From 652a36259aaba26cfdefa9f71311632f87c13795 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 16 Apr 2024 14:54:01 -0700 Subject: [PATCH 1/6] Create tz_localize and tz_convert on the column level --- python/cudf/cudf/core/_internals/timezones.py | 168 +---------------- python/cudf/cudf/core/column/datetime.py | 169 +++++++++++++++++- python/cudf/cudf/core/index.py | 23 +-- python/cudf/cudf/core/series.py | 27 ++- python/cudf/cudf/core/tools/datetimes.py | 5 +- .../cudf/tests/series/test_datetimelike.py | 5 + 6 files changed, 198 insertions(+), 199 deletions(-) diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py index 4888cdd9ac9..13f1a6c8ca2 100644 --- a/python/cudf/cudf/core/_internals/timezones.py +++ b/python/cudf/cudf/core/_internals/timezones.py @@ -3,23 +3,16 @@ import os import zoneinfo from functools import lru_cache -from typing import Tuple, cast import numpy as np -import pandas as pd -import cudf -from cudf._lib.labeling import label_bins -from cudf._lib.search import search_sorted from cudf._lib.timezone import make_timezone_transition_table -from cudf.core.column.column import as_column, build_column -from cudf.core.column.datetime import DatetimeColumn, DatetimeTZColumn +from cudf.core.column.column import as_column from cudf.core.dataframe import DataFrame -from cudf.utils.dtypes import _get_base_dtype @lru_cache(maxsize=20) -def get_tz_data(zone_name): +def get_tz_data(zone_name) -> DataFrame: """ Return timezone data (transition times and UTC offsets) for the given IANA time zone. @@ -43,14 +36,14 @@ def get_tz_data(zone_name): return tz_table -def _find_and_read_tzfile_tzpath(zone_name): +def _find_and_read_tzfile_tzpath(zone_name) -> DataFrame: for search_path in zoneinfo.TZPATH: if os.path.isfile(os.path.join(search_path, zone_name)): return _read_tzfile_as_frame(search_path, zone_name) raise zoneinfo.ZoneInfoNotFoundError(zone_name) -def _find_and_read_tzfile_tzdata(zone_name): +def _find_and_read_tzfile_tzdata(zone_name) -> DataFrame: import importlib.resources package_base = "tzdata.zoneinfo" @@ -77,7 +70,7 @@ def _find_and_read_tzfile_tzdata(zone_name): raise zoneinfo.ZoneInfoNotFoundError(zone_name) -def _read_tzfile_as_frame(tzdir, zone_name): +def _read_tzfile_as_frame(tzdir, zone_name) -> DataFrame: transition_times_and_offsets = make_timezone_transition_table( tzdir, zone_name ) @@ -97,79 +90,7 @@ def _read_tzfile_as_frame(tzdir, zone_name): ) -def _find_ambiguous_and_nonexistent( - data: DatetimeColumn, zone_name: str -) -> Tuple: - """ - Recognize ambiguous and nonexistent timestamps for the given timezone. - - Returns a tuple of columns, both of "bool" dtype and of the same - size as `data`, that respectively indicate ambiguous and - nonexistent timestamps in `data` with the value `True`. - - Ambiguous and/or nonexistent timestamps are only possible if any - transitions occur in the time zone database for the given timezone. - If no transitions occur, the tuple `(False, False)` is returned. - """ - tz_data_for_zone = get_tz_data(zone_name) - transition_times = tz_data_for_zone["transition_times"] - offsets = tz_data_for_zone["offsets"].astype( - f"timedelta64[{data.time_unit}]" - ) - - if len(offsets) == 1: # no transitions - return False, False - - transition_times, offsets, old_offsets = ( - transition_times[1:]._column, - offsets[1:]._column, - offsets[:-1]._column, - ) - - # Assume we have two clocks at the moment of transition: - # - Clock 1 is turned forward or backwards correctly - # - Clock 2 makes no changes - clock_1 = transition_times + offsets - clock_2 = transition_times + old_offsets - - # At the start of an ambiguous time period, Clock 1 (which has - # been turned back) reads less than Clock 2: - cond = clock_1 < clock_2 - ambiguous_begin = clock_1.apply_boolean_mask(cond) - - # The end of an ambiguous time period is what Clock 2 reads at - # the moment of transition: - ambiguous_end = clock_2.apply_boolean_mask(cond) - ambiguous = label_bins( - data, - left_edges=ambiguous_begin, - left_inclusive=True, - right_edges=ambiguous_end, - right_inclusive=False, - ).notnull() - - # At the start of a non-existent time period, Clock 2 reads less - # than Clock 1 (which has been turned forward): - cond = clock_1 > clock_2 - nonexistent_begin = clock_2.apply_boolean_mask(cond) - - # The end of the non-existent time period is what Clock 1 reads - # at the moment of transition: - nonexistent_end = clock_1.apply_boolean_mask(cond) - nonexistent = label_bins( - data, - left_edges=nonexistent_begin, - left_inclusive=True, - right_edges=nonexistent_end, - right_inclusive=False, - ).notnull() - - return ambiguous, nonexistent - - -def localize( - data: DatetimeColumn, zone_name: str, ambiguous, nonexistent -) -> DatetimeTZColumn: +def check_ambiguous_and_nonexistent(ambiguous, nonexistent) -> None: if ambiguous != "NaT": raise NotImplementedError( "Only ambiguous='NaT' is currently supported" @@ -178,80 +99,3 @@ def localize( raise NotImplementedError( "Only nonexistent='NaT' is currently supported" ) - if isinstance(data, DatetimeTZColumn): - raise ValueError( - "Already localized. " - "Use `tz_convert` to convert between time zones." - ) - dtype = pd.DatetimeTZDtype(data.time_unit, zone_name) - ambiguous, nonexistent = _find_ambiguous_and_nonexistent(data, zone_name) - localized = cast( - DatetimeColumn, - data._scatter_by_column( - data.isnull() | (ambiguous | nonexistent), - cudf.Scalar(cudf.NaT, dtype=data.dtype), - ), - ) - gmt_data = local_to_utc(localized, zone_name) - return cast( - DatetimeTZColumn, - build_column( - data=gmt_data.base_data, - dtype=dtype, - mask=localized.base_mask, - size=gmt_data.size, - offset=gmt_data.offset, - ), - ) - - -def delocalize(data: DatetimeColumn) -> DatetimeColumn: - """ - Convert a timezone-aware datetime column to a timezone-naive one. - If the column is already timezone-naive, return it as is. - """ - if isinstance(data, DatetimeTZColumn): - return data._local_time - # already timezone-naive: - return data - - -def convert(data: DatetimeTZColumn, zone_name: str) -> DatetimeTZColumn: - if not isinstance(data, DatetimeTZColumn): - raise TypeError( - "Cannot convert from timezone-naive timestamps to " - "timezone-aware timestamps. For that, " - "use `tz_localize`." - ) - if zone_name == str(data.dtype.tz): - return data.copy() - utc_time = data._utc_time - out = cast( - DatetimeTZColumn, - build_column( - data=utc_time.base_data, - dtype=pd.DatetimeTZDtype(data.time_unit, zone_name), - mask=utc_time.base_mask, - size=utc_time.size, - offset=utc_time.offset, - ), - ) - return out - - -def utc_to_local(data: DatetimeColumn, zone_name: str) -> DatetimeColumn: - tz_data_for_zone = get_tz_data(zone_name) - transition_times, offsets = tz_data_for_zone._columns - transition_times = transition_times.astype(_get_base_dtype(data.dtype)) - indices = search_sorted([transition_times], [data], "right") - 1 - offsets_from_utc = offsets.take(indices, nullify=True) - return data + offsets_from_utc - - -def local_to_utc(data: DatetimeColumn, zone_name: str) -> DatetimeColumn: - tz_data_for_zone = get_tz_data(zone_name) - transition_times, offsets = tz_data_for_zone._columns - transition_times_local = (transition_times + offsets).astype(data.dtype) - indices = search_sorted([transition_times_local], [data], "right") - 1 - offsets_to_utc = offsets.take(indices, nullify=True) - return data - offsets_to_utc diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index b84c1dc7ccd..0fd46e8b7d5 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -7,7 +7,16 @@ import locale import re from locale import nl_langinfo -from typing import Any, Mapping, Optional, Sequence, cast +from typing import ( + TYPE_CHECKING, + Any, + Literal, + Mapping, + Optional, + Sequence, + Tuple, + cast, +) import numpy as np import pandas as pd @@ -16,6 +25,8 @@ import cudf from cudf import _lib as libcudf +from cudf._lib.labeling import label_bins +from cudf._lib.search import search_sorted from cudf._typing import ( ColumnBinaryOperand, DatetimeLikeScalar, @@ -31,6 +42,9 @@ from cudf.utils.dtypes import _get_base_dtype from cudf.utils.utils import _all_bools_with_nulls +if TYPE_CHECKING: + from cudf.core.column.numerical import NumericalColumn + if PANDAS_GE_220: _guess_datetime_format = pd.tseries.api.guess_datetime_format else: @@ -688,6 +702,123 @@ def _with_type_metadata(self, dtype): ) return self + def _find_ambiguous_and_nonexistent( + self, zone_name: str + ) -> Tuple[NumericalColumn | bool, NumericalColumn | bool]: + """ + Recognize ambiguous and nonexistent timestamps for the given timezone. + + Returns a tuple of columns, both of "bool" dtype and of the same + size as `data`, that respectively indicate ambiguous and + nonexistent timestamps in `data` with the value `True`. + + Ambiguous and/or nonexistent timestamps are only possible if any + transitions occur in the time zone database for the given timezone. + If no transitions occur, the tuple `(False, False)` is returned. + """ + from cudf.core._internals.timezones import get_tz_data + + tz_data_for_zone = get_tz_data(zone_name) + transition_times = tz_data_for_zone["transition_times"] + offsets = tz_data_for_zone["offsets"].astype( + f"timedelta64[{self.time_unit}]" + ) + + if len(offsets) == 1: # no transitions + return False, False + + transition_times, offsets, old_offsets = ( + transition_times[1:]._column, + offsets[1:]._column, + offsets[:-1]._column, + ) + + # Assume we have two clocks at the moment of transition: + # - Clock 1 is turned forward or backwards correctly + # - Clock 2 makes no changes + clock_1 = transition_times + offsets + clock_2 = transition_times + old_offsets + + # At the start of an ambiguous time period, Clock 1 (which has + # been turned back) reads less than Clock 2: + cond = clock_1 < clock_2 + ambiguous_begin = clock_1.apply_boolean_mask(cond) + + # The end of an ambiguous time period is what Clock 2 reads at + # the moment of transition: + ambiguous_end = clock_2.apply_boolean_mask(cond) + ambiguous = label_bins( + self, + left_edges=ambiguous_begin, + left_inclusive=True, + right_edges=ambiguous_end, + right_inclusive=False, + ).notnull() + + # At the start of a non-existent time period, Clock 2 reads less + # than Clock 1 (which has been turned forward): + cond = clock_1 > clock_2 + nonexistent_begin = clock_2.apply_boolean_mask(cond) + + # The end of the non-existent time period is what Clock 1 reads + # at the moment of transition: + nonexistent_end = clock_1.apply_boolean_mask(cond) + nonexistent = label_bins( + self, + left_edges=nonexistent_begin, + left_inclusive=True, + right_edges=nonexistent_end, + right_inclusive=False, + ).notnull() + + return ambiguous, nonexistent + + def tz_localize( + self, + tz: str | None, + ambiguous: Literal["NaT"] = "NaT", + nonexistent: Literal["NaT"] = "NaT", + ): + from cudf.core._internals.timezones import ( + check_ambiguous_and_nonexistent, + get_tz_data, + ) + + if tz is None: + return self.copy() + check_ambiguous_and_nonexistent(ambiguous, nonexistent) + dtype = pd.DatetimeTZDtype(self.time_unit, tz) + ambiguous_col, nonexistent_col = self._find_ambiguous_and_nonexistent( + tz + ) + localized = self._scatter_by_column( + self.isnull() | (ambiguous_col | nonexistent_col), + cudf.Scalar(cudf.NaT, dtype=self.dtype), + ) + + tz_data_for_zone = get_tz_data(tz) + transition_times, offsets = tz_data_for_zone._columns + transition_times_local = (transition_times + offsets).astype( + localized.dtype + ) + indices = ( + search_sorted([transition_times_local], [localized], "right") - 1 + ) + offsets_to_utc = offsets.take(indices, nullify=True) + gmt_data = localized - offsets_to_utc + return DatetimeTZColumn( + data=gmt_data.base_data, + dtype=dtype, + mask=localized.base_mask, + size=gmt_data.size, + offset=gmt_data.offset, + ) + + def tz_convert(self, tz: str | None): + raise TypeError( + "Cannot convert tz-naive timestamps, use tz_localize to localize" + ) + class DatetimeTZColumn(DatetimeColumn): def __init__( @@ -754,9 +885,14 @@ def _utc_time(self): @property def _local_time(self): """Return the local time as naive timestamps.""" - from cudf.core._internals.timezones import utc_to_local + from cudf.core._internals.timezones import get_tz_data - return utc_to_local(self, str(self.dtype.tz)) + tz_data_for_zone = get_tz_data(str(self.dtype.tz)) + transition_times, offsets = tz_data_for_zone._columns + transition_times = transition_times.astype(_get_base_dtype(self.dtype)) + indices = search_sorted([transition_times], [self], "right") - 1 + offsets_from_utc = offsets.take(indices, nullify=True) + return self + offsets_from_utc def as_string_column( self, dtype: Dtype, format: str | None = None @@ -779,3 +915,30 @@ def __repr__(self): f"{arr.to_string()}\n" f"dtype: {self.dtype}" ) + + def tz_localize(self, tz: str | None, ambiguous="NaT", nonexistent="NaT"): + from cudf.core._internals.timezones import ( + check_ambiguous_and_nonexistent, + ) + + if tz is None: + return self._local_time + check_ambiguous_and_nonexistent(ambiguous, nonexistent) + raise ValueError( + "Already localized. " + "Use `tz_convert` to convert between time zones." + ) + + def tz_convert(self, tz: str | None): + if tz is None: + return self._utc_time + elif tz == str(self.dtype.tz): + return self.copy() + utc_time = self._utc_time + return type(self)( + data=utc_time.base_data, + dtype=pd.DatetimeTZDtype(self.time_unit, tz), + mask=utc_time.base_mask, + size=utc_time.size, + offset=utc_time.offset, + ) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 0a7435bd241..f472ea8ff38 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2245,7 +2245,12 @@ def round(self, freq): return self.__class__._from_data({self.name: out_column}) - def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"): + def tz_localize( + self, + tz: str | None, + ambiguous: Literal["NaT"] = "NaT", + nonexistent: Literal["NaT"] = "NaT", + ): """ Localize timezone-naive data to timezone-aware data. @@ -2287,17 +2292,12 @@ def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"): ambiguous or nonexistent timestamps are converted to 'NaT'. """ # noqa: E501 - from cudf.core._internals.timezones import delocalize, localize - - if tz is None: - result_col = delocalize(self._column) - else: - result_col = localize(self._column, tz, ambiguous, nonexistent) + result_col = self._column.tz_localize(tz, ambiguous, nonexistent) return DatetimeIndex._from_data( {self.name: result_col}, freq=self._freq ) - def tz_convert(self, tz): + def tz_convert(self, tz: str | None): """ Convert tz-aware datetimes from one time zone to another. @@ -2329,12 +2329,7 @@ def tz_convert(self, tz): '2018-03-03 14:00:00+00:00'], dtype='datetime64[ns, Europe/London]') """ # noqa: E501 - from cudf.core._internals.timezones import convert - - if tz is None: - result_col = self._column._utc_time - else: - result_col = convert(self._column, tz) + result_col = self._column.tz_convert(tz) return DatetimeIndex._from_data({self.name: result_col}) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 275dc664175..aba1688d74a 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4753,22 +4753,22 @@ def strftime(self, date_format, *args, **kwargs): ) @copy_docstring(DatetimeIndex.tz_localize) - def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"): - from cudf.core._internals.timezones import delocalize, localize - - if tz is None: - result_col = delocalize(self.series._column) - else: - result_col = localize( - self.series._column, tz, ambiguous, nonexistent - ) + def tz_localize( + self, + tz: str | None, + ambiguous: Literal["NaT"] = "NaT", + nonexistent: Literal["NaT"] = "NaT", + ): + result_col = self.series._column.tz_localize( + tz, ambiguous, nonexistent + ) return Series._from_data( data={self.series.name: result_col}, index=self.series._index, ) @copy_docstring(DatetimeIndex.tz_convert) - def tz_convert(self, tz): + def tz_convert(self, tz: str | None): """ Parameters ---------- @@ -4778,12 +4778,7 @@ def tz_convert(self, tz): A `tz` of None will convert to UTC and remove the timezone information. """ - from cudf.core._internals.timezones import convert - - if tz is None: - result_col = self.series._column._utc_time - else: - result_col = convert(self.series._column, tz) + result_col = self.series._column.tz_convert(tz) return Series._from_data( {self.series.name: result_col}, index=self.series._index ) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index ed8fca88acd..63ec2273148 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -332,9 +332,6 @@ def _process_col( format: Optional[str], utc: bool, ): - # Causes circular import - from cudf.core._internals.timezones import localize - if col.dtype.kind == "f": if unit not in (None, "ns"): factor = cudf.Scalar( @@ -411,7 +408,7 @@ def _process_col( f"dtype {col.dtype} cannot be converted to {_unit_dtype_map[unit]}" ) if utc and not isinstance(col.dtype, pd.DatetimeTZDtype): - return localize(col, "UTC", ambiguous="NaT", nonexistent="NaT") + return col.tz_localize("UTC") return col diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py index 6ee339ee3ea..7ef55761b2b 100644 --- a/python/cudf/cudf/tests/series/test_datetimelike.py +++ b/python/cudf/cudf/tests/series/test_datetimelike.py @@ -218,3 +218,8 @@ def test_contains_tz_aware(item, expected): dti = cudf.date_range("2020", periods=2, freq="D").tz_localize("UTC") result = item in dti assert result == expected + + +def test_tz_convert_naive_typeerror(): + with pytest.raises(TypeError): + cudf.date_range("2020", periods=2, freq="D").tz_convert(None) From 049b63cec49671f7e71811652010e7e982b217fc Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 16 Apr 2024 15:12:32 -0700 Subject: [PATCH 2/6] Have get_tz_data return tuple of columns instead of DataFrame --- python/cudf/cudf/core/_internals/timezones.py | 34 ++++++++++--------- python/cudf/cudf/core/column/datetime.py | 19 ++++------- 2 files changed, 25 insertions(+), 28 deletions(-) diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py index 13f1a6c8ca2..d81464626d1 100644 --- a/python/cudf/cudf/core/_internals/timezones.py +++ b/python/cudf/cudf/core/_internals/timezones.py @@ -3,16 +3,18 @@ import os import zoneinfo from functools import lru_cache +from typing import Literal, Tuple import numpy as np from cudf._lib.timezone import make_timezone_transition_table from cudf.core.column.column import as_column -from cudf.core.dataframe import DataFrame +from cudf.core.column.datetime import DatetimeColumn +from cudf.core.column.timedelta import TimeDeltaColumn @lru_cache(maxsize=20) -def get_tz_data(zone_name) -> DataFrame: +def get_tz_data(zone_name: str) -> Tuple[DatetimeColumn, TimeDeltaColumn]: """ Return timezone data (transition times and UTC offsets) for the given IANA time zone. @@ -36,14 +38,18 @@ def get_tz_data(zone_name) -> DataFrame: return tz_table -def _find_and_read_tzfile_tzpath(zone_name) -> DataFrame: +def _find_and_read_tzfile_tzpath( + zone_name: str, +) -> Tuple[DatetimeColumn, TimeDeltaColumn]: for search_path in zoneinfo.TZPATH: if os.path.isfile(os.path.join(search_path, zone_name)): return _read_tzfile_as_frame(search_path, zone_name) raise zoneinfo.ZoneInfoNotFoundError(zone_name) -def _find_and_read_tzfile_tzdata(zone_name) -> DataFrame: +def _find_and_read_tzfile_tzdata( + zone_name: str, +) -> Tuple[DatetimeColumn, TimeDeltaColumn]: import importlib.resources package_base = "tzdata.zoneinfo" @@ -70,7 +76,9 @@ def _find_and_read_tzfile_tzdata(zone_name) -> DataFrame: raise zoneinfo.ZoneInfoNotFoundError(zone_name) -def _read_tzfile_as_frame(tzdir, zone_name) -> DataFrame: +def _read_tzfile_as_frame( + tzdir, zone_name: str +) -> Tuple[DatetimeColumn, TimeDeltaColumn]: transition_times_and_offsets = make_timezone_transition_table( tzdir, zone_name ) @@ -78,19 +86,13 @@ def _read_tzfile_as_frame(tzdir, zone_name) -> DataFrame: if not transition_times_and_offsets: # this happens for UTC-like zones min_date = np.int64(np.iinfo("int64").min + 1).astype("M8[s]") - transition_times_and_offsets = ( - as_column([min_date]), - as_column([np.timedelta64(0, "s")]), - ) - - return DataFrame._from_data( - dict( - zip(["transition_times", "offsets"], transition_times_and_offsets) - ) - ) + return (as_column([min_date]), as_column([np.timedelta64(0, "s")])) + return tuple(transition_times_and_offsets) # type: ignore[return-value] -def check_ambiguous_and_nonexistent(ambiguous, nonexistent) -> None: +def check_ambiguous_and_nonexistent( + ambiguous: Literal["NaT"], nonexistent: Literal["NaT"] +) -> None: if ambiguous != "NaT": raise NotImplementedError( "Only ambiguous='NaT' is currently supported" diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 0fd46e8b7d5..220477f9766 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -718,19 +718,16 @@ def _find_ambiguous_and_nonexistent( """ from cudf.core._internals.timezones import get_tz_data - tz_data_for_zone = get_tz_data(zone_name) - transition_times = tz_data_for_zone["transition_times"] - offsets = tz_data_for_zone["offsets"].astype( - f"timedelta64[{self.time_unit}]" - ) + transition_times, offsets = get_tz_data(zone_name) + offsets = offsets.astype(f"timedelta64[{self.time_unit}]") # type: ignore[assignment] if len(offsets) == 1: # no transitions return False, False transition_times, offsets, old_offsets = ( - transition_times[1:]._column, - offsets[1:]._column, - offsets[:-1]._column, + transition_times.slice(1, len(transition_times)), + offsets.slice(1, len(offsets)), + offsets.slice(0, len(offsets) - 1), ) # Assume we have two clocks at the moment of transition: @@ -796,8 +793,7 @@ def tz_localize( cudf.Scalar(cudf.NaT, dtype=self.dtype), ) - tz_data_for_zone = get_tz_data(tz) - transition_times, offsets = tz_data_for_zone._columns + transition_times, offsets = get_tz_data(tz) transition_times_local = (transition_times + offsets).astype( localized.dtype ) @@ -887,8 +883,7 @@ def _local_time(self): """Return the local time as naive timestamps.""" from cudf.core._internals.timezones import get_tz_data - tz_data_for_zone = get_tz_data(str(self.dtype.tz)) - transition_times, offsets = tz_data_for_zone._columns + transition_times, offsets = get_tz_data(str(self.dtype.tz)) transition_times = transition_times.astype(_get_base_dtype(self.dtype)) indices = search_sorted([transition_times], [self], "right") - 1 offsets_from_utc = offsets.take(indices, nullify=True) From 91d4f410f7ff7a29ec95350430e367a95056d75c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 2 May 2024 14:33:01 -0700 Subject: [PATCH 3/6] Address comments in timezones.py --- python/cudf/cudf/core/_internals/timezones.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py index d81464626d1..18151a9d074 100644 --- a/python/cudf/cudf/core/_internals/timezones.py +++ b/python/cudf/cudf/core/_internals/timezones.py @@ -26,8 +26,8 @@ def get_tz_data(zone_name: str) -> Tuple[DatetimeColumn, TimeDeltaColumn]: Returns ------- - DataFrame with two columns containing the transition times - ("transition_times") and corresponding UTC offsets ("offsets"). + Tuple with two columns containing the transition times + and corresponding UTC offsets. """ try: # like zoneinfo, we first look in TZPATH @@ -43,7 +43,7 @@ def _find_and_read_tzfile_tzpath( ) -> Tuple[DatetimeColumn, TimeDeltaColumn]: for search_path in zoneinfo.TZPATH: if os.path.isfile(os.path.join(search_path, zone_name)): - return _read_tzfile_as_frame(search_path, zone_name) + return _read_tzfile_as_columns(search_path, zone_name) raise zoneinfo.ZoneInfoNotFoundError(zone_name) @@ -54,7 +54,7 @@ def _find_and_read_tzfile_tzdata( package_base = "tzdata.zoneinfo" try: - return _read_tzfile_as_frame( + return _read_tzfile_as_columns( str(importlib.resources.files(package_base)), zone_name ) # TODO: make it so that the call to libcudf raises a @@ -76,7 +76,7 @@ def _find_and_read_tzfile_tzdata( raise zoneinfo.ZoneInfoNotFoundError(zone_name) -def _read_tzfile_as_frame( +def _read_tzfile_as_columns( tzdir, zone_name: str ) -> Tuple[DatetimeColumn, TimeDeltaColumn]: transition_times_and_offsets = make_timezone_transition_table( From 0ddc51981d0f8f4ef1e191a43e28087aa0aa253a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 2 May 2024 14:38:22 -0700 Subject: [PATCH 4/6] address comments --- python/cudf/cudf/core/_internals/timezones.py | 3 ++- python/cudf/cudf/core/column/datetime.py | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py index 18151a9d074..f04cae719c2 100644 --- a/python/cudf/cudf/core/_internals/timezones.py +++ b/python/cudf/cudf/core/_internals/timezones.py @@ -92,7 +92,7 @@ def _read_tzfile_as_columns( def check_ambiguous_and_nonexistent( ambiguous: Literal["NaT"], nonexistent: Literal["NaT"] -) -> None: +) -> Tuple[Literal["NaT"], Literal["NaT"]]: if ambiguous != "NaT": raise NotImplementedError( "Only ambiguous='NaT' is currently supported" @@ -101,3 +101,4 @@ def check_ambiguous_and_nonexistent( raise NotImplementedError( "Only nonexistent='NaT' is currently supported" ) + return ambiguous, nonexistent diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index e609e33b45e..8f5f6c3feb5 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -672,13 +672,13 @@ def _with_type_metadata(self, dtype): def _find_ambiguous_and_nonexistent( self, zone_name: str - ) -> Tuple[NumericalColumn | bool, NumericalColumn | bool]: + ) -> Tuple[NumericalColumn, NumericalColumn] | Tuple[bool, bool]: """ Recognize ambiguous and nonexistent timestamps for the given timezone. Returns a tuple of columns, both of "bool" dtype and of the same - size as `data`, that respectively indicate ambiguous and - nonexistent timestamps in `data` with the value `True`. + size as `self`, that respectively indicate ambiguous and + nonexistent timestamps in `self` with the value `True`. Ambiguous and/or nonexistent timestamps are only possible if any transitions occur in the time zone database for the given timezone. From 426ecf71c262630cbf6b0b9e0fd1e3f2eb8a2981 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 3 May 2024 11:20:22 +0100 Subject: [PATCH 5/6] Use return value --- python/cudf/cudf/core/column/datetime.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 8f5f6c3feb5..3f541c71576 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -751,7 +751,7 @@ def tz_localize( if tz is None: return self.copy() - check_ambiguous_and_nonexistent(ambiguous, nonexistent) + ambiguous, nonexistent = check_ambiguous_and_nonexistent(ambiguous, nonexistent) dtype = pd.DatetimeTZDtype(self.time_unit, tz) ambiguous_col, nonexistent_col = self._find_ambiguous_and_nonexistent( tz @@ -886,7 +886,7 @@ def tz_localize(self, tz: str | None, ambiguous="NaT", nonexistent="NaT"): if tz is None: return self._local_time - check_ambiguous_and_nonexistent(ambiguous, nonexistent) + ambiguous, nonexistent = check_ambiguous_and_nonexistent(ambiguous, nonexistent) raise ValueError( "Already localized. " "Use `tz_convert` to convert between time zones." From d2c0cd275419457751e4073b15f9e4701241a36c Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 3 May 2024 11:31:39 +0100 Subject: [PATCH 6/6] Style --- python/cudf/cudf/core/column/datetime.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 3f541c71576..9fe4e5da96d 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -751,7 +751,9 @@ def tz_localize( if tz is None: return self.copy() - ambiguous, nonexistent = check_ambiguous_and_nonexistent(ambiguous, nonexistent) + ambiguous, nonexistent = check_ambiguous_and_nonexistent( + ambiguous, nonexistent + ) dtype = pd.DatetimeTZDtype(self.time_unit, tz) ambiguous_col, nonexistent_col = self._find_ambiguous_and_nonexistent( tz @@ -886,7 +888,9 @@ def tz_localize(self, tz: str | None, ambiguous="NaT", nonexistent="NaT"): if tz is None: return self._local_time - ambiguous, nonexistent = check_ambiguous_and_nonexistent(ambiguous, nonexistent) + ambiguous, nonexistent = check_ambiguous_and_nonexistent( + ambiguous, nonexistent + ) raise ValueError( "Already localized. " "Use `tz_convert` to convert between time zones."