diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py index 4888cdd9ac9..f04cae719c2 100644 --- a/python/cudf/cudf/core/_internals/timezones.py +++ b/python/cudf/cudf/core/_internals/timezones.py @@ -3,23 +3,18 @@ import os import zoneinfo from functools import lru_cache -from typing import Tuple, cast +from typing import Literal, Tuple import numpy as np -import pandas as pd -import cudf -from cudf._lib.labeling import label_bins -from cudf._lib.search import search_sorted from cudf._lib.timezone import make_timezone_transition_table -from cudf.core.column.column import as_column, build_column -from cudf.core.column.datetime import DatetimeColumn, DatetimeTZColumn -from cudf.core.dataframe import DataFrame -from cudf.utils.dtypes import _get_base_dtype +from cudf.core.column.column import as_column +from cudf.core.column.datetime import DatetimeColumn +from cudf.core.column.timedelta import TimeDeltaColumn @lru_cache(maxsize=20) -def get_tz_data(zone_name): +def get_tz_data(zone_name: str) -> Tuple[DatetimeColumn, TimeDeltaColumn]: """ Return timezone data (transition times and UTC offsets) for the given IANA time zone. @@ -31,8 +26,8 @@ def get_tz_data(zone_name): Returns ------- - DataFrame with two columns containing the transition times - ("transition_times") and corresponding UTC offsets ("offsets"). + Tuple with two columns containing the transition times + and corresponding UTC offsets. """ try: # like zoneinfo, we first look in TZPATH @@ -43,19 +38,23 @@ def get_tz_data(zone_name): return tz_table -def _find_and_read_tzfile_tzpath(zone_name): +def _find_and_read_tzfile_tzpath( + zone_name: str, +) -> Tuple[DatetimeColumn, TimeDeltaColumn]: for search_path in zoneinfo.TZPATH: if os.path.isfile(os.path.join(search_path, zone_name)): - return _read_tzfile_as_frame(search_path, zone_name) + return _read_tzfile_as_columns(search_path, zone_name) raise zoneinfo.ZoneInfoNotFoundError(zone_name) -def _find_and_read_tzfile_tzdata(zone_name): +def _find_and_read_tzfile_tzdata( + zone_name: str, +) -> Tuple[DatetimeColumn, TimeDeltaColumn]: import importlib.resources package_base = "tzdata.zoneinfo" try: - return _read_tzfile_as_frame( + return _read_tzfile_as_columns( str(importlib.resources.files(package_base)), zone_name ) # TODO: make it so that the call to libcudf raises a @@ -77,7 +76,9 @@ def _find_and_read_tzfile_tzdata(zone_name): raise zoneinfo.ZoneInfoNotFoundError(zone_name) -def _read_tzfile_as_frame(tzdir, zone_name): +def _read_tzfile_as_columns( + tzdir, zone_name: str +) -> Tuple[DatetimeColumn, TimeDeltaColumn]: transition_times_and_offsets = make_timezone_transition_table( tzdir, zone_name ) @@ -85,91 +86,13 @@ def _read_tzfile_as_frame(tzdir, zone_name): if not transition_times_and_offsets: # this happens for UTC-like zones min_date = np.int64(np.iinfo("int64").min + 1).astype("M8[s]") - transition_times_and_offsets = ( - as_column([min_date]), - as_column([np.timedelta64(0, "s")]), - ) - - return DataFrame._from_data( - dict( - zip(["transition_times", "offsets"], transition_times_and_offsets) - ) - ) - + return (as_column([min_date]), as_column([np.timedelta64(0, "s")])) + return tuple(transition_times_and_offsets) # type: ignore[return-value] -def _find_ambiguous_and_nonexistent( - data: DatetimeColumn, zone_name: str -) -> Tuple: - """ - Recognize ambiguous and nonexistent timestamps for the given timezone. - - Returns a tuple of columns, both of "bool" dtype and of the same - size as `data`, that respectively indicate ambiguous and - nonexistent timestamps in `data` with the value `True`. - - Ambiguous and/or nonexistent timestamps are only possible if any - transitions occur in the time zone database for the given timezone. - If no transitions occur, the tuple `(False, False)` is returned. - """ - tz_data_for_zone = get_tz_data(zone_name) - transition_times = tz_data_for_zone["transition_times"] - offsets = tz_data_for_zone["offsets"].astype( - f"timedelta64[{data.time_unit}]" - ) - if len(offsets) == 1: # no transitions - return False, False - - transition_times, offsets, old_offsets = ( - transition_times[1:]._column, - offsets[1:]._column, - offsets[:-1]._column, - ) - - # Assume we have two clocks at the moment of transition: - # - Clock 1 is turned forward or backwards correctly - # - Clock 2 makes no changes - clock_1 = transition_times + offsets - clock_2 = transition_times + old_offsets - - # At the start of an ambiguous time period, Clock 1 (which has - # been turned back) reads less than Clock 2: - cond = clock_1 < clock_2 - ambiguous_begin = clock_1.apply_boolean_mask(cond) - - # The end of an ambiguous time period is what Clock 2 reads at - # the moment of transition: - ambiguous_end = clock_2.apply_boolean_mask(cond) - ambiguous = label_bins( - data, - left_edges=ambiguous_begin, - left_inclusive=True, - right_edges=ambiguous_end, - right_inclusive=False, - ).notnull() - - # At the start of a non-existent time period, Clock 2 reads less - # than Clock 1 (which has been turned forward): - cond = clock_1 > clock_2 - nonexistent_begin = clock_2.apply_boolean_mask(cond) - - # The end of the non-existent time period is what Clock 1 reads - # at the moment of transition: - nonexistent_end = clock_1.apply_boolean_mask(cond) - nonexistent = label_bins( - data, - left_edges=nonexistent_begin, - left_inclusive=True, - right_edges=nonexistent_end, - right_inclusive=False, - ).notnull() - - return ambiguous, nonexistent - - -def localize( - data: DatetimeColumn, zone_name: str, ambiguous, nonexistent -) -> DatetimeTZColumn: +def check_ambiguous_and_nonexistent( + ambiguous: Literal["NaT"], nonexistent: Literal["NaT"] +) -> Tuple[Literal["NaT"], Literal["NaT"]]: if ambiguous != "NaT": raise NotImplementedError( "Only ambiguous='NaT' is currently supported" @@ -178,80 +101,4 @@ def localize( raise NotImplementedError( "Only nonexistent='NaT' is currently supported" ) - if isinstance(data, DatetimeTZColumn): - raise ValueError( - "Already localized. " - "Use `tz_convert` to convert between time zones." - ) - dtype = pd.DatetimeTZDtype(data.time_unit, zone_name) - ambiguous, nonexistent = _find_ambiguous_and_nonexistent(data, zone_name) - localized = cast( - DatetimeColumn, - data._scatter_by_column( - data.isnull() | (ambiguous | nonexistent), - cudf.Scalar(cudf.NaT, dtype=data.dtype), - ), - ) - gmt_data = local_to_utc(localized, zone_name) - return cast( - DatetimeTZColumn, - build_column( - data=gmt_data.base_data, - dtype=dtype, - mask=localized.base_mask, - size=gmt_data.size, - offset=gmt_data.offset, - ), - ) - - -def delocalize(data: DatetimeColumn) -> DatetimeColumn: - """ - Convert a timezone-aware datetime column to a timezone-naive one. - If the column is already timezone-naive, return it as is. - """ - if isinstance(data, DatetimeTZColumn): - return data._local_time - # already timezone-naive: - return data - - -def convert(data: DatetimeTZColumn, zone_name: str) -> DatetimeTZColumn: - if not isinstance(data, DatetimeTZColumn): - raise TypeError( - "Cannot convert from timezone-naive timestamps to " - "timezone-aware timestamps. For that, " - "use `tz_localize`." - ) - if zone_name == str(data.dtype.tz): - return data.copy() - utc_time = data._utc_time - out = cast( - DatetimeTZColumn, - build_column( - data=utc_time.base_data, - dtype=pd.DatetimeTZDtype(data.time_unit, zone_name), - mask=utc_time.base_mask, - size=utc_time.size, - offset=utc_time.offset, - ), - ) - return out - - -def utc_to_local(data: DatetimeColumn, zone_name: str) -> DatetimeColumn: - tz_data_for_zone = get_tz_data(zone_name) - transition_times, offsets = tz_data_for_zone._columns - transition_times = transition_times.astype(_get_base_dtype(data.dtype)) - indices = search_sorted([transition_times], [data], "right") - 1 - offsets_from_utc = offsets.take(indices, nullify=True) - return data + offsets_from_utc - - -def local_to_utc(data: DatetimeColumn, zone_name: str) -> DatetimeColumn: - tz_data_for_zone = get_tz_data(zone_name) - transition_times, offsets = tz_data_for_zone._columns - transition_times_local = (transition_times + offsets).astype(data.dtype) - indices = search_sorted([transition_times_local], [data], "right") - 1 - offsets_to_utc = offsets.take(indices, nullify=True) - return data - offsets_to_utc + return ambiguous, nonexistent diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 981ef738458..9fe4e5da96d 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -7,7 +7,7 @@ import locale import re from locale import nl_langinfo -from typing import Any, Optional, Sequence, cast +from typing import TYPE_CHECKING, Any, Literal, Optional, Sequence, Tuple, cast import numpy as np import pandas as pd @@ -16,6 +16,8 @@ import cudf from cudf import _lib as libcudf +from cudf._lib.labeling import label_bins +from cudf._lib.search import search_sorted from cudf._typing import ( ColumnBinaryOperand, DatetimeLikeScalar, @@ -31,6 +33,9 @@ from cudf.utils.dtypes import _get_base_dtype from cudf.utils.utils import _all_bools_with_nulls +if TYPE_CHECKING: + from cudf.core.column.numerical import NumericalColumn + if PANDAS_GE_220: _guess_datetime_format = pd.tseries.api.guess_datetime_format else: @@ -665,6 +670,121 @@ def _with_type_metadata(self, dtype): ) return self + def _find_ambiguous_and_nonexistent( + self, zone_name: str + ) -> Tuple[NumericalColumn, NumericalColumn] | Tuple[bool, bool]: + """ + Recognize ambiguous and nonexistent timestamps for the given timezone. + + Returns a tuple of columns, both of "bool" dtype and of the same + size as `self`, that respectively indicate ambiguous and + nonexistent timestamps in `self` with the value `True`. + + Ambiguous and/or nonexistent timestamps are only possible if any + transitions occur in the time zone database for the given timezone. + If no transitions occur, the tuple `(False, False)` is returned. + """ + from cudf.core._internals.timezones import get_tz_data + + transition_times, offsets = get_tz_data(zone_name) + offsets = offsets.astype(f"timedelta64[{self.time_unit}]") # type: ignore[assignment] + + if len(offsets) == 1: # no transitions + return False, False + + transition_times, offsets, old_offsets = ( + transition_times.slice(1, len(transition_times)), + offsets.slice(1, len(offsets)), + offsets.slice(0, len(offsets) - 1), + ) + + # Assume we have two clocks at the moment of transition: + # - Clock 1 is turned forward or backwards correctly + # - Clock 2 makes no changes + clock_1 = transition_times + offsets + clock_2 = transition_times + old_offsets + + # At the start of an ambiguous time period, Clock 1 (which has + # been turned back) reads less than Clock 2: + cond = clock_1 < clock_2 + ambiguous_begin = clock_1.apply_boolean_mask(cond) + + # The end of an ambiguous time period is what Clock 2 reads at + # the moment of transition: + ambiguous_end = clock_2.apply_boolean_mask(cond) + ambiguous = label_bins( + self, + left_edges=ambiguous_begin, + left_inclusive=True, + right_edges=ambiguous_end, + right_inclusive=False, + ).notnull() + + # At the start of a non-existent time period, Clock 2 reads less + # than Clock 1 (which has been turned forward): + cond = clock_1 > clock_2 + nonexistent_begin = clock_2.apply_boolean_mask(cond) + + # The end of the non-existent time period is what Clock 1 reads + # at the moment of transition: + nonexistent_end = clock_1.apply_boolean_mask(cond) + nonexistent = label_bins( + self, + left_edges=nonexistent_begin, + left_inclusive=True, + right_edges=nonexistent_end, + right_inclusive=False, + ).notnull() + + return ambiguous, nonexistent + + def tz_localize( + self, + tz: str | None, + ambiguous: Literal["NaT"] = "NaT", + nonexistent: Literal["NaT"] = "NaT", + ): + from cudf.core._internals.timezones import ( + check_ambiguous_and_nonexistent, + get_tz_data, + ) + + if tz is None: + return self.copy() + ambiguous, nonexistent = check_ambiguous_and_nonexistent( + ambiguous, nonexistent + ) + dtype = pd.DatetimeTZDtype(self.time_unit, tz) + ambiguous_col, nonexistent_col = self._find_ambiguous_and_nonexistent( + tz + ) + localized = self._scatter_by_column( + self.isnull() | (ambiguous_col | nonexistent_col), + cudf.Scalar(cudf.NaT, dtype=self.dtype), + ) + + transition_times, offsets = get_tz_data(tz) + transition_times_local = (transition_times + offsets).astype( + localized.dtype + ) + indices = ( + search_sorted([transition_times_local], [localized], "right") - 1 + ) + offsets_to_utc = offsets.take(indices, nullify=True) + gmt_data = localized - offsets_to_utc + return DatetimeTZColumn( + data=gmt_data.base_data, + dtype=dtype, + mask=localized.base_mask, + size=gmt_data.size, + offset=gmt_data.offset, + ) + + def tz_convert(self, tz: str | None): + raise TypeError( + "Cannot convert tz-naive timestamps, use tz_localize to localize" + ) + class DatetimeTZColumn(DatetimeColumn): def __init__( @@ -731,9 +851,13 @@ def _utc_time(self): @property def _local_time(self): """Return the local time as naive timestamps.""" - from cudf.core._internals.timezones import utc_to_local + from cudf.core._internals.timezones import get_tz_data - return utc_to_local(self, str(self.dtype.tz)) + transition_times, offsets = get_tz_data(str(self.dtype.tz)) + transition_times = transition_times.astype(_get_base_dtype(self.dtype)) + indices = search_sorted([transition_times], [self], "right") - 1 + offsets_from_utc = offsets.take(indices, nullify=True) + return self + offsets_from_utc def as_string_column( self, dtype: Dtype, format: str | None = None @@ -756,3 +880,32 @@ def __repr__(self): f"{arr.to_string()}\n" f"dtype: {self.dtype}" ) + + def tz_localize(self, tz: str | None, ambiguous="NaT", nonexistent="NaT"): + from cudf.core._internals.timezones import ( + check_ambiguous_and_nonexistent, + ) + + if tz is None: + return self._local_time + ambiguous, nonexistent = check_ambiguous_and_nonexistent( + ambiguous, nonexistent + ) + raise ValueError( + "Already localized. " + "Use `tz_convert` to convert between time zones." + ) + + def tz_convert(self, tz: str | None): + if tz is None: + return self._utc_time + elif tz == str(self.dtype.tz): + return self.copy() + utc_time = self._utc_time + return type(self)( + data=utc_time.base_data, + dtype=pd.DatetimeTZDtype(self.time_unit, tz), + mask=utc_time.base_mask, + size=utc_time.size, + offset=utc_time.offset, + ) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index f55fa4c05b5..583e5d74b56 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2258,7 +2258,12 @@ def round(self, freq): return self.__class__._from_data({self.name: out_column}) - def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"): + def tz_localize( + self, + tz: str | None, + ambiguous: Literal["NaT"] = "NaT", + nonexistent: Literal["NaT"] = "NaT", + ): """ Localize timezone-naive data to timezone-aware data. @@ -2300,17 +2305,12 @@ def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"): ambiguous or nonexistent timestamps are converted to 'NaT'. """ # noqa: E501 - from cudf.core._internals.timezones import delocalize, localize - - if tz is None: - result_col = delocalize(self._column) - else: - result_col = localize(self._column, tz, ambiguous, nonexistent) + result_col = self._column.tz_localize(tz, ambiguous, nonexistent) return DatetimeIndex._from_data( {self.name: result_col}, freq=self._freq ) - def tz_convert(self, tz): + def tz_convert(self, tz: str | None): """ Convert tz-aware datetimes from one time zone to another. @@ -2342,12 +2342,7 @@ def tz_convert(self, tz): '2018-03-03 14:00:00+00:00'], dtype='datetime64[ns, Europe/London]') """ # noqa: E501 - from cudf.core._internals.timezones import convert - - if tz is None: - result_col = self._column._utc_time - else: - result_col = convert(self._column, tz) + result_col = self._column.tz_convert(tz) return DatetimeIndex._from_data({self.name: result_col}) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index b6ed28f9093..c3d232aaa7c 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4755,22 +4755,22 @@ def strftime(self, date_format, *args, **kwargs): ) @copy_docstring(DatetimeIndex.tz_localize) - def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"): - from cudf.core._internals.timezones import delocalize, localize - - if tz is None: - result_col = delocalize(self.series._column) - else: - result_col = localize( - self.series._column, tz, ambiguous, nonexistent - ) + def tz_localize( + self, + tz: str | None, + ambiguous: Literal["NaT"] = "NaT", + nonexistent: Literal["NaT"] = "NaT", + ): + result_col = self.series._column.tz_localize( + tz, ambiguous, nonexistent + ) return Series._from_data( data={self.series.name: result_col}, index=self.series._index, ) @copy_docstring(DatetimeIndex.tz_convert) - def tz_convert(self, tz): + def tz_convert(self, tz: str | None): """ Parameters ---------- @@ -4780,12 +4780,7 @@ def tz_convert(self, tz): A `tz` of None will convert to UTC and remove the timezone information. """ - from cudf.core._internals.timezones import convert - - if tz is None: - result_col = self.series._column._utc_time - else: - result_col = convert(self.series._column, tz) + result_col = self.series._column.tz_convert(tz) return Series._from_data( {self.series.name: result_col}, index=self.series._index ) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 907f3b586d1..7f6ce1100ea 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -317,9 +317,6 @@ def _process_col( format: Optional[str], utc: bool, ): - # Causes circular import - from cudf.core._internals.timezones import localize - if col.dtype.kind == "f": if unit not in (None, "ns"): factor = cudf.Scalar( @@ -396,7 +393,7 @@ def _process_col( f"dtype {col.dtype} cannot be converted to {_unit_dtype_map[unit]}" ) if utc and not isinstance(col.dtype, pd.DatetimeTZDtype): - return localize(col, "UTC", ambiguous="NaT", nonexistent="NaT") + return col.tz_localize("UTC") return col diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py index 6ee339ee3ea..7ef55761b2b 100644 --- a/python/cudf/cudf/tests/series/test_datetimelike.py +++ b/python/cudf/cudf/tests/series/test_datetimelike.py @@ -218,3 +218,8 @@ def test_contains_tz_aware(item, expected): dti = cudf.date_range("2020", periods=2, freq="D").tz_localize("UTC") result = item in dti assert result == expected + + +def test_tz_convert_naive_typeerror(): + with pytest.raises(TypeError): + cudf.date_range("2020", periods=2, freq="D").tz_convert(None)