Add support for DatetimeTZDtype and tz_localize (#13163)

TBD Quick benchmark: ```python psr = pd.Series(pd.date_range("1970-01-01", "1980-01-01", freq="1T")) sr = cudf.from_pandas(psr) In [6]: %timeit psr.dt.tz_localize("America/New_York", ambiguous="NaT", nonexistent="NaT") 1.55 s ± 9.91 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) In [7]: %timeit sr.dt.tz_localize("America/New_York", ambiguous="NaT", nonexistent="NaT") 9.98 ms ± 246 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) ``` Authors: - Ashwin Srinath (https://github.com/shwina) - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Bradley Dice (https://github.com/bdice) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: #13163
rapidsai · Apr 27, 2023 · e13dfec · e13dfec
1 parent f4e0f19
commit e13dfec
Show file tree

Hide file tree

Showing 15 changed files with 478 additions and 47 deletions.
diff --git a/docs/cudf/source/api_docs/index_objects.rst b/docs/cudf/source/api_docs/index_objects.rst
@@ -283,6 +283,7 @@ Time-specific operations
    DatetimeIndex.round
    DatetimeIndex.ceil
    DatetimeIndex.floor
+   DatetimeIndex.tz_localize
 
 Conversion
 ~~~~~~~~~~

diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst
@@ -295,6 +295,7 @@ Datetime methods
    round
    floor
    ceil
+   tz_localize
 
 
 Timedelta properties

diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
@@ -8,15 +8,15 @@ import rmm
 
 import cudf
 import cudf._lib as libcudf
-from cudf.api.types import is_categorical_dtype
+from cudf.api.types import is_categorical_dtype, is_datetime64tz_dtype
 from cudf.core.buffer import (
     Buffer,
     CopyOnWriteBuffer,
     SpillableBuffer,
     acquire_spill_lock,
     as_buffer,
 )
-
+from cudf.utils.dtypes import _get_base_dtype
 from cpython.buffer cimport PyObject_CheckBuffer
 from libc.stdint cimport uintptr_t
 from libcpp.memory cimport make_unique, unique_ptr
@@ -313,9 +313,13 @@ cdef class Column:
     cdef mutable_column_view mutable_view(self) except *:
         if is_categorical_dtype(self.dtype):
             col = self.base_children[0]
+            data_dtype = col.dtype
+        elif is_datetime64tz_dtype(self.dtype):
+            col = self
+            data_dtype = _get_base_dtype(col.dtype)
         else:
             col = self
-        data_dtype = col.dtype
+            data_dtype = col.dtype
 
         cdef libcudf_types.data_type dtype = dtype_to_data_type(data_dtype)
         cdef libcudf_types.size_type offset = self.offset
@@ -373,9 +377,12 @@ cdef class Column:
         if is_categorical_dtype(self.dtype):
             col = self.base_children[0]
             data_dtype = col.dtype
+        elif is_datetime64tz_dtype(self.dtype):
+            col = self
+            data_dtype = _get_base_dtype(col.dtype)
         else:
             col = self
-            data_dtype = self.dtype
+            data_dtype = col.dtype
 
         cdef libcudf_types.data_type dtype = dtype_to_data_type(data_dtype)
         cdef libcudf_types.size_type offset = self.offset

diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py
@@ -3,9 +3,19 @@
 import os
 import zoneinfo
 from functools import lru_cache
+from typing import Tuple, cast
 
-from cudf._lib.timezone import build_timezone_transition_table
+import numpy as np
+import pandas as pd
+
+import cudf
+from cudf._lib.labeling import label_bins
+from cudf._lib.search import search_sorted
+from cudf._lib.timezone import make_timezone_transition_table
+from cudf.core.column.column import as_column, build_column
+from cudf.core.column.datetime import DatetimeColumn, DatetimeTZColumn
 from cudf.core.dataframe import DataFrame
+from cudf.utils.dtypes import _get_base_dtype
 
 
 @lru_cache(maxsize=20)
@@ -21,15 +31,16 @@ def get_tz_data(zone_name):
 
     Returns
     -------
-    DataFrame with two columns containing the transition times ("dt")
-    and corresponding UTC offsets ("offset").
+    DataFrame with two columns containing the transition times
+    ("transition_times") and corresponding UTC offsets ("offsets").
     """
     try:
         # like zoneinfo, we first look in TZPATH
-        return _find_and_read_tzfile_tzpath(zone_name)
+        tz_table = _find_and_read_tzfile_tzpath(zone_name)
     except zoneinfo.ZoneInfoNotFoundError:
         # if that fails, we fall back to using `tzdata`
-        return _find_and_read_tzfile_tzdata(zone_name)
+        tz_table = _find_and_read_tzfile_tzdata(zone_name)
+    return tz_table
 
 
 def _find_and_read_tzfile_tzpath(zone_name):
@@ -67,5 +78,143 @@ def _find_and_read_tzfile_tzdata(zone_name):
 
 
 def _read_tzfile_as_frame(tzdir, zone_name):
-    dt, offsets = build_timezone_transition_table(tzdir, zone_name)
-    return DataFrame._from_columns([dt, offsets], ["dt", "offsets"])
+    transition_times_and_offsets = make_timezone_transition_table(
+        tzdir, zone_name
+    )
+
+    if not transition_times_and_offsets:
+        # this happens for UTC-like zones
+        min_date = np.int64(np.iinfo("int64").min + 1).astype("M8[s]")
+        transition_times_and_offsets = as_column([min_date]), as_column(
+            [np.timedelta64(0, "s")]
+        )
+
+    return DataFrame._from_columns(
+        transition_times_and_offsets, ["transition_times", "offsets"]
+    )
+
+
+def _find_ambiguous_and_nonexistent(
+    data: DatetimeColumn, zone_name: str
+) -> Tuple:
+    """
+    Recognize ambiguous and nonexistent timestamps for the given timezone.
+
+    Returns a tuple of columns, both of "bool" dtype and of the same
+    size as `data`, that respectively indicate ambiguous and
+    nonexistent timestamps in `data` with the value `True`.
+
+    Ambiguous and/or nonexistent timestamps are only possible if any
+    transitions occur in the time zone database for the given timezone.
+    If no transitions occur, the tuple `(False, False)` is returned.
+    """
+    tz_data_for_zone = get_tz_data(zone_name)
+    transition_times = tz_data_for_zone["transition_times"]
+    offsets = tz_data_for_zone["offsets"].astype(
+        f"timedelta64[{data._time_unit}]"
+    )
+
+    if len(offsets) == 1:  # no transitions
+        return False, False
+
+    transition_times, offsets, old_offsets = (
+        transition_times[1:]._column,
+        offsets[1:]._column,
+        offsets[:-1]._column,
+    )
+
+    # Assume we have two clocks at the moment of transition:
+    # - Clock 1 is turned forward or backwards correctly
+    # - Clock 2 makes no changes
+    clock_1 = transition_times + offsets
+    clock_2 = transition_times + old_offsets
+
+    # At the start of an ambiguous time period, Clock 1 (which has
+    # been turned back) reads less than Clock 2:
+    cond = clock_1 < clock_2
+    ambiguous_begin = clock_1.apply_boolean_mask(cond)
+
+    # The end of an ambiguous time period is what Clock 2 reads at
+    # the moment of transition:
+    ambiguous_end = clock_2.apply_boolean_mask(cond)
+    ambiguous = label_bins(
+        data,
+        left_edges=ambiguous_begin,
+        left_inclusive=True,
+        right_edges=ambiguous_end,
+        right_inclusive=False,
+    ).notnull()
+
+    # At the start of a non-existent time period, Clock 2 reads less
+    # than Clock 1 (which has been turned forward):
+    cond = clock_1 > clock_2
+    nonexistent_begin = clock_2.apply_boolean_mask(cond)
+
+    # The end of the non-existent time period is what Clock 1 reads
+    # at the moment of transition:
+    nonexistent_end = clock_1.apply_boolean_mask(cond)
+    nonexistent = label_bins(
+        data,
+        left_edges=nonexistent_begin,
+        left_inclusive=True,
+        right_edges=nonexistent_end,
+        right_inclusive=False,
+    ).notnull()
+
+    return ambiguous, nonexistent
+
+
+def localize(
+    data: DatetimeColumn, zone_name: str, ambiguous, nonexistent
+) -> DatetimeTZColumn:
+    if ambiguous != "NaT":
+        raise NotImplementedError(
+            "Only ambiguous='NaT' is currently supported"
+        )
+    if nonexistent != "NaT":
+        raise NotImplementedError(
+            "Only nonexistent='NaT' is currently supported"
+        )
+    if isinstance(data, DatetimeTZColumn):
+        raise ValueError(
+            "Already localized. "
+            "Use `tz_convert` to convert between time zones."
+        )
+    dtype = pd.DatetimeTZDtype(data._time_unit, zone_name)
+    ambiguous, nonexistent = _find_ambiguous_and_nonexistent(data, zone_name)
+    localized = cast(
+        DatetimeColumn,
+        data._scatter_by_column(
+            data.isnull() | (ambiguous | nonexistent),
+            cudf.Scalar(cudf.NA, dtype=data.dtype),
+        ),
+    )
+    gmt_data = local_to_utc(localized, zone_name)
+    return cast(
+        DatetimeTZColumn,
+        build_column(
+            data=gmt_data.data,
+            dtype=dtype,
+            mask=localized.mask,
+            size=gmt_data.size,
+            offset=gmt_data.offset,
+        ),
+    )
+
+
+def utc_to_local(data: DatetimeColumn, zone_name: str) -> DatetimeColumn:
+    tz_data_for_zone = get_tz_data(zone_name)
+    transition_times, offsets = tz_data_for_zone._columns
+    transition_times = transition_times.astype(_get_base_dtype(data.dtype))
+    indices = search_sorted([transition_times], [data], "right") - 1
+    offsets_from_utc = offsets.take(indices, nullify=True)
+    return data + offsets_from_utc
+
+
+def local_to_utc(data: DatetimeColumn, zone_name: str) -> DatetimeColumn:
+    tz_data_for_zone = get_tz_data(zone_name)
+    transition_times, offsets = tz_data_for_zone._columns
+    transition_times_local = (transition_times + offsets).astype(data.dtype)
+    indices = search_sorted([transition_times_local], [data], "right") - 1
+    offsets_to_utc = offsets.take(indices, nullify=True)
+    return data - offsets_to_utc
diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 """
 isort: skip_file
@@ -23,6 +23,7 @@
     serialize_columns,
 )
 from cudf.core.column.datetime import DatetimeColumn  # noqa: F401
+from cudf.core.column.datetime import DatetimeTZColumn  # noqa: F401
 from cudf.core.column.lists import ListColumn  # noqa: F401
 from cudf.core.column.numerical import NumericalColumn  # noqa: F401
 from cudf.core.column.string import StringColumn  # noqa: F401

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -1554,6 +1554,17 @@ def build_column(
             offset=offset,
             null_count=null_count,
         )
+    elif is_datetime64tz_dtype(dtype):
+        if data is None:
+            raise TypeError("Must specify data buffer")
+        return cudf.core.column.datetime.DatetimeTZColumn(
+            data=data,
+            dtype=dtype,
+            mask=mask,
+            size=size,
+            offset=offset,
+            null_count=null_count,
+        )
     elif dtype.type is np.timedelta64:
         if data is None:
             raise TypeError("Must specify data buffer")
@@ -2093,9 +2104,7 @@ def as_column(
                 data = _make_copy_replacing_NaT_with_null(data)
                 mask = data.mask
 
-            data = cudf.core.column.datetime.DatetimeColumn(
-                data=buffer, mask=mask, dtype=arbitrary.dtype
-            )
+            data = build_column(data=buffer, mask=mask, dtype=arbitrary.dtype)
         elif arb_dtype.kind == "m":
 
             time_unit = get_time_unit(arbitrary)
@@ -2243,8 +2252,8 @@ def as_column(
                         raise TypeError
                     if is_datetime64tz_dtype(dtype):
                         raise NotImplementedError(
-                            "cuDF does not yet support "
-                            "timezone-aware datetimes"
+                            "Use `tz_localize()` to construct "
+                            "timezone aware data."
                         )
                     if is_list_dtype(dtype):
                         data = pa.array(arbitrary)

diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
@@ -10,6 +10,7 @@
 
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 
 import cudf
 from cudf import _lib as libcudf
@@ -20,10 +21,16 @@
     DtypeObj,
     ScalarLike,
 )
-from cudf.api.types import is_datetime64_dtype, is_scalar, is_timedelta64_dtype
+from cudf.api.types import (
+    is_datetime64_dtype,
+    is_datetime64tz_dtype,
+    is_scalar,
+    is_timedelta64_dtype,
+)
 from cudf.core.buffer import Buffer, cuda_array_interface_wrapper
 from cudf.core.column import ColumnBase, as_column, column, string
 from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
+from cudf.utils.dtypes import _get_base_dtype
 from cudf.utils.utils import _fillna_natwise
 
 _guess_datetime_format = pd.core.tools.datetimes.guess_datetime_format
@@ -517,6 +524,63 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool:
         else:
             return False
 
+    def _with_type_metadata(self, dtype):
+        if is_datetime64tz_dtype(dtype):
+            return DatetimeTZColumn(
+                data=self.base_data,
+                dtype=dtype,
+                mask=self.base_mask,
+                size=self.size,
+                offset=self.offset,
+                null_count=self.null_count,
+            )
+        return self
+
+
+class DatetimeTZColumn(DatetimeColumn):
+    def __init__(
+        self,
+        data: Buffer,
+        dtype: pd.DatetimeTZDtype,
+        mask: Buffer = None,
+        size: int = None,
+        offset: int = 0,
+        null_count: int = None,
+    ):
+        super().__init__(
+            data=data,
+            dtype=_get_base_dtype(dtype),
+            mask=mask,
+            size=size,
+            offset=offset,
+            null_count=null_count,
+        )
+        self._dtype = dtype
+
+    def to_pandas(
+        self, index: pd.Index = None, nullable: bool = False, **kwargs
+    ) -> "cudf.Series":
+        return self._local_time.to_pandas().dt.tz_localize(
+            self.dtype.tz, ambiguous="NaT", nonexistent="NaT"
+        )
+
+    def to_arrow(self):
+        return pa.compute.assume_timezone(
+            self._local_time.to_arrow(), str(self.dtype.tz)
+        )
+
+    @property
+    def _local_time(self):
+        """Return the local time as naive timestamps."""
+        from cudf.core._internals.timezones import utc_to_local
+
+        return utc_to_local(self, str(self.dtype.tz))
+
+    def as_string_column(
+        self, dtype: Dtype, format=None, **kwargs
+    ) -> "cudf.core.column.StringColumn":
+        return self._local_time.as_string_column(dtype, format, **kwargs)
+
 
 def infer_format(element: str, **kwargs) -> str:
     """
-Original file line number
+Diff line change
@@ Expand Up / @@ -295,6 +295,7 @@ Datetime methods @@
        round
        floor
        ceil
+       tz_localize
     Timedelta properties
@@ Expand Down @@