rapidsai · rapids-bot · Apr 27, 2023 · Feb 21, 2023 · Feb 28, 2023 · Mar 7, 2023
diff --git a/docs/cudf/source/api_docs/index_objects.rst b/docs/cudf/source/api_docs/index_objects.rst
@@ -283,6 +283,8 @@ Time-specific operations
    DatetimeIndex.round
    DatetimeIndex.ceil
    DatetimeIndex.floor
+   DatetimeIndex.tz_localize
+
 
 Conversion
 ~~~~~~~~~~

diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst
@@ -295,6 +295,7 @@ Datetime methods
    round
    floor
    ceil
+   tz_localize
 
 
 Timedelta properties

@@ -8,15 +8,15 @@ import rmm
 
 import cudf
 import cudf._lib as libcudf
-from cudf.api.types import is_categorical_dtype
+from cudf.api.types import is_categorical_dtype, is_datetime64tz_dtype
 from cudf.core.buffer import (
     Buffer,
     CopyOnWriteBuffer,
     SpillableBuffer,
     acquire_spill_lock,
     as_buffer,
 )
-
+from cudf.utils.dtypes import _get_base_dtype
 from cpython.buffer cimport PyObject_CheckBuffer
 from libc.stdint cimport uintptr_t
 from libcpp.memory cimport make_unique, unique_ptr
@@ -313,9 +313,13 @@ cdef class Column:
     cdef mutable_column_view mutable_view(self) except *:
         if is_categorical_dtype(self.dtype):
             col = self.base_children[0]
+            data_dtype = col.dtype
+        elif is_datetime64tz_dtype(self.dtype):
+            col = self
+            data_dtype = _get_base_dtype(col.dtype)
         else:
             col = self
-        data_dtype = col.dtype
+            data_dtype = col.dtype
 
         cdef libcudf_types.data_type dtype = dtype_to_data_type(data_dtype)
         cdef libcudf_types.size_type offset = self.offset
@@ -373,9 +377,12 @@ cdef class Column:
         if is_categorical_dtype(self.dtype):
             col = self.base_children[0]
             data_dtype = col.dtype
+        elif is_datetime64tz_dtype(self.dtype):
+            col = self
+            data_dtype = _get_base_dtype(col.dtype)
         else:
             col = self
-            data_dtype = self.dtype
+            data_dtype = col.dtype
 
         cdef libcudf_types.data_type dtype = dtype_to_data_type(data_dtype)
         cdef libcudf_types.size_type offset = self.offset

@@ -3,9 +3,19 @@
 import os
 import zoneinfo
 from functools import lru_cache
+from typing import Tuple, cast
 
-from cudf._lib.timezone import build_timezone_transition_table
+import numpy as np
+import pandas as pd
+
+import cudf
+from cudf._lib.labeling import label_bins
+from cudf._lib.search import search_sorted
+from cudf._lib.timezone import make_timezone_transition_table
+from cudf.core.column.column import as_column, build_column
+from cudf.core.column.datetime import DatetimeColumn, DatetimeTZColumn
 from cudf.core.dataframe import DataFrame
+from cudf.utils.dtypes import _get_base_dtype
 
 
 @lru_cache(maxsize=20)
@@ -21,15 +31,16 @@ def get_tz_data(zone_name):
 
     Returns
     -------
-    DataFrame with two columns containing the transition times ("dt")
-    and corresponding UTC offsets ("offset").
+    DataFrame with two columns containing the transition times
+    ("transition_times") and corresponding UTC offsets ("offsets").
     """
     try:
         # like zoneinfo, we first look in TZPATH
-        return _find_and_read_tzfile_tzpath(zone_name)
+        tz_table = _find_and_read_tzfile_tzpath(zone_name)
     except zoneinfo.ZoneInfoNotFoundError:
         # if that fails, we fall back to using `tzdata`
-        return _find_and_read_tzfile_tzdata(zone_name)
+        tz_table = _find_and_read_tzfile_tzdata(zone_name)
+    return tz_table
 
 
 def _find_and_read_tzfile_tzpath(zone_name):
@@ -67,5 +78,143 @@ def _find_and_read_tzfile_tzdata(zone_name):
 
 
 def _read_tzfile_as_frame(tzdir, zone_name):
-    dt, offsets = build_timezone_transition_table(tzdir, zone_name)
-    return DataFrame._from_columns([dt, offsets], ["dt", "offsets"])
+    transition_times_and_offsets = make_timezone_transition_table(
+        tzdir, zone_name
+    )
+
+    if not transition_times_and_offsets:
+        # this happens for UTC-like zones
+        min_date = np.int64(np.iinfo("int64").min + 1).astype("M8[s]")
+        transition_times_and_offsets = as_column([min_date]), as_column(
+            [np.timedelta64(0, "s")]
+        )
+
+    return DataFrame._from_columns(
+        transition_times_and_offsets, ["transition_times", "offsets"]
+    )
+
+
+def _find_ambiguous_and_nonexistent(
+    data: DatetimeColumn, zone_name: str
+) -> Tuple:
+    """
+    Recognize ambiguous and nonexistent timestamps for the given timezone.
+
+    Returns a tuple of columns, both of "bool" dtype and of the same
+    size as `data`, that respectivelt indicate ambiguous and
+    nonexistent timestamps in `data` with the value `True`.
+
+    Ambiguous and/or nonexistent timestamps are only possible if any
+    transitions occur in the time zone database for the given timezone.
+    If no transitions occur, the tuple `(None, None)` is returned.
+    """
+    tz_data_for_zone = get_tz_data(zone_name)
+    transition_times = tz_data_for_zone["transition_times"]
+    offsets = tz_data_for_zone["offsets"].astype(
+        f"timedelta64[{data._time_unit}]"
+    )
+
+    if len(offsets) == 1:  # no transitions
+        return False, False
+
+    transition_times, offsets, old_offsets = (
+        transition_times[1:]._column,
+        offsets[1:]._column,
+        offsets[:-1]._column,
+    )
+
+    # Assume we have two clocks at the moment of transition:
+    # - Clock 1 is turned forward or backwards correctly
+    # - Clock 2 makes no changes
+    clock_1 = transition_times + offsets
+    clock_2 = transition_times + old_offsets
+
+    # At the start of an ambiguous time period, Clock 1 (which has
+    # been turned back) reads less than Clock 2:
+    cond = clock_1 < clock_2
+    ambiguous_begin = clock_1.apply_boolean_mask(cond)
+
+    # The end of an ambiguous time period is what Clock 2 reads at
+    # the moment of transition:
+    ambiguous_end = clock_2.apply_boolean_mask(cond)
+    ambiguous = label_bins(
+        data,
+        left_edges=ambiguous_begin,
+        left_inclusive=True,
+        right_edges=ambiguous_end,
+        right_inclusive=False,
+    ).notnull()
+
+    # At the start of a non-existent time period, Clock 2 reads less
+    # than Clock 1 (which has been turned forward):
+    cond = clock_1 > clock_2
+    nonexistent_begin = clock_2.apply_boolean_mask(cond)
+
+    # The end of the non-existent time period is what Clock 1 reads
+    # at the moment of transition:
+    nonexistent_end = clock_1.apply_boolean_mask(cond)
+    nonexistent = label_bins(
+        data,
+        left_edges=nonexistent_begin,
+        left_inclusive=True,
+        right_edges=nonexistent_end,
+        right_inclusive=False,
+    ).notnull()
+
+    return ambiguous, nonexistent
+
+
+def localize(
+    data: DatetimeColumn, zone_name: str, ambiguous, nonexistent
+) -> DatetimeTZColumn:
+    if ambiguous != "NaT":
+        raise NotImplementedError(
+            "Only ambiguous='NaT' is currently supported"
+        )
+    if nonexistent != "NaT":
+        raise NotImplementedError(
+            "Only nonexistent='NaT' is currently supported"
+        )
+    if isinstance(data, DatetimeTZColumn):
+        raise ValueError(
+            "Already localized. "
+            "Use `tz_convert` to convert between time zones."
+        )
+    dtype = pd.DatetimeTZDtype(data._time_unit, zone_name)
+    ambiguous, nonexistent = _find_ambiguous_and_nonexistent(data, zone_name)
+    localized = cast(
+        DatetimeColumn,
+        data._scatter_by_column(
+            data.isnull() | (ambiguous | nonexistent),
+            cudf.Scalar(cudf.NA, dtype=data.dtype),
+        ),
+    )
+    gmt_data = local_to_utc(localized, zone_name)
+    return cast(
+        DatetimeTZColumn,
+        build_column(
+            data=gmt_data.data,
+            dtype=dtype,
+            mask=localized.mask,
+            size=gmt_data.size,
+            offset=gmt_data.offset,
+        ),
+    )
+
+
+def utc_to_local(data: DatetimeColumn, zone_name: str) -> DatetimeColumn:
+    tz_data_for_zone = get_tz_data(zone_name)
+    transition_times, offsets = tz_data_for_zone._columns
+    transition_times = transition_times.astype(_get_base_dtype(data.dtype))
+    indices = search_sorted([transition_times], [data], "right") - 1
+    offsets_from_utc = offsets.take(indices, nullify=True)
+    return data + offsets_from_utc
+
+
+def local_to_utc(data: DatetimeColumn, zone_name: str) -> DatetimeColumn:
+    tz_data_for_zone = get_tz_data(zone_name)
+    transition_times, offsets = tz_data_for_zone._columns
+    transition_times_local = (transition_times + offsets).astype(data.dtype)
+    indices = search_sorted([transition_times_local], [data], "right") - 1
+    offsets_to_utc = offsets.take(indices, nullify=True)
+    return data - offsets_to_utc
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 """
 isort: skip_file
@@ -23,6 +23,7 @@
     serialize_columns,
 )
 from cudf.core.column.datetime import DatetimeColumn  # noqa: F401
+from cudf.core.column.datetime import DatetimeTZColumn  # noqa: F401
 from cudf.core.column.lists import ListColumn  # noqa: F401
 from cudf.core.column.numerical import NumericalColumn  # noqa: F401
 from cudf.core.column.string import StringColumn  # noqa: F401

@@ -1554,6 +1554,17 @@ def build_column(
             offset=offset,
             null_count=null_count,
         )
+    elif is_datetime64tz_dtype(dtype):
+        if data is None:
+            raise TypeError("Must specify data buffer")
-            raise TypeError("Must specify data buffer")
+            raise ValueError("Must specify data buffer")
-            raise TypeError("Must specify data buffer")
+            raise ValueError("Must specify data buffer")
+        return cudf.core.column.datetime.DatetimeTZColumn(
+            data=data,
+            dtype=dtype,
+            mask=mask,
+            size=size,
+            offset=offset,
+            null_count=null_count,
+        )
     elif dtype.type is np.timedelta64:
         if data is None:
             raise TypeError("Must specify data buffer")
@@ -2093,9 +2104,7 @@ def as_column(
                 data = _make_copy_replacing_NaT_with_null(data)
                 mask = data.mask
 
-            data = cudf.core.column.datetime.DatetimeColumn(
-                data=buffer, mask=mask, dtype=arbitrary.dtype
-            )
+            data = build_column(data=buffer, mask=mask, dtype=arbitrary.dtype)
         elif arb_dtype.kind == "m":
 
             time_unit = get_time_unit(arbitrary)
@@ -2243,8 +2252,8 @@ def as_column(
                         raise TypeError
                     if is_datetime64tz_dtype(dtype):
                         raise NotImplementedError(
-                            "cuDF does not yet support "
-                            "timezone-aware datetimes"
+                            "Use `tz_localize()` to construct "
+                            "timezone aware data."
                         )
                     if is_list_dtype(dtype):
                         data = pa.array(arbitrary)
-Original file line number
+Diff line change
@@ Expand Up / @@ -295,6 +295,7 @@ Datetime methods @@
        round
        floor
        ceil
+       tz_localize
     Timedelta properties
@@ Expand Down @@