Skip to content

Commit

Permalink
Add support for DatetimeTZDtype and tz_localize (#13163)
Browse files Browse the repository at this point in the history
TBD

Quick benchmark:

```python
psr = pd.Series(pd.date_range("1970-01-01", "1980-01-01", freq="1T"))
sr = cudf.from_pandas(psr)

In [6]: %timeit psr.dt.tz_localize("America/New_York", ambiguous="NaT", nonexistent="NaT")
1.55 s ± 9.91 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [7]: %timeit sr.dt.tz_localize("America/New_York", ambiguous="NaT", nonexistent="NaT")
9.98 ms ± 246 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
```

Authors:
  - Ashwin Srinath (https://github.com/shwina)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #13163
  • Loading branch information
shwina authored Apr 27, 2023
1 parent f4e0f19 commit e13dfec
Show file tree
Hide file tree
Showing 15 changed files with 478 additions and 47 deletions.
1 change: 1 addition & 0 deletions docs/cudf/source/api_docs/index_objects.rst
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,7 @@ Time-specific operations
DatetimeIndex.round
DatetimeIndex.ceil
DatetimeIndex.floor
DatetimeIndex.tz_localize

Conversion
~~~~~~~~~~
Expand Down
1 change: 1 addition & 0 deletions docs/cudf/source/api_docs/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,7 @@ Datetime methods
round
floor
ceil
tz_localize


Timedelta properties
Expand Down
15 changes: 11 additions & 4 deletions python/cudf/cudf/_lib/column.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@ import rmm

import cudf
import cudf._lib as libcudf
from cudf.api.types import is_categorical_dtype
from cudf.api.types import is_categorical_dtype, is_datetime64tz_dtype
from cudf.core.buffer import (
Buffer,
CopyOnWriteBuffer,
SpillableBuffer,
acquire_spill_lock,
as_buffer,
)

from cudf.utils.dtypes import _get_base_dtype
from cpython.buffer cimport PyObject_CheckBuffer
from libc.stdint cimport uintptr_t
from libcpp.memory cimport make_unique, unique_ptr
Expand Down Expand Up @@ -313,9 +313,13 @@ cdef class Column:
cdef mutable_column_view mutable_view(self) except *:
if is_categorical_dtype(self.dtype):
col = self.base_children[0]
data_dtype = col.dtype
elif is_datetime64tz_dtype(self.dtype):
col = self
data_dtype = _get_base_dtype(col.dtype)
else:
col = self
data_dtype = col.dtype
data_dtype = col.dtype

cdef libcudf_types.data_type dtype = dtype_to_data_type(data_dtype)
cdef libcudf_types.size_type offset = self.offset
Expand Down Expand Up @@ -373,9 +377,12 @@ cdef class Column:
if is_categorical_dtype(self.dtype):
col = self.base_children[0]
data_dtype = col.dtype
elif is_datetime64tz_dtype(self.dtype):
col = self
data_dtype = _get_base_dtype(col.dtype)
else:
col = self
data_dtype = self.dtype
data_dtype = col.dtype

cdef libcudf_types.data_type dtype = dtype_to_data_type(data_dtype)
cdef libcudf_types.size_type offset = self.offset
Expand Down
163 changes: 156 additions & 7 deletions python/cudf/cudf/core/_internals/timezones.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,19 @@
import os
import zoneinfo
from functools import lru_cache
from typing import Tuple, cast

from cudf._lib.timezone import build_timezone_transition_table
import numpy as np
import pandas as pd

import cudf
from cudf._lib.labeling import label_bins
from cudf._lib.search import search_sorted
from cudf._lib.timezone import make_timezone_transition_table
from cudf.core.column.column import as_column, build_column
from cudf.core.column.datetime import DatetimeColumn, DatetimeTZColumn
from cudf.core.dataframe import DataFrame
from cudf.utils.dtypes import _get_base_dtype


@lru_cache(maxsize=20)
Expand All @@ -21,15 +31,16 @@ def get_tz_data(zone_name):
Returns
-------
DataFrame with two columns containing the transition times ("dt")
and corresponding UTC offsets ("offset").
DataFrame with two columns containing the transition times
("transition_times") and corresponding UTC offsets ("offsets").
"""
try:
# like zoneinfo, we first look in TZPATH
return _find_and_read_tzfile_tzpath(zone_name)
tz_table = _find_and_read_tzfile_tzpath(zone_name)
except zoneinfo.ZoneInfoNotFoundError:
# if that fails, we fall back to using `tzdata`
return _find_and_read_tzfile_tzdata(zone_name)
tz_table = _find_and_read_tzfile_tzdata(zone_name)
return tz_table


def _find_and_read_tzfile_tzpath(zone_name):
Expand Down Expand Up @@ -67,5 +78,143 @@ def _find_and_read_tzfile_tzdata(zone_name):


def _read_tzfile_as_frame(tzdir, zone_name):
dt, offsets = build_timezone_transition_table(tzdir, zone_name)
return DataFrame._from_columns([dt, offsets], ["dt", "offsets"])
transition_times_and_offsets = make_timezone_transition_table(
tzdir, zone_name
)

if not transition_times_and_offsets:
# this happens for UTC-like zones
min_date = np.int64(np.iinfo("int64").min + 1).astype("M8[s]")
transition_times_and_offsets = as_column([min_date]), as_column(
[np.timedelta64(0, "s")]
)

return DataFrame._from_columns(
transition_times_and_offsets, ["transition_times", "offsets"]
)


def _find_ambiguous_and_nonexistent(
data: DatetimeColumn, zone_name: str
) -> Tuple:
"""
Recognize ambiguous and nonexistent timestamps for the given timezone.
Returns a tuple of columns, both of "bool" dtype and of the same
size as `data`, that respectively indicate ambiguous and
nonexistent timestamps in `data` with the value `True`.
Ambiguous and/or nonexistent timestamps are only possible if any
transitions occur in the time zone database for the given timezone.
If no transitions occur, the tuple `(False, False)` is returned.
"""
tz_data_for_zone = get_tz_data(zone_name)
transition_times = tz_data_for_zone["transition_times"]
offsets = tz_data_for_zone["offsets"].astype(
f"timedelta64[{data._time_unit}]"
)

if len(offsets) == 1: # no transitions
return False, False

transition_times, offsets, old_offsets = (
transition_times[1:]._column,
offsets[1:]._column,
offsets[:-1]._column,
)

# Assume we have two clocks at the moment of transition:
# - Clock 1 is turned forward or backwards correctly
# - Clock 2 makes no changes
clock_1 = transition_times + offsets
clock_2 = transition_times + old_offsets

# At the start of an ambiguous time period, Clock 1 (which has
# been turned back) reads less than Clock 2:
cond = clock_1 < clock_2
ambiguous_begin = clock_1.apply_boolean_mask(cond)

# The end of an ambiguous time period is what Clock 2 reads at
# the moment of transition:
ambiguous_end = clock_2.apply_boolean_mask(cond)
ambiguous = label_bins(
data,
left_edges=ambiguous_begin,
left_inclusive=True,
right_edges=ambiguous_end,
right_inclusive=False,
).notnull()

# At the start of a non-existent time period, Clock 2 reads less
# than Clock 1 (which has been turned forward):
cond = clock_1 > clock_2
nonexistent_begin = clock_2.apply_boolean_mask(cond)

# The end of the non-existent time period is what Clock 1 reads
# at the moment of transition:
nonexistent_end = clock_1.apply_boolean_mask(cond)
nonexistent = label_bins(
data,
left_edges=nonexistent_begin,
left_inclusive=True,
right_edges=nonexistent_end,
right_inclusive=False,
).notnull()

return ambiguous, nonexistent


def localize(
data: DatetimeColumn, zone_name: str, ambiguous, nonexistent
) -> DatetimeTZColumn:
if ambiguous != "NaT":
raise NotImplementedError(
"Only ambiguous='NaT' is currently supported"
)
if nonexistent != "NaT":
raise NotImplementedError(
"Only nonexistent='NaT' is currently supported"
)
if isinstance(data, DatetimeTZColumn):
raise ValueError(
"Already localized. "
"Use `tz_convert` to convert between time zones."
)
dtype = pd.DatetimeTZDtype(data._time_unit, zone_name)
ambiguous, nonexistent = _find_ambiguous_and_nonexistent(data, zone_name)
localized = cast(
DatetimeColumn,
data._scatter_by_column(
data.isnull() | (ambiguous | nonexistent),
cudf.Scalar(cudf.NA, dtype=data.dtype),
),
)
gmt_data = local_to_utc(localized, zone_name)
return cast(
DatetimeTZColumn,
build_column(
data=gmt_data.data,
dtype=dtype,
mask=localized.mask,
size=gmt_data.size,
offset=gmt_data.offset,
),
)


def utc_to_local(data: DatetimeColumn, zone_name: str) -> DatetimeColumn:
tz_data_for_zone = get_tz_data(zone_name)
transition_times, offsets = tz_data_for_zone._columns
transition_times = transition_times.astype(_get_base_dtype(data.dtype))
indices = search_sorted([transition_times], [data], "right") - 1
offsets_from_utc = offsets.take(indices, nullify=True)
return data + offsets_from_utc


def local_to_utc(data: DatetimeColumn, zone_name: str) -> DatetimeColumn:
tz_data_for_zone = get_tz_data(zone_name)
transition_times, offsets = tz_data_for_zone._columns
transition_times_local = (transition_times + offsets).astype(data.dtype)
indices = search_sorted([transition_times_local], [data], "right") - 1
offsets_to_utc = offsets.take(indices, nullify=True)
return data - offsets_to_utc
3 changes: 2 additions & 1 deletion python/cudf/cudf/core/column/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
# Copyright (c) 2020-2023, NVIDIA CORPORATION.

"""
isort: skip_file
Expand All @@ -23,6 +23,7 @@
serialize_columns,
)
from cudf.core.column.datetime import DatetimeColumn # noqa: F401
from cudf.core.column.datetime import DatetimeTZColumn # noqa: F401
from cudf.core.column.lists import ListColumn # noqa: F401
from cudf.core.column.numerical import NumericalColumn # noqa: F401
from cudf.core.column.string import StringColumn # noqa: F401
Expand Down
19 changes: 14 additions & 5 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -1554,6 +1554,17 @@ def build_column(
offset=offset,
null_count=null_count,
)
elif is_datetime64tz_dtype(dtype):
if data is None:
raise TypeError("Must specify data buffer")
return cudf.core.column.datetime.DatetimeTZColumn(
data=data,
dtype=dtype,
mask=mask,
size=size,
offset=offset,
null_count=null_count,
)
elif dtype.type is np.timedelta64:
if data is None:
raise TypeError("Must specify data buffer")
Expand Down Expand Up @@ -2093,9 +2104,7 @@ def as_column(
data = _make_copy_replacing_NaT_with_null(data)
mask = data.mask

data = cudf.core.column.datetime.DatetimeColumn(
data=buffer, mask=mask, dtype=arbitrary.dtype
)
data = build_column(data=buffer, mask=mask, dtype=arbitrary.dtype)
elif arb_dtype.kind == "m":

time_unit = get_time_unit(arbitrary)
Expand Down Expand Up @@ -2243,8 +2252,8 @@ def as_column(
raise TypeError
if is_datetime64tz_dtype(dtype):
raise NotImplementedError(
"cuDF does not yet support "
"timezone-aware datetimes"
"Use `tz_localize()` to construct "
"timezone aware data."
)
if is_list_dtype(dtype):
data = pa.array(arbitrary)
Expand Down
66 changes: 65 additions & 1 deletion python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import numpy as np
import pandas as pd
import pyarrow as pa

import cudf
from cudf import _lib as libcudf
Expand All @@ -20,10 +21,16 @@
DtypeObj,
ScalarLike,
)
from cudf.api.types import is_datetime64_dtype, is_scalar, is_timedelta64_dtype
from cudf.api.types import (
is_datetime64_dtype,
is_datetime64tz_dtype,
is_scalar,
is_timedelta64_dtype,
)
from cudf.core.buffer import Buffer, cuda_array_interface_wrapper
from cudf.core.column import ColumnBase, as_column, column, string
from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
from cudf.utils.dtypes import _get_base_dtype
from cudf.utils.utils import _fillna_natwise

_guess_datetime_format = pd.core.tools.datetimes.guess_datetime_format
Expand Down Expand Up @@ -517,6 +524,63 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool:
else:
return False

def _with_type_metadata(self, dtype):
if is_datetime64tz_dtype(dtype):
return DatetimeTZColumn(
data=self.base_data,
dtype=dtype,
mask=self.base_mask,
size=self.size,
offset=self.offset,
null_count=self.null_count,
)
return self


class DatetimeTZColumn(DatetimeColumn):
def __init__(
self,
data: Buffer,
dtype: pd.DatetimeTZDtype,
mask: Buffer = None,
size: int = None,
offset: int = 0,
null_count: int = None,
):
super().__init__(
data=data,
dtype=_get_base_dtype(dtype),
mask=mask,
size=size,
offset=offset,
null_count=null_count,
)
self._dtype = dtype

def to_pandas(
self, index: pd.Index = None, nullable: bool = False, **kwargs
) -> "cudf.Series":
return self._local_time.to_pandas().dt.tz_localize(
self.dtype.tz, ambiguous="NaT", nonexistent="NaT"
)

def to_arrow(self):
return pa.compute.assume_timezone(
self._local_time.to_arrow(), str(self.dtype.tz)
)

@property
def _local_time(self):
"""Return the local time as naive timestamps."""
from cudf.core._internals.timezones import utc_to_local

return utc_to_local(self, str(self.dtype.tz))

def as_string_column(
self, dtype: Dtype, format=None, **kwargs
) -> "cudf.core.column.StringColumn":
return self._local_time.as_string_column(dtype, format, **kwargs)


def infer_format(element: str, **kwargs) -> str:
"""
Expand Down
Loading

0 comments on commit e13dfec

Please sign in to comment.