Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for DatetimeTZDtype and tz_localize #13163

Merged
merged 33 commits into from
Apr 27, 2023
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
c2b1968
Raw bindings
shwina Feb 21, 2023
fb293c0
move declarations to /include
vuule Feb 28, 2023
3457063
Renames
shwina Mar 7, 2023
6210054
Remove extra files
shwina Mar 15, 2023
cf67f8b
Final touches
shwina Apr 5, 2023
82ec81e
Revert "move declarations to /include"
shwina Apr 17, 2023
714116d
Add datetimetzdtype
shwina Apr 17, 2023
3a09e3d
tmp
shwina Apr 17, 2023
8ed31eb
First pass at localize
shwina Apr 17, 2023
e72b3fc
Fix up and add tests for localize
shwina Apr 17, 2023
d05e9e0
Add tests for nonexistent
shwina Apr 17, 2023
5b2c990
Improvements to localize
shwina Apr 18, 2023
df3f7bc
Docs
shwina Apr 18, 2023
a4f6430
Styles
shwina Apr 18, 2023
8f4b215
More style
shwina Apr 18, 2023
b6ec602
Use keywords
shwina Apr 18, 2023
54dc4d5
Run localize tests for all time zones
shwina Apr 19, 2023
fad1c6a
Delocalize tests and index tests
shwina Apr 19, 2023
4e89a5c
Use assume_timezone, not cast
shwina Apr 19, 2023
a473e51
Apply suggestions from code review
shwina Apr 19, 2023
1d18fcd
Doc
shwina Apr 19, 2023
99cc4b0
Apply suggestions from code review
shwina Apr 19, 2023
6982474
Merge branch 'add-tz-localize' of github.com:shwina/cudf into add-tz-…
shwina Apr 19, 2023
b7563b8
Merge branch 'branch-23.06' into add-tz-localize
shwina Apr 25, 2023
a1141d8
Merge branch 'branch-23.06' of github.com:rapidsai/cudf into add-tz-l…
shwina Apr 26, 2023
fc5f497
Merge branch 'add-tz-localize' of github.com:shwina/cudf into add-tz-…
shwina Apr 26, 2023
22040ce
Not all platforms have `factory`.
shwina Apr 26, 2023
9e0f922
Use Pandas directly to test if we can work with a TZ
shwina Apr 26, 2023
cfe5005
Merge branch 'branch-23.06' into add-tz-localize
shwina Apr 26, 2023
dcf49f8
Apply suggestions from code review
shwina Apr 26, 2023
8cd32f8
Apply review suggestions
shwina Apr 26, 2023
94aae6e
Merge branch 'add-tz-localize' of github.com:shwina/cudf into add-tz-…
shwina Apr 26, 2023
ffbcf42
Type metadata handling
shwina Apr 27, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/cudf/source/api_docs/index_objects.rst
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,8 @@ Time-specific operations
DatetimeIndex.round
DatetimeIndex.ceil
DatetimeIndex.floor
DatetimeIndex.tz_localize

shwina marked this conversation as resolved.
Show resolved Hide resolved

Conversion
~~~~~~~~~~
Expand Down
1 change: 1 addition & 0 deletions docs/cudf/source/api_docs/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,7 @@ Datetime methods
round
floor
ceil
tz_localize


Timedelta properties
Expand Down
13 changes: 10 additions & 3 deletions python/cudf/cudf/_lib/column.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@ import rmm

import cudf
import cudf._lib as libcudf
from cudf.api.types import is_categorical_dtype
from cudf.api.types import is_categorical_dtype, is_datetime64tz_dtype
from cudf.core.buffer import (
Buffer,
CopyOnWriteBuffer,
SpillableBuffer,
acquire_spill_lock,
as_buffer,
)

from cudf.utils.dtypes import _get_base_dtype
from cpython.buffer cimport PyObject_CheckBuffer
from libc.stdint cimport uintptr_t
from libcpp.memory cimport make_unique, unique_ptr
Expand Down Expand Up @@ -313,9 +313,13 @@ cdef class Column:
cdef mutable_column_view mutable_view(self) except *:
if is_categorical_dtype(self.dtype):
col = self.base_children[0]
data_dtype = col.dtype
elif is_datetime64tz_dtype(self.dtype):
col = self
data_dtype = _get_base_dtype(col.dtype)
else:
col = self
data_dtype = col.dtype
data_dtype = self.dtype
shwina marked this conversation as resolved.
Show resolved Hide resolved

cdef libcudf_types.data_type dtype = dtype_to_data_type(data_dtype)
cdef libcudf_types.size_type offset = self.offset
Expand Down Expand Up @@ -373,6 +377,9 @@ cdef class Column:
if is_categorical_dtype(self.dtype):
col = self.base_children[0]
data_dtype = col.dtype
elif is_datetime64tz_dtype(self.dtype):
col = self
data_dtype = _get_base_dtype(col.dtype)
else:
col = self
data_dtype = self.dtype
shwina marked this conversation as resolved.
Show resolved Hide resolved
Expand Down
155 changes: 148 additions & 7 deletions python/cudf/cudf/core/_internals/timezones.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,19 @@
import os
import zoneinfo
from functools import lru_cache
from typing import Tuple, cast

from cudf._lib.timezone import build_timezone_transition_table
import numpy as np
import pandas as pd

import cudf
from cudf._lib.labeling import label_bins
from cudf._lib.search import search_sorted
from cudf._lib.timezone import make_timezone_transition_table
from cudf.core.column.column import as_column, build_column
from cudf.core.column.datetime import DatetimeColumn, DatetimeTZColumn
from cudf.core.dataframe import DataFrame
from cudf.utils.dtypes import _get_base_dtype


@lru_cache(maxsize=20)
Expand All @@ -21,15 +31,16 @@ def get_tz_data(zone_name):

Returns
-------
DataFrame with two columns containing the transition times ("dt")
and corresponding UTC offsets ("offset").
DataFrame with two columns containing the transition times
("transition_times") and corresponding UTC offsets ("offsets").
"""
try:
# like zoneinfo, we first look in TZPATH
return _find_and_read_tzfile_tzpath(zone_name)
tz_table = _find_and_read_tzfile_tzpath(zone_name)
except zoneinfo.ZoneInfoNotFoundError:
# if that fails, we fall back to using `tzdata`
return _find_and_read_tzfile_tzdata(zone_name)
tz_table = _find_and_read_tzfile_tzdata(zone_name)
return tz_table


def _find_and_read_tzfile_tzpath(zone_name):
Expand Down Expand Up @@ -67,5 +78,135 @@ def _find_and_read_tzfile_tzdata(zone_name):


def _read_tzfile_as_frame(tzdir, zone_name):
dt, offsets = build_timezone_transition_table(tzdir, zone_name)
return DataFrame._from_columns([dt, offsets], ["dt", "offsets"])
transition_times_and_offsets = make_timezone_transition_table(
tzdir, zone_name
)

if not transition_times_and_offsets:
# this happens for UTC-like zones
min_date = np.int64(np.iinfo("int64").min + 1).astype("M8[s]")
transition_times_and_offsets = as_column([min_date]), as_column(
[np.timedelta64(0, "s")]
)

return DataFrame._from_columns(
transition_times_and_offsets, ["transition_times", "offsets"]
)


def _find_ambiguous_and_nonexistent(
data: DatetimeColumn, zone_name: str
) -> Tuple:
"""
Recognize ambiguous and nonexistent timestamps for the given timezone.
bdice marked this conversation as resolved.
Show resolved Hide resolved
"""
tz_data_for_zone = get_tz_data(zone_name)
transition_times = tz_data_for_zone["transition_times"]
offsets = tz_data_for_zone["offsets"].astype(
f"timedelta64[{data._time_unit}]"
)

if len(offsets) == 1: # no transitions
return False, False
bdice marked this conversation as resolved.
Show resolved Hide resolved

transition_times, offsets, old_offsets = (
transition_times[1:]._column,
offsets[1:]._column,
offsets[:-1]._column,
)

# Assume we have two clocks at the moment of transition:
# - Clock 1 is turned forward or backwards correctly
# - Clock 2 makes no changes
clock_1 = transition_times + offsets
clock_2 = transition_times + old_offsets

# At the start of an ambiguous time period, Clock 1 (which has
# been turned back) reads less than Clock 2:
cond = clock_1 < clock_2
ambiguous_begin = clock_1.apply_boolean_mask(cond)

# The end of an ambiguous time period is what Clock 2 reads at
# the moment of transition:
ambiguous_end = clock_2.apply_boolean_mask(cond)
ambiguous = label_bins(
data,
left_edges=ambiguous_begin,
left_inclusive=True,
right_edges=ambiguous_end,
right_inclusive=False,
).notnull()

# At the start of a non-existent time period, Clock 2 reads less
# than Clock 1 (which has been turned forward):
cond = clock_1 > clock_2
nonexistent_begin = clock_2.apply_boolean_mask(cond)

# The end of the non-existent time period is what Clock 1 reads
# at the moment of transition:
nonexistent_end = clock_1.apply_boolean_mask(cond)
nonexistent = label_bins(
data,
left_edges=nonexistent_begin,
left_inclusive=True,
right_edges=nonexistent_end,
right_inclusive=False,
).notnull()

return ambiguous, nonexistent


def localize(
data: DatetimeColumn, zone_name: str, ambiguous, nonexistent
mroeschke marked this conversation as resolved.
Show resolved Hide resolved
) -> DatetimeTZColumn:
if ambiguous != "NaT":
raise NotImplementedError(
"Only ambiguous='NaT' is currently supported"
)
if nonexistent != "NaT":
raise NotImplementedError(
"Only nonexistent='NaT' is currently supported"
)
if isinstance(data, DatetimeTZColumn):
raise ValueError(
"Already localized. "
"Use `tz_convert` to convert between time zones."
)
dtype = pd.DatetimeTZDtype(data._time_unit, zone_name)
ambiguous, nonexistent = _find_ambiguous_and_nonexistent(data, zone_name)
localized = cast(
DatetimeColumn,
data._scatter_by_column(
data.isnull() | (ambiguous | nonexistent),
cudf.Scalar(cudf.NA, dtype=data.dtype),
),
)
gmt_data = local_to_utc(localized, zone_name)
return cast(
DatetimeTZColumn,
build_column(
data=gmt_data.data,
dtype=dtype,
mask=localized.mask,
size=gmt_data.size,
offset=gmt_data.offset,
),
)


def utc_to_local(data: DatetimeColumn, zone_name: str) -> DatetimeColumn:
tz_data_for_zone = get_tz_data(zone_name)
transition_times, offsets = tz_data_for_zone._columns
transition_times = transition_times.astype(_get_base_dtype(data.dtype))
indices = search_sorted([transition_times], [data], "right") - 1
offsets_from_utc = offsets.take(indices, nullify=True)
return data + offsets_from_utc


def local_to_utc(data: DatetimeColumn, zone_name: str) -> DatetimeColumn:
tz_data_for_zone = get_tz_data(zone_name)
transition_times, offsets = tz_data_for_zone._columns
transition_times_local = (transition_times + offsets).astype(data.dtype)
indices = search_sorted([transition_times_local], [data], "right") - 1
offsets_to_utc = offsets.take(indices, nullify=True)
return data - offsets_to_utc
19 changes: 14 additions & 5 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -1554,6 +1554,17 @@ def build_column(
offset=offset,
null_count=null_count,
)
elif is_datetime64tz_dtype(dtype):
if data is None:
raise TypeError("Must specify data buffer")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ValueError seems more natural here unless you're matching something in pandas.

Suggested change
raise TypeError("Must specify data buffer")
raise ValueError("Must specify data buffer")

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is an issue in other places in this method and I just wanted to be consistent. Agree though that ValueError is the more appropriate error type. Let's fix in a follow-up.

return cudf.core.column.datetime.DatetimeTZColumn(
data=data,
dtype=dtype,
mask=mask,
size=size,
offset=offset,
null_count=null_count,
)
elif dtype.type is np.timedelta64:
if data is None:
raise TypeError("Must specify data buffer")
Expand Down Expand Up @@ -2093,9 +2104,7 @@ def as_column(
data = _make_copy_replacing_NaT_with_null(data)
mask = data.mask

data = cudf.core.column.datetime.DatetimeColumn(
data=buffer, mask=mask, dtype=arbitrary.dtype
)
data = build_column(data=buffer, mask=mask, dtype=arbitrary.dtype)
elif arb_dtype.kind == "m":

time_unit = get_time_unit(arbitrary)
Expand Down Expand Up @@ -2243,8 +2252,8 @@ def as_column(
raise TypeError
if is_datetime64tz_dtype(dtype):
raise NotImplementedError(
"cuDF does not yet support "
"timezone-aware datetimes"
"Use `tz_localize()` to construct "
"timezone aware data."
)
if is_list_dtype(dtype):
data = pa.array(arbitrary)
Expand Down
75 changes: 74 additions & 1 deletion python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import numpy as np
import pandas as pd
import pyarrow as pa

import cudf
from cudf import _lib as libcudf
Expand All @@ -20,10 +21,16 @@
DtypeObj,
ScalarLike,
)
from cudf.api.types import is_datetime64_dtype, is_scalar, is_timedelta64_dtype
from cudf.api.types import (
is_datetime64_dtype,
is_datetime64tz_dtype,
is_scalar,
is_timedelta64_dtype,
)
from cudf.core.buffer import Buffer, cuda_array_interface_wrapper
from cudf.core.column import ColumnBase, as_column, column, string
from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
from cudf.utils.dtypes import _get_base_dtype
from cudf.utils.utils import _fillna_natwise

_guess_datetime_format = pd.core.tools.datetimes.guess_datetime_format
Expand Down Expand Up @@ -517,6 +524,72 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool:
else:
return False

def _with_type_metadata(self, dtype):
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
if is_datetime64tz_dtype(dtype):
return DatetimeTZColumn(
data=self.base_data,
dtype=dtype,
mask=self.base_mask,
size=self.size,
offset=self.offset,
null_count=self.null_count,
)
return self


class DatetimeTZColumn(DatetimeColumn):
def __init__(
self,
data: Buffer,
dtype: pd.DatetimeTZDtype,
mask: Buffer = None,
size: int = None,
offset: int = 0,
null_count: int = None,
):
super().__init__(
data=data,
dtype=_get_base_dtype(dtype),
mask=mask,
size=size,
offset=offset,
null_count=null_count,
)
self._dtype = dtype

def to_pandas(
self, index: pd.Index = None, nullable: bool = False, **kwargs
) -> "cudf.Series":
return self._local_time.to_pandas().dt.tz_localize(
self.dtype.tz, ambiguous="NaT", nonexistent="NaT"
)

def to_arrow(self):
return self._local_time.to_arrow().cast(
mroeschke marked this conversation as resolved.
Show resolved Hide resolved
pa.timestamp(self.dtype.unit, str(self.dtype.tz))
)

@property
def values(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this property error implementation be inherited from DateTimeColumn? Or if you think it's possible that CuPy might support time-zone-naive data (the parent class but not this zone-aware class), should we mention "time zone" in the error message?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nope -- you're right, we should just inherit this

"""
Return a CuPy representation of the DateTimeColumn.
"""
raise NotImplementedError(
"DateTime Arrays is not yet implemented in cudf"
shwina marked this conversation as resolved.
Show resolved Hide resolved
)

@property
def _local_time(self):
"""Return the local time as naive timestamps."""
from cudf.core._internals.timezones import utc_to_local

return utc_to_local(self, str(self.dtype.tz))

def as_string_column(
self, dtype: Dtype, format=None, **kwargs
) -> "cudf.core.column.StringColumn":
return self._local_time.as_string_column(dtype, format, **kwargs)


def infer_format(element: str, **kwargs) -> str:
"""
Expand Down
2 changes: 2 additions & 0 deletions python/cudf/cudf/core/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ def dtype(arbitrary):
return np.dtype("object")
elif isinstance(pd_dtype, pd.IntervalDtype):
return cudf.IntervalDtype.from_pandas(pd_dtype)
elif isinstance(pd_dtype, pd.DatetimeTZDtype):
return pd_dtype
else:
raise TypeError(
f"Cannot interpret {arbitrary} as a valid cuDF dtype"
Expand Down
Loading