Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for DatetimeTZDtype and tz_localize #13163

Merged
merged 33 commits into from
Apr 27, 2023
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
c2b1968
Raw bindings
shwina Feb 21, 2023
fb293c0
move declarations to /include
vuule Feb 28, 2023
3457063
Renames
shwina Mar 7, 2023
6210054
Remove extra files
shwina Mar 15, 2023
cf67f8b
Final touches
shwina Apr 5, 2023
82ec81e
Revert "move declarations to /include"
shwina Apr 17, 2023
714116d
Add datetimetzdtype
shwina Apr 17, 2023
3a09e3d
tmp
shwina Apr 17, 2023
8ed31eb
First pass at localize
shwina Apr 17, 2023
e72b3fc
Fix up and add tests for localize
shwina Apr 17, 2023
d05e9e0
Add tests for nonexistent
shwina Apr 17, 2023
5b2c990
Improvements to localize
shwina Apr 18, 2023
df3f7bc
Docs
shwina Apr 18, 2023
a4f6430
Styles
shwina Apr 18, 2023
8f4b215
More style
shwina Apr 18, 2023
b6ec602
Use keywords
shwina Apr 18, 2023
54dc4d5
Run localize tests for all time zones
shwina Apr 19, 2023
fad1c6a
Delocalize tests and index tests
shwina Apr 19, 2023
4e89a5c
Use assume_timezone, not cast
shwina Apr 19, 2023
a473e51
Apply suggestions from code review
shwina Apr 19, 2023
1d18fcd
Doc
shwina Apr 19, 2023
99cc4b0
Apply suggestions from code review
shwina Apr 19, 2023
6982474
Merge branch 'add-tz-localize' of github.com:shwina/cudf into add-tz-…
shwina Apr 19, 2023
b7563b8
Merge branch 'branch-23.06' into add-tz-localize
shwina Apr 25, 2023
a1141d8
Merge branch 'branch-23.06' of github.com:rapidsai/cudf into add-tz-l…
shwina Apr 26, 2023
fc5f497
Merge branch 'add-tz-localize' of github.com:shwina/cudf into add-tz-…
shwina Apr 26, 2023
22040ce
Not all platforms have `factory`.
shwina Apr 26, 2023
9e0f922
Use Pandas directly to test if we can work with a TZ
shwina Apr 26, 2023
cfe5005
Merge branch 'branch-23.06' into add-tz-localize
shwina Apr 26, 2023
dcf49f8
Apply suggestions from code review
shwina Apr 26, 2023
8cd32f8
Apply review suggestions
shwina Apr 26, 2023
94aae6e
Merge branch 'add-tz-localize' of github.com:shwina/cudf into add-tz-…
shwina Apr 26, 2023
ffbcf42
Type metadata handling
shwina Apr 27, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/cudf/source/api_docs/index_objects.rst
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,8 @@ Time-specific operations
DatetimeIndex.round
DatetimeIndex.ceil
DatetimeIndex.floor
DatetimeIndex.tz_localize

shwina marked this conversation as resolved.
Show resolved Hide resolved

Conversion
~~~~~~~~~~
Expand Down
1 change: 1 addition & 0 deletions docs/cudf/source/api_docs/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,7 @@ Datetime methods
round
floor
ceil
tz_localize


Timedelta properties
Expand Down
15 changes: 11 additions & 4 deletions python/cudf/cudf/_lib/column.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@ import rmm

import cudf
import cudf._lib as libcudf
from cudf.api.types import is_categorical_dtype
from cudf.api.types import is_categorical_dtype, is_datetime64tz_dtype
from cudf.core.buffer import (
Buffer,
CopyOnWriteBuffer,
SpillableBuffer,
acquire_spill_lock,
as_buffer,
)

from cudf.utils.dtypes import _get_base_dtype
from cpython.buffer cimport PyObject_CheckBuffer
from libc.stdint cimport uintptr_t
from libcpp.memory cimport make_unique, unique_ptr
Expand Down Expand Up @@ -313,9 +313,13 @@ cdef class Column:
cdef mutable_column_view mutable_view(self) except *:
if is_categorical_dtype(self.dtype):
col = self.base_children[0]
data_dtype = col.dtype
elif is_datetime64tz_dtype(self.dtype):
col = self
data_dtype = _get_base_dtype(col.dtype)
else:
col = self
data_dtype = col.dtype
data_dtype = col.dtype

cdef libcudf_types.data_type dtype = dtype_to_data_type(data_dtype)
cdef libcudf_types.size_type offset = self.offset
Expand Down Expand Up @@ -373,9 +377,12 @@ cdef class Column:
if is_categorical_dtype(self.dtype):
col = self.base_children[0]
data_dtype = col.dtype
elif is_datetime64tz_dtype(self.dtype):
col = self
data_dtype = _get_base_dtype(col.dtype)
else:
col = self
data_dtype = self.dtype
data_dtype = col.dtype

cdef libcudf_types.data_type dtype = dtype_to_data_type(data_dtype)
cdef libcudf_types.size_type offset = self.offset
Expand Down
163 changes: 156 additions & 7 deletions python/cudf/cudf/core/_internals/timezones.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,19 @@
import os
import zoneinfo
from functools import lru_cache
from typing import Tuple, cast

from cudf._lib.timezone import build_timezone_transition_table
import numpy as np
import pandas as pd

import cudf
from cudf._lib.labeling import label_bins
from cudf._lib.search import search_sorted
from cudf._lib.timezone import make_timezone_transition_table
from cudf.core.column.column import as_column, build_column
from cudf.core.column.datetime import DatetimeColumn, DatetimeTZColumn
from cudf.core.dataframe import DataFrame
from cudf.utils.dtypes import _get_base_dtype


@lru_cache(maxsize=20)
Expand All @@ -21,15 +31,16 @@ def get_tz_data(zone_name):

Returns
-------
DataFrame with two columns containing the transition times ("dt")
and corresponding UTC offsets ("offset").
DataFrame with two columns containing the transition times
("transition_times") and corresponding UTC offsets ("offsets").
"""
try:
# like zoneinfo, we first look in TZPATH
return _find_and_read_tzfile_tzpath(zone_name)
tz_table = _find_and_read_tzfile_tzpath(zone_name)
except zoneinfo.ZoneInfoNotFoundError:
# if that fails, we fall back to using `tzdata`
return _find_and_read_tzfile_tzdata(zone_name)
tz_table = _find_and_read_tzfile_tzdata(zone_name)
return tz_table


def _find_and_read_tzfile_tzpath(zone_name):
Expand Down Expand Up @@ -67,5 +78,143 @@ def _find_and_read_tzfile_tzdata(zone_name):


def _read_tzfile_as_frame(tzdir, zone_name):
dt, offsets = build_timezone_transition_table(tzdir, zone_name)
return DataFrame._from_columns([dt, offsets], ["dt", "offsets"])
transition_times_and_offsets = make_timezone_transition_table(
tzdir, zone_name
)

if not transition_times_and_offsets:
# this happens for UTC-like zones
min_date = np.int64(np.iinfo("int64").min + 1).astype("M8[s]")
transition_times_and_offsets = as_column([min_date]), as_column(
[np.timedelta64(0, "s")]
)

return DataFrame._from_columns(
transition_times_and_offsets, ["transition_times", "offsets"]
)


def _find_ambiguous_and_nonexistent(
data: DatetimeColumn, zone_name: str
) -> Tuple:
"""
Recognize ambiguous and nonexistent timestamps for the given timezone.
bdice marked this conversation as resolved.
Show resolved Hide resolved

Returns a tuple of columns, both of "bool" dtype and of the same
size as `data`, that respectivelt indicate ambiguous and
shwina marked this conversation as resolved.
Show resolved Hide resolved
nonexistent timestamps in `data` with the value `True`.

Ambiguous and/or nonexistent timestamps are only possible if any
transitions occur in the time zone database for the given timezone.
If no transitions occur, the tuple `(None, None)` is returned.
"""
tz_data_for_zone = get_tz_data(zone_name)
transition_times = tz_data_for_zone["transition_times"]
offsets = tz_data_for_zone["offsets"].astype(
f"timedelta64[{data._time_unit}]"
)

if len(offsets) == 1: # no transitions
return False, False
bdice marked this conversation as resolved.
Show resolved Hide resolved

transition_times, offsets, old_offsets = (
transition_times[1:]._column,
offsets[1:]._column,
offsets[:-1]._column,
)

# Assume we have two clocks at the moment of transition:
# - Clock 1 is turned forward or backwards correctly
# - Clock 2 makes no changes
clock_1 = transition_times + offsets
clock_2 = transition_times + old_offsets

# At the start of an ambiguous time period, Clock 1 (which has
# been turned back) reads less than Clock 2:
cond = clock_1 < clock_2
ambiguous_begin = clock_1.apply_boolean_mask(cond)

# The end of an ambiguous time period is what Clock 2 reads at
# the moment of transition:
ambiguous_end = clock_2.apply_boolean_mask(cond)
ambiguous = label_bins(
data,
left_edges=ambiguous_begin,
left_inclusive=True,
right_edges=ambiguous_end,
right_inclusive=False,
).notnull()

# At the start of a non-existent time period, Clock 2 reads less
# than Clock 1 (which has been turned forward):
cond = clock_1 > clock_2
nonexistent_begin = clock_2.apply_boolean_mask(cond)

# The end of the non-existent time period is what Clock 1 reads
# at the moment of transition:
nonexistent_end = clock_1.apply_boolean_mask(cond)
nonexistent = label_bins(
data,
left_edges=nonexistent_begin,
left_inclusive=True,
right_edges=nonexistent_end,
right_inclusive=False,
).notnull()

return ambiguous, nonexistent


def localize(
data: DatetimeColumn, zone_name: str, ambiguous, nonexistent
mroeschke marked this conversation as resolved.
Show resolved Hide resolved
) -> DatetimeTZColumn:
if ambiguous != "NaT":
raise NotImplementedError(
"Only ambiguous='NaT' is currently supported"
)
if nonexistent != "NaT":
raise NotImplementedError(
"Only nonexistent='NaT' is currently supported"
)
if isinstance(data, DatetimeTZColumn):
raise ValueError(
"Already localized. "
"Use `tz_convert` to convert between time zones."
)
dtype = pd.DatetimeTZDtype(data._time_unit, zone_name)
ambiguous, nonexistent = _find_ambiguous_and_nonexistent(data, zone_name)
localized = cast(
DatetimeColumn,
data._scatter_by_column(
data.isnull() | (ambiguous | nonexistent),
cudf.Scalar(cudf.NA, dtype=data.dtype),
),
)
gmt_data = local_to_utc(localized, zone_name)
return cast(
DatetimeTZColumn,
build_column(
data=gmt_data.data,
dtype=dtype,
mask=localized.mask,
size=gmt_data.size,
offset=gmt_data.offset,
),
)


def utc_to_local(data: DatetimeColumn, zone_name: str) -> DatetimeColumn:
tz_data_for_zone = get_tz_data(zone_name)
transition_times, offsets = tz_data_for_zone._columns
transition_times = transition_times.astype(_get_base_dtype(data.dtype))
indices = search_sorted([transition_times], [data], "right") - 1
offsets_from_utc = offsets.take(indices, nullify=True)
return data + offsets_from_utc


def local_to_utc(data: DatetimeColumn, zone_name: str) -> DatetimeColumn:
tz_data_for_zone = get_tz_data(zone_name)
transition_times, offsets = tz_data_for_zone._columns
transition_times_local = (transition_times + offsets).astype(data.dtype)
indices = search_sorted([transition_times_local], [data], "right") - 1
offsets_to_utc = offsets.take(indices, nullify=True)
return data - offsets_to_utc
3 changes: 2 additions & 1 deletion python/cudf/cudf/core/column/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
# Copyright (c) 2020-2023, NVIDIA CORPORATION.

"""
isort: skip_file
Expand All @@ -23,6 +23,7 @@
serialize_columns,
)
from cudf.core.column.datetime import DatetimeColumn # noqa: F401
from cudf.core.column.datetime import DatetimeTZColumn # noqa: F401
Copy link
Contributor

@bdice bdice Apr 26, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI: you can fix these F401 "unused import" errors by defining __all__. If it's in __all__, it's "part of the API" of this module and thus flake8 sees it as used.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this more explicit though? I always thought the only "official" use for __all__ was to define what happens when you do from module import *

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that defining your module’s public API (rather than not defining it) is the most explicit approach. I’m pro-__all__ in Python libraries.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

think that defining your module’s public API (rather than not defining it) is the most explicit approach.

Hmm, but __all__ doesn't do that AFAICT. Placing a name in __all__ doesn't preclude it from appearing in e.g., tab completion options (naming it with a leading _ does).

Are there other tools than flake8 that do something with __all__?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think type checkers (mypy, pyright) will utilize __all__ to see what APIs are "public"

Copy link
Contributor

@bdice bdice Apr 27, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, to your question about tab completion in IPython/etc., that's discussed here: ipython/ipykernel#129 (comment)

The community seems split on the issue, but some folks seem to think limiting to __all__ by default would be good. Either way -- it seems that __all__ is viewed as a source of truth for public APIs, and the real question is whether autocompletion should only show public APIs.

It's configurable, with the option IPCompleter.limit_to__all__ (config options docs).

The reason we do this is that we often want to use IPython to poke about and debug modules, so we like to see what's really there, not just what's whitelisted as public API. We should, however, do a better job of prioritising completions, so that things in __all__ come before things that aren't in it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, both! I'm on board with defining __all__

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if we can get a linter to enforce that __all__ is defined (even if empty) always.

from cudf.core.column.lists import ListColumn # noqa: F401
from cudf.core.column.numerical import NumericalColumn # noqa: F401
from cudf.core.column.string import StringColumn # noqa: F401
Expand Down
19 changes: 14 additions & 5 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -1554,6 +1554,17 @@ def build_column(
offset=offset,
null_count=null_count,
)
elif is_datetime64tz_dtype(dtype):
if data is None:
raise TypeError("Must specify data buffer")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ValueError seems more natural here unless you're matching something in pandas.

Suggested change
raise TypeError("Must specify data buffer")
raise ValueError("Must specify data buffer")

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is an issue in other places in this method and I just wanted to be consistent. Agree though that ValueError is the more appropriate error type. Let's fix in a follow-up.

return cudf.core.column.datetime.DatetimeTZColumn(
data=data,
dtype=dtype,
mask=mask,
size=size,
offset=offset,
null_count=null_count,
)
elif dtype.type is np.timedelta64:
if data is None:
raise TypeError("Must specify data buffer")
Expand Down Expand Up @@ -2093,9 +2104,7 @@ def as_column(
data = _make_copy_replacing_NaT_with_null(data)
mask = data.mask

data = cudf.core.column.datetime.DatetimeColumn(
data=buffer, mask=mask, dtype=arbitrary.dtype
)
data = build_column(data=buffer, mask=mask, dtype=arbitrary.dtype)
elif arb_dtype.kind == "m":

time_unit = get_time_unit(arbitrary)
Expand Down Expand Up @@ -2243,8 +2252,8 @@ def as_column(
raise TypeError
if is_datetime64tz_dtype(dtype):
raise NotImplementedError(
"cuDF does not yet support "
"timezone-aware datetimes"
"Use `tz_localize()` to construct "
"timezone aware data."
)
if is_list_dtype(dtype):
data = pa.array(arbitrary)
Expand Down
Loading