Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move timezone conversion logic to DatetimeColumn #15545

Merged
merged 7 commits into from
May 3, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
201 changes: 24 additions & 177 deletions python/cudf/cudf/core/_internals/timezones.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,18 @@
import os
import zoneinfo
from functools import lru_cache
from typing import Tuple, cast
from typing import Literal, Tuple

import numpy as np
import pandas as pd

import cudf
from cudf._lib.labeling import label_bins
from cudf._lib.search import search_sorted
from cudf._lib.timezone import make_timezone_transition_table
from cudf.core.column.column import as_column, build_column
from cudf.core.column.datetime import DatetimeColumn, DatetimeTZColumn
from cudf.core.dataframe import DataFrame
from cudf.utils.dtypes import _get_base_dtype
from cudf.core.column.column import as_column
from cudf.core.column.datetime import DatetimeColumn
from cudf.core.column.timedelta import TimeDeltaColumn


@lru_cache(maxsize=20)
def get_tz_data(zone_name):
def get_tz_data(zone_name: str) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue: Please update the docstring description of the return value.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the catch. Updated

"""
Return timezone data (transition times and UTC offsets) for the
given IANA time zone.
Expand All @@ -31,8 +26,8 @@ def get_tz_data(zone_name):

Returns
-------
DataFrame with two columns containing the transition times
("transition_times") and corresponding UTC offsets ("offsets").
Tuple with two columns containing the transition times
and corresponding UTC offsets.
"""
try:
# like zoneinfo, we first look in TZPATH
Expand All @@ -43,19 +38,23 @@ def get_tz_data(zone_name):
return tz_table


def _find_and_read_tzfile_tzpath(zone_name):
def _find_and_read_tzfile_tzpath(
zone_name: str,
) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
for search_path in zoneinfo.TZPATH:
if os.path.isfile(os.path.join(search_path, zone_name)):
return _read_tzfile_as_frame(search_path, zone_name)
return _read_tzfile_as_columns(search_path, zone_name)
raise zoneinfo.ZoneInfoNotFoundError(zone_name)


def _find_and_read_tzfile_tzdata(zone_name):
def _find_and_read_tzfile_tzdata(
zone_name: str,
) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
import importlib.resources

package_base = "tzdata.zoneinfo"
try:
return _read_tzfile_as_frame(
return _read_tzfile_as_columns(
str(importlib.resources.files(package_base)), zone_name
)
# TODO: make it so that the call to libcudf raises a
Expand All @@ -77,99 +76,23 @@ def _find_and_read_tzfile_tzdata(zone_name):
raise zoneinfo.ZoneInfoNotFoundError(zone_name)


def _read_tzfile_as_frame(tzdir, zone_name):
def _read_tzfile_as_columns(
tzdir, zone_name: str
) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
transition_times_and_offsets = make_timezone_transition_table(
tzdir, zone_name
)

if not transition_times_and_offsets:
# this happens for UTC-like zones
min_date = np.int64(np.iinfo("int64").min + 1).astype("M8[s]")
transition_times_and_offsets = (
as_column([min_date]),
as_column([np.timedelta64(0, "s")]),
)

return DataFrame._from_data(
dict(
zip(["transition_times", "offsets"], transition_times_and_offsets)
)
)

return (as_column([min_date]), as_column([np.timedelta64(0, "s")]))
return tuple(transition_times_and_offsets) # type: ignore[return-value]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: This type ignore is because the cython wrapper has no typing information?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I believe so. mypy is picking up the call from the cython function as list[Any]


def _find_ambiguous_and_nonexistent(
data: DatetimeColumn, zone_name: str
) -> Tuple:
"""
Recognize ambiguous and nonexistent timestamps for the given timezone.

Returns a tuple of columns, both of "bool" dtype and of the same
size as `data`, that respectively indicate ambiguous and
nonexistent timestamps in `data` with the value `True`.

Ambiguous and/or nonexistent timestamps are only possible if any
transitions occur in the time zone database for the given timezone.
If no transitions occur, the tuple `(False, False)` is returned.
"""
tz_data_for_zone = get_tz_data(zone_name)
transition_times = tz_data_for_zone["transition_times"]
offsets = tz_data_for_zone["offsets"].astype(
f"timedelta64[{data.time_unit}]"
)

if len(offsets) == 1: # no transitions
return False, False

transition_times, offsets, old_offsets = (
transition_times[1:]._column,
offsets[1:]._column,
offsets[:-1]._column,
)

# Assume we have two clocks at the moment of transition:
# - Clock 1 is turned forward or backwards correctly
# - Clock 2 makes no changes
clock_1 = transition_times + offsets
clock_2 = transition_times + old_offsets

# At the start of an ambiguous time period, Clock 1 (which has
# been turned back) reads less than Clock 2:
cond = clock_1 < clock_2
ambiguous_begin = clock_1.apply_boolean_mask(cond)

# The end of an ambiguous time period is what Clock 2 reads at
# the moment of transition:
ambiguous_end = clock_2.apply_boolean_mask(cond)
ambiguous = label_bins(
data,
left_edges=ambiguous_begin,
left_inclusive=True,
right_edges=ambiguous_end,
right_inclusive=False,
).notnull()

# At the start of a non-existent time period, Clock 2 reads less
# than Clock 1 (which has been turned forward):
cond = clock_1 > clock_2
nonexistent_begin = clock_2.apply_boolean_mask(cond)

# The end of the non-existent time period is what Clock 1 reads
# at the moment of transition:
nonexistent_end = clock_1.apply_boolean_mask(cond)
nonexistent = label_bins(
data,
left_edges=nonexistent_begin,
left_inclusive=True,
right_edges=nonexistent_end,
right_inclusive=False,
).notnull()

return ambiguous, nonexistent


def localize(
data: DatetimeColumn, zone_name: str, ambiguous, nonexistent
) -> DatetimeTZColumn:
def check_ambiguous_and_nonexistent(
ambiguous: Literal["NaT"], nonexistent: Literal["NaT"]
) -> Tuple[Literal["NaT"], Literal["NaT"]]:
if ambiguous != "NaT":
raise NotImplementedError(
"Only ambiguous='NaT' is currently supported"
Expand All @@ -178,80 +101,4 @@ def localize(
raise NotImplementedError(
"Only nonexistent='NaT' is currently supported"
)
if isinstance(data, DatetimeTZColumn):
raise ValueError(
"Already localized. "
"Use `tz_convert` to convert between time zones."
)
dtype = pd.DatetimeTZDtype(data.time_unit, zone_name)
ambiguous, nonexistent = _find_ambiguous_and_nonexistent(data, zone_name)
localized = cast(
DatetimeColumn,
data._scatter_by_column(
data.isnull() | (ambiguous | nonexistent),
cudf.Scalar(cudf.NaT, dtype=data.dtype),
),
)
gmt_data = local_to_utc(localized, zone_name)
return cast(
DatetimeTZColumn,
build_column(
data=gmt_data.base_data,
dtype=dtype,
mask=localized.base_mask,
size=gmt_data.size,
offset=gmt_data.offset,
),
)


def delocalize(data: DatetimeColumn) -> DatetimeColumn:
"""
Convert a timezone-aware datetime column to a timezone-naive one.
If the column is already timezone-naive, return it as is.
"""
if isinstance(data, DatetimeTZColumn):
return data._local_time
# already timezone-naive:
return data


def convert(data: DatetimeTZColumn, zone_name: str) -> DatetimeTZColumn:
if not isinstance(data, DatetimeTZColumn):
raise TypeError(
"Cannot convert from timezone-naive timestamps to "
"timezone-aware timestamps. For that, "
"use `tz_localize`."
)
if zone_name == str(data.dtype.tz):
return data.copy()
utc_time = data._utc_time
out = cast(
DatetimeTZColumn,
build_column(
data=utc_time.base_data,
dtype=pd.DatetimeTZDtype(data.time_unit, zone_name),
mask=utc_time.base_mask,
size=utc_time.size,
offset=utc_time.offset,
),
)
return out


def utc_to_local(data: DatetimeColumn, zone_name: str) -> DatetimeColumn:
tz_data_for_zone = get_tz_data(zone_name)
transition_times, offsets = tz_data_for_zone._columns
transition_times = transition_times.astype(_get_base_dtype(data.dtype))
indices = search_sorted([transition_times], [data], "right") - 1
offsets_from_utc = offsets.take(indices, nullify=True)
return data + offsets_from_utc


def local_to_utc(data: DatetimeColumn, zone_name: str) -> DatetimeColumn:
tz_data_for_zone = get_tz_data(zone_name)
transition_times, offsets = tz_data_for_zone._columns
transition_times_local = (transition_times + offsets).astype(data.dtype)
indices = search_sorted([transition_times_local], [data], "right") - 1
offsets_to_utc = offsets.take(indices, nullify=True)
return data - offsets_to_utc
return ambiguous, nonexistent
Loading
Loading