Skip to content

Commit

Permalink
Move timezone conversion logic to DatetimeColumn (#15545)
Browse files Browse the repository at this point in the history
Moves methods/logic in `python/cudf/cudf/core/_internals/timezones.py` to the newly created `DatetimeColumn.tz_localize` and `DatetimeColumn.tz_convert`.

Additionally adds typing and improves an error message when doing `tz_convert(None)` on a tz-naive Series/Index to raise a `TypeError` (like pandas) instead of an `AttributeError`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: #15545
  • Loading branch information
mroeschke authored May 3, 2024
1 parent b8503bc commit ce6902f
Show file tree
Hide file tree
Showing 6 changed files with 206 additions and 214 deletions.
201 changes: 24 additions & 177 deletions python/cudf/cudf/core/_internals/timezones.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,18 @@
import os
import zoneinfo
from functools import lru_cache
from typing import Tuple, cast
from typing import Literal, Tuple

import numpy as np
import pandas as pd

import cudf
from cudf._lib.labeling import label_bins
from cudf._lib.search import search_sorted
from cudf._lib.timezone import make_timezone_transition_table
from cudf.core.column.column import as_column, build_column
from cudf.core.column.datetime import DatetimeColumn, DatetimeTZColumn
from cudf.core.dataframe import DataFrame
from cudf.utils.dtypes import _get_base_dtype
from cudf.core.column.column import as_column
from cudf.core.column.datetime import DatetimeColumn
from cudf.core.column.timedelta import TimeDeltaColumn


@lru_cache(maxsize=20)
def get_tz_data(zone_name):
def get_tz_data(zone_name: str) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
"""
Return timezone data (transition times and UTC offsets) for the
given IANA time zone.
Expand All @@ -31,8 +26,8 @@ def get_tz_data(zone_name):
Returns
-------
DataFrame with two columns containing the transition times
("transition_times") and corresponding UTC offsets ("offsets").
Tuple with two columns containing the transition times
and corresponding UTC offsets.
"""
try:
# like zoneinfo, we first look in TZPATH
Expand All @@ -43,19 +38,23 @@ def get_tz_data(zone_name):
return tz_table


def _find_and_read_tzfile_tzpath(zone_name):
def _find_and_read_tzfile_tzpath(
zone_name: str,
) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
for search_path in zoneinfo.TZPATH:
if os.path.isfile(os.path.join(search_path, zone_name)):
return _read_tzfile_as_frame(search_path, zone_name)
return _read_tzfile_as_columns(search_path, zone_name)
raise zoneinfo.ZoneInfoNotFoundError(zone_name)


def _find_and_read_tzfile_tzdata(zone_name):
def _find_and_read_tzfile_tzdata(
zone_name: str,
) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
import importlib.resources

package_base = "tzdata.zoneinfo"
try:
return _read_tzfile_as_frame(
return _read_tzfile_as_columns(
str(importlib.resources.files(package_base)), zone_name
)
# TODO: make it so that the call to libcudf raises a
Expand All @@ -77,99 +76,23 @@ def _find_and_read_tzfile_tzdata(zone_name):
raise zoneinfo.ZoneInfoNotFoundError(zone_name)


def _read_tzfile_as_frame(tzdir, zone_name):
def _read_tzfile_as_columns(
tzdir, zone_name: str
) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
transition_times_and_offsets = make_timezone_transition_table(
tzdir, zone_name
)

if not transition_times_and_offsets:
# this happens for UTC-like zones
min_date = np.int64(np.iinfo("int64").min + 1).astype("M8[s]")
transition_times_and_offsets = (
as_column([min_date]),
as_column([np.timedelta64(0, "s")]),
)

return DataFrame._from_data(
dict(
zip(["transition_times", "offsets"], transition_times_and_offsets)
)
)

return (as_column([min_date]), as_column([np.timedelta64(0, "s")]))
return tuple(transition_times_and_offsets) # type: ignore[return-value]

def _find_ambiguous_and_nonexistent(
data: DatetimeColumn, zone_name: str
) -> Tuple:
"""
Recognize ambiguous and nonexistent timestamps for the given timezone.
Returns a tuple of columns, both of "bool" dtype and of the same
size as `data`, that respectively indicate ambiguous and
nonexistent timestamps in `data` with the value `True`.
Ambiguous and/or nonexistent timestamps are only possible if any
transitions occur in the time zone database for the given timezone.
If no transitions occur, the tuple `(False, False)` is returned.
"""
tz_data_for_zone = get_tz_data(zone_name)
transition_times = tz_data_for_zone["transition_times"]
offsets = tz_data_for_zone["offsets"].astype(
f"timedelta64[{data.time_unit}]"
)

if len(offsets) == 1: # no transitions
return False, False

transition_times, offsets, old_offsets = (
transition_times[1:]._column,
offsets[1:]._column,
offsets[:-1]._column,
)

# Assume we have two clocks at the moment of transition:
# - Clock 1 is turned forward or backwards correctly
# - Clock 2 makes no changes
clock_1 = transition_times + offsets
clock_2 = transition_times + old_offsets

# At the start of an ambiguous time period, Clock 1 (which has
# been turned back) reads less than Clock 2:
cond = clock_1 < clock_2
ambiguous_begin = clock_1.apply_boolean_mask(cond)

# The end of an ambiguous time period is what Clock 2 reads at
# the moment of transition:
ambiguous_end = clock_2.apply_boolean_mask(cond)
ambiguous = label_bins(
data,
left_edges=ambiguous_begin,
left_inclusive=True,
right_edges=ambiguous_end,
right_inclusive=False,
).notnull()

# At the start of a non-existent time period, Clock 2 reads less
# than Clock 1 (which has been turned forward):
cond = clock_1 > clock_2
nonexistent_begin = clock_2.apply_boolean_mask(cond)

# The end of the non-existent time period is what Clock 1 reads
# at the moment of transition:
nonexistent_end = clock_1.apply_boolean_mask(cond)
nonexistent = label_bins(
data,
left_edges=nonexistent_begin,
left_inclusive=True,
right_edges=nonexistent_end,
right_inclusive=False,
).notnull()

return ambiguous, nonexistent


def localize(
data: DatetimeColumn, zone_name: str, ambiguous, nonexistent
) -> DatetimeTZColumn:
def check_ambiguous_and_nonexistent(
ambiguous: Literal["NaT"], nonexistent: Literal["NaT"]
) -> Tuple[Literal["NaT"], Literal["NaT"]]:
if ambiguous != "NaT":
raise NotImplementedError(
"Only ambiguous='NaT' is currently supported"
Expand All @@ -178,80 +101,4 @@ def localize(
raise NotImplementedError(
"Only nonexistent='NaT' is currently supported"
)
if isinstance(data, DatetimeTZColumn):
raise ValueError(
"Already localized. "
"Use `tz_convert` to convert between time zones."
)
dtype = pd.DatetimeTZDtype(data.time_unit, zone_name)
ambiguous, nonexistent = _find_ambiguous_and_nonexistent(data, zone_name)
localized = cast(
DatetimeColumn,
data._scatter_by_column(
data.isnull() | (ambiguous | nonexistent),
cudf.Scalar(cudf.NaT, dtype=data.dtype),
),
)
gmt_data = local_to_utc(localized, zone_name)
return cast(
DatetimeTZColumn,
build_column(
data=gmt_data.base_data,
dtype=dtype,
mask=localized.base_mask,
size=gmt_data.size,
offset=gmt_data.offset,
),
)


def delocalize(data: DatetimeColumn) -> DatetimeColumn:
"""
Convert a timezone-aware datetime column to a timezone-naive one.
If the column is already timezone-naive, return it as is.
"""
if isinstance(data, DatetimeTZColumn):
return data._local_time
# already timezone-naive:
return data


def convert(data: DatetimeTZColumn, zone_name: str) -> DatetimeTZColumn:
if not isinstance(data, DatetimeTZColumn):
raise TypeError(
"Cannot convert from timezone-naive timestamps to "
"timezone-aware timestamps. For that, "
"use `tz_localize`."
)
if zone_name == str(data.dtype.tz):
return data.copy()
utc_time = data._utc_time
out = cast(
DatetimeTZColumn,
build_column(
data=utc_time.base_data,
dtype=pd.DatetimeTZDtype(data.time_unit, zone_name),
mask=utc_time.base_mask,
size=utc_time.size,
offset=utc_time.offset,
),
)
return out


def utc_to_local(data: DatetimeColumn, zone_name: str) -> DatetimeColumn:
tz_data_for_zone = get_tz_data(zone_name)
transition_times, offsets = tz_data_for_zone._columns
transition_times = transition_times.astype(_get_base_dtype(data.dtype))
indices = search_sorted([transition_times], [data], "right") - 1
offsets_from_utc = offsets.take(indices, nullify=True)
return data + offsets_from_utc


def local_to_utc(data: DatetimeColumn, zone_name: str) -> DatetimeColumn:
tz_data_for_zone = get_tz_data(zone_name)
transition_times, offsets = tz_data_for_zone._columns
transition_times_local = (transition_times + offsets).astype(data.dtype)
indices = search_sorted([transition_times_local], [data], "right") - 1
offsets_to_utc = offsets.take(indices, nullify=True)
return data - offsets_to_utc
return ambiguous, nonexistent
Loading

0 comments on commit ce6902f

Please sign in to comment.