Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move timezone conversion logic to DatetimeColumn #15545

Merged
merged 7 commits into from
May 3, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
192 changes: 19 additions & 173 deletions python/cudf/cudf/core/_internals/timezones.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,18 @@
import os
import zoneinfo
from functools import lru_cache
from typing import Tuple, cast
from typing import Literal, Tuple

import numpy as np
import pandas as pd

import cudf
from cudf._lib.labeling import label_bins
from cudf._lib.search import search_sorted
from cudf._lib.timezone import make_timezone_transition_table
from cudf.core.column.column import as_column, build_column
from cudf.core.column.datetime import DatetimeColumn, DatetimeTZColumn
from cudf.core.dataframe import DataFrame
from cudf.utils.dtypes import _get_base_dtype
from cudf.core.column.column import as_column
from cudf.core.column.datetime import DatetimeColumn
from cudf.core.column.timedelta import TimeDeltaColumn


@lru_cache(maxsize=20)
def get_tz_data(zone_name):
def get_tz_data(zone_name: str) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue: Please update the docstring description of the return value.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the catch. Updated

"""
Return timezone data (transition times and UTC offsets) for the
given IANA time zone.
Expand All @@ -43,14 +38,18 @@ def get_tz_data(zone_name):
return tz_table


def _find_and_read_tzfile_tzpath(zone_name):
def _find_and_read_tzfile_tzpath(
zone_name: str,
) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
for search_path in zoneinfo.TZPATH:
if os.path.isfile(os.path.join(search_path, zone_name)):
return _read_tzfile_as_frame(search_path, zone_name)
raise zoneinfo.ZoneInfoNotFoundError(zone_name)


def _find_and_read_tzfile_tzdata(zone_name):
def _find_and_read_tzfile_tzdata(
zone_name: str,
) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
import importlib.resources

package_base = "tzdata.zoneinfo"
Expand All @@ -77,99 +76,23 @@ def _find_and_read_tzfile_tzdata(zone_name):
raise zoneinfo.ZoneInfoNotFoundError(zone_name)


def _read_tzfile_as_frame(tzdir, zone_name):
def _read_tzfile_as_frame(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

polish: rename to _read_tzfile_as_columns, since we no longer return a frame?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea. Change it to that

tzdir, zone_name: str
) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
transition_times_and_offsets = make_timezone_transition_table(
tzdir, zone_name
)

if not transition_times_and_offsets:
# this happens for UTC-like zones
min_date = np.int64(np.iinfo("int64").min + 1).astype("M8[s]")
transition_times_and_offsets = (
as_column([min_date]),
as_column([np.timedelta64(0, "s")]),
)

return DataFrame._from_data(
dict(
zip(["transition_times", "offsets"], transition_times_and_offsets)
)
)


def _find_ambiguous_and_nonexistent(
data: DatetimeColumn, zone_name: str
) -> Tuple:
"""
Recognize ambiguous and nonexistent timestamps for the given timezone.

Returns a tuple of columns, both of "bool" dtype and of the same
size as `data`, that respectively indicate ambiguous and
nonexistent timestamps in `data` with the value `True`.

Ambiguous and/or nonexistent timestamps are only possible if any
transitions occur in the time zone database for the given timezone.
If no transitions occur, the tuple `(False, False)` is returned.
"""
tz_data_for_zone = get_tz_data(zone_name)
transition_times = tz_data_for_zone["transition_times"]
offsets = tz_data_for_zone["offsets"].astype(
f"timedelta64[{data.time_unit}]"
)

if len(offsets) == 1: # no transitions
return False, False

transition_times, offsets, old_offsets = (
transition_times[1:]._column,
offsets[1:]._column,
offsets[:-1]._column,
)

# Assume we have two clocks at the moment of transition:
# - Clock 1 is turned forward or backwards correctly
# - Clock 2 makes no changes
clock_1 = transition_times + offsets
clock_2 = transition_times + old_offsets

# At the start of an ambiguous time period, Clock 1 (which has
# been turned back) reads less than Clock 2:
cond = clock_1 < clock_2
ambiguous_begin = clock_1.apply_boolean_mask(cond)

# The end of an ambiguous time period is what Clock 2 reads at
# the moment of transition:
ambiguous_end = clock_2.apply_boolean_mask(cond)
ambiguous = label_bins(
data,
left_edges=ambiguous_begin,
left_inclusive=True,
right_edges=ambiguous_end,
right_inclusive=False,
).notnull()

# At the start of a non-existent time period, Clock 2 reads less
# than Clock 1 (which has been turned forward):
cond = clock_1 > clock_2
nonexistent_begin = clock_2.apply_boolean_mask(cond)
return (as_column([min_date]), as_column([np.timedelta64(0, "s")]))
return tuple(transition_times_and_offsets) # type: ignore[return-value]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: This type ignore is because the cython wrapper has no typing information?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I believe so. mypy is picking up the call from the cython function as list[Any]


# The end of the non-existent time period is what Clock 1 reads
# at the moment of transition:
nonexistent_end = clock_1.apply_boolean_mask(cond)
nonexistent = label_bins(
data,
left_edges=nonexistent_begin,
left_inclusive=True,
right_edges=nonexistent_end,
right_inclusive=False,
).notnull()

return ambiguous, nonexistent


def localize(
data: DatetimeColumn, zone_name: str, ambiguous, nonexistent
) -> DatetimeTZColumn:
def check_ambiguous_and_nonexistent(
ambiguous: Literal["NaT"], nonexistent: Literal["NaT"]
) -> None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: parse don't validate, check_ambiguous_and_nonexistent(...) -> tuple[Literal["NaT"], Literal["NaT"]] and return those values (and then use them in callers).

Though here it's a bit of a wash since you're not doing any type narrowing.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure thing can change it to parse (although these arguments are not actually used)

if ambiguous != "NaT":
raise NotImplementedError(
"Only ambiguous='NaT' is currently supported"
Expand All @@ -178,80 +101,3 @@ def localize(
raise NotImplementedError(
"Only nonexistent='NaT' is currently supported"
)
if isinstance(data, DatetimeTZColumn):
raise ValueError(
"Already localized. "
"Use `tz_convert` to convert between time zones."
)
dtype = pd.DatetimeTZDtype(data.time_unit, zone_name)
ambiguous, nonexistent = _find_ambiguous_and_nonexistent(data, zone_name)
localized = cast(
DatetimeColumn,
data._scatter_by_column(
data.isnull() | (ambiguous | nonexistent),
cudf.Scalar(cudf.NaT, dtype=data.dtype),
),
)
gmt_data = local_to_utc(localized, zone_name)
return cast(
DatetimeTZColumn,
build_column(
data=gmt_data.base_data,
dtype=dtype,
mask=localized.base_mask,
size=gmt_data.size,
offset=gmt_data.offset,
),
)


def delocalize(data: DatetimeColumn) -> DatetimeColumn:
"""
Convert a timezone-aware datetime column to a timezone-naive one.
If the column is already timezone-naive, return it as is.
"""
if isinstance(data, DatetimeTZColumn):
return data._local_time
# already timezone-naive:
return data


def convert(data: DatetimeTZColumn, zone_name: str) -> DatetimeTZColumn:
if not isinstance(data, DatetimeTZColumn):
raise TypeError(
"Cannot convert from timezone-naive timestamps to "
"timezone-aware timestamps. For that, "
"use `tz_localize`."
)
if zone_name == str(data.dtype.tz):
return data.copy()
utc_time = data._utc_time
out = cast(
DatetimeTZColumn,
build_column(
data=utc_time.base_data,
dtype=pd.DatetimeTZDtype(data.time_unit, zone_name),
mask=utc_time.base_mask,
size=utc_time.size,
offset=utc_time.offset,
),
)
return out


def utc_to_local(data: DatetimeColumn, zone_name: str) -> DatetimeColumn:
tz_data_for_zone = get_tz_data(zone_name)
transition_times, offsets = tz_data_for_zone._columns
transition_times = transition_times.astype(_get_base_dtype(data.dtype))
indices = search_sorted([transition_times], [data], "right") - 1
offsets_from_utc = offsets.take(indices, nullify=True)
return data + offsets_from_utc


def local_to_utc(data: DatetimeColumn, zone_name: str) -> DatetimeColumn:
tz_data_for_zone = get_tz_data(zone_name)
transition_times, offsets = tz_data_for_zone._columns
transition_times_local = (transition_times + offsets).astype(data.dtype)
indices = search_sorted([transition_times_local], [data], "right") - 1
offsets_to_utc = offsets.take(indices, nullify=True)
return data - offsets_to_utc
Loading
Loading