Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tz_convert method to convert between timestamps #13328

Merged
merged 18 commits into from
May 18, 2023
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion python/cudf/cudf/_lib/column.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ cdef class Column:
object null_count=None,
object children=()
):

self._size = size
self._distinct_count = {}
self._dtype = dtype
Expand Down
27 changes: 25 additions & 2 deletions python/cudf/cudf/core/_internals/timezones.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,15 +193,38 @@ def localize(
return cast(
DatetimeTZColumn,
build_column(
data=gmt_data.data,
data=gmt_data.base_data,
dtype=dtype,
mask=localized.mask,
mask=localized.base_mask,
shwina marked this conversation as resolved.
Show resolved Hide resolved
size=gmt_data.size,
offset=gmt_data.offset,
),
)


def convert(data: DatetimeTZColumn, zone_name: str) -> DatetimeTZColumn:
if not isinstance(data, DatetimeTZColumn):
raise TypeError(
"Cannot convert from timezone-naive timestamps to "
"timezone-aware timestamps. For that, "
"use `tz_localize instead."
shwina marked this conversation as resolved.
Show resolved Hide resolved
)
if zone_name == str(data.dtype.tz):
return data.copy()
utc_time = data._utc_time
mroeschke marked this conversation as resolved.
Show resolved Hide resolved
out = cast(
DatetimeTZColumn,
build_column(
data=utc_time.base_data,
dtype=pd.DatetimeTZDtype(data._time_unit, zone_name),
mask=utc_time.base_mask,
size=utc_time.size,
offset=utc_time.offset,
),
)
return out


def utc_to_local(data: DatetimeColumn, zone_name: str) -> DatetimeColumn:
tz_data_for_zone = get_tz_data(zone_name)
transition_times, offsets = tz_data_for_zone._columns
Expand Down
24 changes: 24 additions & 0 deletions python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,6 +577,18 @@ def to_arrow(self):
self._local_time.to_arrow(), str(self.dtype.tz)
)

@property
def _utc_time(self):
"""Return UTC time as naive timestamps."""
return DatetimeColumn(
data=self.base_data,
dtype=_get_base_dtype(self.dtype),
mask=self.base_mask,
size=self.size,
offset=self.offset,
null_count=self.null_count,
)

@property
def _local_time(self):
"""Return the local time as naive timestamps."""
Expand All @@ -589,6 +601,18 @@ def as_string_column(
) -> "cudf.core.column.StringColumn":
return self._local_time.as_string_column(dtype, format, **kwargs)

def __repr__(self):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is really unrelated to the rest of the PR, but a quality-of-life thing for debugging. Pandas always prints the local timestamps when looking at a tz-aware column and pyarrow always prints the UTC timestamps.

# Arrow prints the UTC timestamps, but we want to print the
# local timestamps:
arr = self._local_time.to_arrow().cast(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe a silly question. Why convert to arrow and then attempt to mirror pandas repr conventions instead of just converting it to pandas and using that repr?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mainly because pyarrow has the convenient to_string() method that lets us assemble the repr with a custom class name.

In [7]: print(dti._column.to_arrow().to_string())
[
  2001-01-01 05:00:00.000000000,
  2001-01-01 06:00:00.000000000,
  2001-01-01 07:00:00.000000000,
  2001-01-01 08:00:00.000000000,
  2001-01-01 09:00:00.000000000,
  2001-01-01 10:00:00.000000000,
  2001-01-01 11:00:00.000000000,
  2001-01-01 12:00:00.000000000,
  2001-01-01 13:00:00.000000000,
  2001-01-01 14:00:00.000000000
]

pa.timestamp(self.dtype.unit, str(self.dtype.tz))
)
return (
f"{object.__repr__(self)}\n"
f"{arr.to_string()}\n"
f"dtype: {self.dtype}"
)


def infer_format(element: str, **kwargs) -> str:
"""
Expand Down
41 changes: 41 additions & 0 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2544,6 +2544,47 @@ def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"):
result_col = localize(self._column, tz, ambiguous, nonexistent)
return DatetimeIndex._from_data({self.name: result_col})

def tz_convert(self, tz):
"""
Convert tz-aware datetimes from one time zone to another.

Parameters
----------
tz: str
shwina marked this conversation as resolved.
Show resolved Hide resolved
Time zone for time. Corresponding timestamps would be converted
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The pandas docstring is worded in a weird way. However, I would leave this as-is to match pandas.

to this time zone of the Datetime Array/Index.
A `tz` of None will convert to UTC and remove the timezone
information.

Returns
-------
DatetimeIndex containing timestamps corresponding to the timezone
`tz`.

Examples
--------
>>> import cudf
>>> dti = cudf.date_range('2018-03-01 09:00', periods=3, freq='D')
>>> dti = dti.tz_localize("America/New_York")
>>> dti
DatetimeIndex(['2018-03-01 09:00:00-05:00',
'2018-03-02 09:00:00-05:00',
'2018-03-03 09:00:00-05:00'],
dtype='datetime64[ns, America/New_York]')
>>> dti.tz_convert("Europe/London")
DatetimeIndex(['2018-03-01 14:00:00+00:00',
'2018-03-02 14:00:00+00:00',
'2018-03-03 14:00:00+00:00'],
dtype='datetime64[ns, Europe/London]')
"""
from cudf.core._internals.timezones import convert, localize
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I forgot, is there a reason we defer this import in each function rather than doing it once at the top?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Circular imports :-(


if tz is None:
result_col = localize(self._column._utc_time, None)
else:
result_col = convert(self._column, tz)
return DatetimeIndex._from_data({self.name: result_col})


class TimedeltaIndex(GenericIndex):
"""
Expand Down
21 changes: 21 additions & 0 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4609,6 +4609,27 @@ def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"):
index=self.series._index,
)

@copy_docstring(DatetimeIndex.tz_convert)
def tz_convert(self, tz):
"""
Parameters
----------
tz: str
shwina marked this conversation as resolved.
Show resolved Hide resolved
Time zone for time. Corresponding timestamps would be converted
to this time zone of the Datetime Array/Index.
A `tz` of None will convert to UTC and remove the
timezone information.
"""
from cudf.core._internals.timezones import convert

if tz is None:
result_col = self.series._column._utc_time
else:
result_col = convert(self.series._column, tz)
return Series._from_data(
{self.series.name: result_col}, index=self.series._index
)


class TimedeltaProperties:
"""
Expand Down
19 changes: 19 additions & 0 deletions python/cudf/cudf/tests/indexes/datetime/test_indexing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Copyright (c) 2023, NVIDIA CORPORATION.

import pandas as pd

import cudf
from cudf.testing._utils import assert_eq


def test_slice_datetimetz_index():
data = ["2001-01-01", "2001-01-02", None, None, "2001-01-03"]
pidx = pd.DatetimeIndex(data, dtype="datetime64[ns]").tz_localize(
"US/Eastern"
)
idx = cudf.DatetimeIndex(data, dtype="datetime64[ns]").tz_localize(
"US/Eastern"
)
expected = pidx[1:4]
got = idx[1:4]
assert_eq(expected, got)
10 changes: 10 additions & 0 deletions python/cudf/cudf/tests/indexes/datetime/test_time_specific.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,13 @@ def test_tz_localize():
pidx.tz_localize("America/New_York"),
idx.tz_localize("America/New_York"),
)


def test_tz_convert():
pidx = pd.date_range("2023-01-01", periods=3, freq="H")
idx = cudf.from_pandas(pidx)
pidx = pidx.tz_localize("UTC")
idx = idx.tz_localize("UTC")
assert_eq(
pidx.tz_convert("America/New_York"), idx.tz_convert("America/New_York")
)
57 changes: 57 additions & 0 deletions python/cudf/cudf/tests/series/test_datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,3 +111,60 @@ def test_delocalize(unit, tz):
expect = psr.dt.tz_localize(tz).dt.tz_localize(None)
got = sr.dt.tz_localize(tz).dt.tz_localize(None)
assert_eq(expect, got)


@pytest.mark.parametrize(
"from_tz", ["Europe/London", "America/Chicago", "UTC"]
)
@pytest.mark.parametrize(
"to_tz", ["Europe/London", "America/Chicago", "UTC", None]
)
def test_convert(from_tz, to_tz):
ps = pd.Series(pd.date_range("2023-01-01", periods=3, freq="H"))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I’d love it if we could add some complexity to our test inputs. Maybe a data fixture that has some times on either side of a DST change, ambiguous times, pre-1900 times, etc. Include some times that we know have raised issues in the past (issue tracker has a few).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree. @mroeschke does Pandas do something like this? Just wondering if there's tooling we can borrow/steal/vendor from Pandas

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately not. The only related fixture we used is a fixed variety of timezones we use where applicable

https://github.com/pandas-dev/pandas/blob/0fa150016911de08025d82ef6975750278c5ad7b/pandas/conftest.py#L1196-L1214

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK - I threw the problem at ChatGPT and it generated some edge case tests that I added here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pre-1900 times

I did find that we return a result different from Pandas for this pre-1900 example:

>>> pd.Series(["1899-01-01 12:00"], dtype="datetime64[s]").dt.tz_localize("Europe/Paris").dt.tz_convert("America/New_York")
0   1899-01-01 06:55:00-04:56
dtype: datetime64[ns, America/New_York]

>>> cudf.Series(["1899-01-01 12:00"], dtype="datetime64[s]").dt.tz_localize("Europe/Paris").dt.tz_convert("America/New_York")
0   1899-01-01 06:50:39-04:56
dtype: datetime64[s, America/New_York]

However, our result is the same as you would get with zoneinfo:

>>> datetime(1899, 1, 1, 12, 0, tzinfo=ZoneInfo("Europe/Paris")).astimezone(ZoneInfo("America/New_York"))
datetime.datetime(1899, 1, 1, 6, 50, 39, tzinfo=zoneinfo.ZoneInfo(key='America/New_York'))

@mroeschke I'm curious if this aligns with your experience with the difference between Pandas (pytz) and ZoneInfo?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@shwina If you want to add pre-1900 times in a later PR, that's fine. I think you hit a decent number of edge cases for now. But if we know we disagree with pandas for this specific case, I'd like to document that in an issue. I would consider that a bug.

gs = cudf.from_pandas(ps)
ps = ps.dt.tz_localize(from_tz)
gs = gs.dt.tz_localize(from_tz)
expect = ps.dt.tz_convert(to_tz)
got = gs.dt.tz_convert(to_tz)
assert_eq(expect, got)


def test_convert_from_naive():
gs = cudf.Series(cudf.date_range("2023-01-01", periods=3, freq="H"))
with pytest.raises(TypeError):
gs.dt.tz_convert("America/New_York")


@pytest.mark.parametrize(
"data,original_timezone,target_timezone",
[
# DST transition:
(["2023-03-12 01:30:00"], "America/New_York", "America/Los_Angeles"),
# crossing the international date line:
(["2023-05-17 23:30:00"], "Pacific/Auckland", "America/Los_Angeles"),
# timezone with non-integer offset:
(["2023-05-17 12:00:00"], "Asia/Kolkata", "Australia/Eucla"),
# timezone with negative offset:
(["2023-05-17 09:00:00"], "America/Los_Angeles", "Pacific/Auckland"),
# conversion across multiple days:
(["2023-05-16 23:30:00"], "America/New_York", "Asia/Kolkata"),
# timezone with half-hour offset:
(["2023-05-17 12:00:00"], "Asia/Kolkata", "Australia/Adelaide"),
# timezone conversion with a timestamp in the future
(["2025-01-01 00:00:00"], "America/New_York", "Europe/London"),
# timezone conversion with a timestamp in the past
(["2000-01-01 12:00:00"], "Europe/Paris", "America/Los_Angeles"),
# timezone conversion with a timestamp at midnight
(["2023-05-17 00:00:00"], "Asia/Tokyo", "Europe/Paris"),
bdice marked this conversation as resolved.
Show resolved Hide resolved
],
)
def test_convert_edge_cases(data, original_timezone, target_timezone):
ps = pd.Series(data, dtype="datetime64[s]").dt.tz_localize(
original_timezone
)
gs = cudf.Series(data, dtype="datetime64[s]").dt.tz_localize(
original_timezone
)
expect = ps.dt.tz_convert(target_timezone)
got = gs.dt.tz_convert(target_timezone)
assert_eq(expect, got)