Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cleanup some timedelta/datetime column logic #14715

Merged
merged 35 commits into from
Apr 5, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
9a5a4fe
default timedelta format the same for all types
mroeschke Jan 5, 2024
2f62408
Simplify some timedelta logic
mroeschke Jan 6, 2024
1be1f78
Remove private _time_unit attribute
mroeschke Jan 6, 2024
0ed1120
Merge remote-tracking branch 'upstream/branch-24.02' into ref/column/…
mroeschke Jan 8, 2024
433ff6e
Replace self
mroeschke Jan 8, 2024
9631341
Merge remote-tracking branch 'upstream/branch-24.02' into ref/column/…
mroeschke Jan 8, 2024
8365eee
fix for DatetimeTZDtype
mroeschke Jan 8, 2024
f0373d8
Fix contains for tz aware types
mroeschke Jan 9, 2024
d746a40
Merge remote-tracking branch 'upstream/branch-24.02' into ref/column/…
mroeschke Jan 9, 2024
f8397bb
Add copyright
mroeschke Jan 9, 2024
b1ce2d1
Merge remote-tracking branch 'upstream/branch-24.02' into ref/column/…
mroeschke Jan 10, 2024
eec7fd5
Merge remote-tracking branch 'upstream/branch-24.02' into ref/column/…
mroeschke Jan 10, 2024
ae4251a
Merge remote-tracking branch 'upstream/branch-24.02' into ref/column/…
mroeschke Jan 10, 2024
847cb93
Merge remote-tracking branch 'upstream/branch-24.02' into ref/column/…
mroeschke Jan 19, 2024
41613ba
Merge remote-tracking branch 'upstream/branch-24.02' into ref/column/…
mroeschke Jan 19, 2024
2e95199
Push element indexing logic to subclass
mroeschke Jan 19, 2024
52ffb66
Merge remote-tracking branch 'upstream/branch-24.02' into ref/column/…
mroeschke Jan 22, 2024
40daf51
Merge remote-tracking branch 'upstream/branch-24.02' into ref/column/…
mroeschke Jan 22, 2024
66236c0
Merge remote-tracking branch 'upstream/branch-24.04' into ref/column/…
mroeschke Jan 31, 2024
512a023
Remove _time_unit
mroeschke Jan 31, 2024
eff9d8e
Merge remote-tracking branch 'upstream/branch-24.04' into ref/column/…
mroeschke Feb 2, 2024
2f88a3c
pandas 2.0 in
mroeschke Feb 2, 2024
f6a0ea0
Merge remote-tracking branch 'upstream/branch-24.04' into ref/column/…
mroeschke Feb 3, 2024
3ebd26c
Merge remote-tracking branch 'upstream/branch-24.04' into ref/column/…
mroeschke Feb 5, 2024
a679ab0
Merge remote-tracking branch 'upstream/branch-24.04' into ref/column/…
mroeschke Feb 21, 2024
3f61b10
Merge remote-tracking branch 'upstream/branch-24.04' into ref/column/…
mroeschke Feb 26, 2024
38bbb59
Merge remote-tracking branch 'upstream/branch-24.04' into ref/column/…
mroeschke Feb 29, 2024
f45514b
Merge branch 'branch-24.04' into ref/column/cleanups
mroeschke Mar 4, 2024
08c589d
Merge remote-tracking branch 'upstream/branch-24.04' into ref/column/…
mroeschke Mar 5, 2024
a4a7778
Merge branch 'ref/column/cleanups' of https://github.com/mroeschke/cu…
mroeschke Mar 5, 2024
df7f465
Merge remote-tracking branch 'upstream/branch-24.04' into ref/column/…
mroeschke Mar 11, 2024
8eef64f
Merge remote-tracking branch 'upstream/branch-24.04' into ref/column/…
mroeschke Mar 13, 2024
a507b05
Merge remote-tracking branch 'upstream/branch-24.04' into ref/column/…
mroeschke Mar 15, 2024
50d8ccf
Merge remote-tracking branch 'upstream/branch-24.06' into ref/column/…
mroeschke Mar 18, 2024
c143d71
Merge branch 'branch-24.06' into ref/column/cleanups
mroeschke Apr 5, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions python/cudf/cudf/core/_internals/timezones.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023, NVIDIA CORPORATION.
# Copyright (c) 2023-2024, NVIDIA CORPORATION.

import os
import zoneinfo
Expand Down Expand Up @@ -111,7 +111,7 @@ def _find_ambiguous_and_nonexistent(
tz_data_for_zone = get_tz_data(zone_name)
transition_times = tz_data_for_zone["transition_times"]
offsets = tz_data_for_zone["offsets"].astype(
f"timedelta64[{data._time_unit}]"
f"timedelta64[{data.time_unit}]"
wence- marked this conversation as resolved.
Show resolved Hide resolved
)

if len(offsets) == 1: # no transitions
Expand Down Expand Up @@ -180,7 +180,7 @@ def localize(
"Already localized. "
"Use `tz_convert` to convert between time zones."
)
dtype = pd.DatetimeTZDtype(data._time_unit, zone_name)
dtype = pd.DatetimeTZDtype(data.time_unit, zone_name)
ambiguous, nonexistent = _find_ambiguous_and_nonexistent(data, zone_name)
localized = cast(
DatetimeColumn,
Expand Down Expand Up @@ -227,7 +227,7 @@ def convert(data: DatetimeTZColumn, zone_name: str) -> DatetimeTZColumn:
DatetimeTZColumn,
build_column(
data=utc_time.base_data,
dtype=pd.DatetimeTZDtype(data._time_unit, zone_name),
dtype=pd.DatetimeTZDtype(data.time_unit, zone_name),
mask=utc_time.base_mask,
size=utc_time.size,
offset=utc_time.offset,
Expand Down
16 changes: 8 additions & 8 deletions python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import annotations

import datetime
import functools
import locale
import re
from locale import nl_langinfo
Expand Down Expand Up @@ -239,6 +240,8 @@ def __init__(
null_count: Optional[int] = None,
):
dtype = cudf.dtype(dtype)
if dtype.kind != "M":
raise TypeError(f"{self.dtype} is not a supported datetime type")

if data.size % dtype.itemsize:
raise ValueError("Buffer size must be divisible by element size")
Expand All @@ -254,24 +257,21 @@ def __init__(
null_count=null_count,
)

if self.dtype.type is not np.datetime64:
raise TypeError(f"{self.dtype} is not a supported datetime type")

self._time_unit, _ = np.datetime_data(self.dtype)

def __contains__(self, item: ScalarLike) -> bool:
try:
item_as_dt64 = np.datetime64(item, self._time_unit)
item_as_dt64 = np.datetime64(item, self.time_unit)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit If self.dtype is a DatetimeTZDtype is this code correct?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good find, no it doesn't. In pandas least for DatetimeTZDtype, item must be tz-aware and the same UTC time to return True.

In [1]: import pandas as pd

In [2]: dti = pd.date_range("2020", periods=3, tz="UTC")

In [3]: pd.Timestamp("2020") in dti
Out[3]: False

In [4]: pd.Timestamp("2020", tz="UTC") in dti
Out[4]: True

In [5]: pd.Timestamp("2020", tz="US/Pacific") in dti
Out[5]: False

In [6]: pd.Timestamp("2019-12-31T16:00:00", tz="US/Pacific") in dti
Out[6]: True

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed this case and added a unit test

except ValueError:
# If item cannot be converted to datetime type
# np.datetime64 raises ValueError, hence `item`
# cannot exist in `self`.
return False
return item_as_dt64.astype("int64") in self.as_numerical

@property
@functools.cached_property
def time_unit(self) -> str:
return self._time_unit
if isinstance(self.dtype, pd.DatetimeTZDtype):
return self.dtype.unit
return np.datetime_data(self.dtype)[0]

@property
def year(self) -> ColumnBase:
Expand Down
41 changes: 13 additions & 28 deletions python/cudf/cudf/core/column/timedelta.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from __future__ import annotations

import datetime
import functools
from typing import Any, Optional, Sequence, cast

import numpy as np
Expand All @@ -18,13 +19,6 @@
from cudf.utils.dtypes import np_to_pa_dtype
from cudf.utils.utils import _all_bools_with_nulls

_dtype_to_format_conversion = {
"timedelta64[ns]": "%D days %H:%M:%S",
"timedelta64[us]": "%D days %H:%M:%S",
"timedelta64[ms]": "%D days %H:%M:%S",
"timedelta64[s]": "%D days %H:%M:%S",
}

_unit_to_nanoseconds_conversion = {
"ns": 1,
"us": 1_000,
Expand Down Expand Up @@ -86,6 +80,8 @@ def __init__(
null_count: Optional[int] = None,
):
dtype = cudf.dtype(dtype)
if dtype.kind != "m":
raise TypeError(f"{self.dtype} is not a supported duration type")

if data.size % dtype.itemsize:
raise ValueError("Buffer size must be divisible by element size")
Expand All @@ -101,14 +97,9 @@ def __init__(
null_count=null_count,
)

if self.dtype.type is not np.timedelta64:
raise TypeError(f"{self.dtype} is not a supported duration type")

self._time_unit, _ = np.datetime_data(self.dtype)

def __contains__(self, item: DatetimeLikeScalar) -> bool:
try:
item = np.timedelta64(item, self._time_unit)
item = np.timedelta64(item, self.time_unit)
except ValueError:
# If item cannot be converted to duration type
# np.timedelta64 raises ValueError, hence `item`
Expand Down Expand Up @@ -235,16 +226,12 @@ def normalize_binop_value(self, other) -> ColumnBinaryOperand:
"Cannot perform binary operation on timezone-naive columns"
" and timezone-aware timestamps."
)
if isinstance(other, pd.Timestamp):
if other.tz is not None:
if isinstance(other, datetime.datetime):
if other.tzinfo is not None:
raise NotImplementedError(tz_error_msg)
other = other.to_datetime64()
elif isinstance(other, pd.Timedelta):
other = other.to_timedelta64()
other = pd.Timestamp(other).to_datetime64()
elif isinstance(other, datetime.timedelta):
other = np.timedelta64(other)
elif isinstance(other, datetime.datetime) and other.tzinfo is not None:
raise NotImplementedError(tz_error_msg)
other = pd.Timedelta(other).to_timedelta64()

if isinstance(other, np.timedelta64):
other_time_unit = cudf.utils.dtypes.get_time_unit(other)
Expand All @@ -256,7 +243,7 @@ def normalize_binop_value(self, other) -> ColumnBinaryOperand:
else:
common_dtype = determine_out_dtype(self.dtype, other.dtype)
return cudf.Scalar(other.astype(common_dtype))
elif np.isscalar(other):
elif is_scalar(other):
return cudf.Scalar(other)
return NotImplemented

Expand All @@ -273,9 +260,9 @@ def as_numerical(self) -> "cudf.core.column.NumericalColumn":
),
)

@property
@functools.cached_property
def time_unit(self) -> str:
return self._time_unit
return np.datetime_data(self.dtype)[0]
wence- marked this conversation as resolved.
Show resolved Hide resolved

def fillna(
self,
Expand Down Expand Up @@ -318,9 +305,7 @@ def as_string_column(
self, dtype: Dtype, format=None, **kwargs
) -> "cudf.core.column.StringColumn":
if format is None:
format = _dtype_to_format_conversion.get(
self.dtype.name, "%D days %H:%M:%S"
)
format = "%D days %H:%M:%S"
wence- marked this conversation as resolved.
Show resolved Hide resolved
if len(self) > 0:
return string._timedelta_to_str_typecast_functions[
cudf.dtype(self.dtype)
Expand Down