Skip to content

Commit

Permalink
Support freq in DatetimeIndex (#14593)
Browse files Browse the repository at this point in the history
When a `DatetimeIndex` has a fixed frequency offset, pandas defaults to it having a `.freq` attribute. Because we don't support that, we raise in pandas compatible mode.

Thus, working with datetimes is practically impossible in pandas compatible mode because so many datetime operations involve setting a datetime column as an index (resample, groupby).

This PR adds rudimentary support for the `freq` attribute.

Authors:
  - Ashwin Srinath (https://github.com/shwina)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)
  - Lawrence Mitchell (https://github.com/wence-)

URL: #14593
  • Loading branch information
shwina authored Dec 12, 2023
1 parent 0fa80ec commit a9dc521
Show file tree
Hide file tree
Showing 5 changed files with 223 additions and 28 deletions.
87 changes: 69 additions & 18 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
Tuple,
Type,
Union,
cast,
)

import cupy
Expand Down Expand Up @@ -1427,14 +1428,21 @@ def __repr__(self):
dtype_index = tmp_meta.rfind(" dtype=")
prior_to_dtype = tmp_meta[:dtype_index]
lines = lines[:-1]
lines.append(prior_to_dtype + " dtype='%s'" % self.dtype)
keywords = [f"dtype='{self.dtype}'"]
if self.name is not None:
lines[-1] = lines[-1] + ", name='%s'" % self.name
keywords.append(f"name={self.name!r}")
if "length" in tmp_meta:
lines[-1] = lines[-1] + ", length=%d)" % len(self)
else:
lines[-1] = lines[-1] + ")"

keywords.append(f"length={len(self)}")
if (
"freq" in tmp_meta
and isinstance(self, DatetimeIndex)
and self._freq is not None
):
keywords.append(
f"freq={self._freq._maybe_as_fast_pandas_offset().freqstr!r}"
)
keywords = ", ".join(keywords)
lines.append(f"{prior_to_dtype} {keywords})")
return "\n".join(lines)

@_cudf_nvtx_annotate
Expand Down Expand Up @@ -2125,8 +2133,6 @@ def __init__(
# pandas dtindex creation first which. For now
# just make sure we handle np.datetime64 arrays
# and then just dispatch upstream
if freq is not None:
raise NotImplementedError("Freq is not yet supported")
if tz is not None:
raise NotImplementedError("tz is not yet supported")
if normalize is not False:
Expand All @@ -2140,6 +2146,8 @@ def __init__(
if yearfirst is not False:
raise NotImplementedError("yearfirst == True is not yet supported")

self._freq = _validate_freq(freq)

valid_dtypes = tuple(
f"datetime64[{res}]" for res in ("s", "ms", "us", "ns")
)
Expand All @@ -2157,6 +2165,30 @@ def __init__(

super().__init__(data, **kwargs)

if self._freq is not None:
unique_vals = self.to_series().diff().unique()
if len(unique_vals) > 2 or (
len(unique_vals) == 2
and unique_vals[1] != self._freq._maybe_as_fast_pandas_offset()
):
raise ValueError("No unique frequency found")

@_cudf_nvtx_annotate
def _copy_type_metadata(
self: DatetimeIndex, other: DatetimeIndex, *, override_dtypes=None
) -> GenericIndex:
super()._copy_type_metadata(other, override_dtypes=override_dtypes)
self._freq = _validate_freq(other._freq)
return self

@classmethod
def _from_data(
cls, data: MutableMapping, name: Any = no_default, freq: Any = None
):
result = super()._from_data(data, name)
result._freq = _validate_freq(freq)
return result

def __getitem__(self, index):
value = super().__getitem__(index)
if cudf.get_option("mode.pandas_compatible") and isinstance(
Expand All @@ -2165,6 +2197,11 @@ def __getitem__(self, index):
return pd.Timestamp(value)
return value

@_cudf_nvtx_annotate
def copy(self, name=None, deep=False, dtype=None, names=None):
idx_copy = super().copy(name=name, deep=deep, dtype=dtype, names=names)
return idx_copy._copy_type_metadata(self)

def searchsorted(
self,
value,
Expand Down Expand Up @@ -2518,7 +2555,13 @@ def to_pandas(self, *, nullable: bool = False) -> pd.DatetimeIndex:
)
else:
nanos = self._values.astype("datetime64[ns]")
return pd.DatetimeIndex(nanos.to_pandas(), name=self.name)

freq = (
self._freq._maybe_as_fast_pandas_offset()
if self._freq is not None
else None
)
return pd.DatetimeIndex(nanos.to_pandas(), name=self.name, freq=freq)

@_cudf_nvtx_annotate
def _get_dt_field(self, field):
Expand Down Expand Up @@ -2663,10 +2706,9 @@ def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"):
>>> tz_naive = cudf.date_range('2018-03-01 09:00', periods=3, freq='D')
>>> tz_aware = tz_naive.tz_localize("America/New_York")
>>> tz_aware
DatetimeIndex(['2018-03-01 09:00:00-05:00',
'2018-03-02 09:00:00-05:00',
DatetimeIndex(['2018-03-01 09:00:00-05:00', '2018-03-02 09:00:00-05:00',
'2018-03-03 09:00:00-05:00'],
dtype='datetime64[ns, America/New_York]')
dtype='datetime64[ns, America/New_York]', freq='D')
Ambiguous or nonexistent datetimes are converted to NaT.
Expand All @@ -2685,14 +2727,16 @@ def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"):
``ambiguous`` and ``nonexistent`` arguments. Any
ambiguous or nonexistent timestamps are converted
to 'NaT'.
"""
""" # noqa: E501
from cudf.core._internals.timezones import delocalize, localize

if tz is None:
result_col = delocalize(self._column)
else:
result_col = localize(self._column, tz, ambiguous, nonexistent)
return DatetimeIndex._from_data({self.name: result_col})
return DatetimeIndex._from_data(
{self.name: result_col}, freq=self._freq
)

def tz_convert(self, tz):
"""
Expand All @@ -2717,16 +2761,15 @@ def tz_convert(self, tz):
>>> dti = cudf.date_range('2018-03-01 09:00', periods=3, freq='D')
>>> dti = dti.tz_localize("America/New_York")
>>> dti
DatetimeIndex(['2018-03-01 09:00:00-05:00',
'2018-03-02 09:00:00-05:00',
DatetimeIndex(['2018-03-01 09:00:00-05:00', '2018-03-02 09:00:00-05:00',
'2018-03-03 09:00:00-05:00'],
dtype='datetime64[ns, America/New_York]')
dtype='datetime64[ns, America/New_York]', freq='D')
>>> dti.tz_convert("Europe/London")
DatetimeIndex(['2018-03-01 14:00:00+00:00',
'2018-03-02 14:00:00+00:00',
'2018-03-03 14:00:00+00:00'],
dtype='datetime64[ns, Europe/London]')
"""
""" # noqa: E501
from cudf.core._internals.timezones import convert

if tz is None:
Expand Down Expand Up @@ -3625,3 +3668,11 @@ def _extended_gcd(a: int, b: int) -> Tuple[int, int, int]:
old_s, s = s, old_s - quotient * s
old_t, t = t, old_t - quotient * t
return old_r, old_s, old_t


def _validate_freq(freq: Any) -> cudf.DateOffset:
if isinstance(freq, str):
return cudf.DateOffset._from_freqstr(freq)
elif freq is not None and not isinstance(freq, cudf.DateOffset):
raise ValueError(f"Invalid frequency: {freq}")
return cast(cudf.DateOffset, freq)
14 changes: 14 additions & 0 deletions python/cudf/cudf/core/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,20 +121,33 @@ class _ResampleGrouping(_Grouping):

bin_labels: cudf.core.index.Index

def __init__(self, obj, by=None, level=None):
self._freq = getattr(by, "freq", None)
super().__init__(obj, by, level)

def copy(self, deep=True):
out = super().copy(deep=deep)
result = _ResampleGrouping.__new__(_ResampleGrouping)
result.names = out.names
result._named_columns = out._named_columns
result._key_columns = out._key_columns
result.bin_labels = self.bin_labels.copy(deep=deep)
result._freq = self._freq
return result

@property
def keys(self):
index = super().keys
if self._freq is not None and isinstance(index, cudf.DatetimeIndex):
return cudf.DatetimeIndex._from_data(index._data, freq=self._freq)
return index

def serialize(self):
header, frames = super().serialize()
labels_head, labels_frames = self.bin_labels.serialize()
header["__bin_labels"] = labels_head
header["__bin_labels_count"] = len(labels_frames)
header["_freq"] = self._freq
frames.extend(labels_frames)
return header, frames

Expand All @@ -152,6 +165,7 @@ def deserialize(cls, header, frames):
out.bin_labels = cudf.core.index.Index.deserialize(
header["__bin_labels"], frames[-header["__bin_labels_count"] :]
)
out._freq = header["_freq"]
return out

def _handle_frequency_grouper(self, by):
Expand Down
14 changes: 8 additions & 6 deletions python/cudf/cudf/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -463,13 +463,19 @@ class DateOffset:
}

_CODES_TO_UNITS = {
"N": "nanoseconds",
"ns": "nanoseconds",
"U": "microseconds",
"us": "microseconds",
"ms": "milliseconds",
"L": "milliseconds",
"s": "seconds",
"S": "seconds",
"m": "minutes",
"min": "minutes",
"T": "minutes",
"h": "hours",
"H": "hours",
"D": "days",
"W": "weeks",
"M": "months",
Expand All @@ -487,7 +493,7 @@ class DateOffset:
pd_offset.Nano: "nanoseconds",
}

_FREQSTR_REGEX = re.compile("([0-9]*)([a-zA-Z]+)")
_FREQSTR_REGEX = re.compile("([-+]?[0-9]*)([a-zA-Z]+)")

def __init__(self, n=1, normalize=False, **kwds):
if normalize:
Expand Down Expand Up @@ -843,10 +849,6 @@ def date_range(
arr = cp.linspace(start=start, stop=end, num=periods)
result = cudf.core.column.as_column(arr).astype("datetime64[ns]")
return cudf.DatetimeIndex._from_data({name: result})
elif cudf.get_option("mode.pandas_compatible"):
raise NotImplementedError(
"`DatetimeIndex` with `freq` cannot be constructed."
)

# The code logic below assumes `freq` is defined. It is first normalized
# into `DateOffset` for further computation with timestamps.
Expand Down Expand Up @@ -940,7 +942,7 @@ def date_range(
arr = cp.arange(start=start, stop=stop, step=step, dtype="int64")
res = cudf.core.column.as_column(arr).astype("datetime64[ns]")

return cudf.DatetimeIndex._from_data({name: res})
return cudf.DatetimeIndex._from_data({name: res}, freq=freq)


def _has_fixed_frequency(freq: DateOffset) -> bool:
Expand Down
8 changes: 8 additions & 0 deletions python/cudf/cudf/pandas/_wrappers/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -707,6 +707,14 @@ def Index__new__(cls, *args, **kwargs):
"Resampler", cudf.core.resample._Resampler, pd_Resampler
)

DataFrameResampler = make_intermediate_proxy_type(
"DataFrameResampler", cudf.core.resample.DataFrameResampler, pd_Resampler
)

SeriesResampler = make_intermediate_proxy_type(
"SeriesResampler", cudf.core.resample.SeriesResampler, pd_Resampler
)

StataReader = make_intermediate_proxy_type(
"StataReader",
_Unusable,
Expand Down
Loading

0 comments on commit a9dc521

Please sign in to comment.