Support freq in DatetimeIndex (#14593)

When a `DatetimeIndex` has a fixed frequency offset, pandas defaults to it having a `.freq` attribute. Because we don't support that, we raise in pandas compatible mode. Thus, working with datetimes is practically impossible in pandas compatible mode because so many datetime operations involve setting a datetime column as an index (resample, groupby). This PR adds rudimentary support for the `freq` attribute. Authors: - Ashwin Srinath (https://github.com/shwina) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Bradley Dice (https://github.com/bdice) - Lawrence Mitchell (https://github.com/wence-) URL: #14593
rapidsai · Dec 12, 2023 · a9dc521 · a9dc521
1 parent 0fa80ec
commit a9dc521
Show file tree

Hide file tree

Showing 5 changed files with 223 additions and 28 deletions.
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
@@ -17,6 +17,7 @@
     Tuple,
     Type,
     Union,
+    cast,
 )
 
 import cupy
@@ -1427,14 +1428,21 @@ def __repr__(self):
         dtype_index = tmp_meta.rfind(" dtype=")
         prior_to_dtype = tmp_meta[:dtype_index]
         lines = lines[:-1]
-        lines.append(prior_to_dtype + " dtype='%s'" % self.dtype)
+        keywords = [f"dtype='{self.dtype}'"]
         if self.name is not None:
-            lines[-1] = lines[-1] + ", name='%s'" % self.name
+            keywords.append(f"name={self.name!r}")
         if "length" in tmp_meta:
-            lines[-1] = lines[-1] + ", length=%d)" % len(self)
-        else:
-            lines[-1] = lines[-1] + ")"
-
+            keywords.append(f"length={len(self)}")
+        if (
+            "freq" in tmp_meta
+            and isinstance(self, DatetimeIndex)
+            and self._freq is not None
+        ):
+            keywords.append(
+                f"freq={self._freq._maybe_as_fast_pandas_offset().freqstr!r}"
+            )
+        keywords = ", ".join(keywords)
+        lines.append(f"{prior_to_dtype} {keywords})")
         return "\n".join(lines)
 
     @_cudf_nvtx_annotate
@@ -2125,8 +2133,6 @@ def __init__(
         # pandas dtindex creation first which.  For now
         # just make sure we handle np.datetime64 arrays
         # and then just dispatch upstream
-        if freq is not None:
-            raise NotImplementedError("Freq is not yet supported")
         if tz is not None:
             raise NotImplementedError("tz is not yet supported")
         if normalize is not False:
@@ -2140,6 +2146,8 @@ def __init__(
         if yearfirst is not False:
             raise NotImplementedError("yearfirst == True is not yet supported")
 
+        self._freq = _validate_freq(freq)
+
         valid_dtypes = tuple(
             f"datetime64[{res}]" for res in ("s", "ms", "us", "ns")
         )
@@ -2157,6 +2165,30 @@ def __init__(
 
         super().__init__(data, **kwargs)
 
+        if self._freq is not None:
+            unique_vals = self.to_series().diff().unique()
+            if len(unique_vals) > 2 or (
+                len(unique_vals) == 2
+                and unique_vals[1] != self._freq._maybe_as_fast_pandas_offset()
+            ):
+                raise ValueError("No unique frequency found")
+
+    @_cudf_nvtx_annotate
+    def _copy_type_metadata(
+        self: DatetimeIndex, other: DatetimeIndex, *, override_dtypes=None
+    ) -> GenericIndex:
+        super()._copy_type_metadata(other, override_dtypes=override_dtypes)
+        self._freq = _validate_freq(other._freq)
+        return self
+
+    @classmethod
+    def _from_data(
+        cls, data: MutableMapping, name: Any = no_default, freq: Any = None
+    ):
+        result = super()._from_data(data, name)
+        result._freq = _validate_freq(freq)
+        return result
+
     def __getitem__(self, index):
         value = super().__getitem__(index)
         if cudf.get_option("mode.pandas_compatible") and isinstance(
@@ -2165,6 +2197,11 @@ def __getitem__(self, index):
             return pd.Timestamp(value)
         return value
 
+    @_cudf_nvtx_annotate
+    def copy(self, name=None, deep=False, dtype=None, names=None):
+        idx_copy = super().copy(name=name, deep=deep, dtype=dtype, names=names)
+        return idx_copy._copy_type_metadata(self)
+
     def searchsorted(
         self,
         value,
@@ -2518,7 +2555,13 @@ def to_pandas(self, *, nullable: bool = False) -> pd.DatetimeIndex:
             )
         else:
             nanos = self._values.astype("datetime64[ns]")
-        return pd.DatetimeIndex(nanos.to_pandas(), name=self.name)
+
+        freq = (
+            self._freq._maybe_as_fast_pandas_offset()
+            if self._freq is not None
+            else None
+        )
+        return pd.DatetimeIndex(nanos.to_pandas(), name=self.name, freq=freq)
 
     @_cudf_nvtx_annotate
     def _get_dt_field(self, field):
@@ -2663,10 +2706,9 @@ def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"):
         >>> tz_naive = cudf.date_range('2018-03-01 09:00', periods=3, freq='D')
         >>> tz_aware = tz_naive.tz_localize("America/New_York")
         >>> tz_aware
-        DatetimeIndex(['2018-03-01 09:00:00-05:00',
-                       '2018-03-02 09:00:00-05:00',
+        DatetimeIndex(['2018-03-01 09:00:00-05:00', '2018-03-02 09:00:00-05:00',
                        '2018-03-03 09:00:00-05:00'],
-                      dtype='datetime64[ns, America/New_York]')
+                      dtype='datetime64[ns, America/New_York]', freq='D')
 
         Ambiguous or nonexistent datetimes are converted to NaT.
 
@@ -2685,14 +2727,16 @@ def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"):
         ``ambiguous`` and ``nonexistent`` arguments. Any
         ambiguous or nonexistent timestamps are converted
         to 'NaT'.
-        """
+        """  # noqa: E501
         from cudf.core._internals.timezones import delocalize, localize
 
         if tz is None:
             result_col = delocalize(self._column)
         else:
             result_col = localize(self._column, tz, ambiguous, nonexistent)
-        return DatetimeIndex._from_data({self.name: result_col})
+        return DatetimeIndex._from_data(
+            {self.name: result_col}, freq=self._freq
+        )
 
     def tz_convert(self, tz):
         """
@@ -2717,16 +2761,15 @@ def tz_convert(self, tz):
         >>> dti = cudf.date_range('2018-03-01 09:00', periods=3, freq='D')
         >>> dti = dti.tz_localize("America/New_York")
         >>> dti
-        DatetimeIndex(['2018-03-01 09:00:00-05:00',
-                       '2018-03-02 09:00:00-05:00',
+        DatetimeIndex(['2018-03-01 09:00:00-05:00', '2018-03-02 09:00:00-05:00',
                        '2018-03-03 09:00:00-05:00'],
-                      dtype='datetime64[ns, America/New_York]')
+                      dtype='datetime64[ns, America/New_York]', freq='D')
         >>> dti.tz_convert("Europe/London")
         DatetimeIndex(['2018-03-01 14:00:00+00:00',
                        '2018-03-02 14:00:00+00:00',
                        '2018-03-03 14:00:00+00:00'],
                       dtype='datetime64[ns, Europe/London]')
-        """
+        """  # noqa: E501
         from cudf.core._internals.timezones import convert
 
         if tz is None:
@@ -3625,3 +3668,11 @@ def _extended_gcd(a: int, b: int) -> Tuple[int, int, int]:
         old_s, s = s, old_s - quotient * s
         old_t, t = t, old_t - quotient * t
     return old_r, old_s, old_t
+
+
+def _validate_freq(freq: Any) -> cudf.DateOffset:
+    if isinstance(freq, str):
+        return cudf.DateOffset._from_freqstr(freq)
+    elif freq is not None and not isinstance(freq, cudf.DateOffset):
+        raise ValueError(f"Invalid frequency: {freq}")
+    return cast(cudf.DateOffset, freq)
diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py
@@ -121,20 +121,33 @@ class _ResampleGrouping(_Grouping):
 
     bin_labels: cudf.core.index.Index
 
+    def __init__(self, obj, by=None, level=None):
+        self._freq = getattr(by, "freq", None)
+        super().__init__(obj, by, level)
+
     def copy(self, deep=True):
         out = super().copy(deep=deep)
         result = _ResampleGrouping.__new__(_ResampleGrouping)
         result.names = out.names
         result._named_columns = out._named_columns
         result._key_columns = out._key_columns
         result.bin_labels = self.bin_labels.copy(deep=deep)
+        result._freq = self._freq
         return result
 
+    @property
+    def keys(self):
+        index = super().keys
+        if self._freq is not None and isinstance(index, cudf.DatetimeIndex):
+            return cudf.DatetimeIndex._from_data(index._data, freq=self._freq)
+        return index
+
     def serialize(self):
         header, frames = super().serialize()
         labels_head, labels_frames = self.bin_labels.serialize()
         header["__bin_labels"] = labels_head
         header["__bin_labels_count"] = len(labels_frames)
+        header["_freq"] = self._freq
         frames.extend(labels_frames)
         return header, frames
 
@@ -152,6 +165,7 @@ def deserialize(cls, header, frames):
         out.bin_labels = cudf.core.index.Index.deserialize(
             header["__bin_labels"], frames[-header["__bin_labels_count"] :]
         )
+        out._freq = header["_freq"]
         return out
 
     def _handle_frequency_grouper(self, by):

diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
@@ -463,13 +463,19 @@ class DateOffset:
     }
 
     _CODES_TO_UNITS = {
+        "N": "nanoseconds",
         "ns": "nanoseconds",
+        "U": "microseconds",
         "us": "microseconds",
         "ms": "milliseconds",
         "L": "milliseconds",
         "s": "seconds",
+        "S": "seconds",
         "m": "minutes",
+        "min": "minutes",
+        "T": "minutes",
         "h": "hours",
+        "H": "hours",
         "D": "days",
         "W": "weeks",
         "M": "months",
@@ -487,7 +493,7 @@ class DateOffset:
         pd_offset.Nano: "nanoseconds",
     }
 
-    _FREQSTR_REGEX = re.compile("([0-9]*)([a-zA-Z]+)")
+    _FREQSTR_REGEX = re.compile("([-+]?[0-9]*)([a-zA-Z]+)")
 
     def __init__(self, n=1, normalize=False, **kwds):
         if normalize:
@@ -843,10 +849,6 @@ def date_range(
         arr = cp.linspace(start=start, stop=end, num=periods)
         result = cudf.core.column.as_column(arr).astype("datetime64[ns]")
         return cudf.DatetimeIndex._from_data({name: result})
-    elif cudf.get_option("mode.pandas_compatible"):
-        raise NotImplementedError(
-            "`DatetimeIndex` with `freq` cannot be constructed."
-        )
 
     # The code logic below assumes `freq` is defined. It is first normalized
     # into `DateOffset` for further computation with timestamps.
@@ -940,7 +942,7 @@ def date_range(
         arr = cp.arange(start=start, stop=stop, step=step, dtype="int64")
         res = cudf.core.column.as_column(arr).astype("datetime64[ns]")
 
-    return cudf.DatetimeIndex._from_data({name: res})
+    return cudf.DatetimeIndex._from_data({name: res}, freq=freq)
 
 
 def _has_fixed_frequency(freq: DateOffset) -> bool:

diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -707,6 +707,14 @@ def Index__new__(cls, *args, **kwargs):
     "Resampler", cudf.core.resample._Resampler, pd_Resampler
 )
 
+DataFrameResampler = make_intermediate_proxy_type(
+    "DataFrameResampler", cudf.core.resample.DataFrameResampler, pd_Resampler
+)
+
+SeriesResampler = make_intermediate_proxy_type(
+    "SeriesResampler", cudf.core.resample.SeriesResampler, pd_Resampler
+)
+
 StataReader = make_intermediate_proxy_type(
     "StataReader",
     _Unusable,