From 56430b46a6494cbec90b4c085085a905631be55f Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 15 Dec 2021 16:09:38 -0800 Subject: [PATCH] Use pandas `to_offset` to parse frequency string in `date_range` (#9843) Pandas uses the [following regex](https://github.com/pandas-dev/pandas/blob/8fefaa5a9a7c3f3a1c35c36c1140117dab73c9c7/pandas/_libs/tslibs/offsets.pyx#L3506-L3508) to convert freqeuncy strings into offset, which is exposed in public api `pd.tseries.frequencies.to_offset`. Currently cudf depends on a [custom regex](https://github.com/rapidsai/cudf/blob/fdd9bb00dc0ba5ac373feaa079b782029130dae3/python/cudf/cudf/core/tools/datetimes.py#L819-L821) to perform the conversion. We probably shouldn't reinvent the wheels here as it might make it harder to track changes in pandas. Authors: - Michael Wang (https://github.com/isVoid) Approvers: - Sheilah Kirui (https://github.com/skirui-source) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/9843 --- python/cudf/cudf/core/tools/datetimes.py | 41 ++++++++++++++--------- python/cudf/cudf/tests/test_datetime.py | 42 ++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 3efbd982b53..15426d0173a 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -8,6 +8,7 @@ import cupy as cp import numpy as np import pandas as pd +import pandas.tseries.offsets as pd_offset from pandas.core.tools.datetimes import _unit_map import cudf @@ -458,6 +459,17 @@ class DateOffset: "Y": "years", } + _TICK_OR_WEEK_TO_UNITS = { + pd_offset.Week: "weeks", + pd_offset.Day: "days", + pd_offset.Hour: "hours", + pd_offset.Minute: "minutes", + pd_offset.Second: "seconds", + pd_offset.Milli: "milliseconds", + pd_offset.Micro: "microseconds", + pd_offset.Nano: "nanoseconds", + } + _FREQSTR_REGEX = re.compile("([0-9]*)([a-zA-Z]+)") def __init__(self, n=1, normalize=False, **kwds): @@ -649,6 +661,13 @@ def _from_freqstr(cls: Type[_T], freqstr: str) -> _T: return cls(**{cls._CODES_TO_UNITS[freq_part]: int(numeric_part)}) + @classmethod + def _from_pandas_ticks_or_weeks( + cls: Type[_T], + tick: Union[pd.tseries.offsets.Tick, pd.tseries.offsets.Week], + ) -> _T: + return cls(**{cls._TICK_OR_WEEK_TO_UNITS[type(tick)]: tick.n}) + def _maybe_as_fast_pandas_offset(self): if ( len(self.kwds) == 1 @@ -814,23 +833,15 @@ def date_range( if isinstance(freq, DateOffset): offset = freq elif isinstance(freq, str): - # Map pandas `offset alias` into cudf DateOffset `CODE`, only - # fixed-frequency, non-anchored offset aliases are supported. - mo = re.fullmatch( - rf'(-)*(\d*)({"|".join(_offset_alias_to_code.keys())})', freq - ) - if mo is None: + offset = pd.tseries.frequencies.to_offset(freq) + if not isinstance(offset, pd.tseries.offsets.Tick) and not isinstance( + offset, pd.tseries.offsets.Week + ): raise ValueError( - f"Unrecognized or unsupported offset alias {freq}." + f"Unrecognized frequency string {freq}. cuDF does " + "not yet support month, quarter, year-anchored frequency." ) - - sign, n, offset_alias = mo.groups() - code = _offset_alias_to_code[offset_alias] - - freq = "".join([n, code]) - offset = DateOffset._from_freqstr(freq) - if sign: - offset.kwds.update({s: -i for s, i in offset.kwds.items()}) + offset = DateOffset._from_pandas_ticks_or_weeks(offset) else: raise TypeError("`freq` must be a `str` or cudf.DateOffset object.") diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 72601a3da2c..1a1b21aa3d5 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -1583,6 +1583,48 @@ def test_date_range_raise_overflow(): cudf.date_range(start=start, periods=periods, freq=freq) +@pytest.mark.parametrize( + "freqstr_unsupported", + [ + "1M", + "2SM", + "3MS", + "4BM", + "5CBM", + "6SMS", + "7BMS", + "8CBMS", + "Q", + "2BQ", + "3BQS", + "10A", + "10Y", + "9BA", + "9BY", + "8AS", + "8YS", + "7BAS", + "7BYS", + "BH", + "B", + ], +) +def test_date_range_raise_unsupported(freqstr_unsupported): + s, e = "2001-01-01", "2008-01-31" + pd.date_range(start=s, end=e, freq=freqstr_unsupported) + with pytest.raises(ValueError, match="does not yet support"): + cudf.date_range(start=s, end=e, freq=freqstr_unsupported) + + # We also check that these values are unsupported when using lowercase + # characters. We exclude the value 3MS (every 3 month starts) because 3ms + # is a valid frequency for every 3 milliseconds. + if freqstr_unsupported != "3MS": + freqstr_unsupported = freqstr_unsupported.lower() + pd.date_range(start=s, end=e, freq=freqstr_unsupported) + with pytest.raises(ValueError, match="does not yet support"): + cudf.date_range(start=s, end=e, freq=freqstr_unsupported) + + ################################################################## # End of Date Range Test # ##################################################################