From 5ee4e483f0770377ac1b81e320c13b08f637c759 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 3 Dec 2021 20:59:38 -0800 Subject: [PATCH 1/3] Resort to for string parsing --- python/cudf/cudf/core/tools/datetimes.py | 35 +++++++++++--------- python/cudf/cudf/tests/test_datetime.py | 42 ++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 34d62ffc048..1728026ffad 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -8,6 +8,7 @@ import cupy as cp import numpy as np import pandas as pd +import pandas.tseries.offsets as pd_offset from pandas.core.tools.datetimes import _unit_map import cudf @@ -458,6 +459,16 @@ class DateOffset: "Y": "years", } + _TICK_TO_UNITS = { + pd_offset.Day: "days", + pd_offset.Hour: "hours", + pd_offset.Minute: "minutes", + pd_offset.Second: "seconds", + pd_offset.Milli: "milliseconds", + pd_offset.Micro: "microseconds", + pd_offset.Nano: "nanoseconds", + } + _FREQSTR_REGEX = re.compile("([0-9]*)([a-zA-Z]+)") def __init__(self, n=1, normalize=False, **kwds): @@ -649,6 +660,10 @@ def _from_freqstr(cls: Type[_T], freqstr: str) -> _T: return cls(**{cls._CODES_TO_UNITS[freq_part]: int(numeric_part)}) + @classmethod + def _from_pandas_ticks(cls: Type[_T], tick: pd.tseries.offsets.Tick) -> _T: + return cls(**{cls._TICK_TO_UNITS[type(tick)]: tick.n}) + def _maybe_as_fast_pandas_offset(self): if ( len(self.kwds) == 1 @@ -814,23 +829,13 @@ def date_range( if isinstance(freq, DateOffset): offset = freq elif isinstance(freq, str): - # Map pandas `offset alias` into cudf DateOffset `CODE`, only - # fixed-frequency, non-anchored offset aliases are supported. - mo = re.fullmatch( - rf'(-)*(\d*)({"|".join(_offset_alias_to_code.keys())})', freq - ) - if mo is None: + offset = pd.tseries.frequencies.to_offset(freq) + if not isinstance(offset, pd.tseries.offsets.Tick): raise ValueError( - f"Unrecognized or unsupported offset alias {freq}." + f"Unrecognized frequency string {freq}. cuDF does" + " not yet support month, quarter, year-anchored frequency." ) - - sign, n, offset_alias = mo.groups() - code = _offset_alias_to_code[offset_alias] - - freq = "".join([n, code]) - offset = DateOffset._from_freqstr(freq) - if sign: - offset.kwds.update({s: -i for s, i in offset.kwds.items()}) + offset = DateOffset._from_pandas_ticks(offset) else: raise TypeError("`freq` must be a `str` or cudf.DateOffset object.") diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index a95be4f7932..650426fd508 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -1583,6 +1583,48 @@ def test_date_range_raise_overflow(): cudf.date_range(start=start, periods=periods, freq=freq) +@pytest.mark.parametrize( + "freqstr_unsupported", + [ + "1M", + "2SM", + "3MS", + "4BM", + "5CBM", + "6SMS", + "7BMS", + "8CBMS", + "Q", + "2BQ", + "3BQS", + "10A", + "10Y", + "9BA", + "9BY", + "8AS", + "8YS", + "7BAS", + "7BYS", + "BH", + "B", + ], +) +def test_date_range_raise_unsupported(freqstr_unsupported): + s, e = "2001-01-01", "2008-01-31" + pd.date_range(start=s, end=e, freq=freqstr_unsupported) + with pytest.raises(ValueError, match="does not yet support"): + cudf.date_range(start=s, end=e, freq=freqstr_unsupported) + + # We also check that these values are unsupported when using lowercase + # characters. We exclude the value 3MS (every 3 month starts) because 3ms + # is a valid frequency for every 3 milliseconds. + if freqstr_unsupported != "3MS": + freqstr_unsupported = freqstr_unsupported.lower() + pd.date_range(start=s, end=e, freq=freqstr_unsupported) + with pytest.raises(ValueError, match="does not yet support"): + cudf.date_range(start=s, end=e, freq=freqstr_unsupported) + + ################################################################## # End of Date Range Test # ################################################################## From 3701f4a1980c43879ac16e212c1b0d20b157e61e Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 14 Dec 2021 16:23:20 -0800 Subject: [PATCH 2/3] take Week into consideration too --- python/cudf/cudf/core/tools/datetimes.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 1728026ffad..e064e28b9c1 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -459,7 +459,8 @@ class DateOffset: "Y": "years", } - _TICK_TO_UNITS = { + _TICK_OR_WEEK_TO_UNITS = { + pd_offset.Week: "weeks", pd_offset.Day: "days", pd_offset.Hour: "hours", pd_offset.Minute: "minutes", @@ -661,8 +662,11 @@ def _from_freqstr(cls: Type[_T], freqstr: str) -> _T: return cls(**{cls._CODES_TO_UNITS[freq_part]: int(numeric_part)}) @classmethod - def _from_pandas_ticks(cls: Type[_T], tick: pd.tseries.offsets.Tick) -> _T: - return cls(**{cls._TICK_TO_UNITS[type(tick)]: tick.n}) + def _from_pandas_ticks_or_weeks( + cls: Type[_T], + tick: Union[pd.tseries.offsets.Tick, pd.tseries.offsets.Week], + ) -> _T: + return cls(**{cls._TICK_OR_WEEK_TO_UNITS[type(tick)]: tick.n}) def _maybe_as_fast_pandas_offset(self): if ( @@ -830,12 +834,14 @@ def date_range( offset = freq elif isinstance(freq, str): offset = pd.tseries.frequencies.to_offset(freq) - if not isinstance(offset, pd.tseries.offsets.Tick): + if not isinstance(offset, pd.tseries.offsets.Tick) and not isinstance( + offset, pd.tseries.offsets.Week + ): raise ValueError( f"Unrecognized frequency string {freq}. cuDF does" " not yet support month, quarter, year-anchored frequency." ) - offset = DateOffset._from_pandas_ticks(offset) + offset = DateOffset._from_pandas_ticks_or_weeks(offset) else: raise TypeError("`freq` must be a `str` or cudf.DateOffset object.") From b057746196fa0d8172f7b9d1f71afa25067e3b46 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 14 Dec 2021 19:05:14 -0800 Subject: [PATCH 3/3] Update python/cudf/cudf/core/tools/datetimes.py Co-authored-by: Bradley Dice --- python/cudf/cudf/core/tools/datetimes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index e064e28b9c1..0bf39d97815 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -838,8 +838,8 @@ def date_range( offset, pd.tseries.offsets.Week ): raise ValueError( - f"Unrecognized frequency string {freq}. cuDF does" - " not yet support month, quarter, year-anchored frequency." + f"Unrecognized frequency string {freq}. cuDF does " + "not yet support month, quarter, year-anchored frequency." ) offset = DateOffset._from_pandas_ticks_or_weeks(offset) else: