Skip to content

Commit

Permalink
Use pandas to_offset to parse frequency string in date_range (#9843)
Browse files Browse the repository at this point in the history
Pandas uses the [following regex](https://github.com/pandas-dev/pandas/blob/8fefaa5a9a7c3f3a1c35c36c1140117dab73c9c7/pandas/_libs/tslibs/offsets.pyx#L3506-L3508) to convert freqeuncy strings into offset, which is exposed in public api `pd.tseries.frequencies.to_offset`. Currently cudf depends on a [custom  regex](https://github.com/rapidsai/cudf/blob/fdd9bb00dc0ba5ac373feaa079b782029130dae3/python/cudf/cudf/core/tools/datetimes.py#L819-L821) to perform the conversion. We probably shouldn't reinvent the wheels here as it might make it harder to track changes in pandas.

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - Sheilah Kirui (https://github.com/skirui-source)
  - Bradley Dice (https://github.com/bdice)

URL: #9843
  • Loading branch information
isVoid authored Dec 16, 2021
1 parent 0faf2af commit 56430b4
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 15 deletions.
41 changes: 26 additions & 15 deletions python/cudf/cudf/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import cupy as cp
import numpy as np
import pandas as pd
import pandas.tseries.offsets as pd_offset
from pandas.core.tools.datetimes import _unit_map

import cudf
Expand Down Expand Up @@ -458,6 +459,17 @@ class DateOffset:
"Y": "years",
}

_TICK_OR_WEEK_TO_UNITS = {
pd_offset.Week: "weeks",
pd_offset.Day: "days",
pd_offset.Hour: "hours",
pd_offset.Minute: "minutes",
pd_offset.Second: "seconds",
pd_offset.Milli: "milliseconds",
pd_offset.Micro: "microseconds",
pd_offset.Nano: "nanoseconds",
}

_FREQSTR_REGEX = re.compile("([0-9]*)([a-zA-Z]+)")

def __init__(self, n=1, normalize=False, **kwds):
Expand Down Expand Up @@ -649,6 +661,13 @@ def _from_freqstr(cls: Type[_T], freqstr: str) -> _T:

return cls(**{cls._CODES_TO_UNITS[freq_part]: int(numeric_part)})

@classmethod
def _from_pandas_ticks_or_weeks(
cls: Type[_T],
tick: Union[pd.tseries.offsets.Tick, pd.tseries.offsets.Week],
) -> _T:
return cls(**{cls._TICK_OR_WEEK_TO_UNITS[type(tick)]: tick.n})

def _maybe_as_fast_pandas_offset(self):
if (
len(self.kwds) == 1
Expand Down Expand Up @@ -814,23 +833,15 @@ def date_range(
if isinstance(freq, DateOffset):
offset = freq
elif isinstance(freq, str):
# Map pandas `offset alias` into cudf DateOffset `CODE`, only
# fixed-frequency, non-anchored offset aliases are supported.
mo = re.fullmatch(
rf'(-)*(\d*)({"|".join(_offset_alias_to_code.keys())})', freq
)
if mo is None:
offset = pd.tseries.frequencies.to_offset(freq)
if not isinstance(offset, pd.tseries.offsets.Tick) and not isinstance(
offset, pd.tseries.offsets.Week
):
raise ValueError(
f"Unrecognized or unsupported offset alias {freq}."
f"Unrecognized frequency string {freq}. cuDF does "
"not yet support month, quarter, year-anchored frequency."
)

sign, n, offset_alias = mo.groups()
code = _offset_alias_to_code[offset_alias]

freq = "".join([n, code])
offset = DateOffset._from_freqstr(freq)
if sign:
offset.kwds.update({s: -i for s, i in offset.kwds.items()})
offset = DateOffset._from_pandas_ticks_or_weeks(offset)
else:
raise TypeError("`freq` must be a `str` or cudf.DateOffset object.")

Expand Down
42 changes: 42 additions & 0 deletions python/cudf/cudf/tests/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -1583,6 +1583,48 @@ def test_date_range_raise_overflow():
cudf.date_range(start=start, periods=periods, freq=freq)


@pytest.mark.parametrize(
"freqstr_unsupported",
[
"1M",
"2SM",
"3MS",
"4BM",
"5CBM",
"6SMS",
"7BMS",
"8CBMS",
"Q",
"2BQ",
"3BQS",
"10A",
"10Y",
"9BA",
"9BY",
"8AS",
"8YS",
"7BAS",
"7BYS",
"BH",
"B",
],
)
def test_date_range_raise_unsupported(freqstr_unsupported):
s, e = "2001-01-01", "2008-01-31"
pd.date_range(start=s, end=e, freq=freqstr_unsupported)
with pytest.raises(ValueError, match="does not yet support"):
cudf.date_range(start=s, end=e, freq=freqstr_unsupported)

# We also check that these values are unsupported when using lowercase
# characters. We exclude the value 3MS (every 3 month starts) because 3ms
# is a valid frequency for every 3 milliseconds.
if freqstr_unsupported != "3MS":
freqstr_unsupported = freqstr_unsupported.lower()
pd.date_range(start=s, end=e, freq=freqstr_unsupported)
with pytest.raises(ValueError, match="does not yet support"):
cudf.date_range(start=s, end=e, freq=freqstr_unsupported)


##################################################################
# End of Date Range Test #
##################################################################
Expand Down

0 comments on commit 56430b4

Please sign in to comment.