diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 34d62ffc048..35282002304 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -38,6 +38,8 @@ "min": "m", "s": "s", "S": "s", + "L": "ms", + "ms": "ms", "U": "us", "us": "us", "N": "ns", @@ -448,7 +450,6 @@ class DateOffset: "ns": "nanoseconds", "us": "microseconds", "ms": "milliseconds", - "L": "milliseconds", "s": "seconds", "m": "minutes", "h": "hours", @@ -458,7 +459,7 @@ class DateOffset: "Y": "years", } - _FREQSTR_REGEX = re.compile("([0-9]*)([a-zA-Z]+)") + _FREQSTR_REGEX = re.compile("(-)*([0-9]*)([a-zA-Z]+)") def __init__(self, n=1, normalize=False, **kwds): if normalize: @@ -629,27 +630,54 @@ def __repr__(self): return repr_str @classmethod - def _from_freqstr(cls: Type[_T], freqstr: str) -> _T: + def _from_str(cls: Type[_T], freqstr: str) -> _T: """ - Parse a string and return a DateOffset object - expects strings of the form 3D, 25W, 10ms, 42ns, etc. - """ - match = cls._FREQSTR_REGEX.match(freqstr) + Parse a string and return a DateOffset object. + + A string can be a pandas `offset alias`_ or a + numpy `date/time unit code`_ + + Note that ``m`` (lower case) is ambiguous and is not accepted in this + function. Use ``T``/``min`` for minutely frequency and ``M`` (upper + case) for monthly frequency. + + Expects strings of the form 3D, 25W, -10ms, 42ns, etc. + Not all offset aliases are supported. See `_offset_alias_to_code` and + `_CODE_TO_UNITS` for supported list of strings. + """ + match = cls._FREQSTR_REGEX.fullmatch(freqstr) if match is None: raise ValueError(f"Invalid frequency string: {freqstr}") - numeric_part = match.group(1) - if numeric_part == "": - numeric_part = "1" - freq_part = match.group(2) + # Decompose the string into separate components + sign_part, numeric_part, freq_part = match.groups() + + # Handle various offset strings and normalize as codes + if freq_part == "m": + raise ValueError( + "Lower cased `m` is ambiguous. Use 'T'/'min' to specify " + "minutely frequency or upper cased 'M' to specify monthly " + "frequency." + ) - if freq_part not in cls._CODES_TO_UNITS: + if freq_part in _offset_alias_to_code: + code = _offset_alias_to_code[freq_part] + elif freq_part in cls._CODES_TO_UNITS: + code = freq_part + else: raise ValueError(f"Cannot interpret frequency str: {freqstr}") - return cls(**{cls._CODES_TO_UNITS[freq_part]: int(numeric_part)}) + # Handle sign and numerics + sign = -1 if sign_part else 1 + n = int(numeric_part) if numeric_part else 1 + + # Construct the kwds dictionary + return cls(**{cls._CODES_TO_UNITS[code]: n * sign}) - def _maybe_as_fast_pandas_offset(self): + def _maybe_as_fast_pandas_offset(self) -> pd.DateOffset: if ( len(self.kwds) == 1 and _has_fixed_frequency(self) @@ -814,23 +842,17 @@ def date_range( if isinstance(freq, DateOffset): offset = freq elif isinstance(freq, str): - # Map pandas `offset alias` into cudf DateOffset `CODE`, only - # fixed-frequency, non-anchored offset aliases are supported. - mo = re.fullmatch( - rf'(-)*(\d*)({"|".join(_offset_alias_to_code.keys())})', freq + e = ValueError( + f"Unrecognized frequency string {freq}. cuDF does" + " not yet support month, quarter, year-anchored frequency." ) - if mo is None: - raise ValueError( - f"Unrecognized or unsupported offset alias {freq}." - ) - sign, n, offset_alias = mo.groups() - code = _offset_alias_to_code[offset_alias] - - freq = "".join([n, code]) - offset = DateOffset._from_freqstr(freq) - if sign: - offset.kwds.update({s: -i for s, i in offset.kwds.items()}) + if "M" in freq or "Y" in freq.upper(): + raise e + try: + offset = DateOffset._from_str(freq) + except ValueError: + raise e else: raise TypeError("`freq` must be a `str` or cudf.DateOffset object.") diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index d666dfc0ec1..cf13c3f8625 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -1583,6 +1583,48 @@ def test_date_range_raise_overflow(): cudf.date_range(start=start, periods=periods, freq=freq) +@pytest.mark.parametrize( + "freqstr_unsupported", + [ + "1M", + "2SM", + "3MS", + "4BM", + "5CBM", + "6SMS", + "7BMS", + "8CBMS", + "Q", + "2BQ", + "3BQS", + "10A", + "10Y", + "9BA", + "9BY", + "8AS", + "8YS", + "7BAS", + "7BYS", + "BH", + "B", + ], +) +def test_date_range_raise_unsupported(freqstr_unsupported): + s, e = "2001-01-01", "2008-01-31" + pd.date_range(start=s, end=e, freq=freqstr_unsupported) + with pytest.raises(ValueError, match="does not yet support"): + cudf.date_range(start=s, end=e, freq=freqstr_unsupported) + + # We also check that these values are unsupported when using lowercase + # characters. We exclude the value 3MS (every 3 month starts) because 3ms + # is a valid frequency for every 3 milliseconds. + if freqstr_unsupported != "3MS": + freqstr_unsupported = freqstr_unsupported.lower() + pd.date_range(start=s, end=e, freq=freqstr_unsupported) + with pytest.raises(ValueError, match="does not yet support"): + cudf.date_range(start=s, end=e, freq=freqstr_unsupported) + + ################################################################## # End of Date Range Test # ##################################################################