From 107c08a36fe261b940353b70b355fbb30abd3531 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 16 Nov 2021 15:26:59 -0800 Subject: [PATCH 1/5] consolidate freqstr handling --- python/cudf/cudf/core/tools/datetimes.py | 53 +++++++++++------------- 1 file changed, 24 insertions(+), 29 deletions(-) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 34d62ffc048..ae1e73cb84e 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -38,6 +38,8 @@ "min": "m", "s": "s", "S": "s", + "L": "ms", + "ms": "ms", "U": "us", "us": "us", "N": "ns", @@ -448,7 +450,6 @@ class DateOffset: "ns": "nanoseconds", "us": "microseconds", "ms": "milliseconds", - "L": "milliseconds", "s": "seconds", "m": "minutes", "h": "hours", @@ -458,7 +459,7 @@ class DateOffset: "Y": "years", } - _FREQSTR_REGEX = re.compile("([0-9]*)([a-zA-Z]+)") + _FREQSTR_REGEX = re.compile("(-)*([0-9]*)([a-zA-Z]+)") def __init__(self, n=1, normalize=False, **kwds): if normalize: @@ -629,27 +630,33 @@ def __repr__(self): return repr_str @classmethod - def _from_freqstr(cls: Type[_T], freqstr: str) -> _T: + def _from_str(cls: Type[_T], freqstr: str) -> _T: """ - Parse a string and return a DateOffset object - expects strings of the form 3D, 25W, 10ms, 42ns, etc. - """ - match = cls._FREQSTR_REGEX.match(freqstr) + Parse a string and return a DateOffset object. + Expects strings of the form 3D, 25W, 10ms, 42ns, etc. + See `_offset_alias_to_code` and `_CODE_TO_UNITS` for + supported list of strings. + """ + match = cls._FREQSTR_REGEX.fullmatch(freqstr) if match is None: raise ValueError(f"Invalid frequency string: {freqstr}") - numeric_part = match.group(1) - if numeric_part == "": - numeric_part = "1" - freq_part = match.group(2) - - if freq_part not in cls._CODES_TO_UNITS: + sign_part, numeric_part, freq_part = match.groups() + if freq_part in _offset_alias_to_code: + code = _offset_alias_to_code[freq_part] + elif freq_part in cls._CODES_TO_UNITS: + code = freq_part + else: raise ValueError(f"Cannot interpret frequency str: {freqstr}") - return cls(**{cls._CODES_TO_UNITS[freq_part]: int(numeric_part)}) + sign = -1 if sign_part else 1 + n = int(numeric_part) if numeric_part else 1 + code = _offset_alias_to_code[freq_part] - def _maybe_as_fast_pandas_offset(self): + return cls(**{cls._CODES_TO_UNITS[code]: n * sign}) + + def _maybe_as_fast_pandas_offset(self) -> pd.DateOffset: if ( len(self.kwds) == 1 and _has_fixed_frequency(self) @@ -814,23 +821,11 @@ def date_range( if isinstance(freq, DateOffset): offset = freq elif isinstance(freq, str): - # Map pandas `offset alias` into cudf DateOffset `CODE`, only - # fixed-frequency, non-anchored offset aliases are supported. - mo = re.fullmatch( - rf'(-)*(\d*)({"|".join(_offset_alias_to_code.keys())})', freq - ) - if mo is None: + offset = DateOffset._from_str(freq) + if "months" in offset.kwds or "years" in offset.kwds: raise ValueError( f"Unrecognized or unsupported offset alias {freq}." ) - - sign, n, offset_alias = mo.groups() - code = _offset_alias_to_code[offset_alias] - - freq = "".join([n, code]) - offset = DateOffset._from_freqstr(freq) - if sign: - offset.kwds.update({s: -i for s, i in offset.kwds.items()}) else: raise TypeError("`freq` must be a `str` or cudf.DateOffset object.") From 25da96e5ce64fafc5aaa15286a660d497a883f0f Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 16 Nov 2021 15:42:56 -0800 Subject: [PATCH 2/5] . --- python/cudf/cudf/core/tools/datetimes.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index ae1e73cb84e..d12cf0f4782 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -652,7 +652,6 @@ def _from_str(cls: Type[_T], freqstr: str) -> _T: sign = -1 if sign_part else 1 n = int(numeric_part) if numeric_part else 1 - code = _offset_alias_to_code[freq_part] return cls(**{cls._CODES_TO_UNITS[code]: n * sign}) From cbbc061ac944aa8ff53cd2c107aa12ff8dfc644a Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 17 Nov 2021 17:13:09 -0800 Subject: [PATCH 3/5] reject ambiguous input and add raise error message for unsupported types in date_range --- python/cudf/cudf/core/tools/datetimes.py | 41 ++++++++++++++++++++---- python/cudf/cudf/tests/test_datetime.py | 40 +++++++++++++++++++++++ 2 files changed, 75 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index d12cf0f4782..288463e9a9f 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -634,15 +634,35 @@ def _from_str(cls: Type[_T], freqstr: str) -> _T: """ Parse a string and return a DateOffset object. - Expects strings of the form 3D, 25W, 10ms, 42ns, etc. - See `_offset_alias_to_code` and `_CODE_TO_UNITS` for - supported list of strings. + A string can be a pandas `offset alias`_ or a + numpy `date/time unit code`_ + + Note that ``m`` (lower case) is ambiguous and is not accepted in this + function. Use ``T``/``min`` for minutely frequency and ``M`` (upper + case) for monthly frequency. + + Expects strings of the form 3D, 25W, -10ms, 42ns, etc. + + Not all offset aliases are supported. See `_offset_alias_to_code` and + `_CODE_TO_UNITS` for supported list of strings. """ match = cls._FREQSTR_REGEX.fullmatch(freqstr) if match is None: raise ValueError(f"Invalid frequency string: {freqstr}") + # Decompose the string into separate components sign_part, numeric_part, freq_part = match.groups() + + # Handle various offset strings and normalize as codes + if freq_part == "m": + raise ValueError( + "Lower cased `m` is ambiguous. Use 'T'/'min' to specify " + "minutely frequency or upper cased `M` to specify monthly " + "frequency." + ) + if freq_part in _offset_alias_to_code: code = _offset_alias_to_code[freq_part] elif freq_part in cls._CODES_TO_UNITS: @@ -650,9 +670,11 @@ def _from_str(cls: Type[_T], freqstr: str) -> _T: else: raise ValueError(f"Cannot interpret frequency str: {freqstr}") + # Handle sign and numerics sign = -1 if sign_part else 1 n = int(numeric_part) if numeric_part else 1 + # Construct the kwds dictionary return cls(**{cls._CODES_TO_UNITS[code]: n * sign}) def _maybe_as_fast_pandas_offset(self) -> pd.DateOffset: @@ -820,11 +842,18 @@ def date_range( if isinstance(freq, DateOffset): offset = freq elif isinstance(freq, str): - offset = DateOffset._from_str(freq) - if "months" in offset.kwds or "years" in offset.kwds: + if ( + any( + x in freq.upper() + for x in {"Y", "A", "Q", "B", "SM", "SMS", "CBMS", "M"} + ) + or "MS" in freq + ): raise ValueError( - f"Unrecognized or unsupported offset alias {freq}." + "date_range does not yet support month, quarter, year-anchored" + "or business-date frequency." ) + offset = DateOffset._from_str(freq) else: raise TypeError("`freq` must be a `str` or cudf.DateOffset object.") diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index d666dfc0ec1..ccebe85fc9c 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -1583,6 +1583,46 @@ def test_date_range_raise_overflow(): cudf.date_range(start=start, periods=periods, freq=freq) +@pytest.mark.parametrize( + "freqstr_unsupported", + [ + "1M", + "2SM", + "3MS", + "4BM", + "5CBM", + "6SMS", + "7BMS", + "8CBMS", + "Q", + "2BQ", + "3BQS", + "10A", + "10Y", + "9BA", + "9BY", + "8AS", + "8YS", + "7BAS", + "7BYS", + "BH", + "B", + ], +) +def test_date_range_raise_unsupported(freqstr_unsupported): + s, e = "2001-01-01", "2008-01-31" + pd.date_range(start=s, end=e, freq=freqstr_unsupported) + with pytest.raises(ValueError, match="does not yet support"): + cudf.date_range(start=s, end=e, freq=freqstr_unsupported) + + # 3ms would mean a millisecondly frequencies, not month start frequencies + if not freqstr_unsupported == "3MS": + freqstr_unsupported = freqstr_unsupported.lower() + pd.date_range(start=s, end=e, freq=freqstr_unsupported) + with pytest.raises(ValueError, match="does not yet support"): + cudf.date_range(start=s, end=e, freq=freqstr_unsupported) + + ################################################################## # End of Date Range Test # ################################################################## From 0208eafbb53f90f189ae2a2310d5cb13883cb324 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 23 Nov 2021 18:45:04 -0800 Subject: [PATCH 4/5] Apply suggestions from code review Co-authored-by: Bradley Dice --- python/cudf/cudf/core/tools/datetimes.py | 2 +- python/cudf/cudf/tests/test_datetime.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 288463e9a9f..981db6cbaa1 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -659,7 +659,7 @@ def _from_str(cls: Type[_T], freqstr: str) -> _T: if freq_part == "m": raise ValueError( "Lower cased `m` is ambiguous. Use 'T'/'min' to specify " - "minutely frequency or upper cased `M` to specify monthly " + "minutely frequency or upper cased 'M' to specify monthly " "frequency." ) diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index ccebe85fc9c..cf13c3f8625 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -1615,8 +1615,10 @@ def test_date_range_raise_unsupported(freqstr_unsupported): with pytest.raises(ValueError, match="does not yet support"): cudf.date_range(start=s, end=e, freq=freqstr_unsupported) - # 3ms would mean a millisecondly frequencies, not month start frequencies - if not freqstr_unsupported == "3MS": + # We also check that these values are unsupported when using lowercase + # characters. We exclude the value 3MS (every 3 month starts) because 3ms + # is a valid frequency for every 3 milliseconds. + if freqstr_unsupported != "3MS": freqstr_unsupported = freqstr_unsupported.lower() pd.date_range(start=s, end=e, freq=freqstr_unsupported) with pytest.raises(ValueError, match="does not yet support"): From b3491e869b1913ebdc13687491fe8defcaab8d34 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 23 Nov 2021 19:47:04 -0800 Subject: [PATCH 5/5] do not explicitly test for unsupported types --- python/cudf/cudf/core/tools/datetimes.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 288463e9a9f..2213438d13d 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -842,18 +842,17 @@ def date_range( if isinstance(freq, DateOffset): offset = freq elif isinstance(freq, str): - if ( - any( - x in freq.upper() - for x in {"Y", "A", "Q", "B", "SM", "SMS", "CBMS", "M"} - ) - or "MS" in freq - ): - raise ValueError( - "date_range does not yet support month, quarter, year-anchored" - "or business-date frequency." - ) - offset = DateOffset._from_str(freq) + e = ValueError( + f"Unrecognized frequency string {freq}. cuDF does" + " not yet support month, quarter, year-anchored frequency." + ) + + if "M" in freq or "Y" in freq.upper(): + raise e + try: + offset = DateOffset._from_str(freq) + except ValueError: + raise e else: raise TypeError("`freq` must be a `str` or cudf.DateOffset object.")