Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Consolidate offset string handling into factory _from_str #9709

Closed
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 52 additions & 29 deletions python/cudf/cudf/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
"min": "m",
"s": "s",
"S": "s",
"L": "ms",
"ms": "ms",
"U": "us",
"us": "us",
"N": "ns",
Expand Down Expand Up @@ -448,7 +450,6 @@ class DateOffset:
"ns": "nanoseconds",
"us": "microseconds",
"ms": "milliseconds",
"L": "milliseconds",
"s": "seconds",
"m": "minutes",
"h": "hours",
Expand All @@ -458,7 +459,7 @@ class DateOffset:
"Y": "years",
}

_FREQSTR_REGEX = re.compile("([0-9]*)([a-zA-Z]+)")
_FREQSTR_REGEX = re.compile("(-)*([0-9]*)([a-zA-Z]+)")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like this might be the regex pandas uses, which handles a lot of cases this regex may miss. (This regex might not be for exactly the same purpose, but it demonstrates the types of cases we might need to handle.)

r"([+\-]?\d*|[+\-]?\d*\.\d*)\s*([A-Za-z]+([\-][\dA-Za-z\-]+)?)"
https://github.com/pandas-dev/pandas/blob/8fefaa5a9a7c3f3a1c35c36c1140117dab73c9c7/pandas/_libs/tslibs/offsets.pyx#L3506-L3508

Do we need to support frequencies like +10ms? What about spaces in between like 3 h? I'd check these cases against pandas to see what is supported -- or find the exact regex that pandas uses to make sure we hit the same cases.

Suggested change
_FREQSTR_REGEX = re.compile("(-)*([0-9]*)([a-zA-Z]+)")
_FREQSTR_REGEX = re.compile("([+\-])?(\d*)\s*([A-Za-z]+)")


def __init__(self, n=1, normalize=False, **kwds):
if normalize:
Expand Down Expand Up @@ -629,27 +630,54 @@ def __repr__(self):
return repr_str

@classmethod
def _from_freqstr(cls: Type[_T], freqstr: str) -> _T:
def _from_str(cls: Type[_T], freqstr: str) -> _T:
"""
Parse a string and return a DateOffset object
expects strings of the form 3D, 25W, 10ms, 42ns, etc.
"""
match = cls._FREQSTR_REGEX.match(freqstr)
Parse a string and return a DateOffset object.

A string can be a pandas `offset alias`<https://pandas.pydata.org/\
pandas-docs/stable/user_guide/timeseries.html#offset-aliases>_ or a
numpy `date/time unit code`<https://numpy.org/doc/stable/reference/arr\
ays.datetime.html#datetime-units>_

Note that ``m`` (lower case) is ambiguous and is not accepted in this
function. Use ``T``/``min`` for minutely frequency and ``M`` (upper
case) for monthly frequency.

Expects strings of the form 3D, 25W, -10ms, 42ns, etc.

Not all offset aliases are supported. See `_offset_alias_to_code` and
`_CODE_TO_UNITS` for supported list of strings.
"""
match = cls._FREQSTR_REGEX.fullmatch(freqstr)
if match is None:
raise ValueError(f"Invalid frequency string: {freqstr}")

numeric_part = match.group(1)
if numeric_part == "":
numeric_part = "1"
freq_part = match.group(2)
# Decompose the string into separate components
sign_part, numeric_part, freq_part = match.groups()

# Handle various offset strings and normalize as codes
if freq_part == "m":
raise ValueError(
"Lower cased `m` is ambiguous. Use 'T'/'min' to specify "
"minutely frequency or upper cased `M` to specify monthly "
isVoid marked this conversation as resolved.
Show resolved Hide resolved
"frequency."
)

if freq_part not in cls._CODES_TO_UNITS:
if freq_part in _offset_alias_to_code:
code = _offset_alias_to_code[freq_part]
elif freq_part in cls._CODES_TO_UNITS:
code = freq_part
else:
raise ValueError(f"Cannot interpret frequency str: {freqstr}")

return cls(**{cls._CODES_TO_UNITS[freq_part]: int(numeric_part)})
# Handle sign and numerics
sign = -1 if sign_part else 1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Needed for my other regex suggestion above.

Suggested change
sign = -1 if sign_part else 1
sign = -1 if sign_part.startswith("-") else 1

n = int(numeric_part) if numeric_part else 1

# Construct the kwds dictionary
return cls(**{cls._CODES_TO_UNITS[code]: n * sign})

def _maybe_as_fast_pandas_offset(self):
def _maybe_as_fast_pandas_offset(self) -> pd.DateOffset:
if (
len(self.kwds) == 1
and _has_fixed_frequency(self)
Expand Down Expand Up @@ -814,23 +842,18 @@ def date_range(
if isinstance(freq, DateOffset):
offset = freq
elif isinstance(freq, str):
# Map pandas `offset alias` into cudf DateOffset `CODE`, only
# fixed-frequency, non-anchored offset aliases are supported.
mo = re.fullmatch(
rf'(-)*(\d*)({"|".join(_offset_alias_to_code.keys())})', freq
)
if mo is None:
if (
any(
x in freq.upper()
for x in {"Y", "A", "Q", "B", "SM", "SMS", "CBMS", "M"}
)
or "MS" in freq
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like L, ms should be supported aliases. They're not listed in the docstring on line 765-766, and it looks like 3ms would raise an error here because it contains M when converted to uppercase? I feel like I'm missing something, since those values were added to _offset_alias_to_code in this PR.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah - the issue you pointed here is valid. The plan was to allow ms but reject M and MS.

Copy link
Contributor Author

@isVoid isVoid Nov 24, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Side note: ms is currently not tested in date_range tests because the tool we used to process datetimes from pandas does not support ms yet.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I moved away from checking the unsupported ones explicitly here - because that's basically just creating the complement set of supported ones in _from_str. Instead, I'm wrapping _from_str in try-except blocks.

):
raise ValueError(
isVoid marked this conversation as resolved.
Show resolved Hide resolved
f"Unrecognized or unsupported offset alias {freq}."
"date_range does not yet support month, quarter, year-anchored"
"or business-date frequency."
)

sign, n, offset_alias = mo.groups()
code = _offset_alias_to_code[offset_alias]

freq = "".join([n, code])
offset = DateOffset._from_freqstr(freq)
if sign:
offset.kwds.update({s: -i for s, i in offset.kwds.items()})
offset = DateOffset._from_str(freq)
else:
raise TypeError("`freq` must be a `str` or cudf.DateOffset object.")

Expand Down
40 changes: 40 additions & 0 deletions python/cudf/cudf/tests/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -1583,6 +1583,46 @@ def test_date_range_raise_overflow():
cudf.date_range(start=start, periods=periods, freq=freq)


@pytest.mark.parametrize(
"freqstr_unsupported",
[
"1M",
"2SM",
"3MS",
"4BM",
"5CBM",
"6SMS",
"7BMS",
"8CBMS",
"Q",
"2BQ",
"3BQS",
"10A",
"10Y",
"9BA",
"9BY",
"8AS",
"8YS",
"7BAS",
"7BYS",
"BH",
"B",
],
)
def test_date_range_raise_unsupported(freqstr_unsupported):
s, e = "2001-01-01", "2008-01-31"
pd.date_range(start=s, end=e, freq=freqstr_unsupported)
with pytest.raises(ValueError, match="does not yet support"):
cudf.date_range(start=s, end=e, freq=freqstr_unsupported)

# 3ms would mean a millisecondly frequencies, not month start frequencies
isVoid marked this conversation as resolved.
Show resolved Hide resolved
if not freqstr_unsupported == "3MS":
isVoid marked this conversation as resolved.
Show resolved Hide resolved
freqstr_unsupported = freqstr_unsupported.lower()
pd.date_range(start=s, end=e, freq=freqstr_unsupported)
with pytest.raises(ValueError, match="does not yet support"):
cudf.date_range(start=s, end=e, freq=freqstr_unsupported)


##################################################################
# End of Date Range Test #
##################################################################
Expand Down