Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Consolidate offset string handling into factory _from_str #9709

Closed
Closed
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 23 additions & 29 deletions python/cudf/cudf/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
"min": "m",
"s": "s",
"S": "s",
"L": "ms",
"ms": "ms",
"U": "us",
"us": "us",
"N": "ns",
Expand Down Expand Up @@ -448,7 +450,6 @@ class DateOffset:
"ns": "nanoseconds",
"us": "microseconds",
"ms": "milliseconds",
"L": "milliseconds",
"s": "seconds",
"m": "minutes",
"h": "hours",
Expand All @@ -458,7 +459,7 @@ class DateOffset:
"Y": "years",
}

_FREQSTR_REGEX = re.compile("([0-9]*)([a-zA-Z]+)")
_FREQSTR_REGEX = re.compile("(-)*([0-9]*)([a-zA-Z]+)")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like this might be the regex pandas uses, which handles a lot of cases this regex may miss. (This regex might not be for exactly the same purpose, but it demonstrates the types of cases we might need to handle.)

r"([+\-]?\d*|[+\-]?\d*\.\d*)\s*([A-Za-z]+([\-][\dA-Za-z\-]+)?)"
https://github.com/pandas-dev/pandas/blob/8fefaa5a9a7c3f3a1c35c36c1140117dab73c9c7/pandas/_libs/tslibs/offsets.pyx#L3506-L3508

Do we need to support frequencies like +10ms? What about spaces in between like 3 h? I'd check these cases against pandas to see what is supported -- or find the exact regex that pandas uses to make sure we hit the same cases.

Suggested change
_FREQSTR_REGEX = re.compile("(-)*([0-9]*)([a-zA-Z]+)")
_FREQSTR_REGEX = re.compile("([+\-])?(\d*)\s*([A-Za-z]+)")


def __init__(self, n=1, normalize=False, **kwds):
if normalize:
Expand Down Expand Up @@ -629,27 +630,32 @@ def __repr__(self):
return repr_str

@classmethod
def _from_freqstr(cls: Type[_T], freqstr: str) -> _T:
def _from_str(cls: Type[_T], freqstr: str) -> _T:
"""
Parse a string and return a DateOffset object
expects strings of the form 3D, 25W, 10ms, 42ns, etc.
"""
match = cls._FREQSTR_REGEX.match(freqstr)
Parse a string and return a DateOffset object.

Expects strings of the form 3D, 25W, 10ms, 42ns, etc.
See `_offset_alias_to_code` and `_CODE_TO_UNITS` for
supported list of strings.
"""
match = cls._FREQSTR_REGEX.fullmatch(freqstr)
if match is None:
raise ValueError(f"Invalid frequency string: {freqstr}")

numeric_part = match.group(1)
if numeric_part == "":
numeric_part = "1"
freq_part = match.group(2)

if freq_part not in cls._CODES_TO_UNITS:
sign_part, numeric_part, freq_part = match.groups()
if freq_part in _offset_alias_to_code:
code = _offset_alias_to_code[freq_part]
elif freq_part in cls._CODES_TO_UNITS:
code = freq_part
else:
raise ValueError(f"Cannot interpret frequency str: {freqstr}")

return cls(**{cls._CODES_TO_UNITS[freq_part]: int(numeric_part)})
sign = -1 if sign_part else 1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Needed for my other regex suggestion above.

Suggested change
sign = -1 if sign_part else 1
sign = -1 if sign_part.startswith("-") else 1

n = int(numeric_part) if numeric_part else 1

def _maybe_as_fast_pandas_offset(self):
return cls(**{cls._CODES_TO_UNITS[code]: n * sign})

def _maybe_as_fast_pandas_offset(self) -> pd.DateOffset:
if (
len(self.kwds) == 1
and _has_fixed_frequency(self)
Expand Down Expand Up @@ -814,23 +820,11 @@ def date_range(
if isinstance(freq, DateOffset):
offset = freq
elif isinstance(freq, str):
# Map pandas `offset alias` into cudf DateOffset `CODE`, only
# fixed-frequency, non-anchored offset aliases are supported.
mo = re.fullmatch(
rf'(-)*(\d*)({"|".join(_offset_alias_to_code.keys())})', freq
)
if mo is None:
offset = DateOffset._from_str(freq)
if "months" in offset.kwds or "years" in offset.kwds:
raise ValueError(
isVoid marked this conversation as resolved.
Show resolved Hide resolved
f"Unrecognized or unsupported offset alias {freq}."
)

sign, n, offset_alias = mo.groups()
code = _offset_alias_to_code[offset_alias]

freq = "".join([n, code])
offset = DateOffset._from_freqstr(freq)
if sign:
offset.kwds.update({s: -i for s, i in offset.kwds.items()})
else:
raise TypeError("`freq` must be a `str` or cudf.DateOffset object.")

Expand Down