-
Notifications
You must be signed in to change notification settings - Fork 919
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Consolidate offset string handling into factory _from_str
#9709
Changes from 3 commits
107c08a
25da96e
cbbc061
0208eaf
b3491e8
561caca
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -38,6 +38,8 @@ | |||||
"min": "m", | ||||||
"s": "s", | ||||||
"S": "s", | ||||||
"L": "ms", | ||||||
"ms": "ms", | ||||||
"U": "us", | ||||||
"us": "us", | ||||||
"N": "ns", | ||||||
|
@@ -448,7 +450,6 @@ class DateOffset: | |||||
"ns": "nanoseconds", | ||||||
"us": "microseconds", | ||||||
"ms": "milliseconds", | ||||||
"L": "milliseconds", | ||||||
"s": "seconds", | ||||||
"m": "minutes", | ||||||
"h": "hours", | ||||||
|
@@ -458,7 +459,7 @@ class DateOffset: | |||||
"Y": "years", | ||||||
} | ||||||
|
||||||
_FREQSTR_REGEX = re.compile("([0-9]*)([a-zA-Z]+)") | ||||||
_FREQSTR_REGEX = re.compile("(-)*([0-9]*)([a-zA-Z]+)") | ||||||
|
||||||
def __init__(self, n=1, normalize=False, **kwds): | ||||||
if normalize: | ||||||
|
@@ -629,27 +630,54 @@ def __repr__(self): | |||||
return repr_str | ||||||
|
||||||
@classmethod | ||||||
def _from_freqstr(cls: Type[_T], freqstr: str) -> _T: | ||||||
def _from_str(cls: Type[_T], freqstr: str) -> _T: | ||||||
""" | ||||||
Parse a string and return a DateOffset object | ||||||
expects strings of the form 3D, 25W, 10ms, 42ns, etc. | ||||||
""" | ||||||
match = cls._FREQSTR_REGEX.match(freqstr) | ||||||
Parse a string and return a DateOffset object. | ||||||
|
||||||
A string can be a pandas `offset alias`<https://pandas.pydata.org/\ | ||||||
pandas-docs/stable/user_guide/timeseries.html#offset-aliases>_ or a | ||||||
numpy `date/time unit code`<https://numpy.org/doc/stable/reference/arr\ | ||||||
ays.datetime.html#datetime-units>_ | ||||||
|
||||||
Note that ``m`` (lower case) is ambiguous and is not accepted in this | ||||||
function. Use ``T``/``min`` for minutely frequency and ``M`` (upper | ||||||
case) for monthly frequency. | ||||||
|
||||||
Expects strings of the form 3D, 25W, -10ms, 42ns, etc. | ||||||
|
||||||
Not all offset aliases are supported. See `_offset_alias_to_code` and | ||||||
`_CODE_TO_UNITS` for supported list of strings. | ||||||
""" | ||||||
match = cls._FREQSTR_REGEX.fullmatch(freqstr) | ||||||
if match is None: | ||||||
raise ValueError(f"Invalid frequency string: {freqstr}") | ||||||
|
||||||
numeric_part = match.group(1) | ||||||
if numeric_part == "": | ||||||
numeric_part = "1" | ||||||
freq_part = match.group(2) | ||||||
# Decompose the string into separate components | ||||||
sign_part, numeric_part, freq_part = match.groups() | ||||||
|
||||||
# Handle various offset strings and normalize as codes | ||||||
if freq_part == "m": | ||||||
raise ValueError( | ||||||
"Lower cased `m` is ambiguous. Use 'T'/'min' to specify " | ||||||
"minutely frequency or upper cased `M` to specify monthly " | ||||||
isVoid marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
"frequency." | ||||||
) | ||||||
|
||||||
if freq_part not in cls._CODES_TO_UNITS: | ||||||
if freq_part in _offset_alias_to_code: | ||||||
code = _offset_alias_to_code[freq_part] | ||||||
elif freq_part in cls._CODES_TO_UNITS: | ||||||
code = freq_part | ||||||
else: | ||||||
raise ValueError(f"Cannot interpret frequency str: {freqstr}") | ||||||
|
||||||
return cls(**{cls._CODES_TO_UNITS[freq_part]: int(numeric_part)}) | ||||||
# Handle sign and numerics | ||||||
sign = -1 if sign_part else 1 | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Needed for my other regex suggestion above.
Suggested change
|
||||||
n = int(numeric_part) if numeric_part else 1 | ||||||
|
||||||
# Construct the kwds dictionary | ||||||
return cls(**{cls._CODES_TO_UNITS[code]: n * sign}) | ||||||
|
||||||
def _maybe_as_fast_pandas_offset(self): | ||||||
def _maybe_as_fast_pandas_offset(self) -> pd.DateOffset: | ||||||
if ( | ||||||
len(self.kwds) == 1 | ||||||
and _has_fixed_frequency(self) | ||||||
|
@@ -814,23 +842,18 @@ def date_range( | |||||
if isinstance(freq, DateOffset): | ||||||
offset = freq | ||||||
elif isinstance(freq, str): | ||||||
# Map pandas `offset alias` into cudf DateOffset `CODE`, only | ||||||
# fixed-frequency, non-anchored offset aliases are supported. | ||||||
mo = re.fullmatch( | ||||||
rf'(-)*(\d*)({"|".join(_offset_alias_to_code.keys())})', freq | ||||||
) | ||||||
if mo is None: | ||||||
if ( | ||||||
any( | ||||||
x in freq.upper() | ||||||
for x in {"Y", "A", "Q", "B", "SM", "SMS", "CBMS", "M"} | ||||||
) | ||||||
or "MS" in freq | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looks like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah - the issue you pointed here is valid. The plan was to allow There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Side note: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I moved away from checking the unsupported ones explicitly here - because that's basically just creating the complement set of supported ones in |
||||||
): | ||||||
raise ValueError( | ||||||
isVoid marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
f"Unrecognized or unsupported offset alias {freq}." | ||||||
"date_range does not yet support month, quarter, year-anchored" | ||||||
"or business-date frequency." | ||||||
) | ||||||
|
||||||
sign, n, offset_alias = mo.groups() | ||||||
code = _offset_alias_to_code[offset_alias] | ||||||
|
||||||
freq = "".join([n, code]) | ||||||
offset = DateOffset._from_freqstr(freq) | ||||||
if sign: | ||||||
offset.kwds.update({s: -i for s, i in offset.kwds.items()}) | ||||||
offset = DateOffset._from_str(freq) | ||||||
else: | ||||||
raise TypeError("`freq` must be a `str` or cudf.DateOffset object.") | ||||||
|
||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It looks like this might be the regex pandas uses, which handles a lot of cases this regex may miss. (This regex might not be for exactly the same purpose, but it demonstrates the types of cases we might need to handle.)
r"([+\-]?\d*|[+\-]?\d*\.\d*)\s*([A-Za-z]+([\-][\dA-Za-z\-]+)?)"
https://github.com/pandas-dev/pandas/blob/8fefaa5a9a7c3f3a1c35c36c1140117dab73c9c7/pandas/_libs/tslibs/offsets.pyx#L3506-L3508
Do we need to support frequencies like
+10ms
? What about spaces in between like3 h
? I'd check these cases against pandas to see what is supported -- or find the exact regex that pandas uses to make sure we hit the same cases.