From 107c08a36fe261b940353b70b355fbb30abd3531 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 16 Nov 2021 15:26:59 -0800 Subject: [PATCH 1/9] consolidate freqstr handling --- python/cudf/cudf/core/tools/datetimes.py | 53 +++++++++++------------- 1 file changed, 24 insertions(+), 29 deletions(-) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 34d62ffc048..ae1e73cb84e 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -38,6 +38,8 @@ "min": "m", "s": "s", "S": "s", + "L": "ms", + "ms": "ms", "U": "us", "us": "us", "N": "ns", @@ -448,7 +450,6 @@ class DateOffset: "ns": "nanoseconds", "us": "microseconds", "ms": "milliseconds", - "L": "milliseconds", "s": "seconds", "m": "minutes", "h": "hours", @@ -458,7 +459,7 @@ class DateOffset: "Y": "years", } - _FREQSTR_REGEX = re.compile("([0-9]*)([a-zA-Z]+)") + _FREQSTR_REGEX = re.compile("(-)*([0-9]*)([a-zA-Z]+)") def __init__(self, n=1, normalize=False, **kwds): if normalize: @@ -629,27 +630,33 @@ def __repr__(self): return repr_str @classmethod - def _from_freqstr(cls: Type[_T], freqstr: str) -> _T: + def _from_str(cls: Type[_T], freqstr: str) -> _T: """ - Parse a string and return a DateOffset object - expects strings of the form 3D, 25W, 10ms, 42ns, etc. - """ - match = cls._FREQSTR_REGEX.match(freqstr) + Parse a string and return a DateOffset object. + Expects strings of the form 3D, 25W, 10ms, 42ns, etc. + See `_offset_alias_to_code` and `_CODE_TO_UNITS` for + supported list of strings. + """ + match = cls._FREQSTR_REGEX.fullmatch(freqstr) if match is None: raise ValueError(f"Invalid frequency string: {freqstr}") - numeric_part = match.group(1) - if numeric_part == "": - numeric_part = "1" - freq_part = match.group(2) - - if freq_part not in cls._CODES_TO_UNITS: + sign_part, numeric_part, freq_part = match.groups() + if freq_part in _offset_alias_to_code: + code = _offset_alias_to_code[freq_part] + elif freq_part in cls._CODES_TO_UNITS: + code = freq_part + else: raise ValueError(f"Cannot interpret frequency str: {freqstr}") - return cls(**{cls._CODES_TO_UNITS[freq_part]: int(numeric_part)}) + sign = -1 if sign_part else 1 + n = int(numeric_part) if numeric_part else 1 + code = _offset_alias_to_code[freq_part] - def _maybe_as_fast_pandas_offset(self): + return cls(**{cls._CODES_TO_UNITS[code]: n * sign}) + + def _maybe_as_fast_pandas_offset(self) -> pd.DateOffset: if ( len(self.kwds) == 1 and _has_fixed_frequency(self) @@ -814,23 +821,11 @@ def date_range( if isinstance(freq, DateOffset): offset = freq elif isinstance(freq, str): - # Map pandas `offset alias` into cudf DateOffset `CODE`, only - # fixed-frequency, non-anchored offset aliases are supported. - mo = re.fullmatch( - rf'(-)*(\d*)({"|".join(_offset_alias_to_code.keys())})', freq - ) - if mo is None: + offset = DateOffset._from_str(freq) + if "months" in offset.kwds or "years" in offset.kwds: raise ValueError( f"Unrecognized or unsupported offset alias {freq}." ) - - sign, n, offset_alias = mo.groups() - code = _offset_alias_to_code[offset_alias] - - freq = "".join([n, code]) - offset = DateOffset._from_freqstr(freq) - if sign: - offset.kwds.update({s: -i for s, i in offset.kwds.items()}) else: raise TypeError("`freq` must be a `str` or cudf.DateOffset object.") From 25da96e5ce64fafc5aaa15286a660d497a883f0f Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 16 Nov 2021 15:42:56 -0800 Subject: [PATCH 2/9] . --- python/cudf/cudf/core/tools/datetimes.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index ae1e73cb84e..d12cf0f4782 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -652,7 +652,6 @@ def _from_str(cls: Type[_T], freqstr: str) -> _T: sign = -1 if sign_part else 1 n = int(numeric_part) if numeric_part else 1 - code = _offset_alias_to_code[freq_part] return cls(**{cls._CODES_TO_UNITS[code]: n * sign}) From 6cc207edc0cc06edf7286092b4b8a25348801f1b Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 16 Nov 2021 16:06:32 -0800 Subject: [PATCH 3/9] initial --- python/cudf/cudf/core/indexed_frame.py | 120 +++++++++++++++++++++++- python/cudf/cudf/tests/test_datetime.py | 38 ++++++++ 2 files changed, 157 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index cf12907d96a..07f7c4ed842 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -13,7 +13,11 @@ import cudf from cudf._typing import ColumnLike -from cudf.api.types import is_categorical_dtype, is_list_like +from cudf.api.types import ( + is_categorical_dtype, + is_datetime_dtype, + is_list_like, +) from cudf.core.column import arange from cudf.core.frame import Frame from cudf.core.index import Index @@ -758,3 +762,117 @@ def resample( if isinstance(self, cudf.Series) else cudf.core.resample.DataFrameResampler(self, by=by) ) + + def first(self, offset): + """Select initial periods of time series data based on a date offset. + + When having a DataFrame with **sorted** dates as index, this function + can select the first few rows based on a date offset. + + Parameters + ---------- + offset: str + The offset length of the data that will be selected. For intance, + '1M' will display all rows having their index within the first + month. + + Returns + ------- + Series or DataFrame + A subset of the caller. + + Raises + ------ + TypeError + If the index is not a ``DatetimeIndex`` + + Examples + -------- + >>> i = cudf.date_range('2018-04-09', periods=4, freq='2D') + >>> ts = cudf.DataFrame({'A': [1, 2, 3, 4]}, index=i) + >>> ts + A + 2018-04-09 1 + 2018-04-11 2 + 2018-04-13 3 + 2018-04-15 4 + >>> ts.first('3D') + A + 2018-04-09 1 + 2018-04-11 2 + """ + if not is_datetime_dtype(self._index.dtype): + raise TypeError("'first' only supports a DatetimeIndex index.") + if not isinstance(offset, str): + raise NotImplementedError( + f"Unsupported offset type {type(offset)}." + ) + + if len(self) == 0: + return self.copy() + + pd_offset = cudf.DateOffset._from_str( + offset + )._maybe_as_fast_pandas_offset() + to_search = pd.Timestamp(self._index._column[0]) + pd_offset + slice_end = int( + self._index._column.searchsorted(to_search, side="left")[0] + ) + return self.iloc[:slice_end] + + def last(self, offset): + """Select final periods of time series data based on a date offset. + + When having a DataFrame with **sorted** dates as index, this function + can select the last few rows based on a date offset. + + Parameters + ---------- + offset: str + The offset length of the data that will be selected. For instance, + '3D' will display all rows having their index within the last 3 + days. + + Returns + ------- + Series or DataFrame + A subset of the caller. + + Raises + ------ + TypeError + If the index is not a ``DatetimeIndex`` + + Examples + -------- + >>> i = cudf.date_range('2018-04-09', periods=4, freq='2D') + >>> ts = cudf.DataFrame({'A': [1, 2, 3, 4]}, index=i) + >>> ts + A + 2018-04-09 1 + 2018-04-11 2 + 2018-04-13 3 + 2018-04-15 4 + >>> ts.last('3D') + A + 2018-04-13 3 + 2018-04-15 4 + """ + if not is_datetime_dtype(self._index.dtype): + raise TypeError("'last' only supports a DatetimeIndex index.") + if not isinstance(offset, str): + raise NotImplementedError( + f"Unsupported offset type {type(offset)}." + ) + + if len(self) == 0: + return self.copy() + + pd_offset = cudf.DateOffset._from_str( + offset + )._maybe_as_fast_pandas_offset() + to_search = pd.Timestamp(self._index._column[-1]) - pd_offset + slice_start = int( + self._index._column.searchsorted(to_search, side="right")[0] + ) + return self.iloc[slice_start:] diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index d666dfc0ec1..b321b216f82 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -1786,3 +1786,41 @@ def test_ceil(data, time_type, resolution): expect = ps.dt.ceil(resolution) got = gs.dt.ceil(resolution) assert_eq(expect, got) + + +@pytest.mark.parametrize( + "idx", + [ + pd.DatetimeIndex([]), + pd.DatetimeIndex(["2010-05-31"]), + pd.date_range("2000-01-01", "2000-12-31", periods=21), + ], +) +@pytest.mark.parametrize("offset", ["0h", "1T", "6M", "10Y"]) +def test_first(idx, offset): + ps = pd.Series(range(len(idx)), index=idx) + gs = cudf.from_pandas(ps) + + expect = ps.first(offset=offset) + got = gs.first(offset=offset) + + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "idx", + [ + pd.DatetimeIndex([]), + pd.DatetimeIndex(["2010-05-31"]), + pd.date_range("2000-01-01", "2000-12-31", periods=21), + ], +) +@pytest.mark.parametrize("offset", ["0h", "1T", "6M", "10Y"]) +def test_last(idx, offset): + ps = pd.Series(range(len(idx)), index=idx) + gs = cudf.from_pandas(ps) + + expect = ps.last(offset=offset) + got = gs.last(offset=offset) + + assert_eq(expect, got) From 63a4f3b40ceb44b474f2701e2bdc49d2f77c17c0 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 16 Nov 2021 16:57:30 -0800 Subject: [PATCH 4/9] extend test cases --- python/cudf/cudf/tests/test_datetime.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index b321b216f82..9f5896b00fd 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -1797,12 +1797,13 @@ def test_ceil(data, time_type, resolution): ], ) @pytest.mark.parametrize("offset", ["0h", "1T", "6M", "10Y"]) -def test_first(idx, offset): - ps = pd.Series(range(len(idx)), index=idx) - gs = cudf.from_pandas(ps) +@pytest.mark.parametrize("klass", ["Series", "DataFrame"]) +def test_first(idx, offset, klass): + p = getattr(pd, klass)(range(len(idx)), index=idx) + g = cudf.from_pandas(p) - expect = ps.first(offset=offset) - got = gs.first(offset=offset) + expect = p.first(offset=offset) + got = g.first(offset=offset) assert_eq(expect, got) @@ -1816,11 +1817,12 @@ def test_first(idx, offset): ], ) @pytest.mark.parametrize("offset", ["0h", "1T", "6M", "10Y"]) -def test_last(idx, offset): - ps = pd.Series(range(len(idx)), index=idx) - gs = cudf.from_pandas(ps) +@pytest.mark.parametrize("klass", ["Series", "DataFrame"]) +def test_last(idx, offset, klass): + p = getattr(pd, klass)(range(len(idx)), index=idx) + g = cudf.from_pandas(p) - expect = ps.last(offset=offset) - got = gs.last(offset=offset) + expect = p.last(offset=offset) + got = g.last(offset=offset) assert_eq(expect, got) From 0c978913e7c0e5e3e93b5dbb43930906c69c7f7f Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 16 Nov 2021 21:38:24 -0800 Subject: [PATCH 5/9] consolidating logics into helper --- python/cudf/cudf/core/indexed_frame.py | 79 +++++++++++++------------- 1 file changed, 39 insertions(+), 40 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 07f7c4ed842..ca2313195a5 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3,8 +3,9 @@ from __future__ import annotations +import operator import warnings -from typing import Type, TypeVar +from typing import Callable, Type, TypeVar from uuid import uuid4 import cupy as cp @@ -13,11 +14,7 @@ import cudf from cudf._typing import ColumnLike -from cudf.api.types import ( - is_categorical_dtype, - is_datetime_dtype, - is_list_like, -) +from cudf.api.types import is_categorical_dtype, is_list_like from cudf.core.column import arange from cudf.core.frame import Frame from cudf.core.index import Index @@ -105,6 +102,7 @@ class IndexedFrame(Frame): # mypy can't handle bound type variables as class members _loc_indexer_type: Type[_LocIndexerClass] # type: ignore _iloc_indexer_type: Type[_IlocIndexerClass] # type: ignore + _index: cudf.core.index.BaseIndex def __init__(self, data=None, index=None): super().__init__(data=data, index=index) @@ -763,6 +761,29 @@ def resample( else cudf.core.resample.DataFrameResampler(self, by=by) ) + def _first_or_last( + self, offset, idx: int, op: Callable, side: str, slice_func: Callable + ) -> "IndexedFrame": + """Shared code path for ``first`` and ``last``.""" + if not isinstance(self._index, cudf.core.index.DatetimeIndex): + raise TypeError("'first' only supports a DatetimeIndex index.") + if not isinstance(offset, str): + raise NotImplementedError( + f"Unsupported offset type {type(offset)}." + ) + + if len(self) == 0: + return self.copy() + + pd_offset = cudf.DateOffset._from_str( + offset + )._maybe_as_fast_pandas_offset() + to_search = op(pd.Timestamp(self._index._column[idx]), pd_offset) + end_point = int( + self._index._column.searchsorted(to_search, side=side)[0] + ) + return slice_func(end_point) + def first(self, offset): """Select initial periods of time series data based on a date offset. @@ -801,24 +822,13 @@ def first(self, offset): 2018-04-09 1 2018-04-11 2 """ - if not is_datetime_dtype(self._index.dtype): - raise TypeError("'first' only supports a DatetimeIndex index.") - if not isinstance(offset, str): - raise NotImplementedError( - f"Unsupported offset type {type(offset)}." - ) - - if len(self) == 0: - return self.copy() - - pd_offset = cudf.DateOffset._from_str( - offset - )._maybe_as_fast_pandas_offset() - to_search = pd.Timestamp(self._index._column[0]) + pd_offset - slice_end = int( - self._index._column.searchsorted(to_search, side="left")[0] + return self._first_or_last( + offset, + idx=0, + op=operator.__add__, + side="left", + slice_func=lambda i: self.iloc[:i], ) - return self.iloc[:slice_end] def last(self, offset): """Select final periods of time series data based on a date offset. @@ -858,21 +868,10 @@ def last(self, offset): 2018-04-13 3 2018-04-15 4 """ - if not is_datetime_dtype(self._index.dtype): - raise TypeError("'last' only supports a DatetimeIndex index.") - if not isinstance(offset, str): - raise NotImplementedError( - f"Unsupported offset type {type(offset)}." - ) - - if len(self) == 0: - return self.copy() - - pd_offset = cudf.DateOffset._from_str( - offset - )._maybe_as_fast_pandas_offset() - to_search = pd.Timestamp(self._index._column[-1]) - pd_offset - slice_start = int( - self._index._column.searchsorted(to_search, side="right")[0] + return self._first_or_last( + offset, + idx=-1, + op=operator.__sub__, + side="right", + slice_func=lambda i: self.iloc[i:], ) - return self.iloc[slice_start:] From cbbc061ac944aa8ff53cd2c107aa12ff8dfc644a Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 17 Nov 2021 17:13:09 -0800 Subject: [PATCH 6/9] reject ambiguous input and add raise error message for unsupported types in date_range --- python/cudf/cudf/core/tools/datetimes.py | 41 ++++++++++++++++++++---- python/cudf/cudf/tests/test_datetime.py | 40 +++++++++++++++++++++++ 2 files changed, 75 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index d12cf0f4782..288463e9a9f 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -634,15 +634,35 @@ def _from_str(cls: Type[_T], freqstr: str) -> _T: """ Parse a string and return a DateOffset object. - Expects strings of the form 3D, 25W, 10ms, 42ns, etc. - See `_offset_alias_to_code` and `_CODE_TO_UNITS` for - supported list of strings. + A string can be a pandas `offset alias`_ or a + numpy `date/time unit code`_ + + Note that ``m`` (lower case) is ambiguous and is not accepted in this + function. Use ``T``/``min`` for minutely frequency and ``M`` (upper + case) for monthly frequency. + + Expects strings of the form 3D, 25W, -10ms, 42ns, etc. + + Not all offset aliases are supported. See `_offset_alias_to_code` and + `_CODE_TO_UNITS` for supported list of strings. """ match = cls._FREQSTR_REGEX.fullmatch(freqstr) if match is None: raise ValueError(f"Invalid frequency string: {freqstr}") + # Decompose the string into separate components sign_part, numeric_part, freq_part = match.groups() + + # Handle various offset strings and normalize as codes + if freq_part == "m": + raise ValueError( + "Lower cased `m` is ambiguous. Use 'T'/'min' to specify " + "minutely frequency or upper cased `M` to specify monthly " + "frequency." + ) + if freq_part in _offset_alias_to_code: code = _offset_alias_to_code[freq_part] elif freq_part in cls._CODES_TO_UNITS: @@ -650,9 +670,11 @@ def _from_str(cls: Type[_T], freqstr: str) -> _T: else: raise ValueError(f"Cannot interpret frequency str: {freqstr}") + # Handle sign and numerics sign = -1 if sign_part else 1 n = int(numeric_part) if numeric_part else 1 + # Construct the kwds dictionary return cls(**{cls._CODES_TO_UNITS[code]: n * sign}) def _maybe_as_fast_pandas_offset(self) -> pd.DateOffset: @@ -820,11 +842,18 @@ def date_range( if isinstance(freq, DateOffset): offset = freq elif isinstance(freq, str): - offset = DateOffset._from_str(freq) - if "months" in offset.kwds or "years" in offset.kwds: + if ( + any( + x in freq.upper() + for x in {"Y", "A", "Q", "B", "SM", "SMS", "CBMS", "M"} + ) + or "MS" in freq + ): raise ValueError( - f"Unrecognized or unsupported offset alias {freq}." + "date_range does not yet support month, quarter, year-anchored" + "or business-date frequency." ) + offset = DateOffset._from_str(freq) else: raise TypeError("`freq` must be a `str` or cudf.DateOffset object.") diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index d666dfc0ec1..ccebe85fc9c 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -1583,6 +1583,46 @@ def test_date_range_raise_overflow(): cudf.date_range(start=start, periods=periods, freq=freq) +@pytest.mark.parametrize( + "freqstr_unsupported", + [ + "1M", + "2SM", + "3MS", + "4BM", + "5CBM", + "6SMS", + "7BMS", + "8CBMS", + "Q", + "2BQ", + "3BQS", + "10A", + "10Y", + "9BA", + "9BY", + "8AS", + "8YS", + "7BAS", + "7BYS", + "BH", + "B", + ], +) +def test_date_range_raise_unsupported(freqstr_unsupported): + s, e = "2001-01-01", "2008-01-31" + pd.date_range(start=s, end=e, freq=freqstr_unsupported) + with pytest.raises(ValueError, match="does not yet support"): + cudf.date_range(start=s, end=e, freq=freqstr_unsupported) + + # 3ms would mean a millisecondly frequencies, not month start frequencies + if not freqstr_unsupported == "3MS": + freqstr_unsupported = freqstr_unsupported.lower() + pd.date_range(start=s, end=e, freq=freqstr_unsupported) + with pytest.raises(ValueError, match="does not yet support"): + cudf.date_range(start=s, end=e, freq=freqstr_unsupported) + + ################################################################## # End of Date Range Test # ################################################################## From 19f50eed6bda23fdb13c39e967f0686171395512 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 17 Nov 2021 23:09:26 -0800 Subject: [PATCH 7/9] improvement over tests, more offset aliases --- python/cudf/cudf/tests/test_datetime.py | 50 +++++++++++++++++-------- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 275f5497515..cad33c5cddf 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -1828,16 +1828,44 @@ def test_ceil(data, time_type, resolution): assert_eq(expect, got) -@pytest.mark.parametrize( - "idx", - [ +@pytest.fixture( + params=[ pd.DatetimeIndex([]), pd.DatetimeIndex(["2010-05-31"]), pd.date_range("2000-01-01", "2000-12-31", periods=21), - ], + ] +) +def idx(request): + return request.param + + +@pytest.fixture( + params=[ + "10Y", + "6M", + "M", + "31D", + "0H", + "44640T", + "44640min", + "2678000S", + "2678000000L", + "2678000000ms", + "2678000000000U", + "2678000000000us", + "2678000000000000N", + "2678000000000000ns", + ] ) -@pytest.mark.parametrize("offset", ["0h", "1T", "6M", "10Y"]) -@pytest.mark.parametrize("klass", ["Series", "DataFrame"]) +def offset(request): + return request.param + + +@pytest.fixture(params=["Series", "DataFrame"]) +def klass(request): + return request.param + + def test_first(idx, offset, klass): p = getattr(pd, klass)(range(len(idx)), index=idx) g = cudf.from_pandas(p) @@ -1848,16 +1876,6 @@ def test_first(idx, offset, klass): assert_eq(expect, got) -@pytest.mark.parametrize( - "idx", - [ - pd.DatetimeIndex([]), - pd.DatetimeIndex(["2010-05-31"]), - pd.date_range("2000-01-01", "2000-12-31", periods=21), - ], -) -@pytest.mark.parametrize("offset", ["0h", "1T", "6M", "10Y"]) -@pytest.mark.parametrize("klass", ["Series", "DataFrame"]) def test_last(idx, offset, klass): p = getattr(pd, klass)(range(len(idx)), index=idx) g = cudf.from_pandas(p) From ba8f2df45d8b21ef48581a217dbd51f65c264929 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 3 Dec 2021 19:18:22 -0800 Subject: [PATCH 8/9] resort to to convert freqstr and handle a corner case --- python/cudf/cudf/core/indexed_frame.py | 13 +++- python/cudf/cudf/tests/test_datetime.py | 84 +++++++++++++++++++------ 2 files changed, 75 insertions(+), 22 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index ca2313195a5..38b55028a8e 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -775,10 +775,17 @@ def _first_or_last( if len(self) == 0: return self.copy() - pd_offset = cudf.DateOffset._from_str( - offset - )._maybe_as_fast_pandas_offset() + pd_offset = pd.tseries.frequencies.to_offset(offset) to_search = op(pd.Timestamp(self._index._column[idx]), pd_offset) + if ( + idx == 0 + and not isinstance(pd_offset, pd.tseries.offsets.Tick) + and pd_offset.is_on_offset(pd.Timestamp(self._index[0])) + ): + # Special handle is required when the start time of the index + # is on the end of the offset. See pandas gh29623 for detail. + to_search = to_search - pd_offset.base + return self.loc[:to_search] end_point = int( self._index._column.searchsorted(to_search, side=side)[0] ) diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index a38c46c6ef2..2b95b49c3bd 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -1861,19 +1861,17 @@ def test_floor(data, time_type, resolution): assert_eq(expect, got) -@pytest.fixture( - params=[ +@pytest.mark.parametrize( + "idx", + [ pd.DatetimeIndex([]), pd.DatetimeIndex(["2010-05-31"]), pd.date_range("2000-01-01", "2000-12-31", periods=21), - ] + ], ) -def idx(request): - return request.param - - -@pytest.fixture( - params=[ +@pytest.mark.parametrize( + "offset", + [ "10Y", "6M", "M", @@ -1888,19 +1886,40 @@ def idx(request): "2678000000000us", "2678000000000000N", "2678000000000000ns", - ] + ], ) -def offset(request): - return request.param +def test_first(idx, offset): + p = pd.Series(range(len(idx)), index=idx) + g = cudf.from_pandas(p) + expect = p.first(offset=offset) + got = g.first(offset=offset) -@pytest.fixture(params=["Series", "DataFrame"]) -def klass(request): - return request.param + assert_eq(expect, got) -def test_first(idx, offset, klass): - p = getattr(pd, klass)(range(len(idx)), index=idx) +@pytest.mark.parametrize( + # This test case tests correctness when start is end of month + "idx, offset", + [ + ( + pd.DatetimeIndex( + [ + "2020-01-31", + "2020-02-15", + "2020-02-29", + "2020-03-15", + "2020-03-31", + "2020-04-15", + "2020-04-30", + ] + ), + "3M", + ) + ], +) +def test_first_start_at_end_of_month(idx, offset): + p = pd.Series(range(len(idx)), index=idx) g = cudf.from_pandas(p) expect = p.first(offset=offset) @@ -1909,8 +1928,35 @@ def test_first(idx, offset, klass): assert_eq(expect, got) -def test_last(idx, offset, klass): - p = getattr(pd, klass)(range(len(idx)), index=idx) +@pytest.mark.parametrize( + "idx", + [ + pd.DatetimeIndex([]), + pd.DatetimeIndex(["2010-05-31"]), + pd.date_range("2000-01-01", "2000-12-31", periods=21), + ], +) +@pytest.mark.parametrize( + "offset", + [ + "10Y", + "6M", + "M", + "31D", + "0H", + "44640T", + "44640min", + "2678000S", + "2678000000L", + "2678000000ms", + "2678000000000U", + "2678000000000us", + "2678000000000000N", + "2678000000000000ns", + ], +) +def test_last(idx, offset): + p = pd.Series(range(len(idx)), index=idx) g = cudf.from_pandas(p) expect = p.last(offset=offset) From 1076a7c6748dd86aa79219eb6e1330573b4211f8 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 15 Dec 2021 16:57:24 -0800 Subject: [PATCH 9/9] revert changes introduced from closed PR #9709 --- python/cudf/cudf/core/tools/datetimes.py | 52 ++++++------------------ 1 file changed, 12 insertions(+), 40 deletions(-) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index a78cb5787bf..15426d0173a 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -39,8 +39,6 @@ "min": "m", "s": "s", "S": "s", - "L": "ms", - "ms": "ms", "U": "us", "us": "us", "N": "ns", @@ -451,6 +449,7 @@ class DateOffset: "ns": "nanoseconds", "us": "microseconds", "ms": "milliseconds", + "L": "milliseconds", "s": "seconds", "m": "minutes", "h": "hours", @@ -642,52 +641,25 @@ def __repr__(self): return repr_str @classmethod - def _from_str(cls: Type[_T], freqstr: str) -> _T: + def _from_freqstr(cls: Type[_T], freqstr: str) -> _T: """ - Parse a string and return a DateOffset object. - - A string can be a pandas `offset alias`_ or a - numpy `date/time unit code`_ - - Note that ``m`` (lower case) is ambiguous and is not accepted in this - function. Use ``T``/``min`` for minutely frequency and ``M`` (upper - case) for monthly frequency. - - Expects strings of the form 3D, 25W, -10ms, 42ns, etc. - - Not all offset aliases are supported. See `_offset_alias_to_code` and - `_CODE_TO_UNITS` for supported list of strings. + Parse a string and return a DateOffset object + expects strings of the form 3D, 25W, 10ms, 42ns, etc. """ - match = cls._FREQSTR_REGEX.fullmatch(freqstr) + match = cls._FREQSTR_REGEX.match(freqstr) + if match is None: raise ValueError(f"Invalid frequency string: {freqstr}") - # Decompose the string into separate components - sign_part, numeric_part, freq_part = match.groups() + numeric_part = match.group(1) + if numeric_part == "": + numeric_part = "1" + freq_part = match.group(2) - # Handle various offset strings and normalize as codes - if freq_part == "m": - raise ValueError( - "Lower cased `m` is ambiguous. Use 'T'/'min' to specify " - "minutely frequency or upper cased `M` to specify monthly " - "frequency." - ) - - if freq_part in _offset_alias_to_code: - code = _offset_alias_to_code[freq_part] - elif freq_part in cls._CODES_TO_UNITS: - code = freq_part - else: + if freq_part not in cls._CODES_TO_UNITS: raise ValueError(f"Cannot interpret frequency str: {freqstr}") - # Handle sign and numerics - sign = -1 if sign_part else 1 - n = int(numeric_part) if numeric_part else 1 - - # Construct the kwds dictionary - return cls(**{cls._CODES_TO_UNITS[code]: n * sign}) + return cls(**{cls._CODES_TO_UNITS[freq_part]: int(numeric_part)}) @classmethod def _from_pandas_ticks_or_weeks(