From 107c08a36fe261b940353b70b355fbb30abd3531 Mon Sep 17 00:00:00 2001
From: Michael Wang <michaelwang0905@icloud.com>
Date: Tue, 16 Nov 2021 15:26:59 -0800
Subject: [PATCH 1/9] consolidate freqstr handling

---
 python/cudf/cudf/core/tools/datetimes.py | 53 +++++++++++-------------
 1 file changed, 24 insertions(+), 29 deletions(-)

diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 34d62ffc048..ae1e73cb84e 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -38,6 +38,8 @@
     "min": "m",
     "s": "s",
     "S": "s",
+    "L": "ms",
+    "ms": "ms",
     "U": "us",
     "us": "us",
     "N": "ns",
@@ -448,7 +450,6 @@ class DateOffset:
         "ns": "nanoseconds",
         "us": "microseconds",
         "ms": "milliseconds",
-        "L": "milliseconds",
         "s": "seconds",
         "m": "minutes",
         "h": "hours",
@@ -458,7 +459,7 @@ class DateOffset:
         "Y": "years",
     }
 
-    _FREQSTR_REGEX = re.compile("([0-9]*)([a-zA-Z]+)")
+    _FREQSTR_REGEX = re.compile("(-)*([0-9]*)([a-zA-Z]+)")
 
     def __init__(self, n=1, normalize=False, **kwds):
         if normalize:
@@ -629,27 +630,33 @@ def __repr__(self):
         return repr_str
 
     @classmethod
-    def _from_freqstr(cls: Type[_T], freqstr: str) -> _T:
+    def _from_str(cls: Type[_T], freqstr: str) -> _T:
         """
-        Parse a string and return a DateOffset object
-        expects strings of the form 3D, 25W, 10ms, 42ns, etc.
-        """
-        match = cls._FREQSTR_REGEX.match(freqstr)
+        Parse a string and return a DateOffset object.
 
+        Expects strings of the form 3D, 25W, 10ms, 42ns, etc.
+        See `_offset_alias_to_code` and `_CODE_TO_UNITS` for
+        supported list of strings.
+        """
+        match = cls._FREQSTR_REGEX.fullmatch(freqstr)
         if match is None:
             raise ValueError(f"Invalid frequency string: {freqstr}")
 
-        numeric_part = match.group(1)
-        if numeric_part == "":
-            numeric_part = "1"
-        freq_part = match.group(2)
-
-        if freq_part not in cls._CODES_TO_UNITS:
+        sign_part, numeric_part, freq_part = match.groups()
+        if freq_part in _offset_alias_to_code:
+            code = _offset_alias_to_code[freq_part]
+        elif freq_part in cls._CODES_TO_UNITS:
+            code = freq_part
+        else:
             raise ValueError(f"Cannot interpret frequency str: {freqstr}")
 
-        return cls(**{cls._CODES_TO_UNITS[freq_part]: int(numeric_part)})
+        sign = -1 if sign_part else 1
+        n = int(numeric_part) if numeric_part else 1
+        code = _offset_alias_to_code[freq_part]
 
-    def _maybe_as_fast_pandas_offset(self):
+        return cls(**{cls._CODES_TO_UNITS[code]: n * sign})
+
+    def _maybe_as_fast_pandas_offset(self) -> pd.DateOffset:
         if (
             len(self.kwds) == 1
             and _has_fixed_frequency(self)
@@ -814,23 +821,11 @@ def date_range(
     if isinstance(freq, DateOffset):
         offset = freq
     elif isinstance(freq, str):
-        # Map pandas `offset alias` into cudf DateOffset `CODE`, only
-        # fixed-frequency, non-anchored offset aliases are supported.
-        mo = re.fullmatch(
-            rf'(-)*(\d*)({"|".join(_offset_alias_to_code.keys())})', freq
-        )
-        if mo is None:
+        offset = DateOffset._from_str(freq)
+        if "months" in offset.kwds or "years" in offset.kwds:
             raise ValueError(
                 f"Unrecognized or unsupported offset alias {freq}."
             )
-
-        sign, n, offset_alias = mo.groups()
-        code = _offset_alias_to_code[offset_alias]
-
-        freq = "".join([n, code])
-        offset = DateOffset._from_freqstr(freq)
-        if sign:
-            offset.kwds.update({s: -i for s, i in offset.kwds.items()})
     else:
         raise TypeError("`freq` must be a `str` or cudf.DateOffset object.")
 

From 25da96e5ce64fafc5aaa15286a660d497a883f0f Mon Sep 17 00:00:00 2001
From: Michael Wang <michaelwang0905@icloud.com>
Date: Tue, 16 Nov 2021 15:42:56 -0800
Subject: [PATCH 2/9] .

---
 python/cudf/cudf/core/tools/datetimes.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index ae1e73cb84e..d12cf0f4782 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -652,7 +652,6 @@ def _from_str(cls: Type[_T], freqstr: str) -> _T:
 
         sign = -1 if sign_part else 1
         n = int(numeric_part) if numeric_part else 1
-        code = _offset_alias_to_code[freq_part]
 
         return cls(**{cls._CODES_TO_UNITS[code]: n * sign})
 

From 6cc207edc0cc06edf7286092b4b8a25348801f1b Mon Sep 17 00:00:00 2001
From: Michael Wang <michaelwang0905@icloud.com>
Date: Tue, 16 Nov 2021 16:06:32 -0800
Subject: [PATCH 3/9] initial

---
 python/cudf/cudf/core/indexed_frame.py  | 120 +++++++++++++++++++++++-
 python/cudf/cudf/tests/test_datetime.py |  38 ++++++++
 2 files changed, 157 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index cf12907d96a..07f7c4ed842 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -13,7 +13,11 @@
 
 import cudf
 from cudf._typing import ColumnLike
-from cudf.api.types import is_categorical_dtype, is_list_like
+from cudf.api.types import (
+    is_categorical_dtype,
+    is_datetime_dtype,
+    is_list_like,
+)
 from cudf.core.column import arange
 from cudf.core.frame import Frame
 from cudf.core.index import Index
@@ -758,3 +762,117 @@ def resample(
             if isinstance(self, cudf.Series)
             else cudf.core.resample.DataFrameResampler(self, by=by)
         )
+
+    def first(self, offset):
+        """Select initial periods of time series data based on a date offset.
+
+        When having a DataFrame with **sorted** dates as index, this function
+        can select the first few rows based on a date offset.
+
+        Parameters
+        ----------
+        offset: str
+            The offset length of the data that will be selected. For intance,
+            '1M' will display all rows having their index within the first
+            month.
+
+        Returns
+        -------
+        Series or DataFrame
+            A subset of the caller.
+
+        Raises
+        ------
+        TypeError
+            If the index is not a ``DatetimeIndex``
+
+        Examples
+        --------
+        >>> i = cudf.date_range('2018-04-09', periods=4, freq='2D')
+        >>> ts = cudf.DataFrame({'A': [1, 2, 3, 4]}, index=i)
+        >>> ts
+                    A
+        2018-04-09  1
+        2018-04-11  2
+        2018-04-13  3
+        2018-04-15  4
+        >>> ts.first('3D')
+                    A
+        2018-04-09  1
+        2018-04-11  2
+        """
+        if not is_datetime_dtype(self._index.dtype):
+            raise TypeError("'first' only supports a DatetimeIndex index.")
+        if not isinstance(offset, str):
+            raise NotImplementedError(
+                f"Unsupported offset type {type(offset)}."
+            )
+
+        if len(self) == 0:
+            return self.copy()
+
+        pd_offset = cudf.DateOffset._from_str(
+            offset
+        )._maybe_as_fast_pandas_offset()
+        to_search = pd.Timestamp(self._index._column[0]) + pd_offset
+        slice_end = int(
+            self._index._column.searchsorted(to_search, side="left")[0]
+        )
+        return self.iloc[:slice_end]
+
+    def last(self, offset):
+        """Select final periods of time series data based on a date offset.
+
+        When having a DataFrame with **sorted** dates as index, this function
+        can select the last few rows based on a date offset.
+
+        Parameters
+        ----------
+        offset: str
+            The offset length of the data that will be selected. For instance,
+            '3D' will display all rows having their index within the last 3
+            days.
+
+        Returns
+        -------
+        Series or DataFrame
+            A subset of the caller.
+
+        Raises
+        ------
+        TypeError
+            If the index is not a ``DatetimeIndex``
+
+        Examples
+        --------
+        >>> i = cudf.date_range('2018-04-09', periods=4, freq='2D')
+        >>> ts = cudf.DataFrame({'A': [1, 2, 3, 4]}, index=i)
+        >>> ts
+                    A
+        2018-04-09  1
+        2018-04-11  2
+        2018-04-13  3
+        2018-04-15  4
+        >>> ts.last('3D')
+                    A
+        2018-04-13  3
+        2018-04-15  4
+        """
+        if not is_datetime_dtype(self._index.dtype):
+            raise TypeError("'last' only supports a DatetimeIndex index.")
+        if not isinstance(offset, str):
+            raise NotImplementedError(
+                f"Unsupported offset type {type(offset)}."
+            )
+
+        if len(self) == 0:
+            return self.copy()
+
+        pd_offset = cudf.DateOffset._from_str(
+            offset
+        )._maybe_as_fast_pandas_offset()
+        to_search = pd.Timestamp(self._index._column[-1]) - pd_offset
+        slice_start = int(
+            self._index._column.searchsorted(to_search, side="right")[0]
+        )
+        return self.iloc[slice_start:]
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index d666dfc0ec1..b321b216f82 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -1786,3 +1786,41 @@ def test_ceil(data, time_type, resolution):
     expect = ps.dt.ceil(resolution)
     got = gs.dt.ceil(resolution)
     assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "idx",
+    [
+        pd.DatetimeIndex([]),
+        pd.DatetimeIndex(["2010-05-31"]),
+        pd.date_range("2000-01-01", "2000-12-31", periods=21),
+    ],
+)
+@pytest.mark.parametrize("offset", ["0h", "1T", "6M", "10Y"])
+def test_first(idx, offset):
+    ps = pd.Series(range(len(idx)), index=idx)
+    gs = cudf.from_pandas(ps)
+
+    expect = ps.first(offset=offset)
+    got = gs.first(offset=offset)
+
+    assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "idx",
+    [
+        pd.DatetimeIndex([]),
+        pd.DatetimeIndex(["2010-05-31"]),
+        pd.date_range("2000-01-01", "2000-12-31", periods=21),
+    ],
+)
+@pytest.mark.parametrize("offset", ["0h", "1T", "6M", "10Y"])
+def test_last(idx, offset):
+    ps = pd.Series(range(len(idx)), index=idx)
+    gs = cudf.from_pandas(ps)
+
+    expect = ps.last(offset=offset)
+    got = gs.last(offset=offset)
+
+    assert_eq(expect, got)

From 63a4f3b40ceb44b474f2701e2bdc49d2f77c17c0 Mon Sep 17 00:00:00 2001
From: Michael Wang <michaelwang0905@icloud.com>
Date: Tue, 16 Nov 2021 16:57:30 -0800
Subject: [PATCH 4/9] extend test cases

---
 python/cudf/cudf/tests/test_datetime.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index b321b216f82..9f5896b00fd 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -1797,12 +1797,13 @@ def test_ceil(data, time_type, resolution):
     ],
 )
 @pytest.mark.parametrize("offset", ["0h", "1T", "6M", "10Y"])
-def test_first(idx, offset):
-    ps = pd.Series(range(len(idx)), index=idx)
-    gs = cudf.from_pandas(ps)
+@pytest.mark.parametrize("klass", ["Series", "DataFrame"])
+def test_first(idx, offset, klass):
+    p = getattr(pd, klass)(range(len(idx)), index=idx)
+    g = cudf.from_pandas(p)
 
-    expect = ps.first(offset=offset)
-    got = gs.first(offset=offset)
+    expect = p.first(offset=offset)
+    got = g.first(offset=offset)
 
     assert_eq(expect, got)
 
@@ -1816,11 +1817,12 @@ def test_first(idx, offset):
     ],
 )
 @pytest.mark.parametrize("offset", ["0h", "1T", "6M", "10Y"])
-def test_last(idx, offset):
-    ps = pd.Series(range(len(idx)), index=idx)
-    gs = cudf.from_pandas(ps)
+@pytest.mark.parametrize("klass", ["Series", "DataFrame"])
+def test_last(idx, offset, klass):
+    p = getattr(pd, klass)(range(len(idx)), index=idx)
+    g = cudf.from_pandas(p)
 
-    expect = ps.last(offset=offset)
-    got = gs.last(offset=offset)
+    expect = p.last(offset=offset)
+    got = g.last(offset=offset)
 
     assert_eq(expect, got)

From 0c978913e7c0e5e3e93b5dbb43930906c69c7f7f Mon Sep 17 00:00:00 2001
From: Michael Wang <michaelwang0905@icloud.com>
Date: Tue, 16 Nov 2021 21:38:24 -0800
Subject: [PATCH 5/9] consolidating logics into helper

---
 python/cudf/cudf/core/indexed_frame.py | 79 +++++++++++++-------------
 1 file changed, 39 insertions(+), 40 deletions(-)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 07f7c4ed842..ca2313195a5 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -3,8 +3,9 @@
 
 from __future__ import annotations
 
+import operator
 import warnings
-from typing import Type, TypeVar
+from typing import Callable, Type, TypeVar
 from uuid import uuid4
 
 import cupy as cp
@@ -13,11 +14,7 @@
 
 import cudf
 from cudf._typing import ColumnLike
-from cudf.api.types import (
-    is_categorical_dtype,
-    is_datetime_dtype,
-    is_list_like,
-)
+from cudf.api.types import is_categorical_dtype, is_list_like
 from cudf.core.column import arange
 from cudf.core.frame import Frame
 from cudf.core.index import Index
@@ -105,6 +102,7 @@ class IndexedFrame(Frame):
     # mypy can't handle bound type variables as class members
     _loc_indexer_type: Type[_LocIndexerClass]  # type: ignore
     _iloc_indexer_type: Type[_IlocIndexerClass]  # type: ignore
+    _index: cudf.core.index.BaseIndex
 
     def __init__(self, data=None, index=None):
         super().__init__(data=data, index=index)
@@ -763,6 +761,29 @@ def resample(
             else cudf.core.resample.DataFrameResampler(self, by=by)
         )
 
+    def _first_or_last(
+        self, offset, idx: int, op: Callable, side: str, slice_func: Callable
+    ) -> "IndexedFrame":
+        """Shared code path for ``first`` and ``last``."""
+        if not isinstance(self._index, cudf.core.index.DatetimeIndex):
+            raise TypeError("'first' only supports a DatetimeIndex index.")
+        if not isinstance(offset, str):
+            raise NotImplementedError(
+                f"Unsupported offset type {type(offset)}."
+            )
+
+        if len(self) == 0:
+            return self.copy()
+
+        pd_offset = cudf.DateOffset._from_str(
+            offset
+        )._maybe_as_fast_pandas_offset()
+        to_search = op(pd.Timestamp(self._index._column[idx]), pd_offset)
+        end_point = int(
+            self._index._column.searchsorted(to_search, side=side)[0]
+        )
+        return slice_func(end_point)
+
     def first(self, offset):
         """Select initial periods of time series data based on a date offset.
 
@@ -801,24 +822,13 @@ def first(self, offset):
         2018-04-09  1
         2018-04-11  2
         """
-        if not is_datetime_dtype(self._index.dtype):
-            raise TypeError("'first' only supports a DatetimeIndex index.")
-        if not isinstance(offset, str):
-            raise NotImplementedError(
-                f"Unsupported offset type {type(offset)}."
-            )
-
-        if len(self) == 0:
-            return self.copy()
-
-        pd_offset = cudf.DateOffset._from_str(
-            offset
-        )._maybe_as_fast_pandas_offset()
-        to_search = pd.Timestamp(self._index._column[0]) + pd_offset
-        slice_end = int(
-            self._index._column.searchsorted(to_search, side="left")[0]
+        return self._first_or_last(
+            offset,
+            idx=0,
+            op=operator.__add__,
+            side="left",
+            slice_func=lambda i: self.iloc[:i],
         )
-        return self.iloc[:slice_end]
 
     def last(self, offset):
         """Select final periods of time series data based on a date offset.
@@ -858,21 +868,10 @@ def last(self, offset):
         2018-04-13  3
         2018-04-15  4
         """
-        if not is_datetime_dtype(self._index.dtype):
-            raise TypeError("'last' only supports a DatetimeIndex index.")
-        if not isinstance(offset, str):
-            raise NotImplementedError(
-                f"Unsupported offset type {type(offset)}."
-            )
-
-        if len(self) == 0:
-            return self.copy()
-
-        pd_offset = cudf.DateOffset._from_str(
-            offset
-        )._maybe_as_fast_pandas_offset()
-        to_search = pd.Timestamp(self._index._column[-1]) - pd_offset
-        slice_start = int(
-            self._index._column.searchsorted(to_search, side="right")[0]
+        return self._first_or_last(
+            offset,
+            idx=-1,
+            op=operator.__sub__,
+            side="right",
+            slice_func=lambda i: self.iloc[i:],
         )
-        return self.iloc[slice_start:]

From cbbc061ac944aa8ff53cd2c107aa12ff8dfc644a Mon Sep 17 00:00:00 2001
From: Michael Wang <michaelwang0905@icloud.com>
Date: Wed, 17 Nov 2021 17:13:09 -0800
Subject: [PATCH 6/9] reject ambiguous input and add raise error message for
 unsupported types in date_range

---
 python/cudf/cudf/core/tools/datetimes.py | 41 ++++++++++++++++++++----
 python/cudf/cudf/tests/test_datetime.py  | 40 +++++++++++++++++++++++
 2 files changed, 75 insertions(+), 6 deletions(-)

diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index d12cf0f4782..288463e9a9f 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -634,15 +634,35 @@ def _from_str(cls: Type[_T], freqstr: str) -> _T:
         """
         Parse a string and return a DateOffset object.
 
-        Expects strings of the form 3D, 25W, 10ms, 42ns, etc.
-        See `_offset_alias_to_code` and `_CODE_TO_UNITS` for
-        supported list of strings.
+        A string can be a pandas `offset alias`<https://pandas.pydata.org/\
+            pandas-docs/stable/user_guide/timeseries.html#offset-aliases>_ or a
+        numpy `date/time unit code`<https://numpy.org/doc/stable/reference/arr\
+            ays.datetime.html#datetime-units>_
+
+        Note that ``m`` (lower case) is ambiguous and is not accepted in this
+        function. Use ``T``/``min`` for minutely frequency and ``M`` (upper
+        case) for monthly frequency.
+
+        Expects strings of the form 3D, 25W, -10ms, 42ns, etc.
+
+        Not all offset aliases are supported. See `_offset_alias_to_code` and
+        `_CODE_TO_UNITS` for supported list of strings.
         """
         match = cls._FREQSTR_REGEX.fullmatch(freqstr)
         if match is None:
             raise ValueError(f"Invalid frequency string: {freqstr}")
 
+        # Decompose the string into separate components
         sign_part, numeric_part, freq_part = match.groups()
+
+        # Handle various offset strings and normalize as codes
+        if freq_part == "m":
+            raise ValueError(
+                "Lower cased `m` is ambiguous. Use 'T'/'min' to specify "
+                "minutely frequency or upper cased `M` to specify monthly "
+                "frequency."
+            )
+
         if freq_part in _offset_alias_to_code:
             code = _offset_alias_to_code[freq_part]
         elif freq_part in cls._CODES_TO_UNITS:
@@ -650,9 +670,11 @@ def _from_str(cls: Type[_T], freqstr: str) -> _T:
         else:
             raise ValueError(f"Cannot interpret frequency str: {freqstr}")
 
+        # Handle sign and numerics
         sign = -1 if sign_part else 1
         n = int(numeric_part) if numeric_part else 1
 
+        # Construct the kwds dictionary
         return cls(**{cls._CODES_TO_UNITS[code]: n * sign})
 
     def _maybe_as_fast_pandas_offset(self) -> pd.DateOffset:
@@ -820,11 +842,18 @@ def date_range(
     if isinstance(freq, DateOffset):
         offset = freq
     elif isinstance(freq, str):
-        offset = DateOffset._from_str(freq)
-        if "months" in offset.kwds or "years" in offset.kwds:
+        if (
+            any(
+                x in freq.upper()
+                for x in {"Y", "A", "Q", "B", "SM", "SMS", "CBMS", "M"}
+            )
+            or "MS" in freq
+        ):
             raise ValueError(
-                f"Unrecognized or unsupported offset alias {freq}."
+                "date_range does not yet support month, quarter, year-anchored"
+                "or business-date frequency."
             )
+        offset = DateOffset._from_str(freq)
     else:
         raise TypeError("`freq` must be a `str` or cudf.DateOffset object.")
 
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index d666dfc0ec1..ccebe85fc9c 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -1583,6 +1583,46 @@ def test_date_range_raise_overflow():
         cudf.date_range(start=start, periods=periods, freq=freq)
 
 
+@pytest.mark.parametrize(
+    "freqstr_unsupported",
+    [
+        "1M",
+        "2SM",
+        "3MS",
+        "4BM",
+        "5CBM",
+        "6SMS",
+        "7BMS",
+        "8CBMS",
+        "Q",
+        "2BQ",
+        "3BQS",
+        "10A",
+        "10Y",
+        "9BA",
+        "9BY",
+        "8AS",
+        "8YS",
+        "7BAS",
+        "7BYS",
+        "BH",
+        "B",
+    ],
+)
+def test_date_range_raise_unsupported(freqstr_unsupported):
+    s, e = "2001-01-01", "2008-01-31"
+    pd.date_range(start=s, end=e, freq=freqstr_unsupported)
+    with pytest.raises(ValueError, match="does not yet support"):
+        cudf.date_range(start=s, end=e, freq=freqstr_unsupported)
+
+    # 3ms would mean a millisecondly frequencies, not month start frequencies
+    if not freqstr_unsupported == "3MS":
+        freqstr_unsupported = freqstr_unsupported.lower()
+        pd.date_range(start=s, end=e, freq=freqstr_unsupported)
+        with pytest.raises(ValueError, match="does not yet support"):
+            cudf.date_range(start=s, end=e, freq=freqstr_unsupported)
+
+
 ##################################################################
 #                    End of Date Range Test                      #
 ##################################################################

From 19f50eed6bda23fdb13c39e967f0686171395512 Mon Sep 17 00:00:00 2001
From: Michael Wang <michaelwang0905@icloud.com>
Date: Wed, 17 Nov 2021 23:09:26 -0800
Subject: [PATCH 7/9] improvement over tests, more offset aliases

---
 python/cudf/cudf/tests/test_datetime.py | 50 +++++++++++++++++--------
 1 file changed, 34 insertions(+), 16 deletions(-)

diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 275f5497515..cad33c5cddf 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -1828,16 +1828,44 @@ def test_ceil(data, time_type, resolution):
     assert_eq(expect, got)
 
 
-@pytest.mark.parametrize(
-    "idx",
-    [
+@pytest.fixture(
+    params=[
         pd.DatetimeIndex([]),
         pd.DatetimeIndex(["2010-05-31"]),
         pd.date_range("2000-01-01", "2000-12-31", periods=21),
-    ],
+    ]
+)
+def idx(request):
+    return request.param
+
+
+@pytest.fixture(
+    params=[
+        "10Y",
+        "6M",
+        "M",
+        "31D",
+        "0H",
+        "44640T",
+        "44640min",
+        "2678000S",
+        "2678000000L",
+        "2678000000ms",
+        "2678000000000U",
+        "2678000000000us",
+        "2678000000000000N",
+        "2678000000000000ns",
+    ]
 )
-@pytest.mark.parametrize("offset", ["0h", "1T", "6M", "10Y"])
-@pytest.mark.parametrize("klass", ["Series", "DataFrame"])
+def offset(request):
+    return request.param
+
+
+@pytest.fixture(params=["Series", "DataFrame"])
+def klass(request):
+    return request.param
+
+
 def test_first(idx, offset, klass):
     p = getattr(pd, klass)(range(len(idx)), index=idx)
     g = cudf.from_pandas(p)
@@ -1848,16 +1876,6 @@ def test_first(idx, offset, klass):
     assert_eq(expect, got)
 
 
-@pytest.mark.parametrize(
-    "idx",
-    [
-        pd.DatetimeIndex([]),
-        pd.DatetimeIndex(["2010-05-31"]),
-        pd.date_range("2000-01-01", "2000-12-31", periods=21),
-    ],
-)
-@pytest.mark.parametrize("offset", ["0h", "1T", "6M", "10Y"])
-@pytest.mark.parametrize("klass", ["Series", "DataFrame"])
 def test_last(idx, offset, klass):
     p = getattr(pd, klass)(range(len(idx)), index=idx)
     g = cudf.from_pandas(p)

From ba8f2df45d8b21ef48581a217dbd51f65c264929 Mon Sep 17 00:00:00 2001
From: Michael Wang <michaelwang0905@icloud.com>
Date: Fri, 3 Dec 2021 19:18:22 -0800
Subject: [PATCH 8/9] resort to  to convert freqstr and handle a corner case

---
 python/cudf/cudf/core/indexed_frame.py  | 13 +++-
 python/cudf/cudf/tests/test_datetime.py | 84 +++++++++++++++++++------
 2 files changed, 75 insertions(+), 22 deletions(-)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index ca2313195a5..38b55028a8e 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -775,10 +775,17 @@ def _first_or_last(
         if len(self) == 0:
             return self.copy()
 
-        pd_offset = cudf.DateOffset._from_str(
-            offset
-        )._maybe_as_fast_pandas_offset()
+        pd_offset = pd.tseries.frequencies.to_offset(offset)
         to_search = op(pd.Timestamp(self._index._column[idx]), pd_offset)
+        if (
+            idx == 0
+            and not isinstance(pd_offset, pd.tseries.offsets.Tick)
+            and pd_offset.is_on_offset(pd.Timestamp(self._index[0]))
+        ):
+            # Special handle is required when the start time of the index
+            # is on the end of the offset. See pandas gh29623 for detail.
+            to_search = to_search - pd_offset.base
+            return self.loc[:to_search]
         end_point = int(
             self._index._column.searchsorted(to_search, side=side)[0]
         )
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index a38c46c6ef2..2b95b49c3bd 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -1861,19 +1861,17 @@ def test_floor(data, time_type, resolution):
     assert_eq(expect, got)
 
 
-@pytest.fixture(
-    params=[
+@pytest.mark.parametrize(
+    "idx",
+    [
         pd.DatetimeIndex([]),
         pd.DatetimeIndex(["2010-05-31"]),
         pd.date_range("2000-01-01", "2000-12-31", periods=21),
-    ]
+    ],
 )
-def idx(request):
-    return request.param
-
-
-@pytest.fixture(
-    params=[
+@pytest.mark.parametrize(
+    "offset",
+    [
         "10Y",
         "6M",
         "M",
@@ -1888,19 +1886,40 @@ def idx(request):
         "2678000000000us",
         "2678000000000000N",
         "2678000000000000ns",
-    ]
+    ],
 )
-def offset(request):
-    return request.param
+def test_first(idx, offset):
+    p = pd.Series(range(len(idx)), index=idx)
+    g = cudf.from_pandas(p)
 
+    expect = p.first(offset=offset)
+    got = g.first(offset=offset)
 
-@pytest.fixture(params=["Series", "DataFrame"])
-def klass(request):
-    return request.param
+    assert_eq(expect, got)
 
 
-def test_first(idx, offset, klass):
-    p = getattr(pd, klass)(range(len(idx)), index=idx)
+@pytest.mark.parametrize(
+    # This test case tests correctness when start is end of month
+    "idx, offset",
+    [
+        (
+            pd.DatetimeIndex(
+                [
+                    "2020-01-31",
+                    "2020-02-15",
+                    "2020-02-29",
+                    "2020-03-15",
+                    "2020-03-31",
+                    "2020-04-15",
+                    "2020-04-30",
+                ]
+            ),
+            "3M",
+        )
+    ],
+)
+def test_first_start_at_end_of_month(idx, offset):
+    p = pd.Series(range(len(idx)), index=idx)
     g = cudf.from_pandas(p)
 
     expect = p.first(offset=offset)
@@ -1909,8 +1928,35 @@ def test_first(idx, offset, klass):
     assert_eq(expect, got)
 
 
-def test_last(idx, offset, klass):
-    p = getattr(pd, klass)(range(len(idx)), index=idx)
+@pytest.mark.parametrize(
+    "idx",
+    [
+        pd.DatetimeIndex([]),
+        pd.DatetimeIndex(["2010-05-31"]),
+        pd.date_range("2000-01-01", "2000-12-31", periods=21),
+    ],
+)
+@pytest.mark.parametrize(
+    "offset",
+    [
+        "10Y",
+        "6M",
+        "M",
+        "31D",
+        "0H",
+        "44640T",
+        "44640min",
+        "2678000S",
+        "2678000000L",
+        "2678000000ms",
+        "2678000000000U",
+        "2678000000000us",
+        "2678000000000000N",
+        "2678000000000000ns",
+    ],
+)
+def test_last(idx, offset):
+    p = pd.Series(range(len(idx)), index=idx)
     g = cudf.from_pandas(p)
 
     expect = p.last(offset=offset)

From 1076a7c6748dd86aa79219eb6e1330573b4211f8 Mon Sep 17 00:00:00 2001
From: Michael Wang <michaelwang0905@icloud.com>
Date: Wed, 15 Dec 2021 16:57:24 -0800
Subject: [PATCH 9/9] revert changes introduced from closed PR #9709

---
 python/cudf/cudf/core/tools/datetimes.py | 52 ++++++------------------
 1 file changed, 12 insertions(+), 40 deletions(-)

diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index a78cb5787bf..15426d0173a 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -39,8 +39,6 @@
     "min": "m",
     "s": "s",
     "S": "s",
-    "L": "ms",
-    "ms": "ms",
     "U": "us",
     "us": "us",
     "N": "ns",
@@ -451,6 +449,7 @@ class DateOffset:
         "ns": "nanoseconds",
         "us": "microseconds",
         "ms": "milliseconds",
+        "L": "milliseconds",
         "s": "seconds",
         "m": "minutes",
         "h": "hours",
@@ -642,52 +641,25 @@ def __repr__(self):
         return repr_str
 
     @classmethod
-    def _from_str(cls: Type[_T], freqstr: str) -> _T:
+    def _from_freqstr(cls: Type[_T], freqstr: str) -> _T:
         """
-        Parse a string and return a DateOffset object.
-
-        A string can be a pandas `offset alias`<https://pandas.pydata.org/\
-            pandas-docs/stable/user_guide/timeseries.html#offset-aliases>_ or a
-        numpy `date/time unit code`<https://numpy.org/doc/stable/reference/arr\
-            ays.datetime.html#datetime-units>_
-
-        Note that ``m`` (lower case) is ambiguous and is not accepted in this
-        function. Use ``T``/``min`` for minutely frequency and ``M`` (upper
-        case) for monthly frequency.
-
-        Expects strings of the form 3D, 25W, -10ms, 42ns, etc.
-
-        Not all offset aliases are supported. See `_offset_alias_to_code` and
-        `_CODE_TO_UNITS` for supported list of strings.
+        Parse a string and return a DateOffset object
+        expects strings of the form 3D, 25W, 10ms, 42ns, etc.
         """
-        match = cls._FREQSTR_REGEX.fullmatch(freqstr)
+        match = cls._FREQSTR_REGEX.match(freqstr)
+
         if match is None:
             raise ValueError(f"Invalid frequency string: {freqstr}")
 
-        # Decompose the string into separate components
-        sign_part, numeric_part, freq_part = match.groups()
+        numeric_part = match.group(1)
+        if numeric_part == "":
+            numeric_part = "1"
+        freq_part = match.group(2)
 
-        # Handle various offset strings and normalize as codes
-        if freq_part == "m":
-            raise ValueError(
-                "Lower cased `m` is ambiguous. Use 'T'/'min' to specify "
-                "minutely frequency or upper cased `M` to specify monthly "
-                "frequency."
-            )
-
-        if freq_part in _offset_alias_to_code:
-            code = _offset_alias_to_code[freq_part]
-        elif freq_part in cls._CODES_TO_UNITS:
-            code = freq_part
-        else:
+        if freq_part not in cls._CODES_TO_UNITS:
             raise ValueError(f"Cannot interpret frequency str: {freqstr}")
 
-        # Handle sign and numerics
-        sign = -1 if sign_part else 1
-        n = int(numeric_part) if numeric_part else 1
-
-        # Construct the kwds dictionary
-        return cls(**{cls._CODES_TO_UNITS[code]: n * sign})
+        return cls(**{cls._CODES_TO_UNITS[freq_part]: int(numeric_part)})
 
     @classmethod
     def _from_pandas_ticks_or_weeks(