diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 51bfad3a054..4be35d960ee 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3,9 +3,10 @@ from __future__ import annotations +import operator import warnings from collections import abc -from typing import Type, TypeVar +from typing import Callable, Type, TypeVar from uuid import uuid4 import cupy as cp @@ -109,6 +110,7 @@ class IndexedFrame(Frame): # mypy can't handle bound type variables as class members _loc_indexer_type: Type[_LocIndexerClass] # type: ignore _iloc_indexer_type: Type[_IlocIndexerClass] # type: ignore + _index: cudf.core.index.BaseIndex def __init__(self, data=None, index=None): super().__init__(data=data, index=index) @@ -1104,3 +1106,125 @@ def resample( if isinstance(self, cudf.Series) else cudf.core.resample.DataFrameResampler(self, by=by) ) + + def _first_or_last( + self, offset, idx: int, op: Callable, side: str, slice_func: Callable + ) -> "IndexedFrame": + """Shared code path for ``first`` and ``last``.""" + if not isinstance(self._index, cudf.core.index.DatetimeIndex): + raise TypeError("'first' only supports a DatetimeIndex index.") + if not isinstance(offset, str): + raise NotImplementedError( + f"Unsupported offset type {type(offset)}." + ) + + if len(self) == 0: + return self.copy() + + pd_offset = pd.tseries.frequencies.to_offset(offset) + to_search = op(pd.Timestamp(self._index._column[idx]), pd_offset) + if ( + idx == 0 + and not isinstance(pd_offset, pd.tseries.offsets.Tick) + and pd_offset.is_on_offset(pd.Timestamp(self._index[0])) + ): + # Special handle is required when the start time of the index + # is on the end of the offset. See pandas gh29623 for detail. + to_search = to_search - pd_offset.base + return self.loc[:to_search] + end_point = int( + self._index._column.searchsorted(to_search, side=side)[0] + ) + return slice_func(end_point) + + def first(self, offset): + """Select initial periods of time series data based on a date offset. + + When having a DataFrame with **sorted** dates as index, this function + can select the first few rows based on a date offset. + + Parameters + ---------- + offset: str + The offset length of the data that will be selected. For intance, + '1M' will display all rows having their index within the first + month. + + Returns + ------- + Series or DataFrame + A subset of the caller. + + Raises + ------ + TypeError + If the index is not a ``DatetimeIndex`` + + Examples + -------- + >>> i = cudf.date_range('2018-04-09', periods=4, freq='2D') + >>> ts = cudf.DataFrame({'A': [1, 2, 3, 4]}, index=i) + >>> ts + A + 2018-04-09 1 + 2018-04-11 2 + 2018-04-13 3 + 2018-04-15 4 + >>> ts.first('3D') + A + 2018-04-09 1 + 2018-04-11 2 + """ + return self._first_or_last( + offset, + idx=0, + op=operator.__add__, + side="left", + slice_func=lambda i: self.iloc[:i], + ) + + def last(self, offset): + """Select final periods of time series data based on a date offset. + + When having a DataFrame with **sorted** dates as index, this function + can select the last few rows based on a date offset. + + Parameters + ---------- + offset: str + The offset length of the data that will be selected. For instance, + '3D' will display all rows having their index within the last 3 + days. + + Returns + ------- + Series or DataFrame + A subset of the caller. + + Raises + ------ + TypeError + If the index is not a ``DatetimeIndex`` + + Examples + -------- + >>> i = cudf.date_range('2018-04-09', periods=4, freq='2D') + >>> ts = cudf.DataFrame({'A': [1, 2, 3, 4]}, index=i) + >>> ts + A + 2018-04-09 1 + 2018-04-11 2 + 2018-04-13 3 + 2018-04-15 4 + >>> ts.last('3D') + A + 2018-04-13 3 + 2018-04-15 4 + """ + return self._first_or_last( + offset, + idx=-1, + op=operator.__sub__, + side="right", + slice_func=lambda i: self.iloc[i:], + ) diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 1a1b21aa3d5..9d120819248 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -1892,3 +1892,107 @@ def test_round(data, time_type, resolution): expect = ps.dt.round(resolution) got = gs.dt.round(resolution) assert_eq(expect, got) + + +@pytest.mark.parametrize( + "idx", + [ + pd.DatetimeIndex([]), + pd.DatetimeIndex(["2010-05-31"]), + pd.date_range("2000-01-01", "2000-12-31", periods=21), + ], +) +@pytest.mark.parametrize( + "offset", + [ + "10Y", + "6M", + "M", + "31D", + "0H", + "44640T", + "44640min", + "2678000S", + "2678000000L", + "2678000000ms", + "2678000000000U", + "2678000000000us", + "2678000000000000N", + "2678000000000000ns", + ], +) +def test_first(idx, offset): + p = pd.Series(range(len(idx)), index=idx) + g = cudf.from_pandas(p) + + expect = p.first(offset=offset) + got = g.first(offset=offset) + + assert_eq(expect, got) + + +@pytest.mark.parametrize( + # This test case tests correctness when start is end of month + "idx, offset", + [ + ( + pd.DatetimeIndex( + [ + "2020-01-31", + "2020-02-15", + "2020-02-29", + "2020-03-15", + "2020-03-31", + "2020-04-15", + "2020-04-30", + ] + ), + "3M", + ) + ], +) +def test_first_start_at_end_of_month(idx, offset): + p = pd.Series(range(len(idx)), index=idx) + g = cudf.from_pandas(p) + + expect = p.first(offset=offset) + got = g.first(offset=offset) + + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "idx", + [ + pd.DatetimeIndex([]), + pd.DatetimeIndex(["2010-05-31"]), + pd.date_range("2000-01-01", "2000-12-31", periods=21), + ], +) +@pytest.mark.parametrize( + "offset", + [ + "10Y", + "6M", + "M", + "31D", + "0H", + "44640T", + "44640min", + "2678000S", + "2678000000L", + "2678000000ms", + "2678000000000U", + "2678000000000us", + "2678000000000000N", + "2678000000000000ns", + ], +) +def test_last(idx, offset): + p = pd.Series(range(len(idx)), index=idx) + g = cudf.from_pandas(p) + + expect = p.last(offset=offset) + got = g.last(offset=offset) + + assert_eq(expect, got)