Skip to content

Commit

Permalink
Add first and last method to IndexedFrame (#9710)
Browse files Browse the repository at this point in the history
closes #9600 

This PR adds `first` and `last` method to `indexed_frame`.  This method only applies to `IndexedFrame` with `DatetimeIndex` and gathers the first or last rows within time range specified by `offset` argument.

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - Sheilah Kirui (https://github.com/skirui-source)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #9710
  • Loading branch information
isVoid authored Dec 24, 2021
1 parent c99a37f commit e432d01
Show file tree
Hide file tree
Showing 2 changed files with 229 additions and 1 deletion.
126 changes: 125 additions & 1 deletion python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@

from __future__ import annotations

import operator
import warnings
from collections import abc
from typing import Type, TypeVar
from typing import Callable, Type, TypeVar
from uuid import uuid4

import cupy as cp
Expand Down Expand Up @@ -109,6 +110,7 @@ class IndexedFrame(Frame):
# mypy can't handle bound type variables as class members
_loc_indexer_type: Type[_LocIndexerClass] # type: ignore
_iloc_indexer_type: Type[_IlocIndexerClass] # type: ignore
_index: cudf.core.index.BaseIndex

def __init__(self, data=None, index=None):
super().__init__(data=data, index=index)
Expand Down Expand Up @@ -1104,3 +1106,125 @@ def resample(
if isinstance(self, cudf.Series)
else cudf.core.resample.DataFrameResampler(self, by=by)
)

def _first_or_last(
self, offset, idx: int, op: Callable, side: str, slice_func: Callable
) -> "IndexedFrame":
"""Shared code path for ``first`` and ``last``."""
if not isinstance(self._index, cudf.core.index.DatetimeIndex):
raise TypeError("'first' only supports a DatetimeIndex index.")
if not isinstance(offset, str):
raise NotImplementedError(
f"Unsupported offset type {type(offset)}."
)

if len(self) == 0:
return self.copy()

pd_offset = pd.tseries.frequencies.to_offset(offset)
to_search = op(pd.Timestamp(self._index._column[idx]), pd_offset)
if (
idx == 0
and not isinstance(pd_offset, pd.tseries.offsets.Tick)
and pd_offset.is_on_offset(pd.Timestamp(self._index[0]))
):
# Special handle is required when the start time of the index
# is on the end of the offset. See pandas gh29623 for detail.
to_search = to_search - pd_offset.base
return self.loc[:to_search]
end_point = int(
self._index._column.searchsorted(to_search, side=side)[0]
)
return slice_func(end_point)

def first(self, offset):
"""Select initial periods of time series data based on a date offset.
When having a DataFrame with **sorted** dates as index, this function
can select the first few rows based on a date offset.
Parameters
----------
offset: str
The offset length of the data that will be selected. For intance,
'1M' will display all rows having their index within the first
month.
Returns
-------
Series or DataFrame
A subset of the caller.
Raises
------
TypeError
If the index is not a ``DatetimeIndex``
Examples
--------
>>> i = cudf.date_range('2018-04-09', periods=4, freq='2D')
>>> ts = cudf.DataFrame({'A': [1, 2, 3, 4]}, index=i)
>>> ts
A
2018-04-09 1
2018-04-11 2
2018-04-13 3
2018-04-15 4
>>> ts.first('3D')
A
2018-04-09 1
2018-04-11 2
"""
return self._first_or_last(
offset,
idx=0,
op=operator.__add__,
side="left",
slice_func=lambda i: self.iloc[:i],
)

def last(self, offset):
"""Select final periods of time series data based on a date offset.
When having a DataFrame with **sorted** dates as index, this function
can select the last few rows based on a date offset.
Parameters
----------
offset: str
The offset length of the data that will be selected. For instance,
'3D' will display all rows having their index within the last 3
days.
Returns
-------
Series or DataFrame
A subset of the caller.
Raises
------
TypeError
If the index is not a ``DatetimeIndex``
Examples
--------
>>> i = cudf.date_range('2018-04-09', periods=4, freq='2D')
>>> ts = cudf.DataFrame({'A': [1, 2, 3, 4]}, index=i)
>>> ts
A
2018-04-09 1
2018-04-11 2
2018-04-13 3
2018-04-15 4
>>> ts.last('3D')
A
2018-04-13 3
2018-04-15 4
"""
return self._first_or_last(
offset,
idx=-1,
op=operator.__sub__,
side="right",
slice_func=lambda i: self.iloc[i:],
)
104 changes: 104 additions & 0 deletions python/cudf/cudf/tests/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -1892,3 +1892,107 @@ def test_round(data, time_type, resolution):
expect = ps.dt.round(resolution)
got = gs.dt.round(resolution)
assert_eq(expect, got)


@pytest.mark.parametrize(
"idx",
[
pd.DatetimeIndex([]),
pd.DatetimeIndex(["2010-05-31"]),
pd.date_range("2000-01-01", "2000-12-31", periods=21),
],
)
@pytest.mark.parametrize(
"offset",
[
"10Y",
"6M",
"M",
"31D",
"0H",
"44640T",
"44640min",
"2678000S",
"2678000000L",
"2678000000ms",
"2678000000000U",
"2678000000000us",
"2678000000000000N",
"2678000000000000ns",
],
)
def test_first(idx, offset):
p = pd.Series(range(len(idx)), index=idx)
g = cudf.from_pandas(p)

expect = p.first(offset=offset)
got = g.first(offset=offset)

assert_eq(expect, got)


@pytest.mark.parametrize(
# This test case tests correctness when start is end of month
"idx, offset",
[
(
pd.DatetimeIndex(
[
"2020-01-31",
"2020-02-15",
"2020-02-29",
"2020-03-15",
"2020-03-31",
"2020-04-15",
"2020-04-30",
]
),
"3M",
)
],
)
def test_first_start_at_end_of_month(idx, offset):
p = pd.Series(range(len(idx)), index=idx)
g = cudf.from_pandas(p)

expect = p.first(offset=offset)
got = g.first(offset=offset)

assert_eq(expect, got)


@pytest.mark.parametrize(
"idx",
[
pd.DatetimeIndex([]),
pd.DatetimeIndex(["2010-05-31"]),
pd.date_range("2000-01-01", "2000-12-31", periods=21),
],
)
@pytest.mark.parametrize(
"offset",
[
"10Y",
"6M",
"M",
"31D",
"0H",
"44640T",
"44640min",
"2678000S",
"2678000000L",
"2678000000ms",
"2678000000000U",
"2678000000000us",
"2678000000000000N",
"2678000000000000ns",
],
)
def test_last(idx, offset):
p = pd.Series(range(len(idx)), index=idx)
g = cudf.from_pandas(p)

expect = p.last(offset=offset)
got = g.last(offset=offset)

assert_eq(expect, got)

0 comments on commit e432d01

Please sign in to comment.