Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add first and last method to IndexedFrame #9710

Merged
merged 13 commits into from
Dec 24, 2021
Merged
126 changes: 125 additions & 1 deletion python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@

from __future__ import annotations

import operator
import warnings
from collections import abc
from typing import Type, TypeVar
from typing import Callable, Type, TypeVar
from uuid import uuid4

import cupy as cp
Expand Down Expand Up @@ -109,6 +110,7 @@ class IndexedFrame(Frame):
# mypy can't handle bound type variables as class members
_loc_indexer_type: Type[_LocIndexerClass] # type: ignore
_iloc_indexer_type: Type[_IlocIndexerClass] # type: ignore
_index: cudf.core.index.BaseIndex

def __init__(self, data=None, index=None):
super().__init__(data=data, index=index)
Expand Down Expand Up @@ -1104,3 +1106,125 @@ def resample(
if isinstance(self, cudf.Series)
else cudf.core.resample.DataFrameResampler(self, by=by)
)

def _first_or_last(
self, offset, idx: int, op: Callable, side: str, slice_func: Callable
) -> "IndexedFrame":
"""Shared code path for ``first`` and ``last``."""
if not isinstance(self._index, cudf.core.index.DatetimeIndex):
raise TypeError("'first' only supports a DatetimeIndex index.")
if not isinstance(offset, str):
raise NotImplementedError(
f"Unsupported offset type {type(offset)}."
)

if len(self) == 0:
return self.copy()

pd_offset = pd.tseries.frequencies.to_offset(offset)
to_search = op(pd.Timestamp(self._index._column[idx]), pd_offset)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does the op callable do? and what is the operation being done in this line // what does to_search repr? jw

Copy link
Contributor Author

@isVoid isVoid Dec 10, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If first is called, then op is add to compute the cut-off date counting from the first date in the column; if last, op is sub to compute the that from the last date. to_search is the cut-off date. Feel free to request changes if the naming/logic isn't very readable to your taste.

if (
idx == 0
and not isinstance(pd_offset, pd.tseries.offsets.Tick)
and pd_offset.is_on_offset(pd.Timestamp(self._index[0]))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does is_on_offset check for datetimes values only at the end of the offset?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think pandas DateOffset can be MonthBegin or MonthEnd. Depending on the string to be either MS or M.

In [3]: pd.tseries.frequencies.to_offset('M')
Out[3]: <MonthEnd>

In [4]: pd.tseries.frequencies.to_offset('MS')
Out[4]: <MonthBegin>

I believe is_on_offset checks if the given datetime falls on the given offset. There isn't much documentation about this but it seems like every kind of offset has this function.

Like Nano would always return true:
https://github.com/pandas-dev/pandas/blob/878a0225c648cb145949f78085a8ff3f902a1c20/pandas/_libs/tslibs/offsets.pyx#L834-L835

and BusinessDay would check if the weekday of the given date is monday to friday
https://github.com/pandas-dev/pandas/blob/878a0225c648cb145949f78085a8ff3f902a1c20/pandas/_libs/tslibs/offsets.pyx#L1445-L1448

and for MonthEnd I believe it resorts to the base method:
https://github.com/pandas-dev/pandas/blob/878a0225c648cb145949f78085a8ff3f902a1c20/pandas/_libs/tslibs/offsets.pyx#L650-L659
Note that + and - on the MonthEnd basically sets the date to the end of that month, same apply to other offsets.

):
# Special handle is required when the start time of the index
# is on the end of the offset. See pandas gh29623 for detail.
to_search = to_search - pd_offset.base
return self.loc[:to_search]
end_point = int(
self._index._column.searchsorted(to_search, side=side)[0]
)
return slice_func(end_point)

def first(self, offset):
"""Select initial periods of time series data based on a date offset.
When having a DataFrame with **sorted** dates as index, this function
can select the first few rows based on a date offset.
Parameters
----------
offset: str
The offset length of the data that will be selected. For intance,
'1M' will display all rows having their index within the first
month.
Returns
-------
Series or DataFrame
A subset of the caller.
Raises
------
TypeError
If the index is not a ``DatetimeIndex``
Examples
--------
>>> i = cudf.date_range('2018-04-09', periods=4, freq='2D')
>>> ts = cudf.DataFrame({'A': [1, 2, 3, 4]}, index=i)
>>> ts
A
2018-04-09 1
2018-04-11 2
2018-04-13 3
2018-04-15 4
>>> ts.first('3D')
A
2018-04-09 1
2018-04-11 2
"""
return self._first_or_last(
offset,
idx=0,
op=operator.__add__,
side="left",
slice_func=lambda i: self.iloc[:i],
)

def last(self, offset):
"""Select final periods of time series data based on a date offset.
When having a DataFrame with **sorted** dates as index, this function
can select the last few rows based on a date offset.
Parameters
----------
offset: str
The offset length of the data that will be selected. For instance,
'3D' will display all rows having their index within the last 3
days.
Returns
-------
Series or DataFrame
A subset of the caller.
Raises
------
TypeError
If the index is not a ``DatetimeIndex``
Examples
--------
>>> i = cudf.date_range('2018-04-09', periods=4, freq='2D')
>>> ts = cudf.DataFrame({'A': [1, 2, 3, 4]}, index=i)
>>> ts
A
2018-04-09 1
2018-04-11 2
2018-04-13 3
2018-04-15 4
>>> ts.last('3D')
A
2018-04-13 3
2018-04-15 4
"""
return self._first_or_last(
offset,
idx=-1,
op=operator.__sub__,
side="right",
slice_func=lambda i: self.iloc[i:],
)
104 changes: 104 additions & 0 deletions python/cudf/cudf/tests/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -1892,3 +1892,107 @@ def test_round(data, time_type, resolution):
expect = ps.dt.round(resolution)
got = gs.dt.round(resolution)
assert_eq(expect, got)


@pytest.mark.parametrize(
"idx",
[
pd.DatetimeIndex([]),
pd.DatetimeIndex(["2010-05-31"]),
pd.date_range("2000-01-01", "2000-12-31", periods=21),
],
)
@pytest.mark.parametrize(
"offset",
[
"10Y",
"6M",
"M",
"31D",
"0H",
"44640T",
"44640min",
"2678000S",
"2678000000L",
"2678000000ms",
"2678000000000U",
"2678000000000us",
"2678000000000000N",
"2678000000000000ns",
],
)
def test_first(idx, offset):
p = pd.Series(range(len(idx)), index=idx)
g = cudf.from_pandas(p)

expect = p.first(offset=offset)
got = g.first(offset=offset)

assert_eq(expect, got)


@pytest.mark.parametrize(
# This test case tests correctness when start is end of month
"idx, offset",
[
(
pd.DatetimeIndex(
[
"2020-01-31",
"2020-02-15",
"2020-02-29",
"2020-03-15",
"2020-03-31",
"2020-04-15",
"2020-04-30",
]
),
"3M",
)
],
)
def test_first_start_at_end_of_month(idx, offset):
p = pd.Series(range(len(idx)), index=idx)
g = cudf.from_pandas(p)

expect = p.first(offset=offset)
got = g.first(offset=offset)

assert_eq(expect, got)


@pytest.mark.parametrize(
"idx",
[
pd.DatetimeIndex([]),
pd.DatetimeIndex(["2010-05-31"]),
pd.date_range("2000-01-01", "2000-12-31", periods=21),
],
)
@pytest.mark.parametrize(
"offset",
[
"10Y",
"6M",
"M",
"31D",
"0H",
"44640T",
"44640min",
"2678000S",
"2678000000L",
"2678000000ms",
"2678000000000U",
"2678000000000us",
"2678000000000000N",
"2678000000000000ns",
],
)
def test_last(idx, offset):
p = pd.Series(range(len(idx)), index=idx)
g = cudf.from_pandas(p)

expect = p.last(offset=offset)
got = g.last(offset=offset)

assert_eq(expect, got)