-
Notifications
You must be signed in to change notification settings - Fork 919
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add first
and last
method to IndexedFrame
#9710
Changes from 10 commits
107c08a
25da96e
6cc207e
63a4f3b
0c97891
cbbc061
1fddc14
19f50ee
c34da36
ba8f2df
dc5e92f
254da75
1076a7c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,8 +3,9 @@ | |
|
||
from __future__ import annotations | ||
|
||
import operator | ||
import warnings | ||
from typing import Type, TypeVar | ||
from typing import Callable, Type, TypeVar | ||
from uuid import uuid4 | ||
|
||
import cupy as cp | ||
|
@@ -101,6 +102,7 @@ class IndexedFrame(Frame): | |
# mypy can't handle bound type variables as class members | ||
_loc_indexer_type: Type[_LocIndexerClass] # type: ignore | ||
_iloc_indexer_type: Type[_IlocIndexerClass] # type: ignore | ||
_index: cudf.core.index.BaseIndex | ||
|
||
def __init__(self, data=None, index=None): | ||
super().__init__(data=data, index=index) | ||
|
@@ -758,3 +760,125 @@ def resample( | |
if isinstance(self, cudf.Series) | ||
else cudf.core.resample.DataFrameResampler(self, by=by) | ||
) | ||
|
||
def _first_or_last( | ||
self, offset, idx: int, op: Callable, side: str, slice_func: Callable | ||
) -> "IndexedFrame": | ||
"""Shared code path for ``first`` and ``last``.""" | ||
if not isinstance(self._index, cudf.core.index.DatetimeIndex): | ||
raise TypeError("'first' only supports a DatetimeIndex index.") | ||
if not isinstance(offset, str): | ||
raise NotImplementedError( | ||
f"Unsupported offset type {type(offset)}." | ||
) | ||
|
||
if len(self) == 0: | ||
return self.copy() | ||
|
||
pd_offset = pd.tseries.frequencies.to_offset(offset) | ||
to_search = op(pd.Timestamp(self._index._column[idx]), pd_offset) | ||
if ( | ||
idx == 0 | ||
and not isinstance(pd_offset, pd.tseries.offsets.Tick) | ||
and pd_offset.is_on_offset(pd.Timestamp(self._index[0])) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. does There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think pandas In [3]: pd.tseries.frequencies.to_offset('M')
Out[3]: <MonthEnd>
In [4]: pd.tseries.frequencies.to_offset('MS')
Out[4]: <MonthBegin> I believe Like and and for |
||
): | ||
# Special handle is required when the start time of the index | ||
# is on the end of the offset. See pandas gh29623 for detail. | ||
to_search = to_search - pd_offset.base | ||
return self.loc[:to_search] | ||
end_point = int( | ||
self._index._column.searchsorted(to_search, side=side)[0] | ||
) | ||
return slice_func(end_point) | ||
|
||
def first(self, offset): | ||
"""Select initial periods of time series data based on a date offset. | ||
|
||
When having a DataFrame with **sorted** dates as index, this function | ||
can select the first few rows based on a date offset. | ||
|
||
Parameters | ||
---------- | ||
offset: str | ||
The offset length of the data that will be selected. For intance, | ||
'1M' will display all rows having their index within the first | ||
month. | ||
|
||
Returns | ||
------- | ||
Series or DataFrame | ||
A subset of the caller. | ||
|
||
Raises | ||
------ | ||
TypeError | ||
If the index is not a ``DatetimeIndex`` | ||
|
||
Examples | ||
-------- | ||
>>> i = cudf.date_range('2018-04-09', periods=4, freq='2D') | ||
>>> ts = cudf.DataFrame({'A': [1, 2, 3, 4]}, index=i) | ||
>>> ts | ||
A | ||
2018-04-09 1 | ||
2018-04-11 2 | ||
2018-04-13 3 | ||
2018-04-15 4 | ||
>>> ts.first('3D') | ||
A | ||
2018-04-09 1 | ||
2018-04-11 2 | ||
""" | ||
return self._first_or_last( | ||
offset, | ||
idx=0, | ||
op=operator.__add__, | ||
side="left", | ||
slice_func=lambda i: self.iloc[:i], | ||
) | ||
|
||
def last(self, offset): | ||
"""Select final periods of time series data based on a date offset. | ||
|
||
When having a DataFrame with **sorted** dates as index, this function | ||
can select the last few rows based on a date offset. | ||
|
||
Parameters | ||
---------- | ||
offset: str | ||
The offset length of the data that will be selected. For instance, | ||
'3D' will display all rows having their index within the last 3 | ||
days. | ||
|
||
Returns | ||
------- | ||
Series or DataFrame | ||
A subset of the caller. | ||
|
||
Raises | ||
------ | ||
TypeError | ||
If the index is not a ``DatetimeIndex`` | ||
|
||
Examples | ||
-------- | ||
>>> i = cudf.date_range('2018-04-09', periods=4, freq='2D') | ||
>>> ts = cudf.DataFrame({'A': [1, 2, 3, 4]}, index=i) | ||
>>> ts | ||
A | ||
2018-04-09 1 | ||
2018-04-11 2 | ||
2018-04-13 3 | ||
2018-04-15 4 | ||
>>> ts.last('3D') | ||
A | ||
2018-04-13 3 | ||
2018-04-15 4 | ||
""" | ||
return self._first_or_last( | ||
offset, | ||
idx=-1, | ||
op=operator.__sub__, | ||
side="right", | ||
slice_func=lambda i: self.iloc[i:], | ||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What does the
op
callable do? and what is the operation being done in this line // what doesto_search
repr? jwThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If
first
is called, thenop
isadd
to compute the cut-off date counting from the first date in the column; iflast
,op
issub
to compute the that from the last date.to_search
is the cut-off date. Feel free to request changes if the naming/logic isn't very readable to your taste.