Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add first and last method to IndexedFrame #9710

Merged
merged 13 commits into from
Dec 24, 2021
Merged
126 changes: 125 additions & 1 deletion python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@

from __future__ import annotations

import operator
import warnings
from typing import Type, TypeVar
from typing import Callable, Type, TypeVar
from uuid import uuid4

import cupy as cp
Expand Down Expand Up @@ -101,6 +102,7 @@ class IndexedFrame(Frame):
# mypy can't handle bound type variables as class members
_loc_indexer_type: Type[_LocIndexerClass] # type: ignore
_iloc_indexer_type: Type[_IlocIndexerClass] # type: ignore
_index: cudf.core.index.BaseIndex

def __init__(self, data=None, index=None):
super().__init__(data=data, index=index)
Expand Down Expand Up @@ -758,3 +760,125 @@ def resample(
if isinstance(self, cudf.Series)
else cudf.core.resample.DataFrameResampler(self, by=by)
)

def _first_or_last(
self, offset, idx: int, op: Callable, side: str, slice_func: Callable
) -> "IndexedFrame":
"""Shared code path for ``first`` and ``last``."""
if not isinstance(self._index, cudf.core.index.DatetimeIndex):
raise TypeError("'first' only supports a DatetimeIndex index.")
if not isinstance(offset, str):
raise NotImplementedError(
f"Unsupported offset type {type(offset)}."
)

if len(self) == 0:
return self.copy()

pd_offset = pd.tseries.frequencies.to_offset(offset)
to_search = op(pd.Timestamp(self._index._column[idx]), pd_offset)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does the op callable do? and what is the operation being done in this line // what does to_search repr? jw

Copy link
Contributor Author

@isVoid isVoid Dec 10, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If first is called, then op is add to compute the cut-off date counting from the first date in the column; if last, op is sub to compute the that from the last date. to_search is the cut-off date. Feel free to request changes if the naming/logic isn't very readable to your taste.

if (
idx == 0
and not isinstance(pd_offset, pd.tseries.offsets.Tick)
and pd_offset.is_on_offset(pd.Timestamp(self._index[0]))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does is_on_offset check for datetimes values only at the end of the offset?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think pandas DateOffset can be MonthBegin or MonthEnd. Depending on the string to be either MS or M.

In [3]: pd.tseries.frequencies.to_offset('M')
Out[3]: <MonthEnd>

In [4]: pd.tseries.frequencies.to_offset('MS')
Out[4]: <MonthBegin>

I believe is_on_offset checks if the given datetime falls on the given offset. There isn't much documentation about this but it seems like every kind of offset has this function.

Like Nano would always return true:
https://github.com/pandas-dev/pandas/blob/878a0225c648cb145949f78085a8ff3f902a1c20/pandas/_libs/tslibs/offsets.pyx#L834-L835

and BusinessDay would check if the weekday of the given date is monday to friday
https://github.com/pandas-dev/pandas/blob/878a0225c648cb145949f78085a8ff3f902a1c20/pandas/_libs/tslibs/offsets.pyx#L1445-L1448

and for MonthEnd I believe it resorts to the base method:
https://github.com/pandas-dev/pandas/blob/878a0225c648cb145949f78085a8ff3f902a1c20/pandas/_libs/tslibs/offsets.pyx#L650-L659
Note that + and - on the MonthEnd basically sets the date to the end of that month, same apply to other offsets.

):
# Special handle is required when the start time of the index
# is on the end of the offset. See pandas gh29623 for detail.
to_search = to_search - pd_offset.base
return self.loc[:to_search]
end_point = int(
self._index._column.searchsorted(to_search, side=side)[0]
)
return slice_func(end_point)

def first(self, offset):
"""Select initial periods of time series data based on a date offset.

When having a DataFrame with **sorted** dates as index, this function
can select the first few rows based on a date offset.

Parameters
----------
offset: str
The offset length of the data that will be selected. For intance,
'1M' will display all rows having their index within the first
month.

Returns
-------
Series or DataFrame
A subset of the caller.

Raises
------
TypeError
If the index is not a ``DatetimeIndex``

Examples
--------
>>> i = cudf.date_range('2018-04-09', periods=4, freq='2D')
>>> ts = cudf.DataFrame({'A': [1, 2, 3, 4]}, index=i)
>>> ts
A
2018-04-09 1
2018-04-11 2
2018-04-13 3
2018-04-15 4
>>> ts.first('3D')
A
2018-04-09 1
2018-04-11 2
"""
return self._first_or_last(
offset,
idx=0,
op=operator.__add__,
side="left",
slice_func=lambda i: self.iloc[:i],
)

def last(self, offset):
"""Select final periods of time series data based on a date offset.

When having a DataFrame with **sorted** dates as index, this function
can select the last few rows based on a date offset.

Parameters
----------
offset: str
The offset length of the data that will be selected. For instance,
'3D' will display all rows having their index within the last 3
days.

Returns
-------
Series or DataFrame
A subset of the caller.

Raises
------
TypeError
If the index is not a ``DatetimeIndex``

Examples
--------
>>> i = cudf.date_range('2018-04-09', periods=4, freq='2D')
>>> ts = cudf.DataFrame({'A': [1, 2, 3, 4]}, index=i)
>>> ts
A
2018-04-09 1
2018-04-11 2
2018-04-13 3
2018-04-15 4
>>> ts.last('3D')
A
2018-04-13 3
2018-04-15 4
"""
return self._first_or_last(
offset,
idx=-1,
op=operator.__sub__,
side="right",
slice_func=lambda i: self.iloc[i:],
)
81 changes: 52 additions & 29 deletions python/cudf/cudf/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
"min": "m",
"s": "s",
"S": "s",
"L": "ms",
"ms": "ms",
"U": "us",
"us": "us",
"N": "ns",
Expand Down Expand Up @@ -448,7 +450,6 @@ class DateOffset:
"ns": "nanoseconds",
"us": "microseconds",
"ms": "milliseconds",
"L": "milliseconds",
"s": "seconds",
"m": "minutes",
"h": "hours",
Expand All @@ -458,7 +459,7 @@ class DateOffset:
"Y": "years",
}

_FREQSTR_REGEX = re.compile("([0-9]*)([a-zA-Z]+)")
_FREQSTR_REGEX = re.compile("(-)*([0-9]*)([a-zA-Z]+)")

def __init__(self, n=1, normalize=False, **kwds):
if normalize:
Expand Down Expand Up @@ -629,27 +630,54 @@ def __repr__(self):
return repr_str

@classmethod
def _from_freqstr(cls: Type[_T], freqstr: str) -> _T:
def _from_str(cls: Type[_T], freqstr: str) -> _T:
"""
Parse a string and return a DateOffset object
expects strings of the form 3D, 25W, 10ms, 42ns, etc.
"""
match = cls._FREQSTR_REGEX.match(freqstr)
Parse a string and return a DateOffset object.

A string can be a pandas `offset alias`<https://pandas.pydata.org/\
pandas-docs/stable/user_guide/timeseries.html#offset-aliases>_ or a
numpy `date/time unit code`<https://numpy.org/doc/stable/reference/arr\
ays.datetime.html#datetime-units>_

Note that ``m`` (lower case) is ambiguous and is not accepted in this
function. Use ``T``/``min`` for minutely frequency and ``M`` (upper
case) for monthly frequency.

Expects strings of the form 3D, 25W, -10ms, 42ns, etc.

Not all offset aliases are supported. See `_offset_alias_to_code` and
`_CODE_TO_UNITS` for supported list of strings.
"""
match = cls._FREQSTR_REGEX.fullmatch(freqstr)
if match is None:
raise ValueError(f"Invalid frequency string: {freqstr}")

numeric_part = match.group(1)
if numeric_part == "":
numeric_part = "1"
freq_part = match.group(2)
# Decompose the string into separate components
sign_part, numeric_part, freq_part = match.groups()

# Handle various offset strings and normalize as codes
if freq_part == "m":
raise ValueError(
"Lower cased `m` is ambiguous. Use 'T'/'min' to specify "
"minutely frequency or upper cased `M` to specify monthly "
"frequency."
)

if freq_part not in cls._CODES_TO_UNITS:
if freq_part in _offset_alias_to_code:
code = _offset_alias_to_code[freq_part]
elif freq_part in cls._CODES_TO_UNITS:
code = freq_part
else:
raise ValueError(f"Cannot interpret frequency str: {freqstr}")

return cls(**{cls._CODES_TO_UNITS[freq_part]: int(numeric_part)})
# Handle sign and numerics
sign = -1 if sign_part else 1
n = int(numeric_part) if numeric_part else 1

# Construct the kwds dictionary
return cls(**{cls._CODES_TO_UNITS[code]: n * sign})

def _maybe_as_fast_pandas_offset(self):
def _maybe_as_fast_pandas_offset(self) -> pd.DateOffset:
if (
len(self.kwds) == 1
and _has_fixed_frequency(self)
Expand Down Expand Up @@ -814,23 +842,18 @@ def date_range(
if isinstance(freq, DateOffset):
offset = freq
elif isinstance(freq, str):
# Map pandas `offset alias` into cudf DateOffset `CODE`, only
# fixed-frequency, non-anchored offset aliases are supported.
mo = re.fullmatch(
rf'(-)*(\d*)({"|".join(_offset_alias_to_code.keys())})', freq
)
if mo is None:
if (
any(
x in freq.upper()
for x in {"Y", "A", "Q", "B", "SM", "SMS", "CBMS", "M"}
)
or "MS" in freq
):
raise ValueError(
f"Unrecognized or unsupported offset alias {freq}."
"date_range does not yet support month, quarter, year-anchored"
"or business-date frequency."
)

sign, n, offset_alias = mo.groups()
code = _offset_alias_to_code[offset_alias]

freq = "".join([n, code])
offset = DateOffset._from_freqstr(freq)
if sign:
offset.kwds.update({s: -i for s, i in offset.kwds.items()})
offset = DateOffset._from_str(freq)
else:
raise TypeError("`freq` must be a `str` or cudf.DateOffset object.")

Expand Down
Loading