Skip to content

Commit

Permalink
Adding feature Truncate to DataFrame and Series (#11435)
Browse files Browse the repository at this point in the history
This PR closes #9629 by adding truncate feature to DataFrame and Series. Truncates a DataFrame or Series before and after some index value. If the index being truncated contains only datetime values, before and after may be specified as strings instead of Timestamps.

Authors:
  - https://github.com/VamsiTallam95
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: #11435
  • Loading branch information
VamsiTallam95 authored Nov 4, 2022
1 parent 6e13139 commit 9df2eba
Show file tree
Hide file tree
Showing 4 changed files with 301 additions and 0 deletions.
200 changes: 200 additions & 0 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1045,6 +1045,206 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
zip(self._column_names, data_columns), self._index
)

@_cudf_nvtx_annotate
def truncate(self, before=None, after=None, axis=0, copy=True):
"""
Truncate a Series or DataFrame before and after some index value.
This is a useful shorthand for boolean indexing based on index
values above or below certain thresholds.
Parameters
----------
before : date, str, int
Truncate all rows before this index value.
after : date, str, int
Truncate all rows after this index value.
axis : {0 or 'index', 1 or 'columns'}, optional
Axis to truncate. Truncates the index (rows) by default.
copy : bool, default is True,
Return a copy of the truncated section.
Returns
-------
The truncated Series or DataFrame.
Notes
-----
If the index being truncated contains only datetime values,
`before` and `after` may be specified as strings instead of
Timestamps.
.. pandas-compat::
**DataFrame.truncate, Series.truncate**
The ``copy`` parameter is only present for API compatibility, but
``copy=False`` is not supported. This method always generates a
copy.
Examples
--------
**Series**
>>> import cudf
>>> cs1 = cudf.Series([1, 2, 3, 4])
>>> cs1
0 1
1 2
2 3
3 4
dtype: int64
>>> cs1.truncate(before=1, after=2)
1 2
2 3
dtype: int64
>>> import cudf
>>> dates = cudf.date_range(
... '2021-01-01 23:45:00', '2021-01-01 23:46:00', freq='s'
... )
>>> cs2 = cudf.Series(range(len(dates)), index=dates)
>>> cs2
2021-01-01 23:45:00 0
2021-01-01 23:45:01 1
2021-01-01 23:45:02 2
2021-01-01 23:45:03 3
2021-01-01 23:45:04 4
2021-01-01 23:45:05 5
2021-01-01 23:45:06 6
2021-01-01 23:45:07 7
2021-01-01 23:45:08 8
2021-01-01 23:45:09 9
2021-01-01 23:45:10 10
2021-01-01 23:45:11 11
2021-01-01 23:45:12 12
2021-01-01 23:45:13 13
2021-01-01 23:45:14 14
2021-01-01 23:45:15 15
2021-01-01 23:45:16 16
2021-01-01 23:45:17 17
2021-01-01 23:45:18 18
2021-01-01 23:45:19 19
2021-01-01 23:45:20 20
2021-01-01 23:45:21 21
2021-01-01 23:45:22 22
2021-01-01 23:45:23 23
2021-01-01 23:45:24 24
...
2021-01-01 23:45:56 56
2021-01-01 23:45:57 57
2021-01-01 23:45:58 58
2021-01-01 23:45:59 59
dtype: int64
>>> cs2.truncate(
... before="2021-01-01 23:45:18", after="2021-01-01 23:45:27"
... )
2021-01-01 23:45:18 18
2021-01-01 23:45:19 19
2021-01-01 23:45:20 20
2021-01-01 23:45:21 21
2021-01-01 23:45:22 22
2021-01-01 23:45:23 23
2021-01-01 23:45:24 24
2021-01-01 23:45:25 25
2021-01-01 23:45:26 26
2021-01-01 23:45:27 27
dtype: int64
>>> cs3 = cudf.Series({'A': 1, 'B': 2, 'C': 3, 'D': 4})
>>> cs3
A 1
B 2
C 3
D 4
dtype: int64
>>> cs3.truncate(before='B', after='C')
B 2
C 3
dtype: int64
**DataFrame**
>>> df = cudf.DataFrame({
... 'A': ['a', 'b', 'c', 'd', 'e'],
... 'B': ['f', 'g', 'h', 'i', 'j'],
... 'C': ['k', 'l', 'm', 'n', 'o']
... }, index=[1, 2, 3, 4, 5])
>>> df
A B C
1 a f k
2 b g l
3 c h m
4 d i n
5 e j o
>>> df.truncate(before=2, after=4)
A B C
2 b g l
3 c h m
4 d i n
>>> df.truncate(before="A", after="B", axis="columns")
A B
1 a f
2 b g
3 c h
4 d i
5 e j
>>> import cudf
>>> dates = cudf.date_range(
... '2021-01-01 23:45:00', '2021-01-01 23:46:00', freq='s'
... )
>>> df2 = cudf.DataFrame(data={'A': 1, 'B': 2}, index=dates)
>>> df2.head()
A B
2021-01-01 23:45:00 1 2
2021-01-01 23:45:01 1 2
2021-01-01 23:45:02 1 2
2021-01-01 23:45:03 1 2
2021-01-01 23:45:04 1 2
>>> df2.truncate(
... before="2021-01-01 23:45:18", after="2021-01-01 23:45:27"
... )
A B
2021-01-01 23:45:18 1 2
2021-01-01 23:45:19 1 2
2021-01-01 23:45:20 1 2
2021-01-01 23:45:21 1 2
2021-01-01 23:45:22 1 2
2021-01-01 23:45:23 1 2
2021-01-01 23:45:24 1 2
2021-01-01 23:45:25 1 2
2021-01-01 23:45:26 1 2
2021-01-01 23:45:27 1 2
"""
if not copy:
raise ValueError("Truncating with copy=False is not supported.")
axis = self._get_axis_from_axis_arg(axis)
ax = self._index if axis == 0 else self._data.to_pandas_index()

if not ax.is_monotonic_increasing and not ax.is_monotonic_decreasing:
raise ValueError("truncate requires a sorted index")

if type(ax) is cudf.core.index.DatetimeIndex:
before = pd.to_datetime(before)
after = pd.to_datetime(after)

if before is not None and after is not None and before > after:
raise ValueError(f"Truncate: {after} must be after {before}")

if len(ax) > 1 and ax.is_monotonic_decreasing and ax.nunique() > 1:
before, after = after, before

slicer = [slice(None, None)] * self.ndim
slicer[axis] = slice(before, after)
return self.loc[tuple(slicer)].copy()

@cached_property
def loc(self):
"""Select rows and columns by label or boolean mask.
Expand Down
4 changes: 4 additions & 0 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,10 @@ def __setitem__(self, key, value):
self._frame.iloc[key] = value

def _loc_to_iloc(self, arg):
if isinstance(arg, tuple) and arg and isinstance(arg[0], slice):
if len(arg) > 1:
raise IndexError("Too many Indexers")
arg = arg[0]
if _is_scalar_or_zero_d_array(arg):
if not _is_non_decimal_numeric_dtype(self._frame.index.dtype):
# TODO: switch to cudf.utils.dtypes.is_integer(arg)
Expand Down
56 changes: 56 additions & 0 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,62 @@ def test_axes(data):
assert_eq(e, a)


def test_dataframe_truncate_axis_0():
df = cudf.DataFrame(
{
"A": ["a", "b", "c", "d", "e"],
"B": ["f", "g", "h", "i", "j"],
"C": ["k", "l", "m", "n", "o"],
},
index=[1, 2, 3, 4, 5],
)
pdf = df.to_pandas()

expected = pdf.truncate(before=2, after=4, axis="index")
actual = df.truncate(before=2, after=4, axis="index")
assert_eq(actual, expected)

expected = pdf.truncate(before=1, after=4, axis=0)
actual = df.truncate(before=1, after=4, axis=0)
assert_eq(expected, actual)


def test_dataframe_truncate_axis_1():
df = cudf.DataFrame(
{
"A": ["a", "b", "c", "d", "e"],
"B": ["f", "g", "h", "i", "j"],
"C": ["k", "l", "m", "n", "o"],
},
index=[1, 2, 3, 4, 5],
)
pdf = df.to_pandas()

expected = pdf.truncate(before="A", after="B", axis="columns")
actual = df.truncate(before="A", after="B", axis="columns")
assert_eq(actual, expected)

expected = pdf.truncate(before="A", after="B", axis=1)
actual = df.truncate(before="A", after="B", axis=1)
assert_eq(actual, expected)


def test_dataframe_truncate_datetimeindex():
dates = cudf.date_range(
"2021-01-01 23:45:00", "2021-01-01 23:46:00", freq="s"
)
df = cudf.DataFrame(data={"A": 1, "B": 2}, index=dates)
pdf = df.to_pandas()
expected = pdf.truncate(
before="2021-01-01 23:45:18", after="2021-01-01 23:45:27"
)
actual = df.truncate(
before="2021-01-01 23:45:18", after="2021-01-01 23:45:27"
)

assert_eq(actual, expected)


def test_series_init_none():

# test for creating empty series
Expand Down
41 changes: 41 additions & 0 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1614,6 +1614,47 @@ def test_axes(data):
assert_eq(e, a)


def test_series_truncate():
csr = cudf.Series([1, 2, 3, 4])
psr = csr.to_pandas()

assert_eq(csr.truncate(), psr.truncate())
assert_eq(csr.truncate(1, 2), psr.truncate(1, 2))
assert_eq(csr.truncate(before=1, after=2), psr.truncate(before=1, after=2))


def test_series_truncate_errors():
csr = cudf.Series([1, 2, 3, 4])
with pytest.raises(ValueError):
csr.truncate(axis=1)
with pytest.raises(ValueError):
csr.truncate(copy=False)

csr.index = [3, 2, 1, 6]
psr = csr.to_pandas()
assert_exceptions_equal(
lfunc=csr.truncate,
rfunc=psr.truncate,
)


def test_series_truncate_datetimeindex():
dates = cudf.date_range(
"2021-01-01 23:45:00", "2021-01-02 23:46:00", freq="s"
)
csr = cudf.Series(range(len(dates)), index=dates)
psr = csr.to_pandas()

assert_eq(
csr.truncate(
before="2021-01-01 23:45:18", after="2021-01-01 23:45:27"
),
psr.truncate(
before="2021-01-01 23:45:18", after="2021-01-01 23:45:27"
),
)


@pytest.mark.parametrize(
"data",
[
Expand Down

0 comments on commit 9df2eba

Please sign in to comment.