Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding feature Truncate to DataFrame and Series #11435

Merged
merged 23 commits into from
Nov 4, 2022
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
9ab9e90
Added Truncate Functionality and test cases
VamsiTallam95 Aug 1, 2022
1c5082c
style and docstring fix
VamsiTallam95 Aug 1, 2022
0b28765
Added truncate functionality to dataframe
VamsiTallam95 Aug 2, 2022
4194427
cleaner version
VamsiTallam95 Aug 2, 2022
2fd181d
Update python/cudf/cudf/tests/test_dataframe.py
VamsiTallam95 Aug 17, 2022
b69f306
Update python/cudf/cudf/core/dataframe.py
VamsiTallam95 Aug 17, 2022
2d6b20e
Update python/cudf/cudf/tests/test_dataframe.py
VamsiTallam95 Aug 17, 2022
2d2271e
Update python/cudf/cudf/core/series.py
VamsiTallam95 Aug 17, 2022
2734bd2
Update python/cudf/cudf/core/series.py
VamsiTallam95 Aug 17, 2022
ddede29
Update python/cudf/cudf/core/series.py
VamsiTallam95 Aug 17, 2022
31f6696
Update python/cudf/cudf/core/series.py
VamsiTallam95 Aug 17, 2022
42fb830
incorporating feedback
VamsiTallam95 Aug 17, 2022
033f27e
fixing failed tests
VamsiTallam95 Aug 17, 2022
3f740bc
incorporating feedback
VamsiTallam95 Aug 18, 2022
42b4c7f
fixing style
VamsiTallam95 Aug 18, 2022
900110f
fixing doctests
VamsiTallam95 Aug 18, 2022
7ce214c
Merge remote-tracking branch 'origin/branch-22.12' into truncate
vyasr Nov 2, 2022
9a24b15
Fix Series indexing bug when given a length 1 list of slices.
vyasr Nov 4, 2022
530b54b
Move method into IndexedFrame.py.
vyasr Nov 4, 2022
e6446e8
Document that copy=False is not supported.
vyasr Nov 4, 2022
2e6ba3b
Address PR comments.
vyasr Nov 4, 2022
515fa22
Remove conditional on copy.
vyasr Nov 4, 2022
b3bea25
Shrink test.
vyasr Nov 4, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
201 changes: 201 additions & 0 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1045,6 +1045,207 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
zip(self._column_names, data_columns), self._index
)

@_cudf_nvtx_annotate
def truncate(self, before=None, after=None, axis=0, copy=True):
"""
Truncate a Series or DataFrame before and after some index value.
bdice marked this conversation as resolved.
Show resolved Hide resolved

This is a useful shorthand for boolean indexing based on index
values above or below certain thresholds.

Parameters
----------
before : date, str, int
Truncate all rows before this index value.
after : date, str, int
Truncate all rows after this index value.
axis : {0 or 'index', 1 or 'columns'}, optional
Axis to truncate. Truncates the index (rows) by default.
copy : bool, default is True,
Return a copy of the truncated section.

Returns
-------
The truncated Series or DataFrame.

Notes
-----
If the index being truncated contains only datetime values,
`before` and `after` may be specified as strings instead of
Timestamps.

.. pandas-compat::
**DataFrame.truncate, Series.truncate**

The ``copy`` parameter is only present for API compatibility, but
``copy=False`` is not actually supported. This method always
vyasr marked this conversation as resolved.
Show resolved Hide resolved
generates a copy.

Examples
--------
**Series**

>>> import cudf
>>> cs1 = cudf.Series([1, 2, 3, 4])
>>> cs1
0 1
1 2
2 3
3 4
dtype: int64

>>> cs1.truncate(before=1, after=2)
1 2
2 3
dtype: int64

>>> import cudf
>>> dates = cudf.date_range(
... '2021-01-01 23:45:00', '2021-01-01 23:46:00', freq='s')
vyasr marked this conversation as resolved.
Show resolved Hide resolved
>>> cs2 = cudf.Series(range(len(dates)), index=dates)
>>> cs2
2021-01-01 23:45:00 0
2021-01-01 23:45:01 1
2021-01-01 23:45:02 2
2021-01-01 23:45:03 3
2021-01-01 23:45:04 4
2021-01-01 23:45:05 5
2021-01-01 23:45:06 6
2021-01-01 23:45:07 7
2021-01-01 23:45:08 8
2021-01-01 23:45:09 9
2021-01-01 23:45:10 10
2021-01-01 23:45:11 11
2021-01-01 23:45:12 12
2021-01-01 23:45:13 13
2021-01-01 23:45:14 14
2021-01-01 23:45:15 15
2021-01-01 23:45:16 16
2021-01-01 23:45:17 17
2021-01-01 23:45:18 18
2021-01-01 23:45:19 19
2021-01-01 23:45:20 20
2021-01-01 23:45:21 21
2021-01-01 23:45:22 22
2021-01-01 23:45:23 23
2021-01-01 23:45:24 24
...
2021-01-01 23:45:56 56
2021-01-01 23:45:57 57
2021-01-01 23:45:58 58
2021-01-01 23:45:59 59
dtype: int64


>>> cs2.truncate(
... before="2021-01-01 23:45:18", after="2021-01-01 23:45:27")
2021-01-01 23:45:18 18
2021-01-01 23:45:19 19
2021-01-01 23:45:20 20
2021-01-01 23:45:21 21
2021-01-01 23:45:22 22
2021-01-01 23:45:23 23
2021-01-01 23:45:24 24
2021-01-01 23:45:25 25
2021-01-01 23:45:26 26
2021-01-01 23:45:27 27
dtype: int64

>>> cs3 = cudf.Series({'A':1, 'B':2, 'C':3, 'D':4})
vyasr marked this conversation as resolved.
Show resolved Hide resolved
>>> cs3
A 1
B 2
C 3
D 4
dtype: int64

>>> cs3.truncate(before='B', after='C')
B 2
C 3
dtype: int64

**DataFrame**

>>> df = cudf.DataFrame({
... 'A': ['a', 'b', 'c', 'd', 'e'],
... 'B': ['f', 'g', 'h', 'i', 'j'],
... 'C': ['k', 'l', 'm', 'n', 'o']
... }, index=[1, 2, 3, 4, 5])
>>> df
A B C
1 a f k
2 b g l
3 c h m
4 d i n
5 e j o

>>> df.truncate(before=2, after=4)
A B C
2 b g l
3 c h m
4 d i n

>>> df.truncate(before="A", after="B", axis="columns")
A B
1 a f
2 b g
3 c h
4 d i
5 e j

>>> import cudf
>>> dates = cudf.date_range(
... '2021-01-01 23:45:00', '2021-01-01 23:46:00', freq='s')
>>> df2 = cudf.DataFrame(data={'A': 1, 'B': 2}, index=dates)
>>> df2.head()
A B
2021-01-01 23:45:00 1 2
2021-01-01 23:45:01 1 2
2021-01-01 23:45:02 1 2
2021-01-01 23:45:03 1 2
2021-01-01 23:45:04 1 2

>>> df2.truncate(
... before="2021-01-01 23:45:18", after="2021-01-01 23:45:27")
A B
2021-01-01 23:45:18 1 2
2021-01-01 23:45:19 1 2
2021-01-01 23:45:20 1 2
2021-01-01 23:45:21 1 2
2021-01-01 23:45:22 1 2
2021-01-01 23:45:23 1 2
2021-01-01 23:45:24 1 2
2021-01-01 23:45:25 1 2
2021-01-01 23:45:26 1 2
2021-01-01 23:45:27 1 2
"""
if not copy:
raise ValueError("The copy parameter is not supported.")
vyasr marked this conversation as resolved.
Show resolved Hide resolved
axis = self._get_axis_from_axis_arg(axis)
ax = self._index if axis == 0 else self._data.to_pandas_index()

if not ax.is_monotonic_increasing and not ax.is_monotonic_decreasing:
raise ValueError("truncate requires a sorted index")

if type(ax) is cudf.core.index.DatetimeIndex:
before = pd.to_datetime(before)
after = pd.to_datetime(after)

if before is not None and after is not None and before > after:
raise ValueError(f"Truncate: {after} must be after {before}")

if len(ax) > 1 and ax.is_monotonic_decreasing and ax.nunique() > 1:
before, after = after, before

slicer = [slice(None, None)] * self.ndim
slicer[axis] = slice(before, after)
result = self.loc[tuple(slicer)]

if copy:
result = result.copy()
bdice marked this conversation as resolved.
Show resolved Hide resolved

return result

@cached_property
def loc(self):
"""Select rows and columns by label or boolean mask.
Expand Down
4 changes: 4 additions & 0 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,10 @@ def __setitem__(self, key, value):
self._frame.iloc[key] = value

def _loc_to_iloc(self, arg):
if isinstance(arg, tuple) and arg and isinstance(arg[0], slice):
if len(arg) > 1:
raise IndexError("Too many Indexers")
arg = arg[0]
if _is_scalar_or_zero_d_array(arg):
if not _is_non_decimal_numeric_dtype(self._frame.index.dtype):
# TODO: switch to cudf.utils.dtypes.is_integer(arg)
Expand Down
56 changes: 56 additions & 0 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,62 @@ def test_axes(data):
assert_eq(e, a)


def test_dataframe_truncate_axis_0():
df = cudf.DataFrame(
{
"A": ["a", "b", "c", "d", "e"],
"B": ["f", "g", "h", "i", "j"],
"C": ["k", "l", "m", "n", "o"],
},
index=[1, 2, 3, 4, 5],
)
pdf = df.to_pandas()

expected = pdf.truncate(before=2, after=4, axis="index")
actual = df.truncate(before=2, after=4, axis="index")
assert_eq(actual, expected)

expected = pdf.truncate(before=1, after=4, axis=0)
actual = df.truncate(before=1, after=4, axis=0)
assert_eq(expected, actual)


def test_dataframe_truncate_axis_1():
df = cudf.DataFrame(
{
"A": ["a", "b", "c", "d", "e"],
"B": ["f", "g", "h", "i", "j"],
"C": ["k", "l", "m", "n", "o"],
},
index=[1, 2, 3, 4, 5],
)
pdf = df.to_pandas()

expected = pdf.truncate(before="A", after="B", axis="columns")
actual = df.truncate(before="A", after="B", axis="columns")
assert_eq(actual, expected)

expected = pdf.truncate(before="A", after="B", axis=1)
actual = df.truncate(before="A", after="B", axis=1)
assert_eq(actual, expected)


def test_dataframe_truncate_datetimeindex():
dates = cudf.date_range(
"2021-01-01 23:45:00", "2021-01-01 23:46:00", freq="s"
)
df = cudf.DataFrame(data={"A": 1, "B": 2}, index=dates)
pdf = df.to_pandas()
expected = pdf.truncate(
before="2021-01-01 23:45:18", after="2021-01-01 23:45:27"
)
actual = df.truncate(
before="2021-01-01 23:45:18", after="2021-01-01 23:45:27"
)

assert_eq(actual, expected)


def test_series_init_none():

# test for creating empty series
Expand Down
32 changes: 32 additions & 0 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1614,6 +1614,38 @@ def test_axes(data):
assert_eq(e, a)


def test_series_truncate():
csr = cudf.Series([1, 2, 3, 4])
psr = csr.to_pandas()

assert_eq(csr.truncate(), psr.truncate())
assert_eq(csr.truncate(1, 2), psr.truncate(1, 2))
assert_eq(csr.truncate(before=1, after=2), psr.truncate(before=1, after=2))


def test_series_truncate_errors():
csr = cudf.Series([1, 2, 3, 4])
with pytest.raises(ValueError):
csr.truncate(axis=1)
with pytest.raises(ValueError):
csr.truncate(copy=False)


def test_series_truncate_datetimeindex():
bdice marked this conversation as resolved.
Show resolved Hide resolved
dates = cudf.date_range("2021-01-01", "2021-01-02", freq="s")
vyasr marked this conversation as resolved.
Show resolved Hide resolved
csr = cudf.Series(range(len(dates)), index=dates)
psr = csr.to_pandas()

assert_eq(
csr.truncate(
before="2021-01-01 23:45:18", after="2021-01-01 23:45:27"
),
psr.truncate(
before="2021-01-01 23:45:18", after="2021-01-01 23:45:27"
),
)


@pytest.mark.parametrize(
"data",
[
Expand Down