Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding feature Truncate to DataFrame and Series #11435

Merged
merged 23 commits into from
Nov 4, 2022
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
9ab9e90
Added Truncate Functionality and test cases
VamsiTallam95 Aug 1, 2022
1c5082c
style and docstring fix
VamsiTallam95 Aug 1, 2022
0b28765
Added truncate functionality to dataframe
VamsiTallam95 Aug 2, 2022
4194427
cleaner version
VamsiTallam95 Aug 2, 2022
2fd181d
Update python/cudf/cudf/tests/test_dataframe.py
VamsiTallam95 Aug 17, 2022
b69f306
Update python/cudf/cudf/core/dataframe.py
VamsiTallam95 Aug 17, 2022
2d6b20e
Update python/cudf/cudf/tests/test_dataframe.py
VamsiTallam95 Aug 17, 2022
2d2271e
Update python/cudf/cudf/core/series.py
VamsiTallam95 Aug 17, 2022
2734bd2
Update python/cudf/cudf/core/series.py
VamsiTallam95 Aug 17, 2022
ddede29
Update python/cudf/cudf/core/series.py
VamsiTallam95 Aug 17, 2022
31f6696
Update python/cudf/cudf/core/series.py
VamsiTallam95 Aug 17, 2022
42fb830
incorporating feedback
VamsiTallam95 Aug 17, 2022
033f27e
fixing failed tests
VamsiTallam95 Aug 17, 2022
3f740bc
incorporating feedback
VamsiTallam95 Aug 18, 2022
42b4c7f
fixing style
VamsiTallam95 Aug 18, 2022
900110f
fixing doctests
VamsiTallam95 Aug 18, 2022
7ce214c
Merge remote-tracking branch 'origin/branch-22.12' into truncate
vyasr Nov 2, 2022
9a24b15
Fix Series indexing bug when given a length 1 list of slices.
vyasr Nov 4, 2022
530b54b
Move method into IndexedFrame.py.
vyasr Nov 4, 2022
e6446e8
Document that copy=False is not supported.
vyasr Nov 4, 2022
2e6ba3b
Address PR comments.
vyasr Nov 4, 2022
515fa22
Remove conditional on copy.
vyasr Nov 4, 2022
b3bea25
Shrink test.
vyasr Nov 4, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 108 additions & 0 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2886,6 +2886,114 @@ def axes(self):
"""
return [self._index, self._data.to_pandas_index()]

@_cudf_nvtx_annotate
def truncate(self, before=None, after=None, axis=0, copy=True):
Copy link
Contributor

@bdice bdice Aug 2, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This method probably belongs in python/cudf/cudf/core/indexed_frame.py if the implementation is identical for DataFrame and Series. Those methods will be inherited by both DataFrame and Series.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried to move the code to indexed_frame.py and was not successful due to missing functionalities.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What functionality was missing? Did it attempt to call functions that don't exist in Series? If so, we may just need to move them up from DataFrame.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is done now.

"""
Truncate a DataFrame before and after some index value.
This is a useful shorthand for boolean indexing based on index
values above or below certain thresholds.

Parameters
----------
before : date, str, int
Truncate all rows before this index value.
after : date, str, int
Truncate all rows after this index value.
axis : {0 or 'index', 1 or 'columns'}, optional
Axis to truncate. Truncates the index (rows) by default.
copy : bool, default is True,
Return a copy of the truncated section.

Returns
-------
The truncated Series or DataFrame.

Notes
-----
If the index being truncated contains only datetime values,
`before` and `after` may be specified as strings instead of
Timestamps.

Examples
--------
>>> df = cudf.DataFrame({
... 'A': ['a', 'b', 'c', 'd', 'e'],
... 'B': ['f', 'g', 'h', 'i', 'j'],
... 'C': ['k', 'l', 'm', 'n', 'o']
... }, index=[1, 2, 3, 4, 5])
>>> df
A B C
1 a f k
2 b g l
3 c h m
4 d i n
5 e j o

>>> df.truncate(before=2, after=4)
A B C
2 b g l
3 c h m
4 d i n

>>> df.truncate(before="A", after="B", axis="columns")
A B
1 a f
2 b g
3 c h
4 d i
5 e j

>>> import cudf
>>> dates = cudf.date_range(
... '2021-01-01 23:45:00', '2021-01-01 23:46:00', freq='s')
>>> df2 = cudf.DataFrame(data={'A': 1, 'B': 2}, index=dates)
>>> df2.head()
A B
2021-01-01 23:45:00 1 2
2021-01-01 23:45:01 1 2
2021-01-01 23:45:02 1 2
2021-01-01 23:45:03 1 2
2021-01-01 23:45:04 1 2

>>> df2.truncate(
... before="2021-01-01 23:45:18", after="2021-01-01 23:45:27")
A B
2021-01-01 23:45:18 1 2
2021-01-01 23:45:19 1 2
2021-01-01 23:45:20 1 2
2021-01-01 23:45:21 1 2
2021-01-01 23:45:22 1 2
2021-01-01 23:45:23 1 2
2021-01-01 23:45:24 1 2
2021-01-01 23:45:25 1 2
2021-01-01 23:45:26 1 2
2021-01-01 23:45:27 1 2
"""
axis = self._get_axis_from_axis_arg(axis)
ax = self._index if axis == 0 else self._data.to_pandas_index()

if not ax.is_monotonic_increasing and not ax.is_monotonic_decreasing:
raise ValueError("truncate requires a sorted index")

if type(ax) is cudf.core.index.DatetimeIndex:
before = pd.to_datetime(before)
after = pd.to_datetime(after)

if before is not None and after is not None and before > after:
raise ValueError(f"Truncate: {after} must be after {before}")

if len(ax) > 1 and ax.is_monotonic_decreasing and ax.nunique() > 1:
before, after = after, before

slicer = [slice(None, None)] * self.ndim
slicer[axis] = slice(before, after)
result = self.loc[tuple(slicer)]

if copy:
result = result.copy()

return result

def diff(self, periods=1, axis=0):
"""
First discrete difference of element.
Expand Down
133 changes: 133 additions & 0 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -661,6 +661,139 @@ def axes(self):
"""
return [self.index]

@_cudf_nvtx_annotate
def truncate(self, before=None, after=None, axis=0, copy=True):
"""
Truncate a Series before and after some index value.
This is a useful shorthand for boolean indexing based on index
values above or below certain thresholds.
Parameters
----------
before : date, str, int
Truncate all rows before this index value.
after : date, str, int
Truncate all rows after this index value.
axis : {0 or 'index'}, optional
Axis to truncate. Truncates the index (rows) by default.
copy : bool, default is True
Return a copy of the truncated section.
VamsiTallam95 marked this conversation as resolved.
Show resolved Hide resolved

Returns
-------
The truncated Series.
VamsiTallam95 marked this conversation as resolved.
Show resolved Hide resolved

Notes
-----
If the index being truncated contains only datetime values,
`before` and `after` may be specified as strings instead of
Timestamps.

Examples
VamsiTallam95 marked this conversation as resolved.
Show resolved Hide resolved
--------
>>> import cudf
>>> cs1 = cudf.Series([1, 2, 3, 4])
>>> cs1
0 1
1 2
2 3
3 4
dtype: int64

>>> cs1.truncate(before=1, after=2)
1 2
2 3
dtype: int64

>>> import cudf
>>> dates = cudf.date_range(
... '2021-01-01 23:45:00', '2021-01-01 23:46:00', freq='s')
>>> cs2 = cudf.Series(range(len(dates)), index=dates)
>>> cs2
2021-01-01 23:45:00 0
2021-01-01 23:45:01 1
2021-01-01 23:45:02 2
2021-01-01 23:45:03 3
2021-01-01 23:45:04 4
2021-01-01 23:45:05 5
2021-01-01 23:45:06 6
2021-01-01 23:45:07 7
2021-01-01 23:45:08 8
2021-01-01 23:45:09 9
2021-01-01 23:45:10 10
2021-01-01 23:45:11 11
2021-01-01 23:45:12 12
2021-01-01 23:45:13 13
2021-01-01 23:45:14 14
2021-01-01 23:45:15 15
2021-01-01 23:45:16 16
2021-01-01 23:45:17 17
2021-01-01 23:45:18 18
2021-01-01 23:45:19 19
2021-01-01 23:45:20 20
2021-01-01 23:45:21 21
2021-01-01 23:45:22 22
2021-01-01 23:45:23 23
2021-01-01 23:45:24 24
...
2021-01-01 23:45:56 56
2021-01-01 23:45:57 57
2021-01-01 23:45:58 58
2021-01-01 23:45:59 59
dtype: int64


>>> cs2.truncate(
... before="2021-01-01 23:45:18", after="2021-01-01 23:45:27")
2021-01-01 23:45:18 18
2021-01-01 23:45:19 19
2021-01-01 23:45:20 20
2021-01-01 23:45:21 21
2021-01-01 23:45:22 22
2021-01-01 23:45:23 23
2021-01-01 23:45:24 24
2021-01-01 23:45:25 25
2021-01-01 23:45:26 26
2021-01-01 23:45:27 27
dtype: int64

>>> cs3 = cudf.Series({'A':1, 'B':2, 'C':3, 'D':4})
>>> cs3
A 1
B 2
C 3
D 4
dtype: int64

>>> cs3.truncate(before='B', after='C')
B 2
C 3
dtype: int64
"""
if axis not in (0, "index"):
VamsiTallam95 marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError(f"No axis named {axis} for object type Series")
ax = self.index

if not ax.is_monotonic_increasing and not ax.is_monotonic_decreasing:
raise ValueError("truncate requires a sorted index")

if ax is cudf.core.index.DatetimeIndex:
before = pd.to_datetime(before)
after = pd.to_datetime(after)

if before is not None and after is not None and before > after:
raise ValueError(f"Truncate: {after} must be after {before}")

if len(ax) > 1 and ax.is_monotonic_decreasing and ax.nunique() > 1:
before, after = after, before

slicer = slice(before, after)
result = self.loc[slicer]

if copy:
result = result.copy()
vyasr marked this conversation as resolved.
Show resolved Hide resolved

return result

@_cudf_nvtx_annotate
def serialize(self):
header, frames = super().serialize()
Expand Down
56 changes: 56 additions & 0 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,62 @@ def test_axes(data):
assert_eq(e, a)


def test_dataframe_truncate_axis_0():
df = cudf.DataFrame(
{
"A": ["a", "b", "c", "d", "e"],
"B": ["f", "g", "h", "i", "j"],
"C": ["k", "l", "m", "n", "o"],
},
index=[1, 2, 3, 4, 5],
)
pdf = df.to_pandas()

expected = pdf.truncate(before=2, after=4, axis="index")
actual = df.truncate(before=2, after=4, axis="index")
assert_eq(actual, expected)

expected = pdf.truncate(before=1, after=4, axis=0)
actual = df.truncate(before=1, after=4, axis=0)
assert_eq(expected, actual)


def test_dataframe_truncate_axis_1():
df = cudf.DataFrame(
{
"A": ["a", "b", "c", "d", "e"],
"B": ["f", "g", "h", "i", "j"],
"C": ["k", "l", "m", "n", "o"],
},
index=[1, 2, 3, 4, 5],
)
pdf = df.to_pandas()

expected = pdf.truncate(before="A", after="B", axis="columns")
actual = df.truncate(before="A", after="B", axis="columns")
assert_eq(actual, expected)

expected = pdf.truncate(before="A", after="B", axis=1)
actual = df.truncate(before="A", after="B", axis=1)
assert_eq(actual, expected)


def test_dataframe_truncate_datetimeindex():
dates = cudf.date_range(
"2021-01-01 23:45:00", "2021-01-01 23:46:00", freq="s"
)
df = cudf.DataFrame(data={"A": 1, "B": 2}, index=dates)
pdf = df.to_pandas()
expected = pdf.truncate(
before="2021-01-01 23:45:18", after="2021-01-01 23:45:27"
)
actual = df.truncate(
before="2021-01-01 23:45:18", after="2021-01-01 23:45:27"
)

assert_eq(actual, expected)


def test_series_init_none():

# test for creating empty series
Expand Down
31 changes: 31 additions & 0 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1614,6 +1614,37 @@ def test_axes(data):
assert_eq(e, a)


def test_series_truncate():
csr = cudf.Series([1, 2, 3, 4])
psr = csr.to_pandas()

assert_eq(csr.truncate(), psr.truncate())
assert_eq(csr.truncate(1, 2), psr.truncate(1, 2))
assert_eq(csr.truncate(before=1, after=2), psr.truncate(before=1, after=2))


def test_series_truncate_invalid_axis():
csr = cudf.Series([1, 2, 3, 4])
msg = "No axis named 1 for object type Series"
with pytest.raises(ValueError, match=msg):
csr.truncate(axis=1)


def test_series_truncate_datetimeindex():
bdice marked this conversation as resolved.
Show resolved Hide resolved
dates = cudf.date_range("2021-01-01", "2021-01-02", freq="s")
vyasr marked this conversation as resolved.
Show resolved Hide resolved
csr = cudf.Series(range(len(dates)), index=dates)
psr = csr.to_pandas()

assert_eq(
csr.truncate(
before="2021-01-01 23:45:18", after="2021-01-01 23:45:27"
),
psr.truncate(
before="2021-01-01 23:45:18", after="2021-01-01 23:45:27"
),
)


@pytest.mark.parametrize(
"data",
[
Expand Down