Skip to content

Commit

Permalink
Implement DataFrame diff() (#9817)
Browse files Browse the repository at this point in the history
Fixes: #9604 and resolves #1271

Authors:
  - Sheilah Kirui (https://github.com/skirui-source)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Bradley Dice (https://github.com/bdice)
  - Michael Wang (https://github.com/isVoid)

URL: #9817
  • Loading branch information
skirui-source authored Feb 5, 2022
1 parent e5ba292 commit 2e458b9
Show file tree
Hide file tree
Showing 2 changed files with 146 additions and 0 deletions.
75 changes: 75 additions & 0 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import pyarrow as pa
from nvtx import annotate
from pandas._config import get_option
from pandas.core.dtypes.common import is_float, is_integer
from pandas.io.formats import console
from pandas.io.formats.printing import pprint_thing

Expand Down Expand Up @@ -2542,6 +2543,80 @@ def insert(self, loc, name, value, nan_as_null=None):

self._data.insert(name, value, loc=loc)

def diff(self, periods=1, axis=0):
"""
First discrete difference of element.
Calculates the difference of a DataFrame element compared with another
element in the DataFrame (default is element in previous row).
Parameters
----------
periods : int, default 1
Periods to shift for calculating difference,
accepts negative values.
axis : {0 or 'index', 1 or 'columns'}, default 0
Take difference over rows (0) or columns (1).
Only row-wise (0) shift is supported.
Returns
-------
DataFrame
First differences of the DataFrame.
Notes
-----
Diff currently only supports numeric dtype columns.
Examples
--------
>>> import cudf
>>> gdf = cudf.DataFrame({'a': [1, 2, 3, 4, 5, 6],
... 'b': [1, 1, 2, 3, 5, 8],
... 'c': [1, 4, 9, 16, 25, 36]})
>>> gdf
a b c
0 1 1 1
1 2 1 4
2 3 2 9
3 4 3 16
4 5 5 25
5 6 8 36
>>> gdf.diff(periods=2)
a b c
0 <NA> <NA> <NA>
1 <NA> <NA> <NA>
2 2 1 8
3 2 2 12
4 2 3 16
5 2 5 20
"""
if not is_integer(periods):
if not (is_float(periods) and periods.is_integer()):
raise ValueError("periods must be an integer")
periods = int(periods)

axis = self._get_axis_from_axis_arg(axis)
if axis != 0:
raise NotImplementedError("Only axis=0 is supported.")

if not all(is_numeric_dtype(i) for i in self.dtypes):
raise NotImplementedError(
"DataFrame.diff only supports numeric dtypes"
)

if abs(periods) > len(self):
df = cudf.DataFrame._from_data(
{
name: column_empty(len(self), dtype=dtype, masked=True)
for name, dtype in zip(self.columns, self.dtypes)
}
)
return df

return self - self.shift(periods=periods)

def drop(
self,
labels=None,
Expand Down
71 changes: 71 additions & 0 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -9065,6 +9065,77 @@ def test_dataframe_add_suffix():
assert_eq(got, expected)


@pytest.mark.parametrize(
"data",
[
np.random.RandomState(seed=10).randint(-50, 50, (25, 30)),
np.random.RandomState(seed=10).random_sample((4, 4)),
np.array([1.123, 2.343, 5.890, 0.0]),
[True, False, True, False, False],
{"a": [1.123, 2.343, np.nan, np.nan], "b": [None, 3, 9.08, None]},
],
)
@pytest.mark.parametrize("periods", (-5, -1, 0, 1, 5))
def test_diff_dataframe_numeric_dtypes(data, periods):
gdf = cudf.DataFrame(data)
pdf = gdf.to_pandas()

actual = gdf.diff(periods=periods, axis=0)
expected = pdf.diff(periods=periods, axis=0)

assert_eq(
expected, actual, check_dtype=False,
)


@pytest.mark.parametrize(
("precision", "scale"), [(5, 2), (8, 5)],
)
@pytest.mark.parametrize(
"dtype", [cudf.Decimal32Dtype, cudf.Decimal64Dtype],
)
def test_diff_decimal_dtypes(precision, scale, dtype):
gdf = cudf.DataFrame(
np.random.default_rng(seed=42).uniform(10.5, 75.5, (10, 6)),
dtype=dtype(precision=precision, scale=scale),
)
pdf = gdf.to_pandas()

actual = gdf.diff()
expected = pdf.diff()

assert_eq(
expected, actual, check_dtype=False,
)


def test_diff_dataframe_invalid_axis():
gdf = cudf.DataFrame(np.array([1.123, 2.343, 5.890, 0.0]))
with pytest.raises(NotImplementedError, match="Only axis=0 is supported."):
gdf.diff(periods=1, axis=1)


@pytest.mark.parametrize(
"data",
[
{
"int_col": [1, 2, 3, 4, 5],
"float_col": [1.0, 2.0, 3.0, 4.0, 5.0],
"string_col": ["a", "b", "c", "d", "e"],
},
["a", "b", "c", "d", "e"],
[np.nan, None, np.nan, None],
],
)
def test_diff_dataframe_non_numeric_dypes(data):
gdf = cudf.DataFrame(data)
with pytest.raises(
NotImplementedError,
match="DataFrame.diff only supports numeric dtypes",
):
gdf.diff(periods=2, axis=0)


def test_dataframe_assign_cp_np_array():
m, n = 5, 3
cp_ndarray = cupy.random.randn(m, n)
Expand Down

0 comments on commit 2e458b9

Please sign in to comment.