From 2e458b93b639d5068452c9559ff045b60cc9b933 Mon Sep 17 00:00:00 2001 From: Sheilah Kirui <71867292+skirui-source@users.noreply.github.com> Date: Fri, 4 Feb 2022 19:52:33 -0800 Subject: [PATCH] Implement DataFrame diff() (#9817) Fixes: https://github.com/rapidsai/cudf/issues/9604 and resolves https://github.com/rapidsai/cudf/issues/1271 Authors: - Sheilah Kirui (https://github.com/skirui-source) Approvers: - Karthikeyan (https://github.com/karthikeyann) - Bradley Dice (https://github.com/bdice) - Michael Wang (https://github.com/isVoid) URL: https://github.com/rapidsai/cudf/pull/9817 --- python/cudf/cudf/core/dataframe.py | 75 ++++++++++++++++++++++++ python/cudf/cudf/tests/test_dataframe.py | 71 ++++++++++++++++++++++ 2 files changed, 146 insertions(+) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 9d179994174..25f2f71b40d 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -19,6 +19,7 @@ import pyarrow as pa from nvtx import annotate from pandas._config import get_option +from pandas.core.dtypes.common import is_float, is_integer from pandas.io.formats import console from pandas.io.formats.printing import pprint_thing @@ -2542,6 +2543,80 @@ def insert(self, loc, name, value, nan_as_null=None): self._data.insert(name, value, loc=loc) + def diff(self, periods=1, axis=0): + """ + First discrete difference of element. + + Calculates the difference of a DataFrame element compared with another + element in the DataFrame (default is element in previous row). + + Parameters + ---------- + periods : int, default 1 + Periods to shift for calculating difference, + accepts negative values. + axis : {0 or 'index', 1 or 'columns'}, default 0 + Take difference over rows (0) or columns (1). + Only row-wise (0) shift is supported. + + Returns + ------- + DataFrame + First differences of the DataFrame. + + Notes + ----- + Diff currently only supports numeric dtype columns. + + Examples + -------- + >>> import cudf + >>> gdf = cudf.DataFrame({'a': [1, 2, 3, 4, 5, 6], + ... 'b': [1, 1, 2, 3, 5, 8], + ... 'c': [1, 4, 9, 16, 25, 36]}) + >>> gdf + a b c + 0 1 1 1 + 1 2 1 4 + 2 3 2 9 + 3 4 3 16 + 4 5 5 25 + 5 6 8 36 + >>> gdf.diff(periods=2) + a b c + 0 + 1 + 2 2 1 8 + 3 2 2 12 + 4 2 3 16 + 5 2 5 20 + + """ + if not is_integer(periods): + if not (is_float(periods) and periods.is_integer()): + raise ValueError("periods must be an integer") + periods = int(periods) + + axis = self._get_axis_from_axis_arg(axis) + if axis != 0: + raise NotImplementedError("Only axis=0 is supported.") + + if not all(is_numeric_dtype(i) for i in self.dtypes): + raise NotImplementedError( + "DataFrame.diff only supports numeric dtypes" + ) + + if abs(periods) > len(self): + df = cudf.DataFrame._from_data( + { + name: column_empty(len(self), dtype=dtype, masked=True) + for name, dtype in zip(self.columns, self.dtypes) + } + ) + return df + + return self - self.shift(periods=periods) + def drop( self, labels=None, diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 5022f1a675b..fb173bc0eab 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9065,6 +9065,77 @@ def test_dataframe_add_suffix(): assert_eq(got, expected) +@pytest.mark.parametrize( + "data", + [ + np.random.RandomState(seed=10).randint(-50, 50, (25, 30)), + np.random.RandomState(seed=10).random_sample((4, 4)), + np.array([1.123, 2.343, 5.890, 0.0]), + [True, False, True, False, False], + {"a": [1.123, 2.343, np.nan, np.nan], "b": [None, 3, 9.08, None]}, + ], +) +@pytest.mark.parametrize("periods", (-5, -1, 0, 1, 5)) +def test_diff_dataframe_numeric_dtypes(data, periods): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + actual = gdf.diff(periods=periods, axis=0) + expected = pdf.diff(periods=periods, axis=0) + + assert_eq( + expected, actual, check_dtype=False, + ) + + +@pytest.mark.parametrize( + ("precision", "scale"), [(5, 2), (8, 5)], +) +@pytest.mark.parametrize( + "dtype", [cudf.Decimal32Dtype, cudf.Decimal64Dtype], +) +def test_diff_decimal_dtypes(precision, scale, dtype): + gdf = cudf.DataFrame( + np.random.default_rng(seed=42).uniform(10.5, 75.5, (10, 6)), + dtype=dtype(precision=precision, scale=scale), + ) + pdf = gdf.to_pandas() + + actual = gdf.diff() + expected = pdf.diff() + + assert_eq( + expected, actual, check_dtype=False, + ) + + +def test_diff_dataframe_invalid_axis(): + gdf = cudf.DataFrame(np.array([1.123, 2.343, 5.890, 0.0])) + with pytest.raises(NotImplementedError, match="Only axis=0 is supported."): + gdf.diff(periods=1, axis=1) + + +@pytest.mark.parametrize( + "data", + [ + { + "int_col": [1, 2, 3, 4, 5], + "float_col": [1.0, 2.0, 3.0, 4.0, 5.0], + "string_col": ["a", "b", "c", "d", "e"], + }, + ["a", "b", "c", "d", "e"], + [np.nan, None, np.nan, None], + ], +) +def test_diff_dataframe_non_numeric_dypes(data): + gdf = cudf.DataFrame(data) + with pytest.raises( + NotImplementedError, + match="DataFrame.diff only supports numeric dtypes", + ): + gdf.diff(periods=2, axis=0) + + def test_dataframe_assign_cp_np_array(): m, n = 5, 3 cp_ndarray = cupy.random.randn(m, n)