From f220e902b1a6ea1361f973a5f5db4709d0507812 Mon Sep 17 00:00:00 2001 From: Sheilah Kirui <71867292+skirui-source@users.noreply.github.com> Date: Mon, 18 Jul 2022 09:06:29 -0700 Subject: [PATCH] Implement Groupby pct_change (#11144) Subsequent to https://github.com/rapidsai/cudf/pull/9805, this PR adds support for Groupby.pct_change() Fixes https://github.com/rapidsai/cudf/issues/9606 Replaces https://github.com/rapidsai/cudf/pull/10444 Authors: - Sheilah Kirui (https://github.com/skirui-source) Approvers: - Ashwin Srinath (https://github.com/shwina) URL: https://github.com/rapidsai/cudf/pull/11144 --- python/cudf/cudf/core/groupby/groupby.py | 42 ++++++++++++++ python/cudf/cudf/tests/test_groupby.py | 73 ++++++++++++++++++++++++ 2 files changed, 115 insertions(+) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 8347c2bd94e..c651cfdf3a1 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1416,6 +1416,48 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): result = self._mimic_pandas_order(result) return result._copy_type_metadata(values) + def pct_change( + self, periods=1, fill_method="ffill", axis=0, limit=None, freq=None + ): + """ + Calculates the percent change between sequential elements + in the group. + + Parameters + ---------- + periods : int, default 1 + Periods to shift for forming percent change. + fill_method : str, default 'ffill' + How to handle NAs before computing percent changes. + limit : int, optional + The number of consecutive NAs to fill before stopping. + Not yet implemented. + freq : str, optional + Increment to use from time series API. + Not yet implemented. + + Returns + ------- + Series or DataFrame + Percentage changes within each group + """ + if not axis == 0: + raise NotImplementedError("Only axis=0 is supported.") + if limit is not None: + raise NotImplementedError("limit parameter not supported yet.") + if freq is not None: + raise NotImplementedError("freq parameter not supported yet.") + elif fill_method not in {"ffill", "pad", "bfill", "backfill"}: + raise ValueError( + "fill_method must be one of 'ffill', 'pad', " + "'bfill', or 'backfill'." + ) + + filled = self.fillna(method=fill_method, limit=limit) + fill_grp = filled.groupby(self.grouping) + shifted = fill_grp.shift(periods=periods, freq=freq) + return (filled / shifted) - 1 + def _mimic_pandas_order( self, result: DataFrameOrSeries ) -> DataFrameOrSeries: diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 0750a36461b..bd5e9fe017b 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -2604,3 +2604,76 @@ def test_groupby_transform_maintain_index(by): assert_groupby_results_equal( pdf.groupby(by).transform("max"), gdf.groupby(by).transform("max") ) + + +@pytest.mark.parametrize( + "data, gkey", + [ + ( + { + "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2], + "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1], + "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1], + }, + ["id"], + ), + ( + { + "id": [0, 0, 0, 0, 1, 1, 1], + "a": [1, 3, 4, 2.0, -3.0, 9.0, 10.0], + "b": [10.0, 23, -4.0, 2, -3.0, None, 19.0], + }, + ["id", "a"], + ), + ( + { + "id": ["a", "a", "b", "b", "c", "c"], + "val1": [None, None, None, None, None, None], + }, + ["id"], + ), + ], +) +@pytest.mark.parametrize("periods", [-5, -2, 0, 2, 5]) +@pytest.mark.parametrize("fill_method", ["ffill", "bfill", "pad", "backfill"]) +def test_groupby_pct_change(data, gkey, periods, fill_method): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + actual = gdf.groupby(gkey).pct_change( + periods=periods, fill_method=fill_method + ) + expected = pdf.groupby(gkey).pct_change( + periods=periods, fill_method=fill_method + ) + + assert_eq(expected, actual) + + +@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/11259") +@pytest.mark.parametrize("periods", [-5, 5]) +def test_groupby_pct_change_multiindex_dataframe(periods): + gdf = cudf.DataFrame( + { + "a": [1, 1, 2, 2], + "b": [1, 1, 2, 3], + "c": [2, 3, 4, 5], + "d": [6, 8, 9, 1], + } + ).set_index(["a", "b"]) + + actual = gdf.groupby(level=["a", "b"]).pct_change(periods) + expected = gdf.to_pandas().groupby(level=["a", "b"]).pct_change(periods) + + assert_eq(expected, actual) + + +def test_groupby_pct_change_empty_columns(): + gdf = cudf.DataFrame(columns=["id", "val1", "val2"]) + pdf = gdf.to_pandas() + + actual = gdf.groupby("id").pct_change() + expected = pdf.groupby("id").pct_change() + + assert_eq(expected, actual)