From c4fba3fc368fd72512eff93dc17ffbf63e78591a Mon Sep 17 00:00:00 2001 From: Sheilah Date: Tue, 30 Nov 2021 17:22:13 -0800 Subject: [PATCH 1/9] create new pr --- python/cudf/cudf/core/dataframe.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index c0cb6f1917f..902a78f34d9 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6329,6 +6329,30 @@ def explode(self, column, ignore_index=False): return super()._explode(column, ignore_index) + def pct_change(self): + """ + Calculates the percent change between sequential elements + in the DataFrame. + + Parameters + ---------- + periods : int, default 1 + Periods to shift for forming percent change. + fill_method : str, default 'ffill' + How to handle NAs before computing percent changes. + limit : int, optional + The number of consecutive NAs to fill before stopping. + Not yet implemented. + freq : str, optional + Increment to use from time series API. + Not yet implemented. + + Returns + ------- + DataFrame + """ + pass + def __dataframe__( self, nan_as_null: bool = False, allow_copy: bool = True ): From b0f6ba347a1b50087f15755e21b5d638ebba9790 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Wed, 1 Dec 2021 17:48:29 -0800 Subject: [PATCH 2/9] TO-DO: implem. diff method for dataframes - separate PR --- python/cudf/cudf/core/dataframe.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 902a78f34d9..e534585205e 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6329,7 +6329,9 @@ def explode(self, column, ignore_index=False): return super()._explode(column, ignore_index) - def pct_change(self): + def pct_change( + self, periods=1, fill_method="ffill", limit=None, freq=None + ): """ Calculates the percent change between sequential elements in the DataFrame. @@ -6351,7 +6353,20 @@ def pct_change(self): ------- DataFrame """ - pass + if limit is not None: + raise NotImplementedError("limit parameter not supported yet.") + if freq is not None: + raise NotImplementedError("freq parameter not supported yet.") + elif fill_method not in {"ffill", "pad", "bfill", "backfill"}: + raise ValueError( + "fill_method must be one of 'ffill', 'pad', " + "'bfill', or 'backfill'." + ) + + data = self.fillna(method=fill_method, limit=limit) + data_diff = data.diff(periods=periods) # need to implem. diff method + change = data_diff / data.shift(periods=periods, freq=freq) + return change def __dataframe__( self, nan_as_null: bool = False, allow_copy: bool = True From a9414418f3685ec5650cfad840b9e02031637614 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Thu, 2 Dec 2021 21:59:29 -0800 Subject: [PATCH 3/9] addressed review --- python/cudf/cudf/core/dataframe.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index e534585205e..00c5c267a3b 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6364,9 +6364,10 @@ def pct_change( ) data = self.fillna(method=fill_method, limit=limit) - data_diff = data.diff(periods=periods) # need to implem. diff method - change = data_diff / data.shift(periods=periods, freq=freq) - return change + + return data.diff(periods=periods) / data.shift( + periods=periods, freq=freq + ) def __dataframe__( self, nan_as_null: bool = False, allow_copy: bool = True From 80546474333ca2b5a177ca62405e874d2f6a5816 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Thu, 2 Dec 2021 22:02:56 -0800 Subject: [PATCH 4/9] adding tests, WIP --- python/cudf/cudf/tests/test_dataframe.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index d555b5c4033..6a274569c9d 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9039,3 +9039,25 @@ def test_pearson_corr_multiindex_dataframe(): expected = gdf.to_pandas().groupby(level="a").corr("pearson") assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data", + [ + np.random.normal(-100, 100, (50, 50)), + np.random.randint(-50, 50, (25, 30)), + np.random.random_sample((4, 4)), + np.random.uniform(10.5, 75.5, (10, 6)), + np.array([1.123, 2.343, 5.890, 0.0]), + ], +) +@pytest.mark.parametrize("periods", range(-5, 5)) +@pytest.mark.parametrize("fill_method", ["ffill", "bfill", "pad", "backfill"]) +def test_dataframe_pct_change(data, periods, fill_method): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + actual = gdf.pct_change(periods=periods, fill_method=fill_method) + expected = pdf.pct_change(periods=periods, fill_method=fill_method) + + assert_eq(expected, actual) From 76c30c1ccf4a3159246f2da24f4693bf99578431 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Thu, 10 Feb 2022 02:13:46 -0800 Subject: [PATCH 5/9] use seed to generate random test data --- python/cudf/cudf/tests/test_dataframe.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index e4fccd4f481..f97173f297a 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9184,14 +9184,13 @@ def test_dataframe_rename_duplicate_column(): @pytest.mark.parametrize( "data", [ - np.random.normal(-100, 100, (50, 50)), - np.random.randint(-50, 50, (25, 30)), - np.random.random_sample((4, 4)), - np.random.uniform(10.5, 75.5, (10, 6)), + np.random.RandomState(seed=10).randint(-50, 50, (25, 30)), + np.random.RandomState(seed=10).random_sample((4, 4)), np.array([1.123, 2.343, 5.890, 0.0]), + {"a": [1.123, 2.343, np.nan, np.nan], "b": [None, 3, 9.08, None]}, ], ) -@pytest.mark.parametrize("periods", range(-5, 5)) +@pytest.mark.parametrize("periods", (-2, -1, 0, 1, 2)) @pytest.mark.parametrize("fill_method", ["ffill", "bfill", "pad", "backfill"]) def test_dataframe_pct_change(data, periods, fill_method): gdf = cudf.DataFrame(data) From e1c5d855be964be6fe990d1685903b9fc8fe806b Mon Sep 17 00:00:00 2001 From: Sheilah Kirui <71867292+skirui-source@users.noreply.github.com> Date: Wed, 16 Feb 2022 13:30:43 -0800 Subject: [PATCH 6/9] Update python/cudf/cudf/tests/test_dataframe.py Co-authored-by: Michael Wang --- python/cudf/cudf/tests/test_dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 81e2c8b2a2f..35cc3ba74d0 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9188,7 +9188,7 @@ def test_dataframe_rename_duplicate_column(): {"a": [1.123, 2.343, np.nan, np.nan], "b": [None, 3, 9.08, None]}, ], ) -@pytest.mark.parametrize("periods", (-2, -1, 0, 1, 2)) +@pytest.mark.parametrize("periods", [-5, -2, 0, 2, 5]) @pytest.mark.parametrize("fill_method", ["ffill", "bfill", "pad", "backfill"]) def test_dataframe_pct_change(data, periods, fill_method): gdf = cudf.DataFrame(data) From c264ea4365ef6ba598486d1beb79b4a54fb8cb53 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Wed, 16 Feb 2022 16:14:08 -0800 Subject: [PATCH 7/9] handles case when periods > len(df) --- python/cudf/cudf/core/column/column.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 393afe4a5b9..5d5e85e4d79 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -335,6 +335,12 @@ def _fill( return self def shift(self, offset: int, fill_value: ScalarLike) -> ColumnBase: + # link to the libcudf ticket you will create + if abs(offset) > len(self): + if fill_value is None: + return column_empty_like(self, masked=True) + else: + return full(len(self), fill_value, dtype=self.dtype) return libcudf.copying.shift(self, offset, fill_value) @property From 0f68434898881cb6a0abb157a9233d468c1032ef Mon Sep 17 00:00:00 2001 From: Sheilah Date: Wed, 16 Feb 2022 16:17:27 -0800 Subject: [PATCH 8/9] avoid check_dtype, reduce test cases for periods --- python/cudf/cudf/tests/test_dataframe.py | 28 +++++++++++++++--------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 35cc3ba74d0..5b9c73fd827 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -3442,29 +3442,37 @@ def test_get_numeric_data(): @pytest.mark.parametrize("dtype", NUMERIC_TYPES) -@pytest.mark.parametrize("period", [-1, -5, -10, -20, 0, 1, 5, 10, 20]) +@pytest.mark.parametrize("period", [-15, -1, 0, 1, 15]) @pytest.mark.parametrize("data_empty", [False, True]) def test_shift(dtype, period, data_empty): - + # TODO : this function currently tests for series.shift() + # but should instead test for dataframe.shift() if data_empty: data = None else: if dtype == np.int8: # to keep data in range - data = gen_rand(dtype, 100000, low=-2, high=2) + data = gen_rand(dtype, 10, low=-2, high=2) else: - data = gen_rand(dtype, 100000) + data = gen_rand(dtype, 10) - gdf = cudf.DataFrame({"a": cudf.Series(data, dtype=dtype)}) - pdf = pd.DataFrame({"a": pd.Series(data, dtype=dtype)}) + gs = cudf.DataFrame({"a": cudf.Series(data, dtype=dtype)}) + ps = pd.DataFrame({"a": pd.Series(data, dtype=dtype)}) - shifted_outcome = gdf.a.shift(period).fillna(0) - expected_outcome = pdf.a.shift(period).fillna(0).astype(dtype) + shifted_outcome = gs.a.shift(period) + expected_outcome = ps.a.shift(period) + # pandas uses NaNs to signal missing value and force converts the + # results columns to float types if data_empty: - assert_eq(shifted_outcome, expected_outcome, check_index_type=False) + assert_eq( + shifted_outcome, + expected_outcome, + check_index_type=False, + check_dtype=False, + ) else: - assert_eq(shifted_outcome, expected_outcome) + assert_eq(shifted_outcome, expected_outcome, check_dtype=False) @pytest.mark.parametrize("dtype", NUMERIC_TYPES) From c3c36fba6c05ab9484aad74588e5e2865908527f Mon Sep 17 00:00:00 2001 From: Sheilah Date: Wed, 16 Feb 2022 16:28:01 -0800 Subject: [PATCH 9/9] added link to created bug-fix libcudf ticket --- python/cudf/cudf/core/column/column.py | 4 +++- python/cudf/cudf/tests/test_dataframe.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 5d5e85e4d79..1c1c2ef2bf6 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -335,7 +335,9 @@ def _fill( return self def shift(self, offset: int, fill_value: ScalarLike) -> ColumnBase: - # link to the libcudf ticket you will create + # libcudf currently doesn't handle case when offset > len(df) + # ticket to fix the bug in link below: + # https://github.com/rapidsai/cudf/issues/10314 if abs(offset) > len(self): if fill_value is None: return column_empty_like(self, masked=True) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 5b9c73fd827..f1468801732 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9190,7 +9190,7 @@ def test_dataframe_rename_duplicate_column(): @pytest.mark.parametrize( "data", [ - np.random.RandomState(seed=10).randint(-50, 50, (25, 30)), + np.random.RandomState(seed=10).randint(-50, 50, (10, 10)), np.random.RandomState(seed=10).random_sample((4, 4)), np.array([1.123, 2.343, 5.890, 0.0]), {"a": [1.123, 2.343, np.nan, np.nan], "b": [None, 3, 9.08, None]},