From cdd72c90d84cec5cc3648f3e74954556a88aca91 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 23 Nov 2020 19:33:21 -0600 Subject: [PATCH] Fix result representation in groupby.apply (#6790) Fixes: #6745, #6788 This PR fixes the result representation of groupby apply function when the custom udf returns a scalar, Series or a DataFrame. --- CHANGELOG.md | 1 + python/cudf/cudf/core/groupby/groupby.py | 18 ++++++++- python/cudf/cudf/tests/test_groupby.py | 48 ++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5750d7e8e3f..f811964c75b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -149,6 +149,7 @@ - PR #6632 Fix DataFrame initialization from list of dicts - PR #6767 Fix sort order of parameters in `test_scalar_invalid_implicit_conversion` pytest - PR #6787 Update java reduction APIs to reflect C++ changes +- PR #6790 Fix result representation in groupby.apply - PR #6794 Fix AVRO reader issues with empty input - PR #6798 Fix `read_avro` docs - PR #6824 Fix JNI build diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 2a5aa599a06..6bbb3d0c1de 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -415,7 +415,23 @@ def mult(df): chunks = [ grouped_values[s:e] for s, e in zip(offsets[:-1], offsets[1:]) ] - result = cudf.concat([function(chk) for chk in chunks]) + chunk_results = [function(chk) for chk in chunks] + + if len(chunk_results) > 0 and cudf.utils.dtypes.is_scalar( + chunk_results[0] + ): + result = cudf.Series( + chunk_results, index=self.grouping.keys[offsets[:-1]] + ) + result.index.names = self.grouping.names + elif len(chunk_results) > 0 and isinstance( + chunk_results[0], cudf.Series + ): + result = cudf.concat(chunk_results, axis=1).T + result.index.names = self.grouping.names + else: + result = cudf.concat(chunk_results) + if self._sort: result = result.sort_index() return result diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index b1f0dba50f7..1e9f6305c4d 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -1271,3 +1271,51 @@ def test_groupby_pipe(): actual = gdf.groupby("A").pipe(lambda x: x.max() - x.min()) assert_eq(expected, actual) + + +def test_groupby_apply_return_scalars(): + pdf = pd.DataFrame( + { + "A": [1, 1, 2, 2, 3, 3, 4, 4, 5, 5], + "B": [ + 0.01, + np.nan, + 0.03, + 0.04, + np.nan, + 0.06, + 0.07, + 0.08, + 0.09, + 1.0, + ], + } + ) + gdf = cudf.from_pandas(pdf) + + def custom_map_func(x): + x = x[~x["B"].isna()] + ticker = x.shape[0] + full = ticker / 10 + return full + + expected = pdf.groupby("A").apply(lambda x: custom_map_func(x)) + actual = gdf.groupby("A").apply(lambda x: custom_map_func(x)) + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "cust_func", + [lambda x: x - x.max(), lambda x: x.min() - x.max(), lambda x: x.min()], +) +def test_groupby_apply_return_series_dataframe(cust_func): + pdf = pd.DataFrame( + {"key": [0, 0, 1, 1, 2, 2, 2], "val": [0, 1, 2, 3, 4, 5, 6]} + ) + gdf = cudf.from_pandas(pdf) + + expected = pdf.groupby(["key"]).apply(cust_func) + actual = gdf.groupby(["key"]).apply(cust_func) + + assert_eq(expected, actual)