Fix result representation in groupby.apply (#6790)

Fixes: #6745, #6788 This PR fixes the result representation of groupby apply function when the custom udf returns a scalar, Series or a DataFrame.
rapidsai · Nov 24, 2020 · cdd72c9 · cdd72c9
1 parent db066de
commit cdd72c9
Show file tree

Hide file tree

Showing 3 changed files with 66 additions and 1 deletion.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -149,6 +149,7 @@
 - PR #6632 Fix DataFrame initialization from list of dicts
 - PR #6767 Fix sort order of parameters in `test_scalar_invalid_implicit_conversion` pytest
 - PR #6787 Update java reduction APIs to reflect C++ changes
+- PR #6790 Fix result representation in groupby.apply
 - PR #6794 Fix AVRO reader issues with empty input
 - PR #6798 Fix `read_avro` docs
 - PR #6824 Fix JNI build

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
@@ -415,7 +415,23 @@ def mult(df):
         chunks = [
             grouped_values[s:e] for s, e in zip(offsets[:-1], offsets[1:])
         ]
-        result = cudf.concat([function(chk) for chk in chunks])
+        chunk_results = [function(chk) for chk in chunks]
+
+        if len(chunk_results) > 0 and cudf.utils.dtypes.is_scalar(
+            chunk_results[0]
+        ):
+            result = cudf.Series(
+                chunk_results, index=self.grouping.keys[offsets[:-1]]
+            )
+            result.index.names = self.grouping.names
+        elif len(chunk_results) > 0 and isinstance(
+            chunk_results[0], cudf.Series
+        ):
+            result = cudf.concat(chunk_results, axis=1).T
+            result.index.names = self.grouping.names
+        else:
+            result = cudf.concat(chunk_results)
+
         if self._sort:
             result = result.sort_index()
         return result

diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
@@ -1271,3 +1271,51 @@ def test_groupby_pipe():
     actual = gdf.groupby("A").pipe(lambda x: x.max() - x.min())
 
     assert_eq(expected, actual)
+
+
+def test_groupby_apply_return_scalars():
+    pdf = pd.DataFrame(
+        {
+            "A": [1, 1, 2, 2, 3, 3, 4, 4, 5, 5],
+            "B": [
+                0.01,
+                np.nan,
+                0.03,
+                0.04,
+                np.nan,
+                0.06,
+                0.07,
+                0.08,
+                0.09,
+                1.0,
+            ],
+        }
+    )
+    gdf = cudf.from_pandas(pdf)
+
+    def custom_map_func(x):
+        x = x[~x["B"].isna()]
+        ticker = x.shape[0]
+        full = ticker / 10
+        return full
+
+    expected = pdf.groupby("A").apply(lambda x: custom_map_func(x))
+    actual = gdf.groupby("A").apply(lambda x: custom_map_func(x))
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "cust_func",
+    [lambda x: x - x.max(), lambda x: x.min() - x.max(), lambda x: x.min()],
+)
+def test_groupby_apply_return_series_dataframe(cust_func):
+    pdf = pd.DataFrame(
+        {"key": [0, 0, 1, 1, 2, 2, 2], "val": [0, 1, 2, 3, 4, 5, 6]}
+    )
+    gdf = cudf.from_pandas(pdf)
+
+    expected = pdf.groupby(["key"]).apply(cust_func)
+    actual = gdf.groupby(["key"]).apply(cust_func)
+
+    assert_eq(expected, actual)