From cdd72c90d84cec5cc3648f3e74954556a88aca91 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 23 Nov 2020 19:33:21 -0600
Subject: [PATCH] Fix result representation in groupby.apply (#6790)

Fixes: #6745, #6788

This PR fixes the result representation of groupby apply function when the custom udf returns a scalar, Series or a DataFrame.
---
 CHANGELOG.md                             |  1 +
 python/cudf/cudf/core/groupby/groupby.py | 18 ++++++++-
 python/cudf/cudf/tests/test_groupby.py   | 48 ++++++++++++++++++++++++
 3 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5750d7e8e3f..f811964c75b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -149,6 +149,7 @@
 - PR #6632 Fix DataFrame initialization from list of dicts
 - PR #6767 Fix sort order of parameters in `test_scalar_invalid_implicit_conversion` pytest
 - PR #6787 Update java reduction APIs to reflect C++ changes
+- PR #6790 Fix result representation in groupby.apply
 - PR #6794 Fix AVRO reader issues with empty input
 - PR #6798 Fix `read_avro` docs
 - PR #6824 Fix JNI build
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 2a5aa599a06..6bbb3d0c1de 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -415,7 +415,23 @@ def mult(df):
         chunks = [
             grouped_values[s:e] for s, e in zip(offsets[:-1], offsets[1:])
         ]
-        result = cudf.concat([function(chk) for chk in chunks])
+        chunk_results = [function(chk) for chk in chunks]
+
+        if len(chunk_results) > 0 and cudf.utils.dtypes.is_scalar(
+            chunk_results[0]
+        ):
+            result = cudf.Series(
+                chunk_results, index=self.grouping.keys[offsets[:-1]]
+            )
+            result.index.names = self.grouping.names
+        elif len(chunk_results) > 0 and isinstance(
+            chunk_results[0], cudf.Series
+        ):
+            result = cudf.concat(chunk_results, axis=1).T
+            result.index.names = self.grouping.names
+        else:
+            result = cudf.concat(chunk_results)
+
         if self._sort:
             result = result.sort_index()
         return result
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index b1f0dba50f7..1e9f6305c4d 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -1271,3 +1271,51 @@ def test_groupby_pipe():
     actual = gdf.groupby("A").pipe(lambda x: x.max() - x.min())
 
     assert_eq(expected, actual)
+
+
+def test_groupby_apply_return_scalars():
+    pdf = pd.DataFrame(
+        {
+            "A": [1, 1, 2, 2, 3, 3, 4, 4, 5, 5],
+            "B": [
+                0.01,
+                np.nan,
+                0.03,
+                0.04,
+                np.nan,
+                0.06,
+                0.07,
+                0.08,
+                0.09,
+                1.0,
+            ],
+        }
+    )
+    gdf = cudf.from_pandas(pdf)
+
+    def custom_map_func(x):
+        x = x[~x["B"].isna()]
+        ticker = x.shape[0]
+        full = ticker / 10
+        return full
+
+    expected = pdf.groupby("A").apply(lambda x: custom_map_func(x))
+    actual = gdf.groupby("A").apply(lambda x: custom_map_func(x))
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "cust_func",
+    [lambda x: x - x.max(), lambda x: x.min() - x.max(), lambda x: x.min()],
+)
+def test_groupby_apply_return_series_dataframe(cust_func):
+    pdf = pd.DataFrame(
+        {"key": [0, 0, 1, 1, 2, 2, 2], "val": [0, 1, 2, 3, 4, 5, 6]}
+    )
+    gdf = cudf.from_pandas(pdf)
+
+    expected = pdf.groupby(["key"]).apply(cust_func)
+    actual = gdf.groupby(["key"]).apply(cust_func)
+
+    assert_eq(expected, actual)