From bd321d1e9332695afff4db885eb75393477541bf Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Fri, 4 Dec 2020 18:22:26 -0500
Subject: [PATCH] Enable groupby `list` aggregation for strings(#6914)

Authors:
  - Ashwin Srinath <shwina@users.noreply.github.com>
  - Ashwin Srinath <3190405+shwina@users.noreply.github.com>

Approvers:
  - Keith Kraus
  - Keith Kraus
  - Keith Kraus

URL: https://github.com/rapidsai/cudf/pull/6914
---
 CHANGELOG.md                           |  1 +
 python/cudf/cudf/_lib/groupby.pyx      |  1 +
 python/cudf/cudf/tests/test_groupby.py | 20 ++++++++++++++++++++
 3 files changed, 22 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 071d3e15522..8aca09a02f1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -128,6 +128,7 @@
 - PR #6837 Avoid gather when copying strings view from start of strings column
 - PR #6859 Move align_ptr_for_type() from cuda.cuh to alignment.hpp
 - PR #6807 Refactor `std::array` usage in row group index writing in ORC
+- PR #6914 Enable groupby `list` aggregation for strings
 - PR #6908 Parquet option for strictly decimal reading
 
 ## Bug Fixes
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index c4e1da8bfe6..6037c0ce0bb 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -51,6 +51,7 @@ _STRING_AGGS = {
     "min",
     "nunique",
     "nth",
+    "collect"
 }
 
 _LIST_AGGS = {
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 53166d4bb8c..5dc96c98678 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -1268,6 +1268,26 @@ def test_groupby_list_single_element(list_agg):
     )
 
 
+@pytest.mark.parametrize(
+    "agg", [list, [list, "count"], {"b": list, "c": "sum"}]
+)
+def test_groupby_list_strings(agg):
+    pdf = pd.DataFrame(
+        {
+            "a": [1, 1, 1, 2, 2],
+            "b": ["b", "a", None, "e", "d"],
+            "c": [1, 2, 3, 4, 5],
+        }
+    )
+    gdf = cudf.from_pandas(pdf)
+
+    assert_eq(
+        pdf.groupby("a").agg(agg),
+        gdf.groupby("a").agg(agg),
+        check_dtype=False,
+    )
+
+
 def test_groupby_list_columns_excluded():
     pdf = pd.DataFrame(
         {