From 48a94d9bd8947a165a21b04a8ebcbeab0c3a32f3 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 9 Nov 2020 16:38:44 -0500
Subject: [PATCH 1/7] Add DecimalDtype.itemsize

---
 python/cudf/cudf/core/dtypes.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 044ff16c405..531e1812c2f 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -241,3 +241,7 @@ def to_arrow(self):
     @classmethod
     def from_arrow(cls, typ):
         return cls(typ.precision, typ.scale)
+
+    @property
+    def itemsize(self):
+        return 8

From 3264366a3115f03f48f5944f8b118eff2c362819 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 9 Nov 2020 16:40:06 -0500
Subject: [PATCH 2/7] Add DecimalColumn

---
 python/cudf/cudf/core/column/decimal.py | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 python/cudf/cudf/core/column/decimal.py

diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
new file mode 100644
index 00000000000..d9659bc05f0
--- /dev/null
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -0,0 +1,5 @@
+from cudf.core.column import ColumnBase
+
+
+class DecimalColumn(ColumnBase):
+    pass

From 9c2741eca88bf3786207ad386103aebaa08c229b Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Fri, 4 Dec 2020 13:49:42 -0500
Subject: [PATCH 3/7] Enable groupby list for strings

---
 python/cudf/cudf/_lib/groupby.pyx      |  1 +
 python/cudf/cudf/tests/test_groupby.py | 12 ++++++++++++
 2 files changed, 13 insertions(+)

diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index c4e1da8bfe6..6037c0ce0bb 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -51,6 +51,7 @@ _STRING_AGGS = {
     "min",
     "nunique",
     "nth",
+    "collect"
 }
 
 _LIST_AGGS = {
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 53166d4bb8c..975b742614e 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -1268,6 +1268,18 @@ def test_groupby_list_single_element(list_agg):
     )
 
 
+@pytest.mark.parametrize("list_agg", [list, "collect"])
+def test_groupby_list_strings(list_agg):
+    pdf = pd.DataFrame({"a": [1, 1, 1, 2, 2], "b": ["b", "a", "c", "e", "d"]})
+    gdf = cudf.from_pandas(pdf)
+
+    assert_eq(
+        pdf.groupby("a").agg({"b": list}),
+        gdf.groupby("a").agg({"b": list_agg}),
+        check_dtype=False,
+    )
+
+
 def test_groupby_list_columns_excluded():
     pdf = pd.DataFrame(
         {

From a2b25a087407e7b2fbf72630f92a63b8876e1626 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Fri, 4 Dec 2020 13:51:23 -0500
Subject: [PATCH 4/7] Changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5f63b2bee24..dfe0945e9cd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -119,6 +119,7 @@
 - PR #6837 Avoid gather when copying strings view from start of strings column
 - PR #6859 Move align_ptr_for_type() from cuda.cuh to alignment.hpp
 - PR #6807 Refactor `std::array` usage in row group index writing in ORC
+- PR #6914 Enable groupby `list` aggregation for strings
 
 ## Bug Fixes
 

From d1ae993c709869464a7e2e890b7423aa2cd06930 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Fri, 4 Dec 2020 13:54:48 -0500
Subject: [PATCH 5/7] Revert "Add DecimalDtype.itemsize"

This reverts commit 48a94d9bd8947a165a21b04a8ebcbeab0c3a32f3.
---
 python/cudf/cudf/core/dtypes.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 531e1812c2f..044ff16c405 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -241,7 +241,3 @@ def to_arrow(self):
     @classmethod
     def from_arrow(cls, typ):
         return cls(typ.precision, typ.scale)
-
-    @property
-    def itemsize(self):
-        return 8

From e19c42feb984ac3af0ba772f1996f079321fc4d2 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Fri, 4 Dec 2020 13:55:05 -0500
Subject: [PATCH 6/7] Revert "Add DecimalColumn"

This reverts commit 3264366a3115f03f48f5944f8b118eff2c362819.
---
 python/cudf/cudf/core/column/decimal.py | 5 -----
 1 file changed, 5 deletions(-)
 delete mode 100644 python/cudf/cudf/core/column/decimal.py

diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
deleted file mode 100644
index d9659bc05f0..00000000000
--- a/python/cudf/cudf/core/column/decimal.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from cudf.core.column import ColumnBase
-
-
-class DecimalColumn(ColumnBase):
-    pass

From 57cca09799343ba440b61b4833acaaa8dd34e53f Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Fri, 4 Dec 2020 14:05:10 -0500
Subject: [PATCH 7/7] More tests

---
 python/cudf/cudf/tests/test_groupby.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 975b742614e..5dc96c98678 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -1268,14 +1268,22 @@ def test_groupby_list_single_element(list_agg):
     )
 
 
-@pytest.mark.parametrize("list_agg", [list, "collect"])
-def test_groupby_list_strings(list_agg):
-    pdf = pd.DataFrame({"a": [1, 1, 1, 2, 2], "b": ["b", "a", "c", "e", "d"]})
+@pytest.mark.parametrize(
+    "agg", [list, [list, "count"], {"b": list, "c": "sum"}]
+)
+def test_groupby_list_strings(agg):
+    pdf = pd.DataFrame(
+        {
+            "a": [1, 1, 1, 2, 2],
+            "b": ["b", "a", None, "e", "d"],
+            "c": [1, 2, 3, 4, 5],
+        }
+    )
     gdf = cudf.from_pandas(pdf)
 
     assert_eq(
-        pdf.groupby("a").agg({"b": list}),
-        gdf.groupby("a").agg({"b": list_agg}),
+        pdf.groupby("a").agg(agg),
+        gdf.groupby("a").agg(agg),
         check_dtype=False,
     )