Enable groupby list aggregation for strings(#6914)

Authors: - Ashwin Srinath <[email protected]> - Ashwin Srinath <[email protected]> Approvers: - Keith Kraus - Keith Kraus - Keith Kraus URL: #6914
rapidsai · Dec 4, 2020 · bd321d1 · bd321d1
1 parent 30bbb39
commit bd321d1
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 0 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -128,6 +128,7 @@
 - PR #6837 Avoid gather when copying strings view from start of strings column
 - PR #6859 Move align_ptr_for_type() from cuda.cuh to alignment.hpp
 - PR #6807 Refactor `std::array` usage in row group index writing in ORC
+- PR #6914 Enable groupby `list` aggregation for strings
 - PR #6908 Parquet option for strictly decimal reading
 
 ## Bug Fixes

diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
@@ -51,6 +51,7 @@ _STRING_AGGS = {
     "min",
     "nunique",
     "nth",
+    "collect"
 }
 
 _LIST_AGGS = {

diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
@@ -1268,6 +1268,26 @@ def test_groupby_list_single_element(list_agg):
     )
 
 
+@pytest.mark.parametrize(
+    "agg", [list, [list, "count"], {"b": list, "c": "sum"}]
+)
+def test_groupby_list_strings(agg):
+    pdf = pd.DataFrame(
+        {
+            "a": [1, 1, 1, 2, 2],
+            "b": ["b", "a", None, "e", "d"],
+            "c": [1, 2, 3, 4, 5],
+        }
+    )
+    gdf = cudf.from_pandas(pdf)
+
+    assert_eq(
+        pdf.groupby("a").agg(agg),
+        gdf.groupby("a").agg(agg),
+        check_dtype=False,
+    )
+
+
 def test_groupby_list_columns_excluded():
     pdf = pd.DataFrame(
         {
-Original file line number
+Diff line change
@@ Expand Up / @@ -51,6 +51,7 @@ _STRING_AGGS = { @@
         "min",
         "nunique",
         "nth",
+        "collect"
     }
     _LIST_AGGS = {
@@ Expand Down @@