diff --git a/CHANGELOG.md b/CHANGELOG.md index fd3a045c9b9..0a1d3b2b84f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -127,6 +127,7 @@ - PR #6837 Avoid gather when copying strings view from start of strings column - PR #6859 Move align_ptr_for_type() from cuda.cuh to alignment.hpp - PR #6807 Refactor `std::array` usage in row group index writing in ORC +- PR #6914 Enable groupby `list` aggregation for strings - PR #6908 Parquet option for strictly decimal reading ## Bug Fixes diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index c4e1da8bfe6..6037c0ce0bb 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -51,6 +51,7 @@ _STRING_AGGS = { "min", "nunique", "nth", + "collect" } _LIST_AGGS = { diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 53166d4bb8c..5dc96c98678 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -1268,6 +1268,26 @@ def test_groupby_list_single_element(list_agg): ) +@pytest.mark.parametrize( + "agg", [list, [list, "count"], {"b": list, "c": "sum"}] +) +def test_groupby_list_strings(agg): + pdf = pd.DataFrame( + { + "a": [1, 1, 1, 2, 2], + "b": ["b", "a", None, "e", "d"], + "c": [1, 2, 3, 4, 5], + } + ) + gdf = cudf.from_pandas(pdf) + + assert_eq( + pdf.groupby("a").agg(agg), + gdf.groupby("a").agg(agg), + check_dtype=False, + ) + + def test_groupby_list_columns_excluded(): pdf = pd.DataFrame( {