From bd321d1e9332695afff4db885eb75393477541bf Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Fri, 4 Dec 2020 18:22:26 -0500 Subject: [PATCH] Enable groupby `list` aggregation for strings(#6914) Authors: - Ashwin Srinath - Ashwin Srinath <3190405+shwina@users.noreply.github.com> Approvers: - Keith Kraus - Keith Kraus - Keith Kraus URL: https://github.com/rapidsai/cudf/pull/6914 --- CHANGELOG.md | 1 + python/cudf/cudf/_lib/groupby.pyx | 1 + python/cudf/cudf/tests/test_groupby.py | 20 ++++++++++++++++++++ 3 files changed, 22 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 071d3e15522..8aca09a02f1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -128,6 +128,7 @@ - PR #6837 Avoid gather when copying strings view from start of strings column - PR #6859 Move align_ptr_for_type() from cuda.cuh to alignment.hpp - PR #6807 Refactor `std::array` usage in row group index writing in ORC +- PR #6914 Enable groupby `list` aggregation for strings - PR #6908 Parquet option for strictly decimal reading ## Bug Fixes diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index c4e1da8bfe6..6037c0ce0bb 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -51,6 +51,7 @@ _STRING_AGGS = { "min", "nunique", "nth", + "collect" } _LIST_AGGS = { diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 53166d4bb8c..5dc96c98678 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -1268,6 +1268,26 @@ def test_groupby_list_single_element(list_agg): ) +@pytest.mark.parametrize( + "agg", [list, [list, "count"], {"b": list, "c": "sum"}] +) +def test_groupby_list_strings(agg): + pdf = pd.DataFrame( + { + "a": [1, 1, 1, 2, 2], + "b": ["b", "a", None, "e", "d"], + "c": [1, 2, 3, 4, 5], + } + ) + gdf = cudf.from_pandas(pdf) + + assert_eq( + pdf.groupby("a").agg(agg), + gdf.groupby("a").agg(agg), + check_dtype=False, + ) + + def test_groupby_list_columns_excluded(): pdf = pd.DataFrame( {