From 48a94d9bd8947a165a21b04a8ebcbeab0c3a32f3 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 9 Nov 2020 16:38:44 -0500 Subject: [PATCH 1/7] Add DecimalDtype.itemsize --- python/cudf/cudf/core/dtypes.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 044ff16c405..531e1812c2f 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -241,3 +241,7 @@ def to_arrow(self): @classmethod def from_arrow(cls, typ): return cls(typ.precision, typ.scale) + + @property + def itemsize(self): + return 8 From 3264366a3115f03f48f5944f8b118eff2c362819 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 9 Nov 2020 16:40:06 -0500 Subject: [PATCH 2/7] Add DecimalColumn --- python/cudf/cudf/core/column/decimal.py | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 python/cudf/cudf/core/column/decimal.py diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py new file mode 100644 index 00000000000..d9659bc05f0 --- /dev/null +++ b/python/cudf/cudf/core/column/decimal.py @@ -0,0 +1,5 @@ +from cudf.core.column import ColumnBase + + +class DecimalColumn(ColumnBase): + pass From 9c2741eca88bf3786207ad386103aebaa08c229b Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 4 Dec 2020 13:49:42 -0500 Subject: [PATCH 3/7] Enable groupby list for strings --- python/cudf/cudf/_lib/groupby.pyx | 1 + python/cudf/cudf/tests/test_groupby.py | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index c4e1da8bfe6..6037c0ce0bb 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -51,6 +51,7 @@ _STRING_AGGS = { "min", "nunique", "nth", + "collect" } _LIST_AGGS = { diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 53166d4bb8c..975b742614e 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -1268,6 +1268,18 @@ def test_groupby_list_single_element(list_agg): ) +@pytest.mark.parametrize("list_agg", [list, "collect"]) +def test_groupby_list_strings(list_agg): + pdf = pd.DataFrame({"a": [1, 1, 1, 2, 2], "b": ["b", "a", "c", "e", "d"]}) + gdf = cudf.from_pandas(pdf) + + assert_eq( + pdf.groupby("a").agg({"b": list}), + gdf.groupby("a").agg({"b": list_agg}), + check_dtype=False, + ) + + def test_groupby_list_columns_excluded(): pdf = pd.DataFrame( { From a2b25a087407e7b2fbf72630f92a63b8876e1626 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 4 Dec 2020 13:51:23 -0500 Subject: [PATCH 4/7] Changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5f63b2bee24..dfe0945e9cd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -119,6 +119,7 @@ - PR #6837 Avoid gather when copying strings view from start of strings column - PR #6859 Move align_ptr_for_type() from cuda.cuh to alignment.hpp - PR #6807 Refactor `std::array` usage in row group index writing in ORC +- PR #6914 Enable groupby `list` aggregation for strings ## Bug Fixes From d1ae993c709869464a7e2e890b7423aa2cd06930 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 4 Dec 2020 13:54:48 -0500 Subject: [PATCH 5/7] Revert "Add DecimalDtype.itemsize" This reverts commit 48a94d9bd8947a165a21b04a8ebcbeab0c3a32f3. --- python/cudf/cudf/core/dtypes.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 531e1812c2f..044ff16c405 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -241,7 +241,3 @@ def to_arrow(self): @classmethod def from_arrow(cls, typ): return cls(typ.precision, typ.scale) - - @property - def itemsize(self): - return 8 From e19c42feb984ac3af0ba772f1996f079321fc4d2 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 4 Dec 2020 13:55:05 -0500 Subject: [PATCH 6/7] Revert "Add DecimalColumn" This reverts commit 3264366a3115f03f48f5944f8b118eff2c362819. --- python/cudf/cudf/core/column/decimal.py | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 python/cudf/cudf/core/column/decimal.py diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py deleted file mode 100644 index d9659bc05f0..00000000000 --- a/python/cudf/cudf/core/column/decimal.py +++ /dev/null @@ -1,5 +0,0 @@ -from cudf.core.column import ColumnBase - - -class DecimalColumn(ColumnBase): - pass From 57cca09799343ba440b61b4833acaaa8dd34e53f Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 4 Dec 2020 14:05:10 -0500 Subject: [PATCH 7/7] More tests --- python/cudf/cudf/tests/test_groupby.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 975b742614e..5dc96c98678 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -1268,14 +1268,22 @@ def test_groupby_list_single_element(list_agg): ) -@pytest.mark.parametrize("list_agg", [list, "collect"]) -def test_groupby_list_strings(list_agg): - pdf = pd.DataFrame({"a": [1, 1, 1, 2, 2], "b": ["b", "a", "c", "e", "d"]}) +@pytest.mark.parametrize( + "agg", [list, [list, "count"], {"b": list, "c": "sum"}] +) +def test_groupby_list_strings(agg): + pdf = pd.DataFrame( + { + "a": [1, 1, 1, 2, 2], + "b": ["b", "a", None, "e", "d"], + "c": [1, 2, 3, 4, 5], + } + ) gdf = cudf.from_pandas(pdf) assert_eq( - pdf.groupby("a").agg({"b": list}), - gdf.groupby("a").agg({"b": list_agg}), + pdf.groupby("a").agg(agg), + gdf.groupby("a").agg(agg), check_dtype=False, )