From e599ca40b17b288ffe52181cc498ce5e6c7866df Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 4 Feb 2022 11:12:49 -0800 Subject: [PATCH 1/5] Allow for string aggs in dataframe groupby --- python/dask_cudf/dask_cudf/groupby.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py index 1bc270a5b9f..7a5c953d7e8 100644 --- a/python/dask_cudf/dask_cudf/groupby.py +++ b/python/dask_cudf/dask_cudf/groupby.py @@ -82,6 +82,7 @@ def collect(self, split_every=None, split_out=1): def aggregate(self, arg, split_every=None, split_out=1): if arg == "size": return self.size() + arg = _redirect_aggs(arg) if ( @@ -178,10 +179,8 @@ def collect(self, split_every=None, split_out=1): def aggregate(self, arg, split_every=None, split_out=1): if arg == "size": return self.size() - arg = _redirect_aggs(arg) - if not isinstance(arg, dict): - arg = {self._slice: arg} + arg = _redirect_aggs(arg) if ( isinstance(self.obj, DaskDataFrame) @@ -253,7 +252,7 @@ def groupby_agg( if isinstance(gb_cols, str): gb_cols = [gb_cols] columns = [c for c in ddf.columns if c not in gb_cols] - if isinstance(aggs, list): + if not isinstance(aggs, dict): aggs = {col: aggs for col in columns} # Assert if our output will have a MultiIndex; this will be the case if @@ -413,6 +412,8 @@ def _is_supported(arg, supported: set): _global_set = set(arg) return bool(_global_set.issubset(supported)) + elif isinstance(arg, str): + return arg in supported return False From aa0f276a4f4b6da4842b313303af7ab17f416a51 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 4 Feb 2022 11:13:09 -0800 Subject: [PATCH 2/5] Expand test for _is_supported --- .../dask_cudf/dask_cudf/tests/test_groupby.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index 274c6670426..b815cba95e5 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -556,11 +556,20 @@ def test_groupby_agg_redirect(aggregations): @pytest.mark.parametrize( - "arg", - [["not_supported"], {"a": "not_supported"}, {"a": ["not_supported"]}], + "arg,supported", + [ + ("sum", True), + (["sum"], True), + ({"a": "sum"}, True), + ({"a": ["sum"]}, True), + ("not_supported", False), + (["not_supported"], False), + ({"a": "not_supported"}, False), + ({"a": ["not_supported"]}, False), + ], ) -def test_is_supported(arg): - assert _is_supported(arg, {"supported"}) is False +def test_is_supported(arg, supported): + assert _is_supported(arg, SUPPORTED_AGGS) is supported def test_groupby_unique_lists(): From 7d3049b79c5fb3f52caa0bb32ff7a2d4ce9c3827 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Thu, 17 Mar 2022 06:51:18 -0700 Subject: [PATCH 3/5] Resolve copyright issue --- python/dask_cudf/dask_cudf/tests/test_groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index b815cba95e5..ada6a7244fb 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. import numpy as np import pandas as pd From e6f98aa9a71ddf7263989d9abac5f0cae4264f22 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Thu, 12 May 2022 08:34:47 -0700 Subject: [PATCH 4/5] Add back in series-specific handling of agg normalization --- python/dask_cudf/dask_cudf/groupby.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py index 681db632986..22705c2b83b 100644 --- a/python/dask_cudf/dask_cudf/groupby.py +++ b/python/dask_cudf/dask_cudf/groupby.py @@ -435,6 +435,9 @@ def aggregate(self, arg, split_every=None, split_out=1): arg = _redirect_aggs(arg) + if not isinstance(arg, dict): + arg = {self._slice: arg} + if _groupby_supported(self) and _aggs_supported(arg, SUPPORTED_AGGS): return groupby_agg( self.obj, From a54ee42d0527c115b4432c309abc3ec36daa15e5 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Thu, 12 May 2022 08:35:46 -0700 Subject: [PATCH 5/5] Add in .agg tests for string aggregations --- python/dask_cudf/dask_cudf/tests/test_groupby.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index 7c403d22df5..2b7f2bdae36 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -58,7 +58,10 @@ def test_groupby_basic(series, aggregation): "func", [ lambda df: df.groupby("x").agg({"y": "max"}), + lambda df: df.groupby("x").agg(["sum", "max"]), lambda df: df.groupby("x").y.agg(["sum", "max"]), + lambda df: df.groupby("x").agg("sum"), + lambda df: df.groupby("x").y.agg("sum"), ], ) def test_groupby_agg(func):