From f0919494ad874dd23cb63630272165c41f9ea144 Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Mon, 18 Mar 2024 12:59:30 -0500 Subject: [PATCH] Allow ``numeric_only=True`` for simple groupby reductions (#15326) Adds some simple logic to handle the case that `DataFrameGroupBy._reduce(numeric_only=True)` is called. ## Further Background This change is needed for some dask_cudf groupby aggregations (e.g. "mean") to work with the latest `dask/dask-expr:main`. Although other workarounds and "fixes" are possible, the easiest solution is probably something like this PR. Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/15326 --- python/cudf/cudf/core/groupby/groupby.py | 29 +++++++++++++++++++---- python/cudf/cudf/tests/test_groupby.py | 30 ++++++++++++++++++++++-- 2 files changed, 52 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index d995964057b..945e546af1a 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -22,7 +22,12 @@ from cudf._lib.types import size_type_dtype from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType from cudf.api.extensions import no_default -from cudf.api.types import is_bool_dtype, is_float_dtype, is_list_like +from cudf.api.types import ( + is_bool_dtype, + is_float_dtype, + is_list_like, + is_numeric_dtype, +) from cudf.core._compat import PANDAS_LT_300 from cudf.core.abc import Serializable from cudf.core.column.column import ColumnBase, StructDtype, as_column @@ -701,6 +706,11 @@ def agg(self, func): return result + def _reduce_numeric_only(self, op: str): + raise NotImplementedError( + f"numeric_only is not implemented for {type(self)}" + ) + def _reduce( self, op: str, @@ -731,14 +741,12 @@ def _reduce( The numeric_only, min_count """ - if numeric_only: - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) if min_count != 0: raise NotImplementedError( "min_count parameter is not implemented yet" ) + if numeric_only: + return self._reduce_numeric_only(op) return self.agg(op) def _scan(self, op: str, *args, **kwargs): @@ -2648,6 +2656,17 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin): _PROTECTED_KEYS = frozenset(("obj",)) + def _reduce_numeric_only(self, op: str): + columns = list( + name + for name in self.obj._data.names + if ( + is_numeric_dtype(self.obj._data[name].dtype) + and name not in self.grouping.names + ) + ) + return self[columns].agg(op) + def __getitem__(self, key): return self.obj[key].groupby( by=self.grouping.keys, diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 06516b6b4ea..c139b06d20f 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -1259,7 +1259,7 @@ def test_groupby_unsupported_columns(): pdg = pdf.groupby("x").sum(numeric_only=True) # cudf does not yet support numeric_only, so our default is False (unlike # pandas, which defaults to inferring and throws a warning about it). - gdg = gdf.groupby("x").sum() + gdg = gdf.groupby("x").sum(numeric_only=True) assert_groupby_results_equal(pdg, gdg) @@ -2158,7 +2158,9 @@ def test_groupby_list_columns_excluded(): pandas_agg_result = pdf.groupby("a").agg("mean", numeric_only=True) assert_groupby_results_equal( - pandas_result, gdf.groupby("a").mean(), check_dtype=False + pandas_result, + gdf.groupby("a").mean(numeric_only=True), + check_dtype=False, ) assert_groupby_results_equal( @@ -3826,3 +3828,27 @@ def test_groupby_shift_series_multiindex(): result = ser.groupby(level=0).shift(1) expected = ser.to_pandas().groupby(level=0).shift(1) assert_eq(expected, result) + + +@pytest.mark.parametrize( + "func", ["min", "max", "sum", "mean", "idxmin", "idxmax"] +) +@pytest.mark.parametrize( + "by,data", + [ + ("a", {"a": [1, 2, 3]}), + (["a", "id"], {"id": [0, 0, 1], "a": [1, 2, 3]}), + ("a", {"a": [1, 2, 3], "b": ["A", "B", "C"]}), + ("id", {"id": [0, 0, 1], "a": [1, 2, 3], "b": ["A", "B", "C"]}), + (["b", "id"], {"id": [0, 0, 1], "b": ["A", "B", "C"]}), + ("b", {"b": ["A", "B", "C"]}), + ], +) +def test_group_by_reduce_numeric_only(by, data, func): + # Test that simple groupby reductions support numeric_only=True + df = cudf.DataFrame(data) + expected = getattr(df.to_pandas().groupby(by, sort=True), func)( + numeric_only=True + ) + result = getattr(df.groupby(by, sort=True), func)(numeric_only=True) + assert_eq(expected, result)