Skip to content

Commit

Permalink
Allow numeric_only=True for simple groupby reductions (#15326)
Browse files Browse the repository at this point in the history
Adds some simple logic to handle the case that `DataFrameGroupBy._reduce(numeric_only=True)` is called.

## Further Background
This change is needed for some dask_cudf groupby aggregations (e.g. "mean") to work with the latest `dask/dask-expr:main`. Although other workarounds and "fixes" are possible, the easiest solution is probably something like this PR.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #15326
  • Loading branch information
rjzamora authored Mar 18, 2024
1 parent 62a40cb commit f091949
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 7 deletions.
29 changes: 24 additions & 5 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,12 @@
from cudf._lib.types import size_type_dtype
from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType
from cudf.api.extensions import no_default
from cudf.api.types import is_bool_dtype, is_float_dtype, is_list_like
from cudf.api.types import (
is_bool_dtype,
is_float_dtype,
is_list_like,
is_numeric_dtype,
)
from cudf.core._compat import PANDAS_LT_300
from cudf.core.abc import Serializable
from cudf.core.column.column import ColumnBase, StructDtype, as_column
Expand Down Expand Up @@ -701,6 +706,11 @@ def agg(self, func):

return result

def _reduce_numeric_only(self, op: str):
raise NotImplementedError(
f"numeric_only is not implemented for {type(self)}"
)

def _reduce(
self,
op: str,
Expand Down Expand Up @@ -731,14 +741,12 @@ def _reduce(
The numeric_only, min_count
"""
if numeric_only:
raise NotImplementedError(
"numeric_only parameter is not implemented yet"
)
if min_count != 0:
raise NotImplementedError(
"min_count parameter is not implemented yet"
)
if numeric_only:
return self._reduce_numeric_only(op)
return self.agg(op)

def _scan(self, op: str, *args, **kwargs):
Expand Down Expand Up @@ -2648,6 +2656,17 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):

_PROTECTED_KEYS = frozenset(("obj",))

def _reduce_numeric_only(self, op: str):
columns = list(
name
for name in self.obj._data.names
if (
is_numeric_dtype(self.obj._data[name].dtype)
and name not in self.grouping.names
)
)
return self[columns].agg(op)

def __getitem__(self, key):
return self.obj[key].groupby(
by=self.grouping.keys,
Expand Down
30 changes: 28 additions & 2 deletions python/cudf/cudf/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1259,7 +1259,7 @@ def test_groupby_unsupported_columns():
pdg = pdf.groupby("x").sum(numeric_only=True)
# cudf does not yet support numeric_only, so our default is False (unlike
# pandas, which defaults to inferring and throws a warning about it).
gdg = gdf.groupby("x").sum()
gdg = gdf.groupby("x").sum(numeric_only=True)
assert_groupby_results_equal(pdg, gdg)


Expand Down Expand Up @@ -2158,7 +2158,9 @@ def test_groupby_list_columns_excluded():
pandas_agg_result = pdf.groupby("a").agg("mean", numeric_only=True)

assert_groupby_results_equal(
pandas_result, gdf.groupby("a").mean(), check_dtype=False
pandas_result,
gdf.groupby("a").mean(numeric_only=True),
check_dtype=False,
)

assert_groupby_results_equal(
Expand Down Expand Up @@ -3826,3 +3828,27 @@ def test_groupby_shift_series_multiindex():
result = ser.groupby(level=0).shift(1)
expected = ser.to_pandas().groupby(level=0).shift(1)
assert_eq(expected, result)


@pytest.mark.parametrize(
"func", ["min", "max", "sum", "mean", "idxmin", "idxmax"]
)
@pytest.mark.parametrize(
"by,data",
[
("a", {"a": [1, 2, 3]}),
(["a", "id"], {"id": [0, 0, 1], "a": [1, 2, 3]}),
("a", {"a": [1, 2, 3], "b": ["A", "B", "C"]}),
("id", {"id": [0, 0, 1], "a": [1, 2, 3], "b": ["A", "B", "C"]}),
(["b", "id"], {"id": [0, 0, 1], "b": ["A", "B", "C"]}),
("b", {"b": ["A", "B", "C"]}),
],
)
def test_group_by_reduce_numeric_only(by, data, func):
# Test that simple groupby reductions support numeric_only=True
df = cudf.DataFrame(data)
expected = getattr(df.to_pandas().groupby(by, sort=True), func)(
numeric_only=True
)
result = getattr(df.groupby(by, sort=True), func)(numeric_only=True)
assert_eq(expected, result)

0 comments on commit f091949

Please sign in to comment.