diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index b0be97915f2..e97ea8081e8 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -21,7 +21,7 @@ from cudf._lib.sort import segmented_sort_by_key from cudf._lib.types import size_type_dtype from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType -from cudf.api.types import is_list_like +from cudf.api.types import is_bool_dtype, is_list_like from cudf.core.abc import Serializable from cudf.core.column.column import ColumnBase, arange, as_column from cudf.core.column_accessor import ColumnAccessor @@ -1373,7 +1373,16 @@ def mult(df): """ if self.obj.empty: - return self.obj + res = self.obj.copy(deep=True) + res.index = self.grouping.keys + if function in {"sum", "product"}: + # For `sum` & `product`, boolean types + # will need to result in `int64` type. + for name, col in res._data.items(): + if is_bool_dtype(col.dtype): + res._data[name] = col.astype("int") + return res + if not callable(function): raise TypeError(f"type {type(function)} is not callable") group_names, offsets, group_keys, grouped_values = self._grouped() diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index a3b205cc16b..0e96b97e1e1 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -3342,6 +3342,35 @@ def test_group_by_pandas_sort_order(groups, sort): ) +@pytest.mark.parametrize( + "dtype", + ["int32", "int64", "float64", "datetime64[ns]", "timedelta64[ns]", "bool"], +) +@pytest.mark.parametrize( + "apply_op", + ["sum", "min", "max", "idxmax"], +) +def test_group_by_empty_apply(request, dtype, apply_op): + request.applymarker( + pytest.mark.xfail( + condition=(dtype == "datetime64[ns]" and apply_op == "sum"), + reason=("sum isn't supported for datetime64[ns]"), + ) + ) + gdf = cudf.DataFrame({"a": [], "b": [], "c": []}, dtype=dtype) + pdf = gdf.to_pandas() + + gg = gdf.groupby("a")["c"] + pg = pdf.groupby("a")["c"] + + assert_eq( + gg.apply(apply_op), + pg.apply(apply_op), + check_dtype=True, + check_index_type=True, + ) + + def test_groupby_consecutive_operations(): df = cudf.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) pdf = df.to_pandas()