Skip to content

Commit

Permalink
Fix index of Groupby.apply results when it is performed on empty …
Browse files Browse the repository at this point in the history
…objects (#13944)

closes #13939 
This PR fixes two issues with `Groupby.apply`, where the index of the result was not being set correctly and there is a corner case for `bool` dtype that has to be handled for `sum` & `product` operations.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: #13944
  • Loading branch information
galipremsagar authored Aug 23, 2023
1 parent 4014ea3 commit 171fc91
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 2 deletions.
13 changes: 11 additions & 2 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from cudf._lib.sort import segmented_sort_by_key
from cudf._lib.types import size_type_dtype
from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType
from cudf.api.types import is_list_like
from cudf.api.types import is_bool_dtype, is_list_like
from cudf.core.abc import Serializable
from cudf.core.column.column import ColumnBase, arange, as_column
from cudf.core.column_accessor import ColumnAccessor
Expand Down Expand Up @@ -1373,7 +1373,16 @@ def mult(df):
"""

if self.obj.empty:
return self.obj
res = self.obj.copy(deep=True)
res.index = self.grouping.keys
if function in {"sum", "product"}:
# For `sum` & `product`, boolean types
# will need to result in `int64` type.
for name, col in res._data.items():
if is_bool_dtype(col.dtype):
res._data[name] = col.astype("int")
return res

if not callable(function):
raise TypeError(f"type {type(function)} is not callable")
group_names, offsets, group_keys, grouped_values = self._grouped()
Expand Down
29 changes: 29 additions & 0 deletions python/cudf/cudf/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -3342,6 +3342,35 @@ def test_group_by_pandas_sort_order(groups, sort):
)


@pytest.mark.parametrize(
"dtype",
["int32", "int64", "float64", "datetime64[ns]", "timedelta64[ns]", "bool"],
)
@pytest.mark.parametrize(
"apply_op",
["sum", "min", "max", "idxmax"],
)
def test_group_by_empty_apply(request, dtype, apply_op):
request.applymarker(
pytest.mark.xfail(
condition=(dtype == "datetime64[ns]" and apply_op == "sum"),
reason=("sum isn't supported for datetime64[ns]"),
)
)
gdf = cudf.DataFrame({"a": [], "b": [], "c": []}, dtype=dtype)
pdf = gdf.to_pandas()

gg = gdf.groupby("a")["c"]
pg = pdf.groupby("a")["c"]

assert_eq(
gg.apply(apply_op),
pg.apply(apply_op),
check_dtype=True,
check_index_type=True,
)


def test_groupby_consecutive_operations():
df = cudf.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
pdf = df.to_pandas()
Expand Down

0 comments on commit 171fc91

Please sign in to comment.