Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix index of Groupby.apply results when it is performed on empty objects #13944

Merged
merged 6 commits into from
Aug 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from cudf._lib.sort import segmented_sort_by_key
from cudf._lib.types import size_type_dtype
from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType
from cudf.api.types import is_list_like
from cudf.api.types import is_bool_dtype, is_list_like
from cudf.core.abc import Serializable
from cudf.core.column.column import ColumnBase, arange, as_column
from cudf.core.column_accessor import ColumnAccessor
Expand Down Expand Up @@ -1373,7 +1373,16 @@ def mult(df):
"""

if self.obj.empty:
return self.obj
res = self.obj.copy(deep=True)
res.index = self.grouping.keys
if function in {"sum", "product"}:
# For `sum` & `product`, boolean types
# will need to result in `int64` type.
for name, col in res._data.items():
if is_bool_dtype(col.dtype):
res._data[name] = col.astype("int")
return res

if not callable(function):
raise TypeError(f"type {type(function)} is not callable")
group_names, offsets, group_keys, grouped_values = self._grouped()
Expand Down
29 changes: 29 additions & 0 deletions python/cudf/cudf/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -3342,6 +3342,35 @@ def test_group_by_pandas_sort_order(groups, sort):
)


@pytest.mark.parametrize(
"dtype",
["int32", "int64", "float64", "datetime64[ns]", "timedelta64[ns]", "bool"],
)
@pytest.mark.parametrize(
"apply_op",
["sum", "min", "max", "idxmax"],
)
def test_group_by_empty_apply(request, dtype, apply_op):
request.applymarker(
pytest.mark.xfail(
condition=(dtype == "datetime64[ns]" and apply_op == "sum"),
reason=("sum isn't supported for datetime64[ns]"),
)
)
gdf = cudf.DataFrame({"a": [], "b": [], "c": []}, dtype=dtype)
pdf = gdf.to_pandas()

gg = gdf.groupby("a")["c"]
pg = pdf.groupby("a")["c"]

assert_eq(
gg.apply(apply_op),
pg.apply(apply_op),
check_dtype=True,
check_index_type=True,
)


def test_groupby_consecutive_operations():
df = cudf.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
pdf = df.to_pandas()
Expand Down