Skip to content

Commit

Permalink
Fix GroupBy.get_group and GroupBy.indices (#15143)
Browse files Browse the repository at this point in the history
These are supposed to index based on row indices, not row labels.

- Closes #14955

Authors:
  - Lawrence Mitchell (https://github.com/wence-)
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: #15143
  • Loading branch information
wence- authored Mar 5, 2024
1 parent 1f5fcf6 commit d4368e9
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 7 deletions.
22 changes: 15 additions & 7 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,13 +363,22 @@ def indices(self):
>>> df.groupby(by=["a"]).indices
{10: array([0, 1]), 40: array([2])}
"""
group_names, offsets, _, grouped_values = self._grouped()
offsets, group_keys, (indices,) = self._groupby.groups(
[
cudf.core.column.as_column(
range(len(self.obj)), dtype=size_type_dtype
)
]
)

group_keys = libcudf.stream_compaction.drop_duplicates(group_keys)
if len(group_keys) > 1:
index = cudf.MultiIndex.from_arrays(group_keys)
else:
(group_keys,) = group_keys
index = cudf.Index(group_keys)
return dict(
zip(
group_names.to_pandas(),
np.split(grouped_values.index.values, offsets[1:-1]),
)
zip(index.to_pandas(), cp.split(indices.values, offsets[1:-1]))
)

@_cudf_nvtx_annotate
Expand Down Expand Up @@ -414,8 +423,7 @@ def get_group(self, name, obj=None):
"instead of ``gb.get_group(name, obj=df)``.",
FutureWarning,
)

return obj.loc[self.groups[name].drop_duplicates()]
return obj.iloc[self.indices[name]]

@_cudf_nvtx_annotate
def size(self):
Expand Down
15 changes: 15 additions & 0 deletions python/cudf/cudf/tests/groupby/test_groupby_obj.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
from numpy.testing import assert_array_equal

import cudf
from cudf.testing._utils import assert_eq


def test_groupby_14955():
# https://github.com/rapidsai/cudf/issues/14955
df = cudf.DataFrame({"a": [1, 2] * 2}, index=[0] * 4)
agg = df.groupby("a")
pagg = df.to_pandas().groupby("a")
for key in agg.groups:
assert_array_equal(pagg.indices[key], agg.indices[key].get())
assert_eq(pagg.get_group(key), agg.get_group(key))

0 comments on commit d4368e9

Please sign in to comment.