Fix GroupBy.get_group and GroupBy.indices (#15143)

These are supposed to index based on row indices, not row labels. - Closes #14955 Authors: - Lawrence Mitchell (https://github.com/wence-) - Richard (Rick) Zamora (https://github.com/rjzamora) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) URL: #15143
rapidsai · Mar 5, 2024 · d4368e9 · d4368e9
1 parent 1f5fcf6
commit d4368e9
Show file tree

Hide file tree

Showing 2 changed files with 30 additions and 7 deletions.
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
@@ -363,13 +363,22 @@ def indices(self):
         >>> df.groupby(by=["a"]).indices
         {10: array([0, 1]), 40: array([2])}
         """
-        group_names, offsets, _, grouped_values = self._grouped()
+        offsets, group_keys, (indices,) = self._groupby.groups(
+            [
+                cudf.core.column.as_column(
+                    range(len(self.obj)), dtype=size_type_dtype
+                )
+            ]
+        )
 
+        group_keys = libcudf.stream_compaction.drop_duplicates(group_keys)
+        if len(group_keys) > 1:
+            index = cudf.MultiIndex.from_arrays(group_keys)
+        else:
+            (group_keys,) = group_keys
+            index = cudf.Index(group_keys)
         return dict(
-            zip(
-                group_names.to_pandas(),
-                np.split(grouped_values.index.values, offsets[1:-1]),
-            )
+            zip(index.to_pandas(), cp.split(indices.values, offsets[1:-1]))
         )
 
     @_cudf_nvtx_annotate
@@ -414,8 +423,7 @@ def get_group(self, name, obj=None):
                 "instead of ``gb.get_group(name, obj=df)``.",
                 FutureWarning,
             )
-
-        return obj.loc[self.groups[name].drop_duplicates()]
+        return obj.iloc[self.indices[name]]
 
     @_cudf_nvtx_annotate
     def size(self):

diff --git a/python/cudf/cudf/tests/groupby/test_groupby_obj.py b/python/cudf/cudf/tests/groupby/test_groupby_obj.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from numpy.testing import assert_array_equal
+
+import cudf
+from cudf.testing._utils import assert_eq
+
+
+def test_groupby_14955():
+    # https://github.com/rapidsai/cudf/issues/14955
+    df = cudf.DataFrame({"a": [1, 2] * 2}, index=[0] * 4)
+    agg = df.groupby("a")
+    pagg = df.to_pandas().groupby("a")
+    for key in agg.groups:
+        assert_array_equal(pagg.indices[key], agg.indices[key].get())
+        assert_eq(pagg.get_group(key), agg.get_group(key))