rapidsai · rapids-bot · Feb 8, 2024 · Feb 7, 2024 · Feb 7, 2024 · Feb 7, 2024
@@ -37,6 +37,7 @@ cdef class GroupByRequest:
 
 cdef class GroupBy:
     cdef unique_ptr[groupby] c_obj
+    cdef Table _keys
     cpdef tuple aggregate(self, list requests)
     cpdef tuple scan(self, list requests)
     cpdef tuple shift(self, Table values, list offset, list fill_values)

@@ -98,6 +98,9 @@ cdef class GroupBy:
         sorted keys_are_sorted=sorted.NO
     ):
         self.c_obj.reset(new groupby(keys.view(), null_handling, keys_are_sorted))
+        # keep a reference to the keys table so it doesn't get
+        # deallocated from under us:
+        self._keys = keys
 
     @staticmethod
     cdef tuple _parse_outputs(
@@ -253,26 +256,31 @@ cdef class GroupBy:
         Parameters
         ----------
         values : Table, optional
-            The columns to get group labels for. If not specified, the group
-            labels for the group keys are returned.
+            The columns to get group labels for. If not specified,
+            `None` is returned for the group values.
 
         Returns
         -------
         Tuple[Table, Table, List[int]]
             A tuple of tables containing three items:
                 - A table of group keys
-                - A table of group values
+                - A table of group values or None
                 - A list of integer offsets into the tables
         """
 
         cdef groups c_groups
         if values:
             c_groups = dereference(self.c_obj).get_groups(values.view())
+            return (
+                Table.from_libcudf(move(c_groups.keys)),
+                Table.from_libcudf(move(c_groups.values)),
+                c_groups.offsets,
+            )
         else:
+            # c_groups.values is nullptr
             c_groups = dereference(self.c_obj).get_groups()
-
-        return (
-            Table.from_libcudf(move(c_groups.keys)),
-            Table.from_libcudf(move(c_groups.values)),
-            c_groups.offsets,
-        )
+            return (
+                Table.from_libcudf(move(c_groups.keys)),
+                None,
+                c_groups.offsets,
+            )
@@ -3756,3 +3756,11 @@ def test_group_by_value_counts_with_count_column():
     df = cudf.DataFrame({"a": [1, 5, 3], "count": [2, 5, 2]})
     with pytest.raises(ValueError):
         df.groupby("a", as_index=False).value_counts()
+
+
+def test_groupby_internal_groups_empty(gdf):
+    # test that we don't segfault when calling the internal
+    # .groups() method with an empty list:
+    gb = gdf.groupby("y")._groupby
+    _, grouped_vals, _ = gb.groups([])
+    assert grouped_vals is None