Minor fixes in review

rapidsai · Mar 22, 2023 · adb7a0b · adb7a0b
1 parent b57b068
commit adb7a0b
Show file tree

Hide file tree

Showing 4 changed files with 15 additions and 20 deletions.
diff --git a/python/cudf/benchmarks/API/bench_dataframe.py b/python/cudf/benchmarks/API/bench_dataframe.py
@@ -125,10 +125,7 @@ def bench_groupby_sample(
         ).astype(int)
         kwargs = {"n": target_size, "replace": replace}
 
-    def _():
-        return grouper.sample(**kwargs)
-
-    benchmark(_)
+    benchmark(grouper.sample, **kwargs)
 
 
 @benchmark_with_object(cls="dataframe", dtype="int")

diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx
@@ -186,10 +186,7 @@ def segmented_sort_by_key(
     column_order = column_order or [True] * ncol,
     null_precedence = null_precedence or ["first"] * ncol,
     for asc, null in zip(column_order, null_precedence):
-        if asc:
-            c_column_order.push_back(order.ASCENDING)
-        else:
-            c_column_order.push_back(order.DESCENDING)
+        c_column_order.push_back(order.ASCENDING if asc else order.DESCENDING)
         if asc ^ (null == "first"):
             c_null_precedence.push_back(null_order.AFTER)
         elif asc ^ (null == "last"):

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
@@ -17,6 +17,7 @@
 from cudf._lib.null_mask import bitmask_or
 from cudf._lib.reshape import interleave_columns
 from cudf._lib.sort import segmented_sort_by_key
+from cudf._lib.types import size_type_dtype
 from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType
 from cudf.api.types import is_list_like
 from cudf.core.abc import Serializable
@@ -638,7 +639,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool):
         # aggregation scheme in libcudf. This is probably "fast
         # enough" for most reasonable input sizes.
         _, offsets, _, group_values = self._grouped()
-        group_offsets = np.asarray(offsets, dtype=np.int32)
+        group_offsets = np.asarray(offsets, dtype=size_type_dtype)
         size_per_group = np.diff(group_offsets)
         # "Out of bounds" n for the group size either means no entries
         # (negative) or all the entries (positive)
@@ -652,7 +653,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool):
             group_offsets = group_offsets[:-1]
         else:
             group_offsets = group_offsets[1:] - size_per_group
-        to_take = np.arange(size_per_group.sum(), dtype=np.int32)
+        to_take = np.arange(size_per_group.sum(), dtype=size_type_dtype)
         fixup = np.empty_like(size_per_group)
         fixup[0] = 0
         np.cumsum(size_per_group[:-1], out=fixup[1:])
@@ -901,7 +902,7 @@ def sample(
         Returns
         -------
         New dataframe or series with samples of appropriate size drawn
-        from each group
+        from each group.
 
         """
         if weights is not None:
@@ -920,7 +921,6 @@ def sample(
             # the alias method, otherwise we're back to bucketed
             # rejection sampling.
             raise NotImplementedError("Sampling with weights is not supported")
-        # Can't wait for match/case
         if frac is not None and n is not None:
             raise ValueError("Cannot supply both of frac and n")
         elif n is None and frac is None:
@@ -941,11 +941,11 @@ def sample(
         # into a numpy array directly, rather than a list.
         # TODO: this uses the sort-based groupby, could one use hash-based?
         _, offsets, _, group_values = self._grouped()
-        group_offsets = np.asarray(offsets, dtype=np.int32)
+        group_offsets = np.asarray(offsets, dtype=size_type_dtype)
         size_per_group = np.diff(group_offsets)
         if n is not None:
             samples_per_group = np.broadcast_to(
-                np.int32(n), size_per_group.shape
+                size_type_dtype(n), size_per_group.shape
             )
             if not replace and (minsize := size_per_group.min()) < n:
                 raise ValueError(
@@ -958,23 +958,23 @@ def sample(
             # which is round-to-nearest, ties to sgn(x) * inf).
             samples_per_group = np.round(
                 size_per_group * frac, decimals=0
-            ).astype(np.int32)
+            ).astype(size_type_dtype)
         if replace:
             # We would prefer to use cupy here, but their rng.integers
             # interface doesn't take array-based low and high
             # arguments.
             low = 0
             high = np.repeat(size_per_group, samples_per_group)
             rng = np.random.default_rng(seed=random_state)
-            indices = rng.integers(low, high, dtype=np.int32)
+            indices = rng.integers(low, high, dtype=size_type_dtype)
             indices += np.repeat(group_offsets[:-1], samples_per_group)
         else:
             # Approach: do a segmented argsort of the index array and take
             # the first samples_per_group entries from sorted array.
             # We will shuffle the group indices and then pick them out
             # from the grouped dataframe index.
             nrows = len(group_values)
-            indices = cp.arange(nrows, dtype=np.int32)
+            indices = cp.arange(nrows, dtype=size_type_dtype)
             if len(size_per_group) < 500:
                 # Empirically shuffling with cupy is faster at this scale
                 rs = cp.random.get_random_state()
@@ -992,7 +992,7 @@ def sample(
                 )
                 indices = cp.asarray(indices.data_array_view(mode="read"))
             # Which indices are we going to want?
-            want = np.arange(samples_per_group.sum(), dtype=np.int32)
+            want = np.arange(samples_per_group.sum(), dtype=size_type_dtype)
             scan = np.empty_like(samples_per_group)
             scan[0] = 0
             np.cumsum(samples_per_group[:-1], out=scan[1:])

diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
@@ -4,6 +4,7 @@
 import datetime
 import itertools
 import operator
+import string
 import textwrap
 from decimal import Decimal
 
@@ -2974,7 +2975,7 @@ def index(self, request):
                 [2, 3, 4, 1, 0, 5, 6, 8, 7, 9, 10, 13], dtype="int32"
             )
         elif request.param == "strindex":
-            return cudf.StringIndex(list(chr(ord("a") + i) for i in range(n)))
+            return cudf.StringIndex(string.ascii_lowercase[:n])
         elif request.param == "default":
             return None
 
@@ -3011,7 +3012,7 @@ def expected(self, df, *, n=None, frac=None):
                 )
             )
         else:
-            raise AssertionError("Invalid usage")
+            raise ValueError("Must provide either n or frac")
         values = cudf.Series(sorted(values), dtype=df.a.dtype)
         return cudf.DataFrame({"a": values, "b": values, "v": values})