diff --git a/python/cudf/benchmarks/API/bench_dataframe.py b/python/cudf/benchmarks/API/bench_dataframe.py index 9acdc7e33d5..28777b23583 100644 --- a/python/cudf/benchmarks/API/bench_dataframe.py +++ b/python/cudf/benchmarks/API/bench_dataframe.py @@ -125,10 +125,7 @@ def bench_groupby_sample( ).astype(int) kwargs = {"n": target_size, "replace": replace} - def _(): - return grouper.sample(**kwargs) - - benchmark(_) + benchmark(grouper.sample, **kwargs) @benchmark_with_object(cls="dataframe", dtype="int") diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx index 938ce56af76..c4dae04b134 100644 --- a/python/cudf/cudf/_lib/sort.pyx +++ b/python/cudf/cudf/_lib/sort.pyx @@ -186,10 +186,7 @@ def segmented_sort_by_key( column_order = column_order or [True] * ncol, null_precedence = null_precedence or ["first"] * ncol, for asc, null in zip(column_order, null_precedence): - if asc: - c_column_order.push_back(order.ASCENDING) - else: - c_column_order.push_back(order.DESCENDING) + c_column_order.push_back(order.ASCENDING if asc else order.DESCENDING) if asc ^ (null == "first"): c_null_precedence.push_back(null_order.AFTER) elif asc ^ (null == "last"): diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index cbbe1f2ce0b..a8db09702bb 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -17,6 +17,7 @@ from cudf._lib.null_mask import bitmask_or from cudf._lib.reshape import interleave_columns from cudf._lib.sort import segmented_sort_by_key +from cudf._lib.types import size_type_dtype from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType from cudf.api.types import is_list_like from cudf.core.abc import Serializable @@ -638,7 +639,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool): # aggregation scheme in libcudf. This is probably "fast # enough" for most reasonable input sizes. _, offsets, _, group_values = self._grouped() - group_offsets = np.asarray(offsets, dtype=np.int32) + group_offsets = np.asarray(offsets, dtype=size_type_dtype) size_per_group = np.diff(group_offsets) # "Out of bounds" n for the group size either means no entries # (negative) or all the entries (positive) @@ -652,7 +653,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool): group_offsets = group_offsets[:-1] else: group_offsets = group_offsets[1:] - size_per_group - to_take = np.arange(size_per_group.sum(), dtype=np.int32) + to_take = np.arange(size_per_group.sum(), dtype=size_type_dtype) fixup = np.empty_like(size_per_group) fixup[0] = 0 np.cumsum(size_per_group[:-1], out=fixup[1:]) @@ -901,7 +902,7 @@ def sample( Returns ------- New dataframe or series with samples of appropriate size drawn - from each group + from each group. """ if weights is not None: @@ -920,7 +921,6 @@ def sample( # the alias method, otherwise we're back to bucketed # rejection sampling. raise NotImplementedError("Sampling with weights is not supported") - # Can't wait for match/case if frac is not None and n is not None: raise ValueError("Cannot supply both of frac and n") elif n is None and frac is None: @@ -941,11 +941,11 @@ def sample( # into a numpy array directly, rather than a list. # TODO: this uses the sort-based groupby, could one use hash-based? _, offsets, _, group_values = self._grouped() - group_offsets = np.asarray(offsets, dtype=np.int32) + group_offsets = np.asarray(offsets, dtype=size_type_dtype) size_per_group = np.diff(group_offsets) if n is not None: samples_per_group = np.broadcast_to( - np.int32(n), size_per_group.shape + size_type_dtype(n), size_per_group.shape ) if not replace and (minsize := size_per_group.min()) < n: raise ValueError( @@ -958,7 +958,7 @@ def sample( # which is round-to-nearest, ties to sgn(x) * inf). samples_per_group = np.round( size_per_group * frac, decimals=0 - ).astype(np.int32) + ).astype(size_type_dtype) if replace: # We would prefer to use cupy here, but their rng.integers # interface doesn't take array-based low and high @@ -966,7 +966,7 @@ def sample( low = 0 high = np.repeat(size_per_group, samples_per_group) rng = np.random.default_rng(seed=random_state) - indices = rng.integers(low, high, dtype=np.int32) + indices = rng.integers(low, high, dtype=size_type_dtype) indices += np.repeat(group_offsets[:-1], samples_per_group) else: # Approach: do a segmented argsort of the index array and take @@ -974,7 +974,7 @@ def sample( # We will shuffle the group indices and then pick them out # from the grouped dataframe index. nrows = len(group_values) - indices = cp.arange(nrows, dtype=np.int32) + indices = cp.arange(nrows, dtype=size_type_dtype) if len(size_per_group) < 500: # Empirically shuffling with cupy is faster at this scale rs = cp.random.get_random_state() @@ -992,7 +992,7 @@ def sample( ) indices = cp.asarray(indices.data_array_view(mode="read")) # Which indices are we going to want? - want = np.arange(samples_per_group.sum(), dtype=np.int32) + want = np.arange(samples_per_group.sum(), dtype=size_type_dtype) scan = np.empty_like(samples_per_group) scan[0] = 0 np.cumsum(samples_per_group[:-1], out=scan[1:]) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 53a1caf912a..1cad9eae6c8 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -4,6 +4,7 @@ import datetime import itertools import operator +import string import textwrap from decimal import Decimal @@ -2974,7 +2975,7 @@ def index(self, request): [2, 3, 4, 1, 0, 5, 6, 8, 7, 9, 10, 13], dtype="int32" ) elif request.param == "strindex": - return cudf.StringIndex(list(chr(ord("a") + i) for i in range(n))) + return cudf.StringIndex(string.ascii_lowercase[:n]) elif request.param == "default": return None @@ -3011,7 +3012,7 @@ def expected(self, df, *, n=None, frac=None): ) ) else: - raise AssertionError("Invalid usage") + raise ValueError("Must provide either n or frac") values = cudf.Series(sorted(values), dtype=df.a.dtype) return cudf.DataFrame({"a": values, "b": values, "v": values})