Skip to content

Commit

Permalink
Minor fixes in review
Browse files Browse the repository at this point in the history
  • Loading branch information
wence- committed Mar 22, 2023
1 parent b57b068 commit adb7a0b
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 20 deletions.
5 changes: 1 addition & 4 deletions python/cudf/benchmarks/API/bench_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,10 +125,7 @@ def bench_groupby_sample(
).astype(int)
kwargs = {"n": target_size, "replace": replace}

def _():
return grouper.sample(**kwargs)

benchmark(_)
benchmark(grouper.sample, **kwargs)


@benchmark_with_object(cls="dataframe", dtype="int")
Expand Down
5 changes: 1 addition & 4 deletions python/cudf/cudf/_lib/sort.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -186,10 +186,7 @@ def segmented_sort_by_key(
column_order = column_order or [True] * ncol,
null_precedence = null_precedence or ["first"] * ncol,
for asc, null in zip(column_order, null_precedence):
if asc:
c_column_order.push_back(order.ASCENDING)
else:
c_column_order.push_back(order.DESCENDING)
c_column_order.push_back(order.ASCENDING if asc else order.DESCENDING)
if asc ^ (null == "first"):
c_null_precedence.push_back(null_order.AFTER)
elif asc ^ (null == "last"):
Expand Down
20 changes: 10 additions & 10 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from cudf._lib.null_mask import bitmask_or
from cudf._lib.reshape import interleave_columns
from cudf._lib.sort import segmented_sort_by_key
from cudf._lib.types import size_type_dtype
from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType
from cudf.api.types import is_list_like
from cudf.core.abc import Serializable
Expand Down Expand Up @@ -638,7 +639,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool):
# aggregation scheme in libcudf. This is probably "fast
# enough" for most reasonable input sizes.
_, offsets, _, group_values = self._grouped()
group_offsets = np.asarray(offsets, dtype=np.int32)
group_offsets = np.asarray(offsets, dtype=size_type_dtype)
size_per_group = np.diff(group_offsets)
# "Out of bounds" n for the group size either means no entries
# (negative) or all the entries (positive)
Expand All @@ -652,7 +653,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool):
group_offsets = group_offsets[:-1]
else:
group_offsets = group_offsets[1:] - size_per_group
to_take = np.arange(size_per_group.sum(), dtype=np.int32)
to_take = np.arange(size_per_group.sum(), dtype=size_type_dtype)
fixup = np.empty_like(size_per_group)
fixup[0] = 0
np.cumsum(size_per_group[:-1], out=fixup[1:])
Expand Down Expand Up @@ -901,7 +902,7 @@ def sample(
Returns
-------
New dataframe or series with samples of appropriate size drawn
from each group
from each group.
"""
if weights is not None:
Expand All @@ -920,7 +921,6 @@ def sample(
# the alias method, otherwise we're back to bucketed
# rejection sampling.
raise NotImplementedError("Sampling with weights is not supported")
# Can't wait for match/case
if frac is not None and n is not None:
raise ValueError("Cannot supply both of frac and n")
elif n is None and frac is None:
Expand All @@ -941,11 +941,11 @@ def sample(
# into a numpy array directly, rather than a list.
# TODO: this uses the sort-based groupby, could one use hash-based?
_, offsets, _, group_values = self._grouped()
group_offsets = np.asarray(offsets, dtype=np.int32)
group_offsets = np.asarray(offsets, dtype=size_type_dtype)
size_per_group = np.diff(group_offsets)
if n is not None:
samples_per_group = np.broadcast_to(
np.int32(n), size_per_group.shape
size_type_dtype(n), size_per_group.shape
)
if not replace and (minsize := size_per_group.min()) < n:
raise ValueError(
Expand All @@ -958,23 +958,23 @@ def sample(
# which is round-to-nearest, ties to sgn(x) * inf).
samples_per_group = np.round(
size_per_group * frac, decimals=0
).astype(np.int32)
).astype(size_type_dtype)
if replace:
# We would prefer to use cupy here, but their rng.integers
# interface doesn't take array-based low and high
# arguments.
low = 0
high = np.repeat(size_per_group, samples_per_group)
rng = np.random.default_rng(seed=random_state)
indices = rng.integers(low, high, dtype=np.int32)
indices = rng.integers(low, high, dtype=size_type_dtype)
indices += np.repeat(group_offsets[:-1], samples_per_group)
else:
# Approach: do a segmented argsort of the index array and take
# the first samples_per_group entries from sorted array.
# We will shuffle the group indices and then pick them out
# from the grouped dataframe index.
nrows = len(group_values)
indices = cp.arange(nrows, dtype=np.int32)
indices = cp.arange(nrows, dtype=size_type_dtype)
if len(size_per_group) < 500:
# Empirically shuffling with cupy is faster at this scale
rs = cp.random.get_random_state()
Expand All @@ -992,7 +992,7 @@ def sample(
)
indices = cp.asarray(indices.data_array_view(mode="read"))
# Which indices are we going to want?
want = np.arange(samples_per_group.sum(), dtype=np.int32)
want = np.arange(samples_per_group.sum(), dtype=size_type_dtype)
scan = np.empty_like(samples_per_group)
scan[0] = 0
np.cumsum(samples_per_group[:-1], out=scan[1:])
Expand Down
5 changes: 3 additions & 2 deletions python/cudf/cudf/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import datetime
import itertools
import operator
import string
import textwrap
from decimal import Decimal

Expand Down Expand Up @@ -2974,7 +2975,7 @@ def index(self, request):
[2, 3, 4, 1, 0, 5, 6, 8, 7, 9, 10, 13], dtype="int32"
)
elif request.param == "strindex":
return cudf.StringIndex(list(chr(ord("a") + i) for i in range(n)))
return cudf.StringIndex(string.ascii_lowercase[:n])
elif request.param == "default":
return None

Expand Down Expand Up @@ -3011,7 +3012,7 @@ def expected(self, df, *, n=None, frac=None):
)
)
else:
raise AssertionError("Invalid usage")
raise ValueError("Must provide either n or frac")
values = cudf.Series(sorted(values), dtype=df.a.dtype)
return cudf.DataFrame({"a": values, "b": values, "v": values})

Expand Down

0 comments on commit adb7a0b

Please sign in to comment.