From 008cfe1a97703a2884f2b353c1609ce4e6dba997 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 6 Mar 2023 10:08:02 +0000 Subject: [PATCH 01/18] Implement sketch of groupby.sample To do so, obtain the group offsets and values (and hence index). Sample within each group, and then pull out rows from the original object. The fastest way to do this in Python is via the builtin random library, since neither numpy nor cupy offer a broadcasted/ufunc random.sample, and looping over the groups is very slow using either of them. Looping over the groups and using python random.sample is also slow, but less so. --- python/cudf/cudf/core/groupby/groupby.py | 69 +++++++++++++++++++++++- 1 file changed, 68 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 8ff3e17d6ff..d50d1487bb5 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -2,11 +2,12 @@ import itertools import pickle +import random import textwrap import warnings from collections import abc from functools import cached_property -from typing import Any, Iterable, List, Tuple, Union +from typing import Any, Iterable, List, Optional, Tuple, Union import cupy as cp import numpy as np @@ -699,6 +700,72 @@ def ngroup(self, ascending=True): group_ids._index = index return self._broadcast(group_ids) + def sample( + self, + n: Optional[int] = None, + frac: Optional[float] = None, + replace: bool = False, + weights: Union[abc.Sequence, "cudf.Series", None] = None, + random_state: Union[np.random.RandomState, int, None] = None, + ): + """Return a random sample of items in each group. + + Parameters + ---------- + n + Number of items to return for each group, if sampling + without replacement must be at most the size of the + smallest group. Cannot be used with frac. + frac + Fraction of items to return. Cannot be used with n. Not + currently supported. + replace + Should sampling occur with or without replacement? + weights + Sampling probability for each element. Must be the same + length as the grouped frame. Not currently supported. + random_state + Seed for random number generation. + """ + if frac is not None: + raise NotImplementedError( + "Sorry, sampling with fraction is not supported" + ) + if weights is not None: + raise NotImplementedError( + "Sorry, sampling with weights is not supported" + ) + if random_state is not None and not isinstance(random_state, int): + raise NotImplementedError( + "Sorry, only integer seeds are supported for random_state" + ) + if n is None: + raise ValueError("Please supply a sample size") + # Although the check n is None projects the type of n from + # Optional[int] to int, because of the type-annotation, and + # name-binding in closures, we can't use n in the sample + # lambdas since the value of n might still legitimately be + # None by the type the lambda is called. + nsample = n + rng = random.Random(x=random_state) + if replace: + sample = lambda s, e: [ # noqa: E731 + rng.randrange(s, e) for _ in range(nsample) + ] + else: + sample = lambda s, e: rng.sample( # noqa: E731 + range(s, e), nsample + ) + _, offsets, _, values = self._grouped() + sizes = np.diff(offsets) + indices = list( + itertools.chain.from_iterable( + sample(offset, offset + size) + for size, offset in zip(sizes, offsets) + ) + ) + return self.obj.iloc[values.index[indices]] + def serialize(self): header = {} frames = [] From 09723396bc8cd5d5fdfb6c573a6c29fc895d95d3 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 8 Mar 2023 11:35:32 +0000 Subject: [PATCH 02/18] Implement fast path for sample --- python/cudf/cudf/core/groupby/groupby.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index d50d1487bb5..acb28ae32b8 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -727,6 +727,22 @@ def sample( random_state Seed for random number generation. """ + if ( + n is not None + and frac is None + and not replace + and weights is None + and isinstance(self._by, str) + ): + # Fast path since groupby commutes with whole-frame + # sampling in this case + df = self.obj.sample(frac=1, random_state=random_state) + tempname = f"_{''.join(df.columns)}" + df[tempname] = self.obj[self._by].astype("category").codes.cat + df = df.loc[df.groupby(tempname)[tempname].rank("first") <= n, :] + del df[tempname] + return df + if frac is not None: raise NotImplementedError( "Sorry, sampling with fraction is not supported" From 30629acd38903abb1719d8a88c73e7f5fd209203 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 8 Mar 2023 12:47:41 +0000 Subject: [PATCH 03/18] Pacify type-checker and some more implementation --- python/cudf/cudf/core/groupby/groupby.py | 61 +++++++++++++----------- 1 file changed, 34 insertions(+), 27 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index acb28ae32b8..ebaa7e4d566 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -715,10 +715,10 @@ def sample( n Number of items to return for each group, if sampling without replacement must be at most the size of the - smallest group. Cannot be used with frac. + smallest group. Cannot be used with frac. Default is + ``n=1`` if frac is None. frac - Fraction of items to return. Cannot be used with n. Not - currently supported. + Fraction of items to return. Cannot be used with n. replace Should sampling occur with or without replacement? weights @@ -727,57 +727,64 @@ def sample( random_state Seed for random number generation. """ + if frac is not None and n is not None: + raise ValueError("Cannot supply both of frac and n") + elif n is None and frac is None: + n = 1 if ( n is not None and frac is None and not replace and weights is None and isinstance(self._by, str) + # Need to think harder about seriesgroupby for this case + and isinstance(self.obj, cudf.DataFrame) ): # Fast path since groupby commutes with whole-frame # sampling in this case - df = self.obj.sample(frac=1, random_state=random_state) + df = self.obj.sample( + frac=1, random_state=random_state + ).reset_index(drop=True) + # Cantor name tempname = f"_{''.join(df.columns)}" - df[tempname] = self.obj[self._by].astype("category").codes.cat + newcol = self.obj[self._by] + df[tempname] = newcol.astype("category").codes.cat df = df.loc[df.groupby(tempname)[tempname].rank("first") <= n, :] del df[tempname] return df - if frac is not None: - raise NotImplementedError( - "Sorry, sampling with fraction is not supported" - ) if weights is not None: raise NotImplementedError( "Sorry, sampling with weights is not supported" ) if random_state is not None and not isinstance(random_state, int): raise NotImplementedError( - "Sorry, only integer seeds are supported for random_state" + "Sorry, only integer seeds are supported for random_state " + "in this case" + ) + # Get the groups + _, offsets, _, values = self._grouped() + sizes = np.diff(offsets) + if n is not None: + nsamples: abc.Iterable[int] = itertools.repeat(n) + else: + # Pandas uses Python-based round-to-nearest, ties to even to + # pick sample sizes for the fractional case (unlike IEEE + # which is round-to-nearest, ties to sgn(x) * inf). + nsamples = ( + np.round(sizes * frac, decimals=0).astype(np.int32).tolist() ) - if n is None: - raise ValueError("Please supply a sample size") - # Although the check n is None projects the type of n from - # Optional[int] to int, because of the type-annotation, and - # name-binding in closures, we can't use n in the sample - # lambdas since the value of n might still legitimately be - # None by the type the lambda is called. - nsample = n rng = random.Random(x=random_state) if replace: - sample = lambda s, e: [ # noqa: E731 - rng.randrange(s, e) for _ in range(nsample) + sample = lambda s, e, n: [ # noqa: E731 + rng.randrange(s, e) for _ in range(n) ] else: - sample = lambda s, e: rng.sample( # noqa: E731 - range(s, e), nsample - ) - _, offsets, _, values = self._grouped() - sizes = np.diff(offsets) + sample = lambda s, e, n: rng.sample(range(s, e), n) # noqa: E731 indices = list( itertools.chain.from_iterable( - sample(offset, offset + size) - for size, offset in zip(sizes, offsets) + sample(offset, offset + size, nsample) + for size, offset, nsample in zip(sizes, offsets, nsamples) ) ) return self.obj.iloc[values.index[indices]] From 114a164b069e63831f733e233aee5d7932fc2caa Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 9 Mar 2023 12:32:04 +0000 Subject: [PATCH 04/18] Faster paths in most cases, better documentation --- python/cudf/cudf/core/groupby/groupby.py | 93 ++++++++++++++++-------- 1 file changed, 63 insertions(+), 30 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index ebaa7e4d566..cdc203cd10b 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -2,7 +2,6 @@ import itertools import pickle -import random import textwrap import warnings from collections import abc @@ -726,16 +725,43 @@ def sample( length as the grouped frame. Not currently supported. random_state Seed for random number generation. + + Returns + ------- + New dataframe or series with samples of appropriate size drawn + from each group + + Notes + ----- + Sampling with fractional group sizes is currently much slower + than sampling with a constant number of items per group. + Please fill an enhancement request if you need this code path + to run fast. """ + if weights is not None: + raise NotImplementedError( + "Sorry, sampling with weights is not supported" + ) if frac is not None and n is not None: raise ValueError("Cannot supply both of frac and n") elif n is None and frac is None: n = 1 + # Supported cases + # + # n frac replace | Method | Speed + # ------------------------------------------------------------------ + # int None False | whole-frame sample | fast + # int None True | groupby, all-at-once | fast + # | random sample generation | + # None float False | groupby, per-group | slow (many groups) + # | random sample generation | + # None float True | groupby, per-group | slow (many groups) + # | random sample generation | if ( n is not None - and frac is None and not replace and weights is None + # Can't create a categorical from more than a single column and isinstance(self._by, str) # Need to think harder about seriesgroupby for this case and isinstance(self.obj, cudf.DataFrame) @@ -748,45 +774,52 @@ def sample( # Cantor name tempname = f"_{''.join(df.columns)}" newcol = self.obj[self._by] - df[tempname] = newcol.astype("category").codes.cat + df[tempname] = newcol.astype("category").cat.codes df = df.loc[df.groupby(tempname)[tempname].rank("first") <= n, :] del df[tempname] return df - if weights is not None: - raise NotImplementedError( - "Sorry, sampling with weights is not supported" - ) + # Get the groups + _, offsets, _, values = self._grouped() + offsets = np.asarray(offsets).astype(np.int32) + sizes = np.diff(offsets) if random_state is not None and not isinstance(random_state, int): raise NotImplementedError( "Sorry, only integer seeds are supported for random_state " "in this case" ) - # Get the groups - _, offsets, _, values = self._grouped() - sizes = np.diff(offsets) - if n is not None: - nsamples: abc.Iterable[int] = itertools.repeat(n) + # TODO: handle random states properly. + rng = np.random.default_rng(seed=random_state) + if n is not None and replace: + # We can use numpy in this case, which is fast for lots of + # groups and large samples + lo = offsets[:-1].reshape(-1, 1) + hi = lo + sizes.reshape(-1, 1) + indices = rng.integers(lo, hi, size=(*lo.shape, n)).reshape(-1) else: - # Pandas uses Python-based round-to-nearest, ties to even to - # pick sample sizes for the fractional case (unlike IEEE - # which is round-to-nearest, ties to sgn(x) * inf). - nsamples = ( - np.round(sizes * frac, decimals=0).astype(np.int32).tolist() - ) - rng = random.Random(x=random_state) - if replace: - sample = lambda s, e, n: [ # noqa: E731 - rng.randrange(s, e) for _ in range(n) - ] - else: - sample = lambda s, e, n: rng.sample(range(s, e), n) # noqa: E731 - indices = list( - itertools.chain.from_iterable( - sample(offset, offset + size, nsample) - for size, offset, nsample in zip(sizes, offsets, nsamples) + if n is not None: + nsamples = np.broadcast_to(np.int32(n), sizes.shape) + else: + # Pandas uses round-to-nearest, ties to even to pick + # sample sizes for the fractional case (unlike IEEE + # which is round-to-nearest, ties to sgn(x) * inf). + nsamples = np.round(sizes * frac, decimals=0).astype(np.int32) + if len(sizes) >= 100_000: + # Arbitrary "large" case + warnings.warn( + "Sampling a large number of groups with these " + "parameters is slow. If you need this to be faster, " + "please open an RFE with your use case." + ) + indices = np.concatenate( + [ + rng.choice(size, size=nsample, replace=replace).astype( + np.int32 + ) + + offset + for size, offset, nsample in zip(sizes, offsets, nsamples) + ] ) - ) return self.obj.iloc[values.index[indices]] def serialize(self): From 2f9a8c1d1793e670960b08758e262078b2be24d3 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 9 Mar 2023 15:46:57 +0000 Subject: [PATCH 05/18] Fix bugs in fast-path code --- python/cudf/cudf/core/groupby/groupby.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index cdc203cd10b..ce61f02a8c7 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -735,7 +735,7 @@ def sample( ----- Sampling with fractional group sizes is currently much slower than sampling with a constant number of items per group. - Please fill an enhancement request if you need this code path + Please file an enhancement request if you need this code path to run fast. """ if weights is not None: @@ -768,12 +768,18 @@ def sample( ): # Fast path since groupby commutes with whole-frame # sampling in this case + minsize = self.size().min() + if minsize < n: + raise ValueError( + f"Cannot sample {n=} without replacement " + f"smallest group is {minsize}." + ) df = self.obj.sample( frac=1, random_state=random_state ).reset_index(drop=True) # Cantor name tempname = f"_{''.join(df.columns)}" - newcol = self.obj[self._by] + newcol = df[self._by] df[tempname] = newcol.astype("category").cat.codes df = df.loc[df.groupby(tempname)[tempname].rank("first") <= n, :] del df[tempname] From af5036a83eab7953adedb4eaf25cd21a03ebcbd0 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 9 Mar 2023 15:47:25 +0000 Subject: [PATCH 06/18] Add tests of groupby.sample --- python/cudf/cudf/core/groupby/groupby.py | 1 + python/cudf/cudf/tests/test_groupby.py | 71 ++++++++++++++++++++++++ 2 files changed, 72 insertions(+) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index ce61f02a8c7..58ab4bf3e28 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -812,6 +812,7 @@ def sample( nsamples = np.round(sizes * frac, decimals=0).astype(np.int32) if len(sizes) >= 100_000: # Arbitrary "large" case + # TODO: Check this is a reasonable size warnings.warn( "Sampling a large number of groups with these " "parameters is slow. If you need this to be faster, " diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 97700779a89..5ecb5c831bd 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -1,5 +1,6 @@ # Copyright (c) 2018-2023, NVIDIA CORPORATION. +import collections import datetime import itertools import textwrap @@ -2972,3 +2973,73 @@ def test_groupby_dtypes(groups): pdf = df.to_pandas() assert_eq(pdf.groupby(groups).dtypes, df.groupby(groups).dtypes) + + +class TestSample: + @pytest.fixture( + params=[ + ["a", "a", "b", "b", "c", "c", "c", "d", "d", "d", "d", "d"], + [1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4], + ], + ids=["str-group", "int-group"], + ) + def df(self, request): + return cudf.DataFrame( + {"a": request.param, "b": request.param, "v": request.param} + ) + + @pytest.fixture(params=["a", ["a", "b"]], ids=["single-col", "two-col"]) + def by(self, request): + return request.param + + def expected(self, df, *, n=None, frac=None): + value_counts = collections.Counter(df.a.values_host) + if n is not None: + values = list( + itertools.chain.from_iterable( + itertools.repeat(v, n) for v in value_counts.keys() + ) + ) + elif frac is not None: + values = list( + itertools.chain.from_iterable( + itertools.repeat(v, round(count * frac)) + for v, count in value_counts.items() + ) + ) + else: + raise AssertionError("Invalid usage") + values = cudf.Series(sorted(values), dtype=df.a.dtype) + return cudf.DataFrame({"a": values, "b": values, "v": values}) + + @pytest.mark.parametrize("n", [None, 0, 1, 2]) + def test_constant_n_no_replace(self, df, by, n): + result = df.groupby(by).sample(n=n).sort_values("a") + n = 1 if n is None else n + assert_eq(self.expected(df, n=n), result.reset_index(drop=True)) + + def test_constant_n_no_replace_too_large_raises(self, df): + with pytest.raises(ValueError): + df.groupby("a").sample(n=3) + + @pytest.mark.parametrize("n", [1, 2, 3]) + def test_constant_n_replace(self, df, by, n): + result = df.groupby(by).sample(n=n, replace=True).sort_values("a") + assert_eq(self.expected(df, n=n), result.reset_index(drop=True)) + + def test_invalid_arguments(self, df): + with pytest.raises(ValueError): + df.groupby("a").sample(n=1, frac=0.1) + + def test_not_implemented_arguments(self, df): + with pytest.raises(NotImplementedError): + # These are valid weights, but we don't implemented this yet. + df.groupby("a").sample(n=1, weights=[1 / len(df)] * len(df)) + + @pytest.mark.parametrize("frac", [0, 1 / 3, 1 / 2, 2 / 3, 1]) + @pytest.mark.parametrize("replace", [False, True]) + def test_fraction_rounding(self, df, by, frac, replace): + result = ( + df.groupby(by).sample(frac=frac, replace=replace).sort_values("a") + ) + assert_eq(self.expected(df, frac=frac), result.reset_index(drop=True)) From cf5fc32c15c2a8c47043863fc8bf13a2353f1e68 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 10 Mar 2023 13:00:23 +0000 Subject: [PATCH 07/18] Expose segmented_sort_by_key to Python --- python/cudf/cudf/_lib/cpp/sorting.pxd | 9 +++- python/cudf/cudf/_lib/sort.pyx | 70 ++++++++++++++++++++++++++- 2 files changed, 76 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/sorting.pxd b/python/cudf/cudf/_lib/cpp/sorting.pxd index c6c42c327ac..b210ddf81dd 100644 --- a/python/cudf/cudf/_lib/cpp/sorting.pxd +++ b/python/cudf/cudf/_lib/cpp/sorting.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr @@ -38,3 +38,10 @@ cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil: const table_view& table, vector[libcudf_types.order] column_order, vector[libcudf_types.null_order] null_precedence) except + + + cdef unique_ptr[table] segmented_sort_by_key( + const table_view& values, + const table_view& keys, + const column_view& segment_offsets, + vector[libcudf_types.order] column_order, + vector[libcudf_types.null_order] null_precedence) except + diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx index 3b96cc618dd..938ce56af76 100644 --- a/python/cudf/cudf/_lib/sort.pyx +++ b/python/cudf/cudf/_lib/sort.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from cudf.core.buffer import acquire_spill_lock @@ -18,11 +18,13 @@ from cudf._lib.cpp.search cimport lower_bound, upper_bound from cudf._lib.cpp.sorting cimport ( is_sorted as cpp_is_sorted, rank, + segmented_sort_by_key as cpp_segmented_sort_by_key, sorted_order, ) +from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport null_order, null_policy, order -from cudf._lib.utils cimport table_view_from_columns +from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns @acquire_spill_lock() @@ -143,6 +145,70 @@ def order_by(list columns_from_table, object ascending, str na_position): return Column.from_unique_ptr(move(c_result)) +def segmented_sort_by_key( + list values, + list keys, + Column segment_offsets, + list column_order=None, + list null_precedence=None, +): + """ + Sort segments of a table by given keys + + Parameters + ---------- + values : list[Column] + Columns of the table which will be sorted + keys : list[Column] + Columns making up the sort key + offsets : Column + Segment offsets + column_order : list[bool], optional + Sequence of boolean values which correspond to each column in + keys providing the sort order (default all True). + With True <=> ascending; False <=> descending. + null_precedence : list[str], optional + Sequence of "first" or "last" values (default "first") + indicating the position of null values when sorting the keys. + + Returns + ------- + list[Column] + list of value columns sorted by keys + """ + cdef table_view values_view = table_view_from_columns(values) + cdef table_view keys_view = table_view_from_columns(keys) + cdef column_view offsets_view = segment_offsets.view() + cdef vector[order] c_column_order + cdef vector[null_order] c_null_precedence + cdef unique_ptr[table] result + ncol = len(values) + column_order = column_order or [True] * ncol, + null_precedence = null_precedence or ["first"] * ncol, + for asc, null in zip(column_order, null_precedence): + if asc: + c_column_order.push_back(order.ASCENDING) + else: + c_column_order.push_back(order.DESCENDING) + if asc ^ (null == "first"): + c_null_precedence.push_back(null_order.AFTER) + elif asc ^ (null == "last"): + c_null_precedence.push_back(null_order.BEFORE) + else: + raise ValueError(f"Invalid null precedence {null}") + with nogil: + result = move( + cpp_segmented_sort_by_key( + values_view, + keys_view, + offsets_view, + c_column_order, + c_null_precedence, + ) + ) + return columns_from_unique_ptr(move(result)) + + @acquire_spill_lock() def digitize(list source_columns, list bins, bool right=False): """ From a97ba8da78b4ddb2acbe596dc80c13a4003e6f06 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 10 Mar 2023 13:03:10 +0000 Subject: [PATCH 08/18] No more pathological slow cases in groupby sample Also remove large memory footprint for sampling without replacement. --- python/cudf/cudf/core/groupby/groupby.py | 150 +++++++++++------------ 1 file changed, 73 insertions(+), 77 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 58ab4bf3e28..b9921efcca1 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -16,6 +16,7 @@ from cudf._lib import groupby as libgroupby from cudf._lib.null_mask import bitmask_or from cudf._lib.reshape import interleave_columns +from cudf._lib.sort import segmented_sort_by_key from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType from cudf.api.types import is_list_like from cudf.core.abc import Serializable @@ -731,103 +732,98 @@ def sample( New dataframe or series with samples of appropriate size drawn from each group - Notes - ----- - Sampling with fractional group sizes is currently much slower - than sampling with a constant number of items per group. - Please file an enhancement request if you need this code path - to run fast. """ if weights is not None: raise NotImplementedError( "Sorry, sampling with weights is not supported" ) + # Can't wait for match/case if frac is not None and n is not None: raise ValueError("Cannot supply both of frac and n") elif n is None and frac is None: n = 1 - # Supported cases - # - # n frac replace | Method | Speed - # ------------------------------------------------------------------ - # int None False | whole-frame sample | fast - # int None True | groupby, all-at-once | fast - # | random sample generation | - # None float False | groupby, per-group | slow (many groups) - # | random sample generation | - # None float True | groupby, per-group | slow (many groups) - # | random sample generation | - if ( - n is not None - and not replace - and weights is None - # Can't create a categorical from more than a single column - and isinstance(self._by, str) - # Need to think harder about seriesgroupby for this case - and isinstance(self.obj, cudf.DataFrame) - ): - # Fast path since groupby commutes with whole-frame - # sampling in this case - minsize = self.size().min() - if minsize < n: - raise ValueError( - f"Cannot sample {n=} without replacement " - f"smallest group is {minsize}." - ) - df = self.obj.sample( - frac=1, random_state=random_state - ).reset_index(drop=True) - # Cantor name - tempname = f"_{''.join(df.columns)}" - newcol = df[self._by] - df[tempname] = newcol.astype("category").cat.codes - df = df.loc[df.groupby(tempname)[tempname].rank("first") <= n, :] - del df[tempname] - return df - - # Get the groups - _, offsets, _, values = self._grouped() - offsets = np.asarray(offsets).astype(np.int32) - sizes = np.diff(offsets) + elif frac is not None and not (0 <= frac <= 1): + raise ValueError( + "Sampling with fraction must provide fraction in " + f"[0, 1], got {frac=}" + ) + # TODO: handle random states properly. if random_state is not None and not isinstance(random_state, int): raise NotImplementedError( "Sorry, only integer seeds are supported for random_state " "in this case" ) - # TODO: handle random states properly. - rng = np.random.default_rng(seed=random_state) - if n is not None and replace: + # Get the groups + try: + _, (index,), group_offsets = self._groupby.groups( + [*self.obj._index._columns] # type: ignore + ) + except ValueError: + raise NotImplementedError( + "Sorry groupby.sample with multiindex not implemented" + ) + group_offsets = np.asarray(group_offsets).astype(np.int32) + size_per_group = np.diff(group_offsets) + if n is not None: + samples_per_group = np.broadcast_to( + np.int32(n), size_per_group.shape + ) + if not replace and (minsize := size_per_group.min()) < n: + raise ValueError( + f"Cannot sample {n=} without replacement, " + f"smallest group is {minsize}" + ) + else: + # Pandas uses round-to-nearest, ties to even to + # pick sample sizes for the fractional case (unlike IEEE + # which is round-to-nearest, ties to sgn(x) * inf). + samples_per_group = np.round( + size_per_group * frac, decimals=0 + ).astype(np.int32) + if replace: + # Use numpy to do all-at-once generation of the # We can use numpy in this case, which is fast for lots of # groups and large samples - lo = offsets[:-1].reshape(-1, 1) - hi = lo + sizes.reshape(-1, 1) - indices = rng.integers(lo, hi, size=(*lo.shape, n)).reshape(-1) + lo = 0 + hi = np.repeat(size_per_group, samples_per_group) + rng = np.random.default_rng(seed=random_state) + # Would be nice to use cupy here, but their rng.integers + # interface doesn't take array lo and hi arguments. + indices = rng.integers(lo, hi, dtype=np.int32).reshape(-1) + indices += np.repeat(group_offsets[:-1], samples_per_group) else: - if n is not None: - nsamples = np.broadcast_to(np.int32(n), sizes.shape) + # Approach: do a segmented argsort of the index array and take + # the first samples_per_group entries from sorted array. + if len(size_per_group) < 500: + # Empirically shuffling with cupy is faster at this scale + indices = cp.asarray(index.data_array_view(mode="read")) + rs = cp.random.get_random_state() + rs.seed(seed=random_state) + for off, size in zip(group_offsets, size_per_group): + rs.shuffle(indices[off : off + size]) else: - # Pandas uses round-to-nearest, ties to even to pick - # sample sizes for the fractional case (unlike IEEE - # which is round-to-nearest, ties to sgn(x) * inf). - nsamples = np.round(sizes * frac, decimals=0).astype(np.int32) - if len(sizes) >= 100_000: - # Arbitrary "large" case - # TODO: Check this is a reasonable size - warnings.warn( - "Sampling a large number of groups with these " - "parameters is slow. If you need this to be faster, " - "please open an RFE with your use case." + rng = cp.random.default_rng(seed=random_state) + offsets_col = as_column(group_offsets) + values = [index] + keys = [as_column(rng.random(size=len(index)))] + (indices,) = segmented_sort_by_key( + values, keys, offsets_col, [], [] ) - indices = np.concatenate( - [ - rng.choice(size, size=nsample, replace=replace).astype( - np.int32 - ) - + offset - for size, offset, nsample in zip(sizes, offsets, nsamples) - ] + indices = cp.asarray(indices.data_array_view(mode="read")) + # For many groups this is the slowest part. + # Really I want a segmented_take that returns the first n + # values from each segment which would also be good for head/tail + to_take = np.fromiter( + itertools.chain.from_iterable( + range(off, off + size) + for off, size in zip(group_offsets, samples_per_group) + ), + dtype=np.int32, ) - return self.obj.iloc[values.index[indices]] + indices = indices[to_take] + # This is the index into the original dataframe ordered by the groups + index = cp.asarray(index.data_array_view(mode="read")) + return self.obj.iloc[index[indices]] def serialize(self): header = {} From cffe605c979b0c465a22489dd97fe327e293b4e5 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 10 Mar 2023 16:13:16 +0000 Subject: [PATCH 09/18] Slightly faster masking of the shuffled indices --- python/cudf/cudf/core/groupby/groupby.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index b9921efcca1..f8d2024c25f 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -810,17 +810,11 @@ def sample( values, keys, offsets_col, [], [] ) indices = cp.asarray(indices.data_array_view(mode="read")) - # For many groups this is the slowest part. - # Really I want a segmented_take that returns the first n - # values from each segment which would also be good for head/tail - to_take = np.fromiter( - itertools.chain.from_iterable( - range(off, off + size) - for off, size in zip(group_offsets, samples_per_group) - ), - dtype=np.int32, - ) - indices = indices[to_take] + # Which indices are we going to want? + mask = np.zeros(len(index), dtype=bool) + for offset, sample_size in zip(group_offsets, samples_per_group): + mask[offset : offset + sample_size] = True + indices = indices[mask] # This is the index into the original dataframe ordered by the groups index = cp.asarray(index.data_array_view(mode="read")) return self.obj.iloc[index[indices]] From aa0fd8c11bdc1f08cc9ee48de12765423947889d Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 10 Mar 2023 17:41:29 +0000 Subject: [PATCH 10/18] Minor fixes --- python/cudf/cudf/core/groupby/groupby.py | 36 +++++++++++------------- python/cudf/cudf/tests/test_groupby.py | 2 +- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index f8d2024c25f..3fe4ee0822f 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -734,9 +734,7 @@ def sample( """ if weights is not None: - raise NotImplementedError( - "Sorry, sampling with weights is not supported" - ) + raise NotImplementedError("Sampling with weights is not supported") # Can't wait for match/case if frac is not None and n is not None: raise ValueError("Cannot supply both of frac and n") @@ -750,7 +748,7 @@ def sample( # TODO: handle random states properly. if random_state is not None and not isinstance(random_state, int): raise NotImplementedError( - "Sorry, only integer seeds are supported for random_state " + "Only integer seeds are supported for random_state " "in this case" ) # Get the groups @@ -760,9 +758,9 @@ def sample( ) except ValueError: raise NotImplementedError( - "Sorry groupby.sample with multiindex not implemented" + "groupby.sample with multiindex not implemented" ) - group_offsets = np.asarray(group_offsets).astype(np.int32) + group_offsets = np.asarray(group_offsets, dtype=np.int32) size_per_group = np.diff(group_offsets) if n is not None: samples_per_group = np.broadcast_to( @@ -781,16 +779,15 @@ def sample( size_per_group * frac, decimals=0 ).astype(np.int32) if replace: - # Use numpy to do all-at-once generation of the - # We can use numpy in this case, which is fast for lots of - # groups and large samples - lo = 0 - hi = np.repeat(size_per_group, samples_per_group) + # We would prefer to use cupy here, but their rng.integers + # interface doesn't take array-based low and high + # arguments. + low = 0 + high = np.repeat(size_per_group, samples_per_group) rng = np.random.default_rng(seed=random_state) - # Would be nice to use cupy here, but their rng.integers - # interface doesn't take array lo and hi arguments. - indices = rng.integers(lo, hi, dtype=np.int32).reshape(-1) - indices += np.repeat(group_offsets[:-1], samples_per_group) + indices = rng.integers(low, high, dtype=np.int32) + np.repeat( + group_offsets[:-1], samples_per_group + ) else: # Approach: do a segmented argsort of the index array and take # the first samples_per_group entries from sorted array. @@ -803,11 +800,12 @@ def sample( rs.shuffle(indices[off : off + size]) else: rng = cp.random.default_rng(seed=random_state) - offsets_col = as_column(group_offsets) - values = [index] - keys = [as_column(rng.random(size=len(index)))] (indices,) = segmented_sort_by_key( - values, keys, offsets_col, [], [] + [index], + [as_column(rng.random(size=len(index)))], + as_column(group_offsets), + [], + [], ) indices = cp.asarray(indices.data_array_view(mode="read")) # Which indices are we going to want? diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 5ecb5c831bd..197749a1143 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -3033,7 +3033,7 @@ def test_invalid_arguments(self, df): def test_not_implemented_arguments(self, df): with pytest.raises(NotImplementedError): - # These are valid weights, but we don't implemented this yet. + # These are valid weights, but we don't implement this yet. df.groupby("a").sample(n=1, weights=[1 / len(df)] * len(df)) @pytest.mark.parametrize("frac", [0, 1 / 3, 1 / 2, 2 / 3, 1]) From 74cc64b4ceda60c41c9b32329af646dcb3316df7 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 13 Mar 2023 13:07:23 +0000 Subject: [PATCH 11/18] Fix sample for non-range index Also speed up index masking and add pointers to implementation ideas for weighted sampling. --- python/cudf/cudf/core/groupby/groupby.py | 56 ++++++++++++++---------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 3fe4ee0822f..8ee1a71334d 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -734,6 +734,20 @@ def sample( """ if weights is not None: + # To implement this case again needs different algorithms + # in both cases. + # + # Without replacement, use the weighted reservoir sampling + # approach of Efraimidas and Spirakis (2006) + # https://doi.org/10.1016/j.ipl.2005.11.003, essentially, + # do a segmented argsort sorting on weight-scaled + # logarithmic deviates. See + # https://timvieira.github.io/blog/post/ + # 2019/09/16/algorithms-for-sampling-without-replacement/ + # + # With replacement is trickier, one might be able to use + # the alias method, otherwise we're back to bucketed + # rejection sampling. raise NotImplementedError("Sampling with weights is not supported") # Can't wait for match/case if frac is not None and n is not None: @@ -752,15 +766,11 @@ def sample( "in this case" ) # Get the groups - try: - _, (index,), group_offsets = self._groupby.groups( - [*self.obj._index._columns] # type: ignore - ) - except ValueError: - raise NotImplementedError( - "groupby.sample with multiindex not implemented" - ) - group_offsets = np.asarray(group_offsets, dtype=np.int32) + # TODO: convince Cython to convert the std::vector offsets + # into a numpy array directly, rather than a list. + # TODO: this uses the sort-based groupby, could one use hash-based? + _, offsets, _, group_values = self._grouped() + group_offsets = np.asarray(offsets, dtype=np.int32) size_per_group = np.diff(group_offsets) if n is not None: samples_per_group = np.broadcast_to( @@ -785,15 +795,17 @@ def sample( low = 0 high = np.repeat(size_per_group, samples_per_group) rng = np.random.default_rng(seed=random_state) - indices = rng.integers(low, high, dtype=np.int32) + np.repeat( - group_offsets[:-1], samples_per_group - ) + indices = rng.integers(low, high, dtype=np.int32) + indices += np.repeat(group_offsets[:-1], samples_per_group) else: # Approach: do a segmented argsort of the index array and take # the first samples_per_group entries from sorted array. + # We will shuffle the group indices and then pick them out + # from the grouped dataframe index. + nrows = len(group_values) + indices = cp.arange(nrows, dtype=np.int32) if len(size_per_group) < 500: # Empirically shuffling with cupy is faster at this scale - indices = cp.asarray(index.data_array_view(mode="read")) rs = cp.random.get_random_state() rs.seed(seed=random_state) for off, size in zip(group_offsets, size_per_group): @@ -801,21 +813,21 @@ def sample( else: rng = cp.random.default_rng(seed=random_state) (indices,) = segmented_sort_by_key( - [index], - [as_column(rng.random(size=len(index)))], + [as_column(indices)], + [as_column(rng.random(size=nrows))], as_column(group_offsets), [], [], ) indices = cp.asarray(indices.data_array_view(mode="read")) # Which indices are we going to want? - mask = np.zeros(len(index), dtype=bool) - for offset, sample_size in zip(group_offsets, samples_per_group): - mask[offset : offset + sample_size] = True - indices = indices[mask] - # This is the index into the original dataframe ordered by the groups - index = cp.asarray(index.data_array_view(mode="read")) - return self.obj.iloc[index[indices]] + want = np.arange(samples_per_group.sum(), dtype=np.int32) + scan = np.empty_like(samples_per_group) + scan[0] = 0 + np.cumsum(samples_per_group[:-1], out=scan[1:]) + want += np.repeat(offsets[:-1] - scan, samples_per_group) + indices = indices[want] + return group_values.iloc[indices] def serialize(self): header = {} From 6113e8e830c43a56a9ad3d2c58c9f67e5c8b1f0c Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 13 Mar 2023 13:08:20 +0000 Subject: [PATCH 12/18] Test groupby.sample with non rangeindex --- python/cudf/cudf/tests/test_groupby.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 197749a1143..5bff9760523 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -2976,6 +2976,20 @@ def test_groupby_dtypes(groups): class TestSample: + @pytest.fixture(params=["default", "rangeindex", "intindex", "strindex"]) + def index(self, request): + n = 12 + if request.param == "rangeindex": + return cudf.RangeIndex(2, n + 2) + elif request.param == "intindex": + return cudf.Index( + [2, 3, 4, 1, 0, 5, 6, 8, 7, 9, 10, 13], dtype="int32" + ) + elif request.param == "strindex": + return cudf.StringIndex(list(chr(ord("a") + i) for i in range(n))) + elif request.param == "default": + return None + @pytest.fixture( params=[ ["a", "a", "b", "b", "c", "c", "c", "d", "d", "d", "d", "d"], @@ -2983,9 +2997,10 @@ class TestSample: ], ids=["str-group", "int-group"], ) - def df(self, request): + def df(self, index, request): return cudf.DataFrame( - {"a": request.param, "b": request.param, "v": request.param} + {"a": request.param, "b": request.param, "v": request.param}, + index=index, ) @pytest.fixture(params=["a", ["a", "b"]], ids=["single-col", "two-col"]) From 85acba9366a885cc4b0add22dc7c08f1665bf3a6 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 14 Mar 2023 11:26:27 +0000 Subject: [PATCH 13/18] Add groupby.sample to pytest benchmarks --- python/cudf/benchmarks/API/bench_dataframe.py | 29 ++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/python/cudf/benchmarks/API/bench_dataframe.py b/python/cudf/benchmarks/API/bench_dataframe.py index 42bfa854396..9acdc7e33d5 100644 --- a/python/cudf/benchmarks/API/bench_dataframe.py +++ b/python/cudf/benchmarks/API/bench_dataframe.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. """Benchmarks of DataFrame methods.""" @@ -104,6 +104,33 @@ def bench_groupby_agg(benchmark, dataframe, agg, num_key_cols, as_index, sort): benchmark(dataframe.groupby(by=by, as_index=as_index, sort=sort).agg, agg) +@benchmark_with_object(cls="dataframe", dtype="int", nulls=False, cols=6) +@pytest.mark.parametrize( + "num_key_cols", + [2, 3, 4], +) +@pytest.mark.parametrize("use_frac", [True, False]) +@pytest.mark.parametrize("replace", [True, False]) +@pytest.mark.parametrize("target_sample_frac", [0.1, 0.5, 1]) +def bench_groupby_sample( + benchmark, dataframe, num_key_cols, use_frac, replace, target_sample_frac +): + grouper = dataframe.groupby(by=list(dataframe.columns[:num_key_cols])) + if use_frac: + kwargs = {"frac": target_sample_frac, "replace": replace} + else: + minsize = grouper.size().min() + target_size = numpy.round( + target_sample_frac * minsize, decimals=0 + ).astype(int) + kwargs = {"n": target_size, "replace": replace} + + def _(): + return grouper.sample(**kwargs) + + benchmark(_) + + @benchmark_with_object(cls="dataframe", dtype="int") @pytest.mark.parametrize("num_cols_to_sort", [1]) def bench_sort_values(benchmark, dataframe, num_cols_to_sort): From e90a214f28d42dc0b4bd8b362fe927dd78338895 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 14 Mar 2023 11:46:02 +0000 Subject: [PATCH 14/18] Use numpy group_offsets --- python/cudf/cudf/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 8ee1a71334d..cd6c623fa70 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -825,7 +825,7 @@ def sample( scan = np.empty_like(samples_per_group) scan[0] = 0 np.cumsum(samples_per_group[:-1], out=scan[1:]) - want += np.repeat(offsets[:-1] - scan, samples_per_group) + want += np.repeat(group_offsets[:-1] - scan, samples_per_group) indices = indices[want] return group_values.iloc[indices] From adb7a0b6d080988e78ba3c389cada8299dc95d7b Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 22 Mar 2023 16:35:26 +0000 Subject: [PATCH 15/18] Minor fixes in review --- python/cudf/benchmarks/API/bench_dataframe.py | 5 +---- python/cudf/cudf/_lib/sort.pyx | 5 +---- python/cudf/cudf/core/groupby/groupby.py | 20 +++++++++---------- python/cudf/cudf/tests/test_groupby.py | 5 +++-- 4 files changed, 15 insertions(+), 20 deletions(-) diff --git a/python/cudf/benchmarks/API/bench_dataframe.py b/python/cudf/benchmarks/API/bench_dataframe.py index 9acdc7e33d5..28777b23583 100644 --- a/python/cudf/benchmarks/API/bench_dataframe.py +++ b/python/cudf/benchmarks/API/bench_dataframe.py @@ -125,10 +125,7 @@ def bench_groupby_sample( ).astype(int) kwargs = {"n": target_size, "replace": replace} - def _(): - return grouper.sample(**kwargs) - - benchmark(_) + benchmark(grouper.sample, **kwargs) @benchmark_with_object(cls="dataframe", dtype="int") diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx index 938ce56af76..c4dae04b134 100644 --- a/python/cudf/cudf/_lib/sort.pyx +++ b/python/cudf/cudf/_lib/sort.pyx @@ -186,10 +186,7 @@ def segmented_sort_by_key( column_order = column_order or [True] * ncol, null_precedence = null_precedence or ["first"] * ncol, for asc, null in zip(column_order, null_precedence): - if asc: - c_column_order.push_back(order.ASCENDING) - else: - c_column_order.push_back(order.DESCENDING) + c_column_order.push_back(order.ASCENDING if asc else order.DESCENDING) if asc ^ (null == "first"): c_null_precedence.push_back(null_order.AFTER) elif asc ^ (null == "last"): diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index cbbe1f2ce0b..a8db09702bb 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -17,6 +17,7 @@ from cudf._lib.null_mask import bitmask_or from cudf._lib.reshape import interleave_columns from cudf._lib.sort import segmented_sort_by_key +from cudf._lib.types import size_type_dtype from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType from cudf.api.types import is_list_like from cudf.core.abc import Serializable @@ -638,7 +639,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool): # aggregation scheme in libcudf. This is probably "fast # enough" for most reasonable input sizes. _, offsets, _, group_values = self._grouped() - group_offsets = np.asarray(offsets, dtype=np.int32) + group_offsets = np.asarray(offsets, dtype=size_type_dtype) size_per_group = np.diff(group_offsets) # "Out of bounds" n for the group size either means no entries # (negative) or all the entries (positive) @@ -652,7 +653,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool): group_offsets = group_offsets[:-1] else: group_offsets = group_offsets[1:] - size_per_group - to_take = np.arange(size_per_group.sum(), dtype=np.int32) + to_take = np.arange(size_per_group.sum(), dtype=size_type_dtype) fixup = np.empty_like(size_per_group) fixup[0] = 0 np.cumsum(size_per_group[:-1], out=fixup[1:]) @@ -901,7 +902,7 @@ def sample( Returns ------- New dataframe or series with samples of appropriate size drawn - from each group + from each group. """ if weights is not None: @@ -920,7 +921,6 @@ def sample( # the alias method, otherwise we're back to bucketed # rejection sampling. raise NotImplementedError("Sampling with weights is not supported") - # Can't wait for match/case if frac is not None and n is not None: raise ValueError("Cannot supply both of frac and n") elif n is None and frac is None: @@ -941,11 +941,11 @@ def sample( # into a numpy array directly, rather than a list. # TODO: this uses the sort-based groupby, could one use hash-based? _, offsets, _, group_values = self._grouped() - group_offsets = np.asarray(offsets, dtype=np.int32) + group_offsets = np.asarray(offsets, dtype=size_type_dtype) size_per_group = np.diff(group_offsets) if n is not None: samples_per_group = np.broadcast_to( - np.int32(n), size_per_group.shape + size_type_dtype(n), size_per_group.shape ) if not replace and (minsize := size_per_group.min()) < n: raise ValueError( @@ -958,7 +958,7 @@ def sample( # which is round-to-nearest, ties to sgn(x) * inf). samples_per_group = np.round( size_per_group * frac, decimals=0 - ).astype(np.int32) + ).astype(size_type_dtype) if replace: # We would prefer to use cupy here, but their rng.integers # interface doesn't take array-based low and high @@ -966,7 +966,7 @@ def sample( low = 0 high = np.repeat(size_per_group, samples_per_group) rng = np.random.default_rng(seed=random_state) - indices = rng.integers(low, high, dtype=np.int32) + indices = rng.integers(low, high, dtype=size_type_dtype) indices += np.repeat(group_offsets[:-1], samples_per_group) else: # Approach: do a segmented argsort of the index array and take @@ -974,7 +974,7 @@ def sample( # We will shuffle the group indices and then pick them out # from the grouped dataframe index. nrows = len(group_values) - indices = cp.arange(nrows, dtype=np.int32) + indices = cp.arange(nrows, dtype=size_type_dtype) if len(size_per_group) < 500: # Empirically shuffling with cupy is faster at this scale rs = cp.random.get_random_state() @@ -992,7 +992,7 @@ def sample( ) indices = cp.asarray(indices.data_array_view(mode="read")) # Which indices are we going to want? - want = np.arange(samples_per_group.sum(), dtype=np.int32) + want = np.arange(samples_per_group.sum(), dtype=size_type_dtype) scan = np.empty_like(samples_per_group) scan[0] = 0 np.cumsum(samples_per_group[:-1], out=scan[1:]) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 53a1caf912a..1cad9eae6c8 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -4,6 +4,7 @@ import datetime import itertools import operator +import string import textwrap from decimal import Decimal @@ -2974,7 +2975,7 @@ def index(self, request): [2, 3, 4, 1, 0, 5, 6, 8, 7, 9, 10, 13], dtype="int32" ) elif request.param == "strindex": - return cudf.StringIndex(list(chr(ord("a") + i) for i in range(n))) + return cudf.StringIndex(string.ascii_lowercase[:n]) elif request.param == "default": return None @@ -3011,7 +3012,7 @@ def expected(self, df, *, n=None, frac=None): ) ) else: - raise AssertionError("Invalid usage") + raise ValueError("Must provide either n or frac") values = cudf.Series(sorted(values), dtype=df.a.dtype) return cudf.DataFrame({"a": values, "b": values, "v": values}) From 7998d909e0f0d28b7b93bb6bb7e638c4ab57562c Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 22 Mar 2023 16:53:46 +0000 Subject: [PATCH 16/18] Dtypes aren't callable --- python/cudf/cudf/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index a8db09702bb..1ab1da92bc7 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -945,7 +945,7 @@ def sample( size_per_group = np.diff(group_offsets) if n is not None: samples_per_group = np.broadcast_to( - size_type_dtype(n), size_per_group.shape + size_type_dtype.type(n), size_per_group.shape ) if not replace and (minsize := size_per_group.min()) < n: raise ValueError( From 558541dc3fadf34fa28137b717b6ebc5478370f3 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 22 Mar 2023 17:02:00 +0000 Subject: [PATCH 17/18] Trailing comma is bad --- python/cudf/cudf/_lib/sort.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx index c4dae04b134..3c3f8cabda6 100644 --- a/python/cudf/cudf/_lib/sort.pyx +++ b/python/cudf/cudf/_lib/sort.pyx @@ -183,8 +183,8 @@ def segmented_sort_by_key( cdef vector[null_order] c_null_precedence cdef unique_ptr[table] result ncol = len(values) - column_order = column_order or [True] * ncol, - null_precedence = null_precedence or ["first"] * ncol, + column_order = column_order or [True] * ncol + null_precedence = null_precedence or ["first"] * ncol for asc, null in zip(column_order, null_precedence): c_column_order.push_back(order.ASCENDING if asc else order.DESCENDING) if asc ^ (null == "first"): From 206d07ad90a6802243aa369bee557fb5f6711559 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 23 Mar 2023 09:26:34 +0000 Subject: [PATCH 18/18] Correct string index construction --- python/cudf/cudf/tests/test_groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 1cad9eae6c8..1b86c68b582 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -2975,7 +2975,7 @@ def index(self, request): [2, 3, 4, 1, 0, 5, 6, 8, 7, 9, 10, 13], dtype="int32" ) elif request.param == "strindex": - return cudf.StringIndex(string.ascii_lowercase[:n]) + return cudf.StringIndex(list(string.ascii_lowercase[:n])) elif request.param == "default": return None