From 008cfe1a97703a2884f2b353c1609ce4e6dba997 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 6 Mar 2023 10:08:02 +0000 Subject: [PATCH] Implement sketch of groupby.sample To do so, obtain the group offsets and values (and hence index). Sample within each group, and then pull out rows from the original object. The fastest way to do this in Python is via the builtin random library, since neither numpy nor cupy offer a broadcasted/ufunc random.sample, and looping over the groups is very slow using either of them. Looping over the groups and using python random.sample is also slow, but less so. --- python/cudf/cudf/core/groupby/groupby.py | 69 +++++++++++++++++++++++- 1 file changed, 68 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 8ff3e17d6ff..d50d1487bb5 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -2,11 +2,12 @@ import itertools import pickle +import random import textwrap import warnings from collections import abc from functools import cached_property -from typing import Any, Iterable, List, Tuple, Union +from typing import Any, Iterable, List, Optional, Tuple, Union import cupy as cp import numpy as np @@ -699,6 +700,72 @@ def ngroup(self, ascending=True): group_ids._index = index return self._broadcast(group_ids) + def sample( + self, + n: Optional[int] = None, + frac: Optional[float] = None, + replace: bool = False, + weights: Union[abc.Sequence, "cudf.Series", None] = None, + random_state: Union[np.random.RandomState, int, None] = None, + ): + """Return a random sample of items in each group. + + Parameters + ---------- + n + Number of items to return for each group, if sampling + without replacement must be at most the size of the + smallest group. Cannot be used with frac. + frac + Fraction of items to return. Cannot be used with n. Not + currently supported. + replace + Should sampling occur with or without replacement? + weights + Sampling probability for each element. Must be the same + length as the grouped frame. Not currently supported. + random_state + Seed for random number generation. + """ + if frac is not None: + raise NotImplementedError( + "Sorry, sampling with fraction is not supported" + ) + if weights is not None: + raise NotImplementedError( + "Sorry, sampling with weights is not supported" + ) + if random_state is not None and not isinstance(random_state, int): + raise NotImplementedError( + "Sorry, only integer seeds are supported for random_state" + ) + if n is None: + raise ValueError("Please supply a sample size") + # Although the check n is None projects the type of n from + # Optional[int] to int, because of the type-annotation, and + # name-binding in closures, we can't use n in the sample + # lambdas since the value of n might still legitimately be + # None by the type the lambda is called. + nsample = n + rng = random.Random(x=random_state) + if replace: + sample = lambda s, e: [ # noqa: E731 + rng.randrange(s, e) for _ in range(nsample) + ] + else: + sample = lambda s, e: rng.sample( # noqa: E731 + range(s, e), nsample + ) + _, offsets, _, values = self._grouped() + sizes = np.diff(offsets) + indices = list( + itertools.chain.from_iterable( + sample(offset, offset + size) + for size, offset in zip(sizes, offsets) + ) + ) + return self.obj.iloc[values.index[indices]] + def serialize(self): header = {} frames = []