From b831cc976ba56cde6176b9c0a21061e3b6a5600a Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 9 Feb 2022 16:06:14 -0800 Subject: [PATCH 01/38] Rewrites `sample` --- python/cudf/cudf/_lib/copying.pyx | 26 --- python/cudf/cudf/_lib/cpp/copying.pxd | 7 - python/cudf/cudf/core/dataframe.py | 37 ++++ python/cudf/cudf/core/frame.py | 195 +-------------------- python/cudf/cudf/core/indexed_frame.py | 214 ++++++++++++++++++++++- python/cudf/cudf/tests/test_dataframe.py | 108 +++++++++--- python/cudf/cudf/tests/test_index.py | 105 ----------- 7 files changed, 335 insertions(+), 357 deletions(-) diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index 30157bc10ad..718f92cea75 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -652,32 +652,6 @@ def get_element(Column input_column, size_type index): ) -def sample(input, size_type n, - bool replace, int64_t seed, bool keep_index=True): - cdef table_view tbl_view = table_view_from_table(input, not keep_index) - cdef cpp_copying.sample_with_replacement replacement - - if replace: - replacement = cpp_copying.sample_with_replacement.TRUE - else: - replacement = cpp_copying.sample_with_replacement.FALSE - - cdef unique_ptr[table] c_output - with nogil: - c_output = move( - cpp_copying.sample(tbl_view, n, replacement, seed) - ) - - return data_from_unique_ptr( - move(c_output), - column_names=input._column_names, - index_names=( - None if keep_index is False - else input._index_names - ) - ) - - def segmented_gather(Column source_column, Column gather_map): cdef shared_ptr[lists_column_view] source_LCV = ( make_shared[lists_column_view](source_column.view()) diff --git a/python/cudf/cudf/_lib/cpp/copying.pxd b/python/cudf/cudf/_lib/cpp/copying.pxd index be1b6d8069c..4ac0dc09160 100644 --- a/python/cudf/cudf/_lib/cpp/copying.pxd +++ b/python/cudf/cudf/_lib/cpp/copying.pxd @@ -175,10 +175,3 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil: ctypedef enum sample_with_replacement: FALSE 'cudf::sample_with_replacement::FALSE', TRUE 'cudf::sample_with_replacement::TRUE', - - cdef unique_ptr[table] sample ( - table_view input, - size_type n, - sample_with_replacement replacement, - int64_t seed - ) except + diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 8a49efabcde..62e6d254873 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -26,6 +26,7 @@ import cudf import cudf.core.common from cudf import _lib as libcudf +from cudf._typing import ColumnLike from cudf.api.types import ( _is_scalar_or_zero_d_array, is_bool_dtype, @@ -6081,6 +6082,42 @@ def nunique(self, axis=0, dropna=True): return cudf.Series(super().nunique(method="sort", dropna=dropna)) + def _sample_axis_1( + self, + n: int, + weights: ColumnLike, + replace: bool, + random_state: np.random.RandomState, + ignore_index: bool, + ): + if replace: + raise NotImplementedError( + "Sample is not supported for axis 1/'columns' " + "when 'replace=True'" + ) + + if n > 0 and self.shape[1] == 0: + raise ValueError( + "Cannot take a sample larger than 0 when axis is empty" + ) + + columns = np.asarray(self._data.names) + if not replace and n > columns.size: + raise ValueError( + "Cannot take a larger sample " + "than population when 'replace=False'" + ) + + sampled_column_labels = random_state.choice( + columns, size=n, replace=False, p=weights + ) + + result = self._get_columns_by_label(sampled_column_labels) + if ignore_index: + result.reset_index(drop=True) + + return result + def from_dataframe(df, allow_copy=False): return df_protocol.from_dataframe(df, allow_copy=allow_copy) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 2c5606cb17f..3ea400610d9 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -49,7 +49,7 @@ from cudf.core.window import Rolling from cudf.utils import ioutils from cudf.utils.docutils import copy_docstring -from cudf.utils.dtypes import find_common_type, is_column_like +from cudf.utils.dtypes import find_common_type T = TypeVar("T", bound="Frame") @@ -1573,199 +1573,6 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): zip(self._column_names, data_columns), self._index ) - @annotate("SAMPLE", color="orange", domain="cudf_python") - def sample( - self, - n=None, - frac=None, - replace=False, - weights=None, - random_state=None, - axis=None, - keep_index=True, - ): - """Return a random sample of items from an axis of object. - - You can use random_state for reproducibility. - - Parameters - ---------- - n : int, optional - Number of items from axis to return. Cannot be used with frac. - Default = 1 if frac = None. - frac : float, optional - Fraction of axis items to return. Cannot be used with n. - replace : bool, default False - Allow or disallow sampling of the same row more than once. - replace == True is not yet supported for axis = 1/"columns" - weights : str or ndarray-like, optional - Only supported for axis=1/"columns" - random_state : int, numpy RandomState or None, default None - Seed for the random number generator (if int), or None. - If None, a random seed will be chosen. - if RandomState, seed will be extracted from current state. - axis : {0 or ‘index’, 1 or ‘columns’, None}, default None - Axis to sample. Accepts axis number or name. - Default is stat axis for given data type - (0 for Series and DataFrames). Series and Index doesn't - support axis=1. - - Returns - ------- - Series or DataFrame or Index - A new object of same type as caller containing n items - randomly sampled from the caller object. - - Examples - -------- - >>> import cudf as cudf - >>> df = cudf.DataFrame({"a":{1, 2, 3, 4, 5}}) - >>> df.sample(3) - a - 1 2 - 3 4 - 0 1 - - >>> sr = cudf.Series([1, 2, 3, 4, 5]) - >>> sr.sample(10, replace=True) - 1 4 - 3 1 - 2 4 - 0 5 - 0 1 - 4 5 - 4 1 - 0 2 - 0 3 - 3 2 - dtype: int64 - - >>> df = cudf.DataFrame( - ... {"a":[1, 2], "b":[2, 3], "c":[3, 4], "d":[4, 5]}) - >>> df.sample(2, axis=1) - a c - 0 1 3 - 1 2 4 - """ - - if frac is not None and frac > 1 and not replace: - raise ValueError( - "Replace has to be set to `True` " - "when upsampling the population `frac` > 1." - ) - elif frac is not None and n is not None: - raise ValueError( - "Please enter a value for `frac` OR `n`, not both" - ) - - if frac is None and n is None: - n = 1 - elif frac is not None: - if axis is None or axis == 0 or axis == "index": - n = int(round(self.shape[0] * frac)) - else: - n = int(round(self.shape[1] * frac)) - - if axis is None or axis == 0 or axis == "index": - if n > 0 and self.shape[0] == 0: - raise ValueError( - "Cannot take a sample larger than 0 when axis is empty" - ) - - if not replace and n > self.shape[0]: - raise ValueError( - "Cannot take a larger sample than population " - "when 'replace=False'" - ) - - if weights is not None: - raise NotImplementedError( - "weights is not yet supported for axis=0/index" - ) - - if random_state is None: - seed = np.random.randint( - np.iinfo(np.int64).max, dtype=np.int64 - ) - elif isinstance(random_state, np.random.mtrand.RandomState): - _, keys, pos, _, _ = random_state.get_state() - seed = 0 if pos >= len(keys) else pos - else: - seed = np.int64(random_state) - - result = self.__class__._from_data( - *libcudf.copying.sample( - self, - n=n, - replace=replace, - seed=seed, - keep_index=keep_index, - ) - ) - result._copy_type_metadata(self) - - return result - else: - if len(self.shape) != 2: - raise ValueError( - f"No axis named {axis} for " - f"object type {self.__class__}" - ) - - if replace: - raise NotImplementedError( - "Sample is not supported for " - f"axis {axis} when 'replace=True'" - ) - - if n > 0 and self.shape[1] == 0: - raise ValueError( - "Cannot take a sample larger than 0 when axis is empty" - ) - - columns = np.asarray(self._data.names) - if not replace and n > columns.size: - raise ValueError( - "Cannot take a larger sample " - "than population when 'replace=False'" - ) - - if weights is not None: - if is_column_like(weights): - weights = np.asarray(weights) - else: - raise ValueError( - "Strings can only be passed to weights " - "when sampling from rows on a DataFrame" - ) - - if columns.size != len(weights): - raise ValueError( - "Weights and axis to be sampled must be of same length" - ) - - total_weight = weights.sum() - if total_weight != 1: - if not isinstance(weights.dtype, float): - weights = weights.astype("float64") - weights = weights / total_weight - - np.random.seed(random_state) - gather_map = np.random.choice( - columns, size=n, replace=replace, p=weights - ) - - if isinstance(self, cudf.MultiIndex): - # TODO: Need to update this once MultiIndex is refactored, - # should be able to treat it similar to other Frame object - result = cudf.Index(self.to_frame(index=False)[gather_map]) - else: - result = self[gather_map] - if not keep_index: - result.index = None - - return result - @classmethod @annotate("FROM_ARROW", color="orange", domain="cudf_python") def from_arrow(cls, data): diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index fab5d75f62b..93980e971cd 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -6,7 +6,7 @@ import operator import warnings from collections import Counter, abc -from typing import Callable, Type, TypeVar +from typing import Callable, Optional, Type, TypeVar, Union from uuid import uuid4 import cupy as cp @@ -30,6 +30,7 @@ from cudf.core.index import Index, RangeIndex, _index_from_columns from cudf.core.multiindex import MultiIndex from cudf.core.udf.utils import _compile_or_get, _supported_cols_from_frame +from cudf.utils.dtypes import is_column_like from cudf.utils.utils import cached_property doc_reset_index_template = """ @@ -1695,6 +1696,175 @@ def last(self, offset): slice_func=lambda i: self.iloc[i:], ) + @annotate("SAMPLE", color="orange", domain="cudf_python") + def sample( + self, + n=None, + frac=None, + replace=False, + weights=None, + random_state=None, + axis=None, + ignore_index=False, + ): + """Return a random sample of items from an axis of object. + + You can use random_state for reproducibility. + + Notes + ----- + When sampling from ``axis=0/'index'``, ``random_state`` can be either + a host random state (``numpy.random.RandomState``) or a device random + state (``cupy.random.RandomState``). If a host random state is used, + the rows sampled is guarenteed to match pandas result, but performance + may be decreased if the item count is too high. Optionally, user may + specify a device random state to achieve better performance at high + item counts. + + Parameters + ---------- + n : int, optional + Number of items from axis to return. Cannot be used with frac. + Default = 1 if frac = None. + frac : float, optional + Fraction of axis items to return. Cannot be used with n. + replace : bool, default False + Allow or disallow sampling of the same row more than once. + replace == True is not yet supported for axis = 1/"columns" + weights : str or ndarray-like, optional + Only supported for axis=1/"columns" + random_state : int, numpy/cupy RandomState, or None, default None + If None, default cupy random state is chosen. + If int, as seed for the default cupy random state. + If RandomState, rows-to-sample are generated from the RandomState. + axis : {0 or `index`, 1 or `columns`, None}, default None + Axis to sample. Accepts axis number or name. + Default is stat axis for given data type + (0 for Series and DataFrames). Series and Index doesn't + support axis=1. + ignore_index : bool, default False + If True, the resulting index will be labeled 0, 1, …, n - 1. + + Returns + ------- + Series or DataFrame + A new object of same type as caller containing n items + randomly sampled from the caller object. + + Examples + -------- + >>> import cudf as cudf + >>> df = cudf.DataFrame({"a":{1, 2, 3, 4, 5}}) + >>> df.sample(3) + a + 1 2 + 3 4 + 0 1 + + >>> sr = cudf.Series([1, 2, 3, 4, 5]) + >>> sr.sample(10, replace=True) + 1 4 + 3 1 + 2 4 + 0 5 + 0 1 + 4 5 + 4 1 + 0 2 + 0 3 + 3 2 + dtype: int64 + + >>> df = cudf.DataFrame( + ... {"a":[1, 2], "b":[2, 3], "c":[3, 4], "d":[4, 5]}) + >>> df.sample(2, axis=1) + a c + 0 1 3 + 1 2 4 + """ + if frac is not None and frac > 1 and not replace: + raise ValueError( + "Replace has to be set to `True` " + "when upsampling the population `frac` > 1." + ) + elif frac is not None and n is not None: + raise ValueError( + "Please enter a value for `frac` OR `n`, not both" + ) + + axis = self._get_axis_from_axis_arg(axis) + + if frac is None and n is None: + n = 1 + elif frac is not None: + if axis == 0: + n = int(round(self.shape[0] * frac)) + else: + n = int(round(self.shape[1] * frac)) + + size = self.shape[axis] + weights = preprocess_weights(weights, size) + random_state = preprocess_random_state(random_state, axis) + + if axis == 0: + return self._sample_axis_0( + n, weights, replace, random_state, ignore_index + ) + else: + if isinstance(random_state, cp.random.RandomState): + raise ValueError( + "Use a host random state when sampling from " + "axis=1/columns." + ) + return self._sample_axis_1( + n, weights, replace, random_state, ignore_index + ) + + def _sample_axis_0( + self, + n: int, + weights: None, + replace: bool, + random_state: Union[np.random.RandomState, cp.random.RandomState], + ignore_index: bool, + ): + if n > 0 and self.shape[0] == 0: + raise ValueError( + "Cannot take a sample larger than 0 when axis is empty" + ) + + if not replace and n > self.shape[0]: + raise ValueError( + "Cannot take a larger sample than population " + "when 'replace=False'" + ) + + if weights is not None: + raise NotImplementedError( + "weights is not yet supported for axis=0/index" + ) + + # Dynamic dispatch to host/device depending on state provided. + gather_map = cudf.core.column.as_column( + random_state.choice(len(self), size=n, replace=replace) + ) + + return self._gather( + gather_map, keep_index=not ignore_index, check_bounds=False + ) + + def _sample_axis_1( + self, + n: int, + weights: None, + replace: bool, + random_state: np.random.RandomState, + ignore_index: bool, + ): + raise NotImplementedError( + "Sampling from axis 1 is only implemented for Dataframe." + ) + def _check_duplicate_level_names(specified, level_names): """Raise if any of `specified` has duplicates in `level_names`.""" @@ -1711,3 +1881,45 @@ def _check_duplicate_level_names(specified, level_names): f"The names {duplicates_specified} occurs multiple times, use a" " level number" ) + + +def preprocess_weights( + weights: Optional[ColumnLike], size: int +) -> Optional[ColumnLike]: + """If ``weights`` is not None, normalize weights.""" + if weights is not None: + if is_column_like(weights): + weights = np.asarray(weights) + else: + raise NotImplementedError( + "Weights specified by string is unsupported yet." + ) + + if size != len(weights): + raise ValueError( + "Weights and axis to be sampled must be of same length" + ) + + total_weight = weights.sum() + if total_weight != 1: + if not isinstance(weights.dtype, float): + weights = weights.astype("float64") + weights = weights / total_weight + return weights + + +def preprocess_random_state( + seed_or_random_state: Union[ + None, np.number, np.random.RandomState, cp.random.RandomState + ], + axis: int, +) -> Union[np.random.RandomState, cp.random.RandomState]: + """Construct random state from parameter ``seed_or_random_state``.""" + if not isinstance( + seed_or_random_state, (np.random.RandomState, cp.random.RandomState) + ): + lib = cp if axis == 0 else np + seed_or_random_state = lib.random.RandomState( + seed=seed_or_random_state + ) + return seed_or_random_state diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index fb173bc0eab..83abf3c62fc 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -11,6 +11,7 @@ from copy import copy import cupy +import cupy as cp import numpy as np import pandas as pd import pyarrow as pa @@ -7400,14 +7401,34 @@ def test_cudf_arrow_array_error(): sr.__arrow_array__() -@pytest.mark.parametrize("n", [0, 2, 5, 10, None]) -@pytest.mark.parametrize("frac", [0.1, 0.5, 1, 2, None]) +def make_random_state(seed, random_state_lib, force_host=False): + if random_state_lib: + return ( + np.random.RandomState(seed) + if force_host + else random_state_lib(seed) + ) + else: + return seed + + +@pytest.mark.parametrize("n", [0, 2, 10, None]) +@pytest.mark.parametrize("frac", [0.3, 2, None]) @pytest.mark.parametrize("replace", [True, False]) @pytest.mark.parametrize("axis", [0, 1]) -def test_dataframe_sample_basic(n, frac, replace, axis): - # as we currently don't support column with same name +@pytest.mark.parametrize( + "random_state_lib", [None, np.random.RandomState, cp.random.RandomState], +) +def test_dataframe_sample_basic(n, frac, replace, axis, random_state_lib): + # TODO: decouple valid parameter set from invalid parameter set and + # write two separate tests to reduce function complexity. if axis == 1 and replace: - return + pytest.skip(reason="we currently don't support column with same name.") + if axis == 1 and random_state_lib == cp.random.RandomState: + pytest.skip( + reason="cannot sample column axis with device random state." + ) + pdf = pd.DataFrame( { "a": [1, 2, 3, 4, 5], @@ -7417,14 +7438,16 @@ def test_dataframe_sample_basic(n, frac, replace, axis): index=[1, 2, 3, 4, 5], ) df = cudf.DataFrame.from_pandas(pdf) - random_state = 0 + seed = 1 try: pout = pdf.sample( n=n, frac=frac, replace=replace, - random_state=random_state, + random_state=make_random_state( + seed, random_state_lib, force_host=True + ), axis=axis, ) except BaseException: @@ -7437,7 +7460,9 @@ def test_dataframe_sample_basic(n, frac, replace, axis): "n": n, "frac": frac, "replace": replace, - "random_state": random_state, + "random_state": make_random_state( + seed, random_state_lib, force_host=True + ), "axis": axis, }, ), @@ -7447,7 +7472,7 @@ def test_dataframe_sample_basic(n, frac, replace, axis): "n": n, "frac": frac, "replace": replace, - "random_state": random_state, + "random_state": make_random_state(seed, random_state_lib), "axis": axis, }, ), @@ -7457,34 +7482,61 @@ def test_dataframe_sample_basic(n, frac, replace, axis): n=n, frac=frac, replace=replace, - random_state=random_state, + random_state=make_random_state(seed, random_state_lib), axis=axis, ) - assert pout.shape == gout.shape + if random_state_lib == np.random.RandomState: + assert_eq(pout, gout) + else: + assert pout.shape == gout.shape + + +@pytest.mark.parametrize("replace", [True, False]) +def test_dataframe_reproducibility_seed(replace): + df = cudf.DataFrame({"a": cupy.arange(0, 1024)}) + + seed = 1 + expected = df.sample(1024, replace=replace, random_state=seed) + out = df.sample(1024, replace=replace, random_state=seed) + + assert_eq(expected, out) @pytest.mark.parametrize("replace", [True, False]) -@pytest.mark.parametrize("random_state", [1, np.random.mtrand.RandomState(10)]) -def test_dataframe_reproducibility(replace, random_state): +@pytest.mark.parametrize("lib", [np, cp]) +def test_dataframe_reproducibility_rstate(replace, lib): df = cudf.DataFrame({"a": cupy.arange(0, 1024)}) - expected = df.sample(1024, replace=replace, random_state=random_state) - out = df.sample(1024, replace=replace, random_state=random_state) + seed = 10 + expected = df.sample( + 1024, replace=replace, random_state=lib.random.RandomState(seed) + ) + out = df.sample( + 1024, replace=replace, random_state=lib.random.RandomState(seed) + ) assert_eq(expected, out) -@pytest.mark.parametrize("n", [0, 2, 5, 10, None]) -@pytest.mark.parametrize("frac", [0.1, 0.5, 1, 2, None]) +@pytest.mark.parametrize("n", [0, 2, 10, None]) +@pytest.mark.parametrize("frac", [0.3, 2, None]) @pytest.mark.parametrize("replace", [True, False]) -def test_series_sample_basic(n, frac, replace): +@pytest.mark.parametrize( + "random_state_lib", [None, np.random.RandomState, cp.random.RandomState], +) +def test_series_sample_basic(n, frac, replace, random_state_lib): psr = pd.Series([1, 2, 3, 4, 5]) sr = cudf.Series.from_pandas(psr) - random_state = 0 + seed = 1 try: pout = psr.sample( - n=n, frac=frac, replace=replace, random_state=random_state + n=n, + frac=frac, + replace=replace, + random_state=make_random_state( + seed, random_state_lib, force_host=True + ), ) except BaseException: assert_exceptions_equal( @@ -7496,7 +7548,9 @@ def test_series_sample_basic(n, frac, replace): "n": n, "frac": frac, "replace": replace, - "random_state": random_state, + "random_state": make_random_state( + seed, random_state_lib, force_host=True + ), }, ), rfunc_args_and_kwargs=( @@ -7505,15 +7559,21 @@ def test_series_sample_basic(n, frac, replace): "n": n, "frac": frac, "replace": replace, - "random_state": random_state, + "random_state": make_random_state(seed, random_state_lib), }, ), ) else: gout = sr.sample( - n=n, frac=frac, replace=replace, random_state=random_state + n=n, + frac=frac, + replace=replace, + random_state=make_random_state(seed, random_state_lib), ) - assert pout.shape == gout.shape + if random_state_lib == np.random.RandomState: + assert_eq(pout, gout) + else: + assert pout.shape == gout.shape @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 6679725ae9a..65346f57a11 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -1614,111 +1614,6 @@ def test_interval_index_from_breaks(closed): assert_eq(pindex, gindex) -@pytest.mark.parametrize("n", [0, 2, 5, 10, None]) -@pytest.mark.parametrize("frac", [0.1, 0.5, 1, 2, None]) -@pytest.mark.parametrize("replace", [True, False]) -def test_index_sample_basic(n, frac, replace): - psr = pd.Series([1, 2, 3, 4, 5]) - gindex = cudf.Index(psr) - random_state = 0 - - try: - pout = psr.sample( - n=n, frac=frac, replace=replace, random_state=random_state - ) - except BaseException: - assert_exceptions_equal( - lfunc=psr.sample, - rfunc=gindex.sample, - lfunc_args_and_kwargs=( - [], - { - "n": n, - "frac": frac, - "replace": replace, - "random_state": random_state, - }, - ), - rfunc_args_and_kwargs=( - [], - { - "n": n, - "frac": frac, - "replace": replace, - "random_state": random_state, - }, - ), - ) - else: - gout = gindex.sample( - n=n, frac=frac, replace=replace, random_state=random_state - ) - - assert pout.shape == gout.shape - - -@pytest.mark.parametrize("n", [2, 5, 10, None]) -@pytest.mark.parametrize("frac", [0.5, 1, 2, None]) -@pytest.mark.parametrize("replace", [True, False]) -@pytest.mark.parametrize("axis", [0, 1]) -def test_multiindex_sample_basic(n, frac, replace, axis): - # as we currently don't support column with same name - if axis == 1 and replace: - return - pdf = pd.DataFrame( - { - "a": [1, 2, 3, 4, 5], - "float": [0.05, 0.2, 0.3, 0.2, 0.25], - "int": [1, 3, 5, 4, 2], - }, - ) - mul_index = cudf.Index(cudf.from_pandas(pdf)) - random_state = 0 - - try: - pout = pdf.sample( - n=n, - frac=frac, - replace=replace, - random_state=random_state, - axis=axis, - ) - except BaseException: - assert_exceptions_equal( - lfunc=pdf.sample, - rfunc=mul_index.sample, - lfunc_args_and_kwargs=( - [], - { - "n": n, - "frac": frac, - "replace": replace, - "random_state": random_state, - "axis": axis, - }, - ), - rfunc_args_and_kwargs=( - [], - { - "n": n, - "frac": frac, - "replace": replace, - "random_state": random_state, - "axis": axis, - }, - ), - ) - else: - gout = mul_index.sample( - n=n, - frac=frac, - replace=replace, - random_state=random_state, - axis=axis, - ) - assert pout.shape == gout.shape - - @pytest.mark.parametrize( "data", [ From ec64a98c9c6d274db7edcc8a4e2f0986a5f48f2e Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 9 Feb 2022 16:40:42 -0800 Subject: [PATCH 02/38] Restore support for index but add deprecation warning --- python/cudf/cudf/core/_base_index.py | 22 ++++++ python/cudf/cudf/tests/test_index.py | 107 +++++++++++++++++++++++++++ 2 files changed, 129 insertions(+) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 2e6f138d2e3..7b8b91a4d36 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -3,6 +3,7 @@ from __future__ import annotations import pickle +import warnings from typing import Any, Set import pandas as pd @@ -1524,6 +1525,27 @@ def _split_columns_by_levels(self, levels): [], ) + def sample( + self, + n=None, + frac=None, + replace=False, + weights=None, + random_state=None, + axis=None, + ignore_index=False, + ): + warnings.warn( + "Index.sample is deprecated and will be removed.", FutureWarning, + ) + return cudf.core.index._index_from_data( + self.to_frame() + .sample( + n, frac, replace, weights, random_state, axis, ignore_index + ) + ._data + ) + def _get_result_name(left_name, right_name): if left_name == right_name: diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 65346f57a11..6c1a705ffcb 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -1614,6 +1614,113 @@ def test_interval_index_from_breaks(closed): assert_eq(pindex, gindex) +@pytest.mark.parametrize("n", [0, 2, 5, 10, None]) +@pytest.mark.parametrize("frac", [0.1, 0.5, 1, 2, None]) +@pytest.mark.parametrize("replace", [True, False]) +def test_index_sample_basic(n, frac, replace): + psr = pd.Series([1, 2, 3, 4, 5]) + gindex = cudf.Index(psr) + random_state = 0 + + try: + pout = psr.sample( + n=n, frac=frac, replace=replace, random_state=random_state + ) + except BaseException: + assert_exceptions_equal( + lfunc=psr.sample, + rfunc=gindex.sample, + lfunc_args_and_kwargs=( + [], + { + "n": n, + "frac": frac, + "replace": replace, + "random_state": random_state, + }, + ), + rfunc_args_and_kwargs=( + [], + { + "n": n, + "frac": frac, + "replace": replace, + "random_state": random_state, + }, + ), + ) + else: + gout = gindex.sample( + n=n, frac=frac, replace=replace, random_state=random_state + ) + + assert pout.shape == gout.shape + + +@pytest.mark.parametrize("n", [2, 5, 10, None]) +@pytest.mark.parametrize("frac", [0.5, 1, 2, None]) +@pytest.mark.parametrize("replace", [True, False]) +@pytest.mark.parametrize("axis", [0, 1]) +def test_multiindex_sample_basic(n, frac, replace, axis): + # as we currently don't support column with same name + if axis == 1 and replace: + return + pdf = pd.DataFrame( + { + "a": [1, 2, 3, 4, 5], + "float": [0.05, 0.2, 0.3, 0.2, 0.25], + "int": [1, 3, 5, 4, 2], + }, + ) + mul_index = cudf.Index(cudf.from_pandas(pdf)) + random_state = 0 + + try: + pout = pdf.sample( + n=n, + frac=frac, + replace=replace, + random_state=random_state, + axis=axis, + ) + except BaseException: + assert_exceptions_equal( + lfunc=pdf.sample, + rfunc=mul_index.sample, + lfunc_args_and_kwargs=( + [], + { + "n": n, + "frac": frac, + "replace": replace, + "random_state": random_state, + "axis": axis, + }, + ), + rfunc_args_and_kwargs=( + [], + { + "n": n, + "frac": frac, + "replace": replace, + "random_state": random_state, + "axis": axis, + }, + ), + ) + else: + gout = mul_index.sample( + n=n, + frac=frac, + replace=replace, + random_state=random_state, + axis=axis, + ) + if axis == 1 and n is None and frac is None: + pout = pout.iloc[:, 0] + assert pout.shape == gout.shape + + @pytest.mark.parametrize( "data", [ From 80e6cf63e646141caa581c356c156a6e89d5de45 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 15 Feb 2022 10:30:59 -0800 Subject: [PATCH 03/38] Better logic to compute `n`. --- python/cudf/cudf/core/indexed_frame.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 93980e971cd..d76aaf3b35c 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1794,15 +1794,12 @@ def sample( axis = self._get_axis_from_axis_arg(axis) - if frac is None and n is None: - n = 1 - elif frac is not None: - if axis == 0: - n = int(round(self.shape[0] * frac)) - else: - n = int(round(self.shape[1] * frac)) - size = self.shape[axis] + if frac is None: + n = 1 if n is None else n + else: + n = int(round(size * frac)) + weights = preprocess_weights(weights, size) random_state = preprocess_random_state(random_state, axis) From 82597e6e4c79be6d84edc99be40d121f04cff76e Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 15 Feb 2022 10:35:06 -0800 Subject: [PATCH 04/38] Move size checks --- python/cudf/cudf/core/indexed_frame.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index d76aaf3b35c..a14bb51ea21 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1714,11 +1714,11 @@ def sample( Notes ----- When sampling from ``axis=0/'index'``, ``random_state`` can be either - a host random state (``numpy.random.RandomState``) or a device random - state (``cupy.random.RandomState``). If a host random state is used, + a numpy random state (``numpy.random.RandomState``) or a cupy random + state (``cupy.random.RandomState``). If a numpy random state is used, the rows sampled is guarenteed to match pandas result, but performance may be decreased if the item count is too high. Optionally, user may - specify a device random state to achieve better performance at high + specify a cupy random state to achieve better performance at high item counts. Parameters @@ -1800,6 +1800,11 @@ def sample( else: n = int(round(size * frac)) + if n > 0 and size == 0: + raise ValueError( + "Cannot take a sample larger than 0 when axis is empty" + ) + weights = preprocess_weights(weights, size) random_state = preprocess_random_state(random_state, axis) @@ -1810,7 +1815,7 @@ def sample( else: if isinstance(random_state, cp.random.RandomState): raise ValueError( - "Use a host random state when sampling from " + "Use a numpy random state when sampling from " "axis=1/columns." ) return self._sample_axis_1( @@ -1825,11 +1830,6 @@ def _sample_axis_0( random_state: Union[np.random.RandomState, cp.random.RandomState], ignore_index: bool, ): - if n > 0 and self.shape[0] == 0: - raise ValueError( - "Cannot take a sample larger than 0 when axis is empty" - ) - if not replace and n > self.shape[0]: raise ValueError( "Cannot take a larger sample than population " @@ -1841,7 +1841,7 @@ def _sample_axis_0( "weights is not yet supported for axis=0/index" ) - # Dynamic dispatch to host/device depending on state provided. + # Dynamic dispatch to numpy/cupy depending on state provided. gather_map = cudf.core.column.as_column( random_state.choice(len(self), size=n, replace=replace) ) From aa166b757aca88ae561a250582769d006579d82b Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 15 Feb 2022 11:14:12 -0800 Subject: [PATCH 05/38] Consolidating error checking codes to reduce checking the same variable twice; Fix type annotations. --- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/core/indexed_frame.py | 44 +++++++++++--------------- 2 files changed, 20 insertions(+), 26 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 62e6d254873..acd059d7451 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6085,7 +6085,7 @@ def nunique(self, axis=0, dropna=True): def _sample_axis_1( self, n: int, - weights: ColumnLike, + weights: Optional[ColumnLike], replace: bool, random_state: np.random.RandomState, ignore_index: bool, diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index a14bb51ea21..c9dfd19f93b 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1782,22 +1782,21 @@ def sample( 0 1 3 1 2 4 """ - if frac is not None and frac > 1 and not replace: - raise ValueError( - "Replace has to be set to `True` " - "when upsampling the population `frac` > 1." - ) - elif frac is not None and n is not None: - raise ValueError( - "Please enter a value for `frac` OR `n`, not both" - ) - axis = self._get_axis_from_axis_arg(axis) - size = self.shape[axis] + if frac is None: n = 1 if n is None else n else: + if frac > 1 and not replace: + raise ValueError( + "Replace has to be set to `True` " + "when upsampling the population `frac` > 1." + ) + if n is not None: + raise ValueError( + "Please enter a value for `frac` OR `n`, not both" + ) n = int(round(size * frac)) if n > 0 and size == 0: @@ -1805,6 +1804,12 @@ def sample( "Cannot take a sample larger than 0 when axis is empty" ) + if not replace and n > size: + raise ValueError( + "Cannot take a larger sample than population " + "when 'replace=False'" + ) + weights = preprocess_weights(weights, size) random_state = preprocess_random_state(random_state, axis) @@ -1825,25 +1830,14 @@ def sample( def _sample_axis_0( self, n: int, - weights: None, + weights: Optional[ColumnLike], replace: bool, random_state: Union[np.random.RandomState, cp.random.RandomState], ignore_index: bool, ): - if not replace and n > self.shape[0]: - raise ValueError( - "Cannot take a larger sample than population " - "when 'replace=False'" - ) - - if weights is not None: - raise NotImplementedError( - "weights is not yet supported for axis=0/index" - ) - # Dynamic dispatch to numpy/cupy depending on state provided. gather_map = cudf.core.column.as_column( - random_state.choice(len(self), size=n, replace=replace) + random_state.choice(len(self), size=n, replace=replace, p=weights) ) return self._gather( @@ -1853,7 +1847,7 @@ def _sample_axis_0( def _sample_axis_1( self, n: int, - weights: None, + weights: Optional[ColumnLike], replace: bool, random_state: np.random.RandomState, ignore_index: bool, From 0466e84241eb308d4dcf43c455a1be2f714f143a Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 15 Feb 2022 11:18:39 -0800 Subject: [PATCH 06/38] Change error message. Co-authored-by: Vyas Ramasubramani --- python/cudf/cudf/core/indexed_frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index c9dfd19f93b..a772fddbf3e 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1853,7 +1853,7 @@ def _sample_axis_1( ignore_index: bool, ): raise NotImplementedError( - "Sampling from axis 1 is only implemented for Dataframe." + f"Sampling from axis 1 is not implemented for {self.__class__}." ) From 8965538f68c0ef6dda97d914536716017834eeff Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 15 Feb 2022 11:43:47 -0800 Subject: [PATCH 07/38] Inlining `weights`, `random_state` handling; rewriting `weights` docstring. --- python/cudf/cudf/core/indexed_frame.py | 75 ++++++++++---------------- 1 file changed, 29 insertions(+), 46 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index a772fddbf3e..510424fc0ed 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1731,8 +1731,12 @@ def sample( replace : bool, default False Allow or disallow sampling of the same row more than once. replace == True is not yet supported for axis = 1/"columns" - weights : str or ndarray-like, optional - Only supported for axis=1/"columns" + weights : numpy or cupy ndarray-like, optional + Default `None` for uniform probability distribution over rows to + sample from. If `ndarray` is passed, the length of `weights` should + equal to the number of rows to sample from, and will be normalized + to have sum of 1. Unlike pandas, index alignment is not currently + not performed. random_state : int, numpy/cupy RandomState, or None, default None If None, default cupy random state is chosen. If int, as seed for the default cupy random state. @@ -1785,6 +1789,7 @@ def sample( axis = self._get_axis_from_axis_arg(axis) size = self.shape[axis] + # Compute `n` from parameter `frac`. if frac is None: n = 1 if n is None else n else: @@ -1810,8 +1815,28 @@ def sample( "when 'replace=False'" ) - weights = preprocess_weights(weights, size) - random_state = preprocess_random_state(random_state, axis) + # Normalize `weights` array. + if weights is not None: + if is_column_like(weights): + weights = np.asarray(weights) + else: + raise NotImplementedError( + "Weights specified by string is unsupported yet." + ) + + if size != len(weights): + raise ValueError( + "Weights and axis to be sampled must be of same length" + ) + + weights = weights / weights.sum() + + # Construct random state if `random_state` parameter is a seed. + if not isinstance( + random_state, (np.random.RandomState, cp.random.RandomState) + ): + lib = cp if axis == 0 else np + random_state = lib.random.RandomState(seed=random_state) if axis == 0: return self._sample_axis_0( @@ -1872,45 +1897,3 @@ def _check_duplicate_level_names(specified, level_names): f"The names {duplicates_specified} occurs multiple times, use a" " level number" ) - - -def preprocess_weights( - weights: Optional[ColumnLike], size: int -) -> Optional[ColumnLike]: - """If ``weights`` is not None, normalize weights.""" - if weights is not None: - if is_column_like(weights): - weights = np.asarray(weights) - else: - raise NotImplementedError( - "Weights specified by string is unsupported yet." - ) - - if size != len(weights): - raise ValueError( - "Weights and axis to be sampled must be of same length" - ) - - total_weight = weights.sum() - if total_weight != 1: - if not isinstance(weights.dtype, float): - weights = weights.astype("float64") - weights = weights / total_weight - return weights - - -def preprocess_random_state( - seed_or_random_state: Union[ - None, np.number, np.random.RandomState, cp.random.RandomState - ], - axis: int, -) -> Union[np.random.RandomState, cp.random.RandomState]: - """Construct random state from parameter ``seed_or_random_state``.""" - if not isinstance( - seed_or_random_state, (np.random.RandomState, cp.random.RandomState) - ): - lib = cp if axis == 0 else np - seed_or_random_state = lib.random.RandomState( - seed=seed_or_random_state - ) - return seed_or_random_state From 3f33dcb70124b7b3e26dbd45b200bceb2d43f2be Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 15 Feb 2022 12:32:10 -0800 Subject: [PATCH 08/38] Rewrites `make_random_state` into a fixture; Optimizes code locations. --- python/cudf/cudf/tests/conftest.py | 39 +++++++++ python/cudf/cudf/tests/test_dataframe.py | 100 +++-------------------- python/cudf/cudf/tests/test_series.py | 42 ++++++++++ 3 files changed, 92 insertions(+), 89 deletions(-) diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index 041bd055f0a..eebe3c15145 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -1,10 +1,49 @@ import pathlib +import cupy as cp +import numpy as np import pytest import rmm # noqa: F401 +from cudf.testing._utils import assert_eq + @pytest.fixture(scope="session") def datadir(): return pathlib.Path(__file__).parent / "data" + + +@pytest.fixture( + params=[None, 42, np.random.RandomState, cp.random.RandomState] +) +def random_state_tuple(request): + """A pytest fixture of valid `random_state` parameter pairs for pandas + and cudf. Valid parameter combinations, and what to check for each pair + are listed below: + + pandas: None, seed(int), np.random.RandomState, np.random.RandomState + cudf: None, seed(int), np.random.RandomState, cp.random.RandomState + ------ + check: shape, shape, exact result, shape + + Each column above stands for one valid parameter combination and check. + """ + + def shape_checker(expected, got): + assert expected.shape == got.shape + + def exact_checker(expected, got): + assert_eq(expected, got) + + seed_or_state_ctor = request.param + if seed_or_state_ctor is None: + return None, None, shape_checker + elif isinstance(seed_or_state_ctor, int): + return seed_or_state_ctor, seed_or_state_ctor, shape_checker + elif seed_or_state_ctor == np.random.RandomState: + return seed_or_state_ctor(42), seed_or_state_ctor(42), exact_checker + elif seed_or_state_ctor == cp.random.RandomState: + return np.random.RandomState(42), seed_or_state_ctor(42), shape_checker + else: + pytest.skip("Unsupported params.") diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 83abf3c62fc..7ad1b3dd6a1 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -7401,30 +7401,18 @@ def test_cudf_arrow_array_error(): sr.__arrow_array__() -def make_random_state(seed, random_state_lib, force_host=False): - if random_state_lib: - return ( - np.random.RandomState(seed) - if force_host - else random_state_lib(seed) - ) - else: - return seed - - @pytest.mark.parametrize("n", [0, 2, 10, None]) @pytest.mark.parametrize("frac", [0.3, 2, None]) @pytest.mark.parametrize("replace", [True, False]) @pytest.mark.parametrize("axis", [0, 1]) -@pytest.mark.parametrize( - "random_state_lib", [None, np.random.RandomState, cp.random.RandomState], -) -def test_dataframe_sample_basic(n, frac, replace, axis, random_state_lib): +def test_dataframe_sample_basic(n, frac, replace, axis, random_state_tuple): # TODO: decouple valid parameter set from invalid parameter set and # write two separate tests to reduce function complexity. if axis == 1 and replace: pytest.skip(reason="we currently don't support column with same name.") - if axis == 1 and random_state_lib == cp.random.RandomState: + + pd_random_state, gd_random_state, checker = random_state_tuple + if axis == 1 and isinstance(gd_random_state, cp.random.RandomState): pytest.skip( reason="cannot sample column axis with device random state." ) @@ -7438,16 +7426,13 @@ def test_dataframe_sample_basic(n, frac, replace, axis, random_state_lib): index=[1, 2, 3, 4, 5], ) df = cudf.DataFrame.from_pandas(pdf) - seed = 1 try: - pout = pdf.sample( + expected = pdf.sample( n=n, frac=frac, replace=replace, - random_state=make_random_state( - seed, random_state_lib, force_host=True - ), + random_state=pd_random_state, axis=axis, ) except BaseException: @@ -7460,9 +7445,7 @@ def test_dataframe_sample_basic(n, frac, replace, axis, random_state_lib): "n": n, "frac": frac, "replace": replace, - "random_state": make_random_state( - seed, random_state_lib, force_host=True - ), + "random_state": pd_random_state, "axis": axis, }, ), @@ -7472,23 +7455,20 @@ def test_dataframe_sample_basic(n, frac, replace, axis, random_state_lib): "n": n, "frac": frac, "replace": replace, - "random_state": make_random_state(seed, random_state_lib), + "random_state": gd_random_state, "axis": axis, }, ), ) else: - gout = df.sample( + got = df.sample( n=n, frac=frac, replace=replace, - random_state=make_random_state(seed, random_state_lib), + random_state=gd_random_state, axis=axis, ) - if random_state_lib == np.random.RandomState: - assert_eq(pout, gout) - else: - assert pout.shape == gout.shape + checker(expected, got) @pytest.mark.parametrize("replace", [True, False]) @@ -7518,64 +7498,6 @@ def test_dataframe_reproducibility_rstate(replace, lib): assert_eq(expected, out) -@pytest.mark.parametrize("n", [0, 2, 10, None]) -@pytest.mark.parametrize("frac", [0.3, 2, None]) -@pytest.mark.parametrize("replace", [True, False]) -@pytest.mark.parametrize( - "random_state_lib", [None, np.random.RandomState, cp.random.RandomState], -) -def test_series_sample_basic(n, frac, replace, random_state_lib): - psr = pd.Series([1, 2, 3, 4, 5]) - sr = cudf.Series.from_pandas(psr) - seed = 1 - - try: - pout = psr.sample( - n=n, - frac=frac, - replace=replace, - random_state=make_random_state( - seed, random_state_lib, force_host=True - ), - ) - except BaseException: - assert_exceptions_equal( - lfunc=psr.sample, - rfunc=sr.sample, - lfunc_args_and_kwargs=( - [], - { - "n": n, - "frac": frac, - "replace": replace, - "random_state": make_random_state( - seed, random_state_lib, force_host=True - ), - }, - ), - rfunc_args_and_kwargs=( - [], - { - "n": n, - "frac": frac, - "replace": replace, - "random_state": make_random_state(seed, random_state_lib), - }, - ), - ) - else: - gout = sr.sample( - n=n, - frac=frac, - replace=replace, - random_state=make_random_state(seed, random_state_lib), - ) - if random_state_lib == np.random.RandomState: - assert_eq(pout, gout) - else: - assert pout.shape == gout.shape - - @pytest.mark.parametrize( "df", [ diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 358484d79b9..3461dc73f30 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1590,3 +1590,45 @@ def test_fill(data, fill_value, begin, end, inplace): def test_fill_new_category(): gs = cudf.Series(pd.Categorical(["a", "b", "c"])) gs[0:1] = "d" + + +@pytest.mark.parametrize("n", [0, 2, 10, None]) +@pytest.mark.parametrize("frac", [0.3, 2, None]) +@pytest.mark.parametrize("replace", [True, False]) +def test_series_sample_basic(n, frac, replace, random_state_tuple): + pd_random_state, gd_random_state, checker = random_state_tuple + psr = pd.Series([1, 2, 3, 4, 5]) + sr = cudf.Series.from_pandas(psr) + + try: + expected = psr.sample( + n=n, frac=frac, replace=replace, random_state=pd_random_state, + ) + except BaseException: + assert_exceptions_equal( + lfunc=psr.sample, + rfunc=sr.sample, + lfunc_args_and_kwargs=( + [], + { + "n": n, + "frac": frac, + "replace": replace, + "random_state": pd_random_state, + }, + ), + rfunc_args_and_kwargs=( + [], + { + "n": n, + "frac": frac, + "replace": replace, + "random_state": gd_random_state, + }, + ), + ) + else: + got = sr.sample( + n=n, frac=frac, replace=replace, random_state=gd_random_state, + ) + checker(expected, got) From 79d3d568ea7471800c4d8ae99aac2b32e7c88828 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 15 Feb 2022 12:39:04 -0800 Subject: [PATCH 09/38] Use fixture in reproducibility test --- python/cudf/cudf/tests/test_dataframe.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 7ad1b3dd6a1..6031e856a88 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -7483,16 +7483,21 @@ def test_dataframe_reproducibility_seed(replace): @pytest.mark.parametrize("replace", [True, False]) -@pytest.mark.parametrize("lib", [np, cp]) -def test_dataframe_reproducibility_rstate(replace, lib): +def test_dataframe_sample_reproducibility(replace, random_state_tuple): df = cudf.DataFrame({"a": cupy.arange(0, 1024)}) + _, gd_random_state, _ = random_state_tuple + if gd_random_state is None: + pytest.skip( + "`None` seed defaults to global random state that's dependent to" + "system implementation. Reproducibility isn't guarenteed." + ) seed = 10 expected = df.sample( - 1024, replace=replace, random_state=lib.random.RandomState(seed) + 1024, replace=replace, random_state=type(gd_random_state)(seed) ) out = df.sample( - 1024, replace=replace, random_state=lib.random.RandomState(seed) + 1024, replace=replace, random_state=type(gd_random_state)(seed) ) assert_eq(expected, out) From ecd80610b70f2118d5a83aedbab0d5dd9423c548 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 15 Feb 2022 15:42:06 -0800 Subject: [PATCH 10/38] Reverting support for cupy arrays for weights; add tests for weights. --- python/cudf/cudf/core/indexed_frame.py | 16 ++++++------- python/cudf/cudf/tests/conftest.py | 13 +++++++++++ python/cudf/cudf/tests/test_dataframe.py | 19 +++++++++++++++- python/cudf/cudf/tests/test_series.py | 29 +++++++++++++++++++++--- 4 files changed, 65 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 510424fc0ed..d8c203649f9 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1731,7 +1731,7 @@ def sample( replace : bool, default False Allow or disallow sampling of the same row more than once. replace == True is not yet supported for axis = 1/"columns" - weights : numpy or cupy ndarray-like, optional + weights : numpy ndarray-like, optional Default `None` for uniform probability distribution over rows to sample from. If `ndarray` is passed, the length of `weights` should equal to the number of rows to sample from, and will be normalized @@ -1815,6 +1815,13 @@ def sample( "when 'replace=False'" ) + # Construct random state if `random_state` parameter is a seed. + if not isinstance( + random_state, (np.random.RandomState, cp.random.RandomState) + ): + lib = cp if axis == 0 else np + random_state = lib.random.RandomState(seed=random_state) + # Normalize `weights` array. if weights is not None: if is_column_like(weights): @@ -1831,13 +1838,6 @@ def sample( weights = weights / weights.sum() - # Construct random state if `random_state` parameter is a seed. - if not isinstance( - random_state, (np.random.RandomState, cp.random.RandomState) - ): - lib = cp if axis == 0 else np - random_state = lib.random.RandomState(seed=random_state) - if axis == 0: return self._sample_axis_0( n, weights, replace, random_state, ignore_index diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index eebe3c15145..2595acac190 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -47,3 +47,16 @@ def exact_checker(expected, got): return np.random.RandomState(42), seed_or_state_ctor(42), shape_checker else: pytest.skip("Unsupported params.") + + +@pytest.fixture(params=[None, np.ones]) +def make_weights(request): + if request.param is None: + return lambda _: (None, None) + else: + + def wrapped(size): + # Uniform distribution, non-normalized + return request.param(size), request.param(size) + + return wrapped diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 6031e856a88..f2302381bd5 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -7405,7 +7405,9 @@ def test_cudf_arrow_array_error(): @pytest.mark.parametrize("frac", [0.3, 2, None]) @pytest.mark.parametrize("replace", [True, False]) @pytest.mark.parametrize("axis", [0, 1]) -def test_dataframe_sample_basic(n, frac, replace, axis, random_state_tuple): +def test_dataframe_sample_basic( + n, frac, replace, axis, random_state_tuple, make_weights +): # TODO: decouple valid parameter set from invalid parameter set and # write two separate tests to reduce function complexity. if axis == 1 and replace: @@ -7427,12 +7429,24 @@ def test_dataframe_sample_basic(n, frac, replace, axis, random_state_tuple): ) df = cudf.DataFrame.from_pandas(pdf) + pd_weights, gd_weights = make_weights(len(pdf)) + if ( + not replace + and not isinstance(gd_weights, np.random.RandomState) + and gd_weights is not None + ): + pytest.skip( + "`cupy.random.RandomState` doesn't support weighted sampling " + "without replacement." + ) + try: expected = pdf.sample( n=n, frac=frac, replace=replace, random_state=pd_random_state, + weights=pd_weights, axis=axis, ) except BaseException: @@ -7446,6 +7460,7 @@ def test_dataframe_sample_basic(n, frac, replace, axis, random_state_tuple): "frac": frac, "replace": replace, "random_state": pd_random_state, + "weights": pd_weights, "axis": axis, }, ), @@ -7456,6 +7471,7 @@ def test_dataframe_sample_basic(n, frac, replace, axis, random_state_tuple): "frac": frac, "replace": replace, "random_state": gd_random_state, + "weights": gd_weights, "axis": axis, }, ), @@ -7466,6 +7482,7 @@ def test_dataframe_sample_basic(n, frac, replace, axis, random_state_tuple): frac=frac, replace=replace, random_state=gd_random_state, + weights=gd_weights, axis=axis, ) checker(expected, got) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 3461dc73f30..dc46cc37dfc 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1595,14 +1595,31 @@ def test_fill_new_category(): @pytest.mark.parametrize("n", [0, 2, 10, None]) @pytest.mark.parametrize("frac", [0.3, 2, None]) @pytest.mark.parametrize("replace", [True, False]) -def test_series_sample_basic(n, frac, replace, random_state_tuple): +def test_series_sample_basic( + n, frac, replace, random_state_tuple, make_weights +): pd_random_state, gd_random_state, checker = random_state_tuple psr = pd.Series([1, 2, 3, 4, 5]) sr = cudf.Series.from_pandas(psr) + pd_weights, gd_weights = make_weights(len(psr)) + if ( + not replace + and not isinstance(gd_weights, np.random.RandomState) + and gd_weights is not None + ): + pytest.skip( + "`cupy.random.RandomState` doesn't support weighted sampling " + "without replacement." + ) + try: expected = psr.sample( - n=n, frac=frac, replace=replace, random_state=pd_random_state, + n=n, + frac=frac, + replace=replace, + weights=pd_weights, + random_state=pd_random_state, ) except BaseException: assert_exceptions_equal( @@ -1614,6 +1631,7 @@ def test_series_sample_basic(n, frac, replace, random_state_tuple): "n": n, "frac": frac, "replace": replace, + "weights": pd_weights, "random_state": pd_random_state, }, ), @@ -1623,12 +1641,17 @@ def test_series_sample_basic(n, frac, replace, random_state_tuple): "n": n, "frac": frac, "replace": replace, + "weights": gd_weights, "random_state": gd_random_state, }, ), ) else: got = sr.sample( - n=n, frac=frac, replace=replace, random_state=gd_random_state, + n=n, + frac=frac, + replace=replace, + weights=gd_weights, + random_state=gd_random_state, ) checker(expected, got) From ce2e0fa7b46e4ec04e1709a36d9fc224153978fa Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 15 Feb 2022 15:49:12 -0800 Subject: [PATCH 11/38] Apply suggestions from code review Co-authored-by: Vyas Ramasubramani --- python/cudf/cudf/core/dataframe.py | 4 ++-- python/cudf/cudf/core/indexed_frame.py | 23 +++++++++++++---------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index acd059d7451..8ff0ae33f26 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6101,8 +6101,8 @@ def _sample_axis_1( "Cannot take a sample larger than 0 when axis is empty" ) - columns = np.asarray(self._data.names) - if not replace and n > columns.size: + columns = self._data.names + if not replace and n > len(columns): raise ValueError( "Cannot take a larger sample " "than population when 'replace=False'" diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index d8c203649f9..a43945f7759 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1709,17 +1709,19 @@ def sample( ): """Return a random sample of items from an axis of object. - You can use random_state for reproducibility. + If reproducible results are required, a random number generator may be provided + via the `random_state` parameter. This function will always produce the same sample + given an identical `random_state`. Notes ----- When sampling from ``axis=0/'index'``, ``random_state`` can be either a numpy random state (``numpy.random.RandomState``) or a cupy random - state (``cupy.random.RandomState``). If a numpy random state is used, - the rows sampled is guarenteed to match pandas result, but performance - may be decreased if the item count is too high. Optionally, user may - specify a cupy random state to achieve better performance at high - item counts. + state (``cupy.random.RandomState``). When a numpy random state is + used, the output is guaranteed to match the output of the corresponding + pandas method call, but generating the sample may be slow. If exact pandas + equivalence is not required, using a cupy random state will achieve better + performance, especially at high item counts. Parameters ---------- @@ -1735,11 +1737,11 @@ def sample( Default `None` for uniform probability distribution over rows to sample from. If `ndarray` is passed, the length of `weights` should equal to the number of rows to sample from, and will be normalized - to have sum of 1. Unlike pandas, index alignment is not currently + to have a sum of 1. Unlike pandas, index alignment is not currently not performed. random_state : int, numpy/cupy RandomState, or None, default None If None, default cupy random state is chosen. - If int, as seed for the default cupy random state. + If int, the seed for the default cupy random state. If RandomState, rows-to-sample are generated from the RandomState. axis : {0 or `index`, 1 or `columns`, None}, default None Axis to sample. Accepts axis number or name. @@ -1780,7 +1782,8 @@ def sample( dtype: int64 >>> df = cudf.DataFrame( - ... {"a":[1, 2], "b":[2, 3], "c":[3, 4], "d":[4, 5]}) + ... {"a": [1, 2], "b": [2, 3], "c": [3, 4], "d": [4, 5]} + ... ) >>> df.sample(2, axis=1) a c 0 1 3 @@ -1836,7 +1839,7 @@ def sample( "Weights and axis to be sampled must be of same length" ) - weights = weights / weights.sum() + weights /= weights.sum() if axis == 0: return self._sample_axis_0( From b0b6ce7c1f6dc8653898eca50f9389d61cfadeda Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 15 Feb 2022 15:56:11 -0800 Subject: [PATCH 12/38] Apply fixes from reviews --- python/cudf/cudf/core/dataframe.py | 5 ----- python/cudf/cudf/core/indexed_frame.py | 15 +++++++-------- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 8ff0ae33f26..5bd9f8aff8c 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6096,11 +6096,6 @@ def _sample_axis_1( "when 'replace=True'" ) - if n > 0 and self.shape[1] == 0: - raise ValueError( - "Cannot take a sample larger than 0 when axis is empty" - ) - columns = self._data.names if not replace and n > len(columns): raise ValueError( diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index a43945f7759..c64e2f25296 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1709,9 +1709,9 @@ def sample( ): """Return a random sample of items from an axis of object. - If reproducible results are required, a random number generator may be provided - via the `random_state` parameter. This function will always produce the same sample - given an identical `random_state`. + If reproducible results are required, a random number generator may be + provided via the `random_state` parameter. This function will always + produce the same sample given an identical `random_state`. Notes ----- @@ -1719,9 +1719,9 @@ def sample( a numpy random state (``numpy.random.RandomState``) or a cupy random state (``cupy.random.RandomState``). When a numpy random state is used, the output is guaranteed to match the output of the corresponding - pandas method call, but generating the sample may be slow. If exact pandas - equivalence is not required, using a cupy random state will achieve better - performance, especially at high item counts. + pandas method call, but generating the sample may be slow. If exact + pandas equivalence is not required, using a cupy random state will + achieve better performance, especially at high item counts. Parameters ---------- @@ -1746,8 +1746,7 @@ def sample( axis : {0 or `index`, 1 or `columns`, None}, default None Axis to sample. Accepts axis number or name. Default is stat axis for given data type - (0 for Series and DataFrames). Series and Index doesn't - support axis=1. + (0 for Series and DataFrames). Series doesn't support axis=1. ignore_index : bool, default False If True, the resulting index will be labeled 0, 1, …, n - 1. From 62ee2b404eeb0bf5a77792baef0fa600ed5a1491 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 15 Feb 2022 16:04:48 -0800 Subject: [PATCH 13/38] Conforming error messages. --- python/cudf/cudf/core/dataframe.py | 8 ++++---- python/cudf/cudf/core/indexed_frame.py | 16 ++++++++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 5bd9f8aff8c..dcb88361db1 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6092,15 +6092,15 @@ def _sample_axis_1( ): if replace: raise NotImplementedError( - "Sample is not supported for axis 1/'columns' " - "when 'replace=True'" + "Sample is not supported for axis 1/`columns` when" + "`replace=True`." ) columns = self._data.names if not replace and n > len(columns): raise ValueError( - "Cannot take a larger sample " - "than population when 'replace=False'" + "Cannot take a larger sample than population when" + "`replace=False`." ) sampled_column_labels = random_state.choice( diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index c64e2f25296..f1beb324773 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1797,24 +1797,24 @@ def sample( else: if frac > 1 and not replace: raise ValueError( - "Replace has to be set to `True` " - "when upsampling the population `frac` > 1." + "Replace has to be set to `True` when upsampling the " + "population `frac` > 1." ) if n is not None: raise ValueError( - "Please enter a value for `frac` OR `n`, not both" + "Please enter a value for `frac` OR `n`, not both." ) n = int(round(size * frac)) if n > 0 and size == 0: raise ValueError( - "Cannot take a sample larger than 0 when axis is empty" + "Cannot take a sample larger than 0 when axis is empty." ) if not replace and n > size: raise ValueError( - "Cannot take a larger sample than population " - "when 'replace=False'" + "Cannot take a larger sample than population when " + "`replace=False`." ) # Construct random state if `random_state` parameter is a seed. @@ -1835,7 +1835,7 @@ def sample( if size != len(weights): raise ValueError( - "Weights and axis to be sampled must be of same length" + "Weights and axis to be sampled must be of same length." ) weights /= weights.sum() @@ -1848,7 +1848,7 @@ def sample( if isinstance(random_state, cp.random.RandomState): raise ValueError( "Use a numpy random state when sampling from " - "axis=1/columns." + "`axis=1`/`columns`." ) return self._sample_axis_1( n, weights, replace, random_state, ignore_index From 6c34a71250c0d7b334904f1745e3133dda30c78d Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 15 Feb 2022 16:08:47 -0800 Subject: [PATCH 14/38] Removing redundant tests --- python/cudf/cudf/tests/test_dataframe.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index f2302381bd5..36f27472aaf 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -7488,17 +7488,6 @@ def test_dataframe_sample_basic( checker(expected, got) -@pytest.mark.parametrize("replace", [True, False]) -def test_dataframe_reproducibility_seed(replace): - df = cudf.DataFrame({"a": cupy.arange(0, 1024)}) - - seed = 1 - expected = df.sample(1024, replace=replace, random_state=seed) - out = df.sample(1024, replace=replace, random_state=seed) - - assert_eq(expected, out) - - @pytest.mark.parametrize("replace", [True, False]) def test_dataframe_sample_reproducibility(replace, random_state_tuple): df = cudf.DataFrame({"a": cupy.arange(0, 1024)}) @@ -7506,7 +7495,7 @@ def test_dataframe_sample_reproducibility(replace, random_state_tuple): _, gd_random_state, _ = random_state_tuple if gd_random_state is None: pytest.skip( - "`None` seed defaults to global random state that's dependent to" + "`None` seed defaults to global random state that's dependent to " "system implementation. Reproducibility isn't guarenteed." ) seed = 10 From e0d78ee298ab866e010208ca16b04cd16e53f980 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 15 Feb 2022 16:22:26 -0800 Subject: [PATCH 15/38] Rewrites weights fixture; skip comparing error messages. --- python/cudf/cudf/tests/conftest.py | 10 +++++++--- python/cudf/cudf/tests/test_dataframe.py | 15 ++++++++------- python/cudf/cudf/tests/test_series.py | 15 ++++++++------- 3 files changed, 23 insertions(+), 17 deletions(-) diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index 2595acac190..13bb7f06cef 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -18,7 +18,9 @@ def datadir(): params=[None, 42, np.random.RandomState, cp.random.RandomState] ) def random_state_tuple(request): - """A pytest fixture of valid `random_state` parameter pairs for pandas + """ + Specific to `test_dataframe_sample*` and `test_series_sample*` tests. + A pytest fixture of valid `random_state` parameter pairs for pandas and cudf. Valid parameter combinations, and what to check for each pair are listed below: @@ -51,12 +53,14 @@ def exact_checker(expected, got): @pytest.fixture(params=[None, np.ones]) def make_weights(request): + """Specific to `test_dataframe_sample*` and `test_series_sample*` tests. + """ if request.param is None: - return lambda _: (None, None) + return lambda _: None else: def wrapped(size): # Uniform distribution, non-normalized - return request.param(size), request.param(size) + return request.param(size) return wrapped diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 36f27472aaf..d9e0de61739 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -7429,11 +7429,11 @@ def test_dataframe_sample_basic( ) df = cudf.DataFrame.from_pandas(pdf) - pd_weights, gd_weights = make_weights(len(pdf)) + weights = make_weights(len(pdf)) if ( not replace - and not isinstance(gd_weights, np.random.RandomState) - and gd_weights is not None + and not isinstance(weights, np.random.RandomState) + and weights is not None ): pytest.skip( "`cupy.random.RandomState` doesn't support weighted sampling " @@ -7446,7 +7446,7 @@ def test_dataframe_sample_basic( frac=frac, replace=replace, random_state=pd_random_state, - weights=pd_weights, + weights=weights, axis=axis, ) except BaseException: @@ -7460,7 +7460,7 @@ def test_dataframe_sample_basic( "frac": frac, "replace": replace, "random_state": pd_random_state, - "weights": pd_weights, + "weights": weights, "axis": axis, }, ), @@ -7471,10 +7471,11 @@ def test_dataframe_sample_basic( "frac": frac, "replace": replace, "random_state": gd_random_state, - "weights": gd_weights, + "weights": weights, "axis": axis, }, ), + compare_error_message=False, ) else: got = df.sample( @@ -7482,7 +7483,7 @@ def test_dataframe_sample_basic( frac=frac, replace=replace, random_state=gd_random_state, - weights=gd_weights, + weights=weights, axis=axis, ) checker(expected, got) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index dc46cc37dfc..929efc42308 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1602,11 +1602,11 @@ def test_series_sample_basic( psr = pd.Series([1, 2, 3, 4, 5]) sr = cudf.Series.from_pandas(psr) - pd_weights, gd_weights = make_weights(len(psr)) + weights = make_weights(len(psr)) if ( not replace - and not isinstance(gd_weights, np.random.RandomState) - and gd_weights is not None + and not isinstance(weights, np.random.RandomState) + and weights is not None ): pytest.skip( "`cupy.random.RandomState` doesn't support weighted sampling " @@ -1618,7 +1618,7 @@ def test_series_sample_basic( n=n, frac=frac, replace=replace, - weights=pd_weights, + weights=weights, random_state=pd_random_state, ) except BaseException: @@ -1631,7 +1631,7 @@ def test_series_sample_basic( "n": n, "frac": frac, "replace": replace, - "weights": pd_weights, + "weights": weights, "random_state": pd_random_state, }, ), @@ -1641,17 +1641,18 @@ def test_series_sample_basic( "n": n, "frac": frac, "replace": replace, - "weights": gd_weights, + "weights": weights, "random_state": gd_random_state, }, ), + compare_error_message=False, ) else: got = sr.sample( n=n, frac=frac, replace=replace, - weights=gd_weights, + weights=weights, random_state=gd_random_state, ) checker(expected, got) From 5300d02a7a68e3a74bacef0a3d71441af101fd0d Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 15 Feb 2022 16:55:45 -0800 Subject: [PATCH 16/38] Update copyright headers --- python/cudf/cudf/_lib/copying.pyx | 2 +- python/cudf/cudf/_lib/cpp/copying.pxd | 2 +- python/cudf/cudf/tests/test_index.py | 2 +- python/cudf/cudf/tests/test_series.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index 718f92cea75..d4476cd8371 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import pickle diff --git a/python/cudf/cudf/_lib/cpp/copying.pxd b/python/cudf/cudf/_lib/cpp/copying.pxd index 4ac0dc09160..a1c433774b5 100644 --- a/python/cudf/cudf/_lib/cpp/copying.pxd +++ b/python/cudf/cudf/_lib/cpp/copying.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libc.stdint cimport int32_t, int64_t, uint8_t from libcpp cimport bool diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 6c1a705ffcb..3588cf0fc61 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. """ Test related to Index diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 929efc42308..d2c674df89c 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import operator import re From f3b05ec444e227dbad40954f77118be7f4d24b34 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 23 Feb 2022 21:00:54 -0800 Subject: [PATCH 17/38] Commiting changes discussed offline --- python/cudf/cudf/core/indexed_frame.py | 11 +++++------ python/cudf/cudf/tests/conftest.py | 16 ++++++++++++---- python/cudf/cudf/tests/test_dataframe.py | 14 +++++++------- python/cudf/cudf/tests/test_series.py | 14 +++++++------- 4 files changed, 31 insertions(+), 24 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 9cb2c8c6ead..47cc8179b85 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -30,7 +30,6 @@ from cudf.core.index import Index, RangeIndex, _index_from_columns from cudf.core.multiindex import MultiIndex from cudf.core.udf.utils import _compile_or_get, _supported_cols_from_frame -from cudf.utils.dtypes import is_column_like doc_reset_index_template = """ Reset the index of the {klass}, or a level of it. @@ -1723,7 +1722,9 @@ def sample( used, the output is guaranteed to match the output of the corresponding pandas method call, but generating the sample may be slow. If exact pandas equivalence is not required, using a cupy random state will - achieve better performance, especially at high item counts. + achieve better performance, especially when sampling large number of + items. It's advised to use the matching `ndarray` type to the random + state for the `weights` array. Parameters ---------- @@ -1735,7 +1736,7 @@ def sample( replace : bool, default False Allow or disallow sampling of the same row more than once. replace == True is not yet supported for axis = 1/"columns" - weights : numpy ndarray-like, optional + weights : ndarray-like, optional Default `None` for uniform probability distribution over rows to sample from. If `ndarray` is passed, the length of `weights` should equal to the number of rows to sample from, and will be normalized @@ -1828,9 +1829,7 @@ def sample( # Normalize `weights` array. if weights is not None: - if is_column_like(weights): - weights = np.asarray(weights) - else: + if isinstance(weights, str): raise NotImplementedError( "Weights specified by string is unsupported yet." ) diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index 13bb7f06cef..24859571b36 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -1,3 +1,5 @@ +# Copyright (c) 2019-2022, NVIDIA CORPORATION. + import pathlib import cupy as cp @@ -51,16 +53,22 @@ def exact_checker(expected, got): pytest.skip("Unsupported params.") -@pytest.fixture(params=[None, np.ones]) -def make_weights(request): +@pytest.fixture(params=[None, "ones"]) +def make_weights(request, random_state_tuple): """Specific to `test_dataframe_sample*` and `test_series_sample*` tests. + Only testing weights array that matches type with random state. """ + _, gd_random_state, _ = random_state_tuple + if request.param is None: - return lambda _: None + return lambda _: (None, None) else: def wrapped(size): # Uniform distribution, non-normalized - return request.param(size) + if isinstance(gd_random_state, np.random.RandomState): + return np.ones(size), np.ones(size) + else: + return np.ones(size), cp.ones(size) return wrapped diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 7092b63566b..6dd8cd9b012 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -7427,11 +7427,11 @@ def test_dataframe_sample_basic( ) df = cudf.DataFrame.from_pandas(pdf) - weights = make_weights(len(pdf)) + pd_weights, gd_weights = make_weights(len(pdf)) if ( not replace - and not isinstance(weights, np.random.RandomState) - and weights is not None + and not isinstance(gd_random_state, np.random.RandomState) + and gd_weights is not None ): pytest.skip( "`cupy.random.RandomState` doesn't support weighted sampling " @@ -7444,7 +7444,7 @@ def test_dataframe_sample_basic( frac=frac, replace=replace, random_state=pd_random_state, - weights=weights, + weights=pd_weights, axis=axis, ) except BaseException: @@ -7458,7 +7458,7 @@ def test_dataframe_sample_basic( "frac": frac, "replace": replace, "random_state": pd_random_state, - "weights": weights, + "weights": pd_weights, "axis": axis, }, ), @@ -7469,7 +7469,7 @@ def test_dataframe_sample_basic( "frac": frac, "replace": replace, "random_state": gd_random_state, - "weights": weights, + "weights": gd_weights, "axis": axis, }, ), @@ -7481,7 +7481,7 @@ def test_dataframe_sample_basic( frac=frac, replace=replace, random_state=gd_random_state, - weights=weights, + weights=gd_weights, axis=axis, ) checker(expected, got) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index d2c674df89c..bc1d77ab1d5 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1602,11 +1602,11 @@ def test_series_sample_basic( psr = pd.Series([1, 2, 3, 4, 5]) sr = cudf.Series.from_pandas(psr) - weights = make_weights(len(psr)) + pd_weights, gd_weights = make_weights(len(psr)) if ( not replace - and not isinstance(weights, np.random.RandomState) - and weights is not None + and not isinstance(gd_random_state, np.random.RandomState) + and gd_weights is not None ): pytest.skip( "`cupy.random.RandomState` doesn't support weighted sampling " @@ -1618,7 +1618,7 @@ def test_series_sample_basic( n=n, frac=frac, replace=replace, - weights=weights, + weights=pd_weights, random_state=pd_random_state, ) except BaseException: @@ -1631,7 +1631,7 @@ def test_series_sample_basic( "n": n, "frac": frac, "replace": replace, - "weights": weights, + "weights": pd_weights, "random_state": pd_random_state, }, ), @@ -1641,7 +1641,7 @@ def test_series_sample_basic( "n": n, "frac": frac, "replace": replace, - "weights": weights, + "weights": gd_weights, "random_state": gd_random_state, }, ), @@ -1652,7 +1652,7 @@ def test_series_sample_basic( n=n, frac=frac, replace=replace, - weights=weights, + weights=gd_weights, random_state=gd_random_state, ) checker(expected, got) From 7c07fbd885d48030835b6d7634e1ca57a383bc83 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 25 Feb 2022 10:43:22 -0800 Subject: [PATCH 18/38] Skip comparing index.sample message --- python/cudf/cudf/tests/test_index.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index d2c384f534a..fc2d50d0025 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -1648,6 +1648,7 @@ def test_index_sample_basic(n, frac, replace): "random_state": random_state, }, ), + compare_error_message=False ) else: gout = gindex.sample( From 4e1f4f3b5e6cce6ad673e049bd4ba2de26bf22a7 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 25 Feb 2022 11:26:52 -0800 Subject: [PATCH 19/38] Fix index style --- python/cudf/cudf/tests/test_index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index fc2d50d0025..ffeb40d41f1 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -1648,7 +1648,7 @@ def test_index_sample_basic(n, frac, replace): "random_state": random_state, }, ), - compare_error_message=False + compare_error_message=False, ) else: gout = gindex.sample( From ddc80aa2a561059c556ea60e168259ffcbac35b8 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 25 Feb 2022 13:44:46 -0800 Subject: [PATCH 20/38] Revert copyright change in b08a9b53400ad9801776ddadb986fbed9171ba5b --- ci/cpu/upload.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh index 551cfd588b8..f2f67e9e000 100755 --- a/ci/cpu/upload.sh +++ b/ci/cpu/upload.sh @@ -1,5 +1,4 @@ #!/bin/bash -# Copyright (c) 2020-2022, NVIDIA CORPORATION. # # Adopted from https://github.com/tmcdonell/travis-scripts/blob/dfaac280ac2082cd6bcaba3217428347899f2975/update-accelerate-buildbot.sh From c57f4704b3a202adccc8f105ec5ab28653d6e2eb Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 25 Feb 2022 14:14:47 -0800 Subject: [PATCH 21/38] Update python/cudf/cudf/core/indexed_frame.py Co-authored-by: Vyas Ramasubramani --- python/cudf/cudf/core/indexed_frame.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index e91c7c4e091..1231fe45dfb 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1860,7 +1860,6 @@ def _sample_axis_0( random_state: Union[np.random.RandomState, cp.random.RandomState], ignore_index: bool, ): - # Dynamic dispatch to numpy/cupy depending on state provided. gather_map = cudf.core.column.as_column( random_state.choice(len(self), size=n, replace=replace, p=weights) ) From f1586103bb08fbe042c95c230298d51cb293c85e Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 25 Feb 2022 14:23:31 -0800 Subject: [PATCH 22/38] Remove error mimic --- python/cudf/cudf/core/dataframe.py | 5 ----- python/cudf/cudf/core/indexed_frame.py | 6 ------ 2 files changed, 11 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index e34fe9b1f98..dca1313ffe8 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6343,11 +6343,6 @@ def _sample_axis_1( ) columns = self._data.names - if not replace and n > len(columns): - raise ValueError( - "Cannot take a larger sample than population when" - "`replace=False`." - ) sampled_column_labels = random_state.choice( columns, size=n, replace=False, p=weights diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index e91c7c4e091..4d93ddd965e 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1811,12 +1811,6 @@ def sample( "Cannot take a sample larger than 0 when axis is empty." ) - if not replace and n > size: - raise ValueError( - "Cannot take a larger sample than population when " - "`replace=False`." - ) - # Construct random state if `random_state` parameter is a seed. if not isinstance( random_state, (np.random.RandomState, cp.random.RandomState) From 4f0d2ea328539542019b5b24019e90edbc97d8f1 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 25 Feb 2022 14:23:44 -0800 Subject: [PATCH 23/38] Pre-commits --- python/cudf/cudf/core/indexed_frame.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 4d93ddd965e..b612d958020 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1726,7 +1726,7 @@ def sample( Parameters ---------- n : int, optional - Number of items from axis to return. Cannot be used with frac. + Number of items from axis to return. Cannot be used with `frac`. Default = 1 if frac = None. frac : float, optional Fraction of axis items to return. Cannot be used with n. @@ -1815,6 +1815,11 @@ def sample( if not isinstance( random_state, (np.random.RandomState, cp.random.RandomState) ): + # By default, cupy random state is used to sample from rows and + # numpy is used to sample from columns. In general, cuDF assumes + # the number of columns is much smaller than the number of rows, + # thus using numpy random states can avoid kernel launching + # overhead on creating a small gather map. lib = cp if axis == 0 else np random_state = lib.random.RandomState(seed=random_state) From 2a9c013f9012c539d8e35dbadcdfc4daf5b28645 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 25 Feb 2022 14:27:35 -0800 Subject: [PATCH 24/38] update axis=1 cupy random state error message --- python/cudf/cudf/core/indexed_frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index b612d958020..01b47c924fb 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1844,8 +1844,8 @@ def sample( else: if isinstance(random_state, cp.random.RandomState): raise ValueError( - "Use a numpy random state when sampling from " - "`axis=1`/`columns`." + "Sampling from `axis=1`/`columns` with cupy random state" + "isn't supported." ) return self._sample_axis_1( n, weights, replace, random_state, ignore_index From 84d2da3afec338dc13e04a5b2edd0d579cdeb411 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 25 Feb 2022 14:51:19 -0800 Subject: [PATCH 25/38] update reprocibility test --- python/cudf/cudf/tests/test_dataframe.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index c9650893620..a99efd51937 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -7233,22 +7233,16 @@ def test_dataframe_sample_basic( @pytest.mark.parametrize("replace", [True, False]) -def test_dataframe_sample_reproducibility(replace, random_state_tuple): +@pytest.mark.parametrize( + "random_state_lib", [cp.random.RandomState, np.random.RandomState] +) +def test_dataframe_sample_reproducibility(replace, random_state_lib): df = cudf.DataFrame({"a": cupy.arange(0, 1024)}) - _, gd_random_state, _ = random_state_tuple - if gd_random_state is None: - pytest.skip( - "`None` seed defaults to global random state that's dependent to " - "system implementation. Reproducibility isn't guarenteed." - ) - seed = 10 expected = df.sample( - 1024, replace=replace, random_state=type(gd_random_state)(seed) - ) - out = df.sample( - 1024, replace=replace, random_state=type(gd_random_state)(seed) + 1024, replace=replace, random_state=random_state_lib(10) ) + out = df.sample(1024, replace=replace, random_state=random_state_lib(10)) assert_eq(expected, out) From 3f797cf34b4c8c31684ce24226153d2ebb47e727 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 25 Feb 2022 14:52:26 -0800 Subject: [PATCH 26/38] Revert test_struct copyright change. --- python/cudf/cudf/tests/test_struct.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py index 8d1056ca9cc..167f171fa26 100644 --- a/python/cudf/cudf/tests/test_struct.py +++ b/python/cudf/cudf/tests/test_struct.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020, NVIDIA CORPORATION. import numpy as np import pandas as pd From 967a77efeb5014e1bbf2d7a01270ac52d826ed61 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 25 Feb 2022 15:36:17 -0800 Subject: [PATCH 27/38] Test built-in iterable with argument --- python/cudf/cudf/core/indexed_frame.py | 11 +++++++---- python/cudf/cudf/tests/conftest.py | 4 +++- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index a6ce1f8a9df..f3fcc8580cf 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1812,9 +1812,11 @@ def sample( ) # Construct random state if `random_state` parameter is a seed. - if not isinstance( - random_state, (np.random.RandomState, cp.random.RandomState) - ): + if isinstance(random_state, cp.random.RandomState): + lib = cp + elif isinstance(random_state, np.random.RandomState): + lib = np + else: # By default, cupy random state is used to sample from rows and # numpy is used to sample from columns. In general, cuDF assumes # the number of columns is much smaller than the number of rows, @@ -1835,7 +1837,8 @@ def sample( "Weights and axis to be sampled must be of same length." ) - weights /= weights.sum() + weights = lib.asarray(weights) + weights = weights / weights.sum() if axis == 0: return self._sample_axis_0( diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index 132e5d37ea6..a6ace94606a 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -56,7 +56,7 @@ def exact_checker(expected, got): pytest.skip("Unsupported params.") -@pytest.fixture(params=[None, "ones"]) +@pytest.fixture(params=[None, "builtin-list", "nd-arrays"]) def make_weights(request, random_state_tuple): """Specific to `test_dataframe_sample*` and `test_series_sample*` tests. Only testing weights array that matches type with random state. @@ -65,6 +65,8 @@ def make_weights(request, random_state_tuple): if request.param is None: return lambda _: (None, None) + elif request.param == "builtin-list": + return lambda size: ([1] * size, [1] * size) else: def wrapped(size): From ebf6f70e62b618d3c2c4f2d25321973dbfc8d3d2 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 25 Feb 2022 15:43:53 -0800 Subject: [PATCH 28/38] doc fix --- python/cudf/cudf/core/indexed_frame.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index f3fcc8580cf..8df05dc773f 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1811,16 +1811,16 @@ def sample( "Cannot take a sample larger than 0 when axis is empty." ) - # Construct random state if `random_state` parameter is a seed. if isinstance(random_state, cp.random.RandomState): lib = cp elif isinstance(random_state, np.random.RandomState): lib = np else: - # By default, cupy random state is used to sample from rows and - # numpy is used to sample from columns. In general, cuDF assumes - # the number of columns is much smaller than the number of rows, - # thus using numpy random states can avoid kernel launching + # Construct random state if `random_state` parameter is None or a + # seed. By default, cupy random state is used to sample from rows + # and numpy is used to sample from columns. In general, cuDF + # assumes the number of columns is much smaller than the number of + # rows, thus using numpy random states can avoid kernel launching # overhead on creating a small gather map. lib = cp if axis == 0 else np random_state = lib.random.RandomState(seed=random_state) From 20a5f9cb0102e4183ba26a4ab16d7cd5e756b3ad Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 25 Feb 2022 17:42:12 -0800 Subject: [PATCH 29/38] Simplifying `axis==1` case --- python/cudf/cudf/tests/conftest.py | 73 +++++++++++++++----- python/cudf/cudf/tests/test_dataframe.py | 88 ++++++++++++++++++++++-- 2 files changed, 140 insertions(+), 21 deletions(-) diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index a6ace94606a..099d414c0d1 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -1,5 +1,6 @@ # Copyright (c) 2019-2022, NVIDIA CORPORATION. +import itertools import os import pathlib @@ -20,30 +21,26 @@ def datadir(): @pytest.fixture( - params=[None, 42, np.random.RandomState, cp.random.RandomState] + params=itertools.product([0, 2, None], [0.3, None]), + ids=lambda arg: f"n={arg[0]}-frac={arg[1]}", ) -def random_state_tuple(request): +def sample_n_frac(request): """ - Specific to `test_dataframe_sample*` and `test_series_sample*` tests. - A pytest fixture of valid `random_state` parameter pairs for pandas - and cudf. Valid parameter combinations, and what to check for each pair - are listed below: - - pandas: None, seed(int), np.random.RandomState, np.random.RandomState - cudf: None, seed(int), np.random.RandomState, cp.random.RandomState - ------ - check: shape, shape, exact result, shape - - Each column above stands for one valid parameter combination and check. + Specific to `test_sample*` tests. """ + n, frac = request.param + if n is not None and frac is not None: + pytest.skip("Cannot specify both n and frac.") + return n, frac + +def _random_state_tuple(seed_or_state_ctor): def shape_checker(expected, got): assert expected.shape == got.shape def exact_checker(expected, got): assert_eq(expected, got) - seed_or_state_ctor = request.param if seed_or_state_ctor is None: return None, None, shape_checker elif isinstance(seed_or_state_ctor, int): @@ -56,9 +53,53 @@ def exact_checker(expected, got): pytest.skip("Unsupported params.") +@pytest.fixture( + params=[None, 42, np.random.RandomState], + ids=["None", "IntSeed", "NumpyRandomState"], +) +def random_state_tuple_axis_1(request): + """ + Specific to `test_sample*_axis_1` tests. + A pytest fixture of valid `random_state` parameter pairs for pandas + and cudf. Valid parameter combinations, and what to check for each pair + are listed below: + + pandas: None, seed(int), np.random.RandomState + cudf: None, seed(int), np.random.RandomState + ------ + check: shape, shape, exact result + + Each column above stands for one valid parameter combination and check. + """ + + return _random_state_tuple(request.param) + + +@pytest.fixture( + params=[None, 42, np.random.RandomState, cp.random.RandomState], + ids=["None", "IntSeed", "NumpyRandomState", "CupyRandomState"], +) +def random_state_tuple_axis_0(request): + """ + Specific to `test_sample*_axis_0` tests. + A pytest fixture of valid `random_state` parameter pairs for pandas + and cudf. Valid parameter combinations, and what to check for each pair + are listed below: + + pandas: None, seed(int), np.random.RandomState, np.random.RandomState + cudf: None, seed(int), np.random.RandomState, cp.random.RandomState + ------ + check: shape, shape, exact result, shape + + Each column above stands for one valid parameter combination and check. + """ + + return _random_state_tuple(request.param) + + @pytest.fixture(params=[None, "builtin-list", "nd-arrays"]) -def make_weights(request, random_state_tuple): - """Specific to `test_dataframe_sample*` and `test_series_sample*` tests. +def make_weights_axis_0(request, random_state_tuple): + """Specific to `test_sample*_axis_0` tests. Only testing weights array that matches type with random state. """ _, gd_random_state, _ = random_state_tuple diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index a99efd51937..eca21b5b8af 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -7144,13 +7144,51 @@ def test_cudf_arrow_array_error(): sr.__arrow_array__() +@pytest.mark.parametrize( + "make_weights_axis_1", + [lambda _: None, lambda s: [1] * s, lambda s: np.ones(s)], +) +def test_sample_axis_1( + sample_n_frac, random_state_tuple_axis_1, make_weights_axis_1 +): + n, frac = sample_n_frac + pd_random_state, gd_random_state, checker = random_state_tuple_axis_1 + + pdf = pd.DataFrame( + { + "a": [1, 2, 3, 4, 5], + "float": [0.05, 0.2, 0.3, 0.2, 0.25], + "int": [1, 3, 5, 4, 2], + }, + ) + df = cudf.DataFrame.from_pandas(pdf) + + weights = make_weights_axis_1(len(pdf.columns)) + + expected = pdf.sample( + n=n, + frac=frac, + replace=False, + random_state=pd_random_state, + weights=weights, + axis=1, + ) + got = df.sample( + n=n, + frac=frac, + replace=False, + random_state=gd_random_state, + weights=weights, + axis=1, + ) + checker(expected, got) + + @pytest.mark.parametrize("n", [0, 2, 10, None]) @pytest.mark.parametrize("frac", [0.3, 2, None]) @pytest.mark.parametrize("replace", [True, False]) -@pytest.mark.parametrize("axis", [0, 1]) -def test_dataframe_sample_basic( - n, frac, replace, axis, random_state_tuple, make_weights -): +def test_sample_axis_0(n, frac, replace, random_state_tuple, make_weights): + axis = 0 # TODO: decouple valid parameter set from invalid parameter set and # write two separate tests to reduce function complexity. if axis == 1 and replace: @@ -7236,7 +7274,7 @@ def test_dataframe_sample_basic( @pytest.mark.parametrize( "random_state_lib", [cp.random.RandomState, np.random.RandomState] ) -def test_dataframe_sample_reproducibility(replace, random_state_lib): +def test_sample_reproducibility(replace, random_state_lib): df = cudf.DataFrame({"a": cupy.arange(0, 1024)}) expected = df.sample( @@ -7247,6 +7285,46 @@ def test_dataframe_sample_reproducibility(replace, random_state_lib): assert_eq(expected, out) +@pytest.mark.parametrize("axis", [0, 1]) +def test_sample_invalid_n_frac_combo(axis): + n, frac = 2, 0.5 + pdf = pd.DataFrame( + { + "a": [1, 2, 3, 4, 5], + "float": [0.05, 0.2, 0.3, 0.2, 0.25], + "int": [1, 3, 5, 4, 2], + }, + ) + df = cudf.DataFrame.from_pandas(pdf) + + assert_exceptions_equal( + lfunc=pdf.sample, + rfunc=df.sample, + lfunc_args_and_kwargs=([], {"n": n, "frac": frac, "axis": axis}), + rfunc_args_and_kwargs=([], {"n": n, "frac": frac, "axis": axis}), + ) + + +@pytest.mark.parametrize("n, frac", [(100, None), (None, 3)]) +@pytest.mark.parametrize("axis", [0, 1]) +def test_oversample_without_replace(n, frac, axis): + pdf = pd.DataFrame({"a": [1, 2, 3, 4, 5]}) + df = cudf.DataFrame.from_pandas(pdf) + + assert_exceptions_equal( + lfunc=pdf.sample, + rfunc=df.sample, + lfunc_args_and_kwargs=( + [], + {"n": n, "frac": frac, "axis": axis, "replace": False}, + ), + rfunc_args_and_kwargs=( + [], + {"n": n, "frac": frac, "axis": axis, "replace": False}, + ), + ) + + @pytest.mark.parametrize( "df", [ From 17319b1486f89bf6d06cd288f44f2b6219c273fd Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 25 Feb 2022 20:58:04 -0800 Subject: [PATCH 30/38] Simplify `axis=0` case, further document unsupported argument combinations and raise proper warning message. --- python/cudf/cudf/core/dataframe.py | 6 +- python/cudf/cudf/core/indexed_frame.py | 20 +++-- python/cudf/cudf/tests/conftest.py | 10 +-- python/cudf/cudf/tests/test_dataframe.py | 96 ++++++++---------------- 4 files changed, 56 insertions(+), 76 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index dca1313ffe8..9fb7f4661ea 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6337,15 +6337,15 @@ def _sample_axis_1( ignore_index: bool, ): if replace: + # Since cuDF does not support multiple columns with same name, + # sample with replace=True at axis 1 is unsupported. raise NotImplementedError( "Sample is not supported for axis 1/`columns` when" "`replace=True`." ) - columns = self._data.names - sampled_column_labels = random_state.choice( - columns, size=n, replace=False, p=weights + self._column_names, size=n, replace=False, p=weights ) result = self._get_columns_by_label(sampled_column_labels) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 8df05dc773f..262857f9bbc 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1732,7 +1732,10 @@ def sample( Fraction of axis items to return. Cannot be used with n. replace : bool, default False Allow or disallow sampling of the same row more than once. - replace == True is not yet supported for axis = 1/"columns" + `replace == True` is not supported for axis = 1/"columns". + `replace == False` is not supported for axis = 0/"index" given + `random_state` is `None` or a cupy random state, and `weights` is + specified. weights : ndarray-like, optional Default `None` for uniform probability distribution over rows to sample from. If `ndarray` is passed, the length of `weights` should @@ -1862,12 +1865,19 @@ def _sample_axis_0( random_state: Union[np.random.RandomState, cp.random.RandomState], ignore_index: bool, ): - gather_map = cudf.core.column.as_column( - random_state.choice(len(self), size=n, replace=replace, p=weights) - ) + try: + gather_map_array = random_state.choice( + len(self), size=n, replace=replace, p=weights + ) + except NotImplementedError as e: + raise NotImplementedError( + "Unsupported arguments for gather map generation." + ) from e return self._gather( - gather_map, keep_index=not ignore_index, check_bounds=False + cudf.core.column.as_column(gather_map_array), + keep_index=not ignore_index, + check_bounds=False, ) def _sample_axis_1( diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index 099d414c0d1..14b0ad96e79 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -49,8 +49,8 @@ def exact_checker(expected, got): return seed_or_state_ctor(42), seed_or_state_ctor(42), exact_checker elif seed_or_state_ctor == cp.random.RandomState: return np.random.RandomState(42), seed_or_state_ctor(42), shape_checker - else: - pytest.skip("Unsupported params.") + + assert False, "No random state is specialized beyond the above types." @pytest.fixture( @@ -97,12 +97,12 @@ def random_state_tuple_axis_0(request): return _random_state_tuple(request.param) -@pytest.fixture(params=[None, "builtin-list", "nd-arrays"]) -def make_weights_axis_0(request, random_state_tuple): +@pytest.fixture(params=[None, "builtin_list", "ndarray"]) +def make_weights_axis_0(request, random_state_tuple_axis_0): """Specific to `test_sample*_axis_0` tests. Only testing weights array that matches type with random state. """ - _, gd_random_state, _ = random_state_tuple + _, gd_random_state, _ = random_state_tuple_axis_0 if request.param is None: return lambda _: (None, None) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index eca21b5b8af..03f67587041 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -7184,21 +7184,12 @@ def test_sample_axis_1( checker(expected, got) -@pytest.mark.parametrize("n", [0, 2, 10, None]) -@pytest.mark.parametrize("frac", [0.3, 2, None]) @pytest.mark.parametrize("replace", [True, False]) -def test_sample_axis_0(n, frac, replace, random_state_tuple, make_weights): - axis = 0 - # TODO: decouple valid parameter set from invalid parameter set and - # write two separate tests to reduce function complexity. - if axis == 1 and replace: - pytest.skip(reason="we currently don't support column with same name.") - - pd_random_state, gd_random_state, checker = random_state_tuple - if axis == 1 and isinstance(gd_random_state, cp.random.RandomState): - pytest.skip( - reason="cannot sample column axis with device random state." - ) +def test_sample_axis_0( + sample_n_frac, replace, random_state_tuple_axis_0, make_weights_axis_0 +): + n, frac = sample_n_frac + pd_random_state, gd_random_state, checker = random_state_tuple_axis_0 pdf = pd.DataFrame( { @@ -7206,11 +7197,10 @@ def test_sample_axis_0(n, frac, replace, random_state_tuple, make_weights): "float": [0.05, 0.2, 0.3, 0.2, 0.25], "int": [1, 3, 5, 4, 2], }, - index=[1, 2, 3, 4, 5], ) df = cudf.DataFrame.from_pandas(pdf) - pd_weights, gd_weights = make_weights(len(pdf)) + pd_weights, gd_weights = make_weights_axis_0(len(pdf)) if ( not replace and not isinstance(gd_random_state, np.random.RandomState) @@ -7221,53 +7211,24 @@ def test_sample_axis_0(n, frac, replace, random_state_tuple, make_weights): "without replacement." ) - try: - expected = pdf.sample( - n=n, - frac=frac, - replace=replace, - random_state=pd_random_state, - weights=pd_weights, - axis=axis, - ) - except BaseException: - assert_exceptions_equal( - lfunc=pdf.sample, - rfunc=df.sample, - lfunc_args_and_kwargs=( - [], - { - "n": n, - "frac": frac, - "replace": replace, - "random_state": pd_random_state, - "weights": pd_weights, - "axis": axis, - }, - ), - rfunc_args_and_kwargs=( - [], - { - "n": n, - "frac": frac, - "replace": replace, - "random_state": gd_random_state, - "weights": gd_weights, - "axis": axis, - }, - ), - compare_error_message=False, - ) - else: - got = df.sample( - n=n, - frac=frac, - replace=replace, - random_state=gd_random_state, - weights=gd_weights, - axis=axis, - ) - checker(expected, got) + expected = pdf.sample( + n=n, + frac=frac, + replace=replace, + random_state=pd_random_state, + weights=pd_weights, + axis=0, + ) + + got = df.sample( + n=n, + frac=frac, + replace=replace, + random_state=gd_random_state, + weights=gd_weights, + axis=0, + ) + checker(expected, got) @pytest.mark.parametrize("replace", [True, False]) @@ -7325,6 +7286,15 @@ def test_oversample_without_replace(n, frac, axis): ) +@pytest.mark.parametrize("random_state", [None, cp.random.RandomState(42)]) +def test_sample_unsupported_arguments(random_state): + df = cudf.DataFrame({"float": [0.05, 0.2, 0.3, 0.2, 0.25]}) + with pytest.raises(NotImplementedError, match="Unsupported argument"): + df.sample( + n=2, replace=False, random_state=random_state, weights=[1] * 5 + ) + + @pytest.mark.parametrize( "df", [ From ac77a233a37ccb39461a82fb343b1db4f500ae25 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 25 Feb 2022 21:13:31 -0800 Subject: [PATCH 31/38] Simplify series test. --- python/cudf/cudf/tests/test_series.py | 70 ++++++++------------------- 1 file changed, 21 insertions(+), 49 deletions(-) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index d12fe1c733e..b551d5b89cd 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1590,17 +1590,16 @@ def test_fill_new_category(): gs[0:1] = "d" -@pytest.mark.parametrize("n", [0, 2, 10, None]) -@pytest.mark.parametrize("frac", [0.3, 2, None]) @pytest.mark.parametrize("replace", [True, False]) -def test_series_sample_basic( - n, frac, replace, random_state_tuple, make_weights +def test_sample( + sample_n_frac, replace, random_state_tuple_axis_0, make_weights_axis_0 ): - pd_random_state, gd_random_state, checker = random_state_tuple + n, frac = sample_n_frac + pd_random_state, gd_random_state, checker = random_state_tuple_axis_0 psr = pd.Series([1, 2, 3, 4, 5]) sr = cudf.Series.from_pandas(psr) - pd_weights, gd_weights = make_weights(len(psr)) + pd_weights, gd_weights = make_weights_axis_0(len(psr)) if ( not replace and not isinstance(gd_random_state, np.random.RandomState) @@ -1611,49 +1610,22 @@ def test_series_sample_basic( "without replacement." ) - try: - expected = psr.sample( - n=n, - frac=frac, - replace=replace, - weights=pd_weights, - random_state=pd_random_state, - ) - except BaseException: - assert_exceptions_equal( - lfunc=psr.sample, - rfunc=sr.sample, - lfunc_args_and_kwargs=( - [], - { - "n": n, - "frac": frac, - "replace": replace, - "weights": pd_weights, - "random_state": pd_random_state, - }, - ), - rfunc_args_and_kwargs=( - [], - { - "n": n, - "frac": frac, - "replace": replace, - "weights": gd_weights, - "random_state": gd_random_state, - }, - ), - compare_error_message=False, - ) - else: - got = sr.sample( - n=n, - frac=frac, - replace=replace, - weights=gd_weights, - random_state=gd_random_state, - ) - checker(expected, got) + expected = psr.sample( + n=n, + frac=frac, + replace=replace, + weights=pd_weights, + random_state=pd_random_state, + ) + + got = sr.sample( + n=n, + frac=frac, + replace=replace, + weights=gd_weights, + random_state=gd_random_state, + ) + checker(expected, got) @pytest.mark.parametrize( From 12015f7256b1c06ed635562454e9a3cada96ba32 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 2 Mar 2022 23:46:37 -0800 Subject: [PATCH 32/38] Update comments --- python/cudf/cudf/core/indexed_frame.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 262857f9bbc..57c5b4040d4 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1820,11 +1820,9 @@ def sample( lib = np else: # Construct random state if `random_state` parameter is None or a - # seed. By default, cupy random state is used to sample from rows - # and numpy is used to sample from columns. In general, cuDF - # assumes the number of columns is much smaller than the number of - # rows, thus using numpy random states can avoid kernel launching - # overhead on creating a small gather map. + # seed. By default, cupy random state is used to sample rows + # and numpy is used to sample columns. This is because row data + # is stored on device, and the column objects are stored on host. lib = cp if axis == 0 else np random_state = lib.random.RandomState(seed=random_state) From 5511585cc16372930a03dd7c66fd5190bd8a6437 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 2 Mar 2022 23:46:56 -0800 Subject: [PATCH 33/38] Update error message --- python/cudf/cudf/core/indexed_frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 57c5b4040d4..1fca4451903 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1869,7 +1869,7 @@ def _sample_axis_0( ) except NotImplementedError as e: raise NotImplementedError( - "Unsupported arguments for gather map generation." + "Random sampling with cupy does not support these inputs." ) from e return self._gather( From 93ffc7ddf53b05dc3bd25452794747a67ad6e5ce Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 3 Mar 2022 00:10:42 -0800 Subject: [PATCH 34/38] Move nested function outside --- python/cudf/cudf/tests/conftest.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index 14b0ad96e79..a22531697e9 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -34,12 +34,15 @@ def sample_n_frac(request): return n, frac -def _random_state_tuple(seed_or_state_ctor): - def shape_checker(expected, got): - assert expected.shape == got.shape +def shape_checker(expected, got): + assert expected.shape == got.shape + + +def exact_checker(expected, got): + assert_eq(expected, got) - def exact_checker(expected, got): - assert_eq(expected, got) + +def _random_state_tuple(seed_or_state_ctor): if seed_or_state_ctor is None: return None, None, shape_checker From d0c7c3972f8122d541525c49bb11e555debc63bb Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 3 Mar 2022 20:16:30 -0800 Subject: [PATCH 35/38] Inline random state construction --- python/cudf/cudf/tests/conftest.py | 31 +++++++++++++----------------- 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index a22531697e9..ef195d548ef 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -42,22 +42,12 @@ def exact_checker(expected, got): assert_eq(expected, got) -def _random_state_tuple(seed_or_state_ctor): - - if seed_or_state_ctor is None: - return None, None, shape_checker - elif isinstance(seed_or_state_ctor, int): - return seed_or_state_ctor, seed_or_state_ctor, shape_checker - elif seed_or_state_ctor == np.random.RandomState: - return seed_or_state_ctor(42), seed_or_state_ctor(42), exact_checker - elif seed_or_state_ctor == cp.random.RandomState: - return np.random.RandomState(42), seed_or_state_ctor(42), shape_checker - - assert False, "No random state is specialized beyond the above types." - - @pytest.fixture( - params=[None, 42, np.random.RandomState], + params=[ + (None, None, shape_checker), + (42, 42, shape_checker), + (np.random.RandomState(42), np.random.RandomState(42), exact_checker), + ], ids=["None", "IntSeed", "NumpyRandomState"], ) def random_state_tuple_axis_1(request): @@ -75,11 +65,16 @@ def random_state_tuple_axis_1(request): Each column above stands for one valid parameter combination and check. """ - return _random_state_tuple(request.param) + return request.param @pytest.fixture( - params=[None, 42, np.random.RandomState, cp.random.RandomState], + params=[ + (None, None, shape_checker), + (42, 42, shape_checker), + (np.random.RandomState(42), np.random.RandomState(42), exact_checker), + (np.random.RandomState(42), cp.random.RandomState(42), shape_checker), + ], ids=["None", "IntSeed", "NumpyRandomState", "CupyRandomState"], ) def random_state_tuple_axis_0(request): @@ -97,7 +92,7 @@ def random_state_tuple_axis_0(request): Each column above stands for one valid parameter combination and check. """ - return _random_state_tuple(request.param) + return request.param @pytest.fixture(params=[None, "builtin_list", "ndarray"]) From 66acd59cb91c1e87c3e093a9b2bc4e3d2ea7cc77 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 3 Mar 2022 20:59:56 -0800 Subject: [PATCH 36/38] Remove nested fixture request --- python/cudf/cudf/tests/conftest.py | 11 +++++------ python/cudf/cudf/tests/test_dataframe.py | 9 +++++++-- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index ef195d548ef..4a42d811c80 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -96,21 +96,20 @@ def random_state_tuple_axis_0(request): @pytest.fixture(params=[None, "builtin_list", "ndarray"]) -def make_weights_axis_0(request, random_state_tuple_axis_0): +def make_weights_axis_0(request): """Specific to `test_sample*_axis_0` tests. Only testing weights array that matches type with random state. """ - _, gd_random_state, _ = random_state_tuple_axis_0 if request.param is None: - return lambda _: (None, None) + return lambda *_: (None, None) elif request.param == "builtin-list": - return lambda size: ([1] * size, [1] * size) + return lambda size, _: ([1] * size, [1] * size) else: - def wrapped(size): + def wrapped(size, numpy_weights_for_cudf): # Uniform distribution, non-normalized - if isinstance(gd_random_state, np.random.RandomState): + if numpy_weights_for_cudf: return np.ones(size), np.ones(size) else: return np.ones(size), cp.ones(size) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 03f67587041..2beae04a0db 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -7200,7 +7200,9 @@ def test_sample_axis_0( ) df = cudf.DataFrame.from_pandas(pdf) - pd_weights, gd_weights = make_weights_axis_0(len(pdf)) + pd_weights, gd_weights = make_weights_axis_0( + len(pdf), isinstance(gd_random_state, np.random.RandomState) + ) if ( not replace and not isinstance(gd_random_state, np.random.RandomState) @@ -7289,7 +7291,10 @@ def test_oversample_without_replace(n, frac, axis): @pytest.mark.parametrize("random_state", [None, cp.random.RandomState(42)]) def test_sample_unsupported_arguments(random_state): df = cudf.DataFrame({"float": [0.05, 0.2, 0.3, 0.2, 0.25]}) - with pytest.raises(NotImplementedError, match="Unsupported argument"): + with pytest.raises( + NotImplementedError, + match="Random sampling with cupy does not support these inputs.", + ): df.sample( n=2, replace=False, random_state=random_state, weights=[1] * 5 ) From a7588b37f860beb7962388ab286cab09f06fe641 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 3 Mar 2022 21:02:14 -0800 Subject: [PATCH 37/38] Use common variable --- python/cudf/cudf/tests/test_dataframe.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 2beae04a0db..4a18bb64afc 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -7240,10 +7240,9 @@ def test_sample_axis_0( def test_sample_reproducibility(replace, random_state_lib): df = cudf.DataFrame({"a": cupy.arange(0, 1024)}) - expected = df.sample( - 1024, replace=replace, random_state=random_state_lib(10) - ) - out = df.sample(1024, replace=replace, random_state=random_state_lib(10)) + n = 1024 + expected = df.sample(n, replace=replace, random_state=random_state_lib(10)) + out = df.sample(n, replace=replace, random_state=random_state_lib(10)) assert_eq(expected, out) From f4e268614c5fc7833aa75e08c9d84971a46e92cd Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 3 Mar 2022 21:08:26 -0800 Subject: [PATCH 38/38] Parametrize series test into axis_0 --- python/cudf/cudf/tests/test_dataframe.py | 24 +++++++++------ python/cudf/cudf/tests/test_series.py | 38 ------------------------ 2 files changed, 15 insertions(+), 47 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 4a18bb64afc..5bde75c2e21 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -7184,21 +7184,27 @@ def test_sample_axis_1( checker(expected, got) +@pytest.mark.parametrize( + "pdf", + [ + pd.DataFrame( + { + "a": [1, 2, 3, 4, 5], + "float": [0.05, 0.2, 0.3, 0.2, 0.25], + "int": [1, 3, 5, 4, 2], + }, + ), + pd.Series([1, 2, 3, 4, 5]), + ], +) @pytest.mark.parametrize("replace", [True, False]) def test_sample_axis_0( - sample_n_frac, replace, random_state_tuple_axis_0, make_weights_axis_0 + pdf, sample_n_frac, replace, random_state_tuple_axis_0, make_weights_axis_0 ): n, frac = sample_n_frac pd_random_state, gd_random_state, checker = random_state_tuple_axis_0 - pdf = pd.DataFrame( - { - "a": [1, 2, 3, 4, 5], - "float": [0.05, 0.2, 0.3, 0.2, 0.25], - "int": [1, 3, 5, 4, 2], - }, - ) - df = cudf.DataFrame.from_pandas(pdf) + df = cudf.from_pandas(pdf) pd_weights, gd_weights = make_weights_axis_0( len(pdf), isinstance(gd_random_state, np.random.RandomState) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index b551d5b89cd..55fcd15f0d5 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1590,44 +1590,6 @@ def test_fill_new_category(): gs[0:1] = "d" -@pytest.mark.parametrize("replace", [True, False]) -def test_sample( - sample_n_frac, replace, random_state_tuple_axis_0, make_weights_axis_0 -): - n, frac = sample_n_frac - pd_random_state, gd_random_state, checker = random_state_tuple_axis_0 - psr = pd.Series([1, 2, 3, 4, 5]) - sr = cudf.Series.from_pandas(psr) - - pd_weights, gd_weights = make_weights_axis_0(len(psr)) - if ( - not replace - and not isinstance(gd_random_state, np.random.RandomState) - and gd_weights is not None - ): - pytest.skip( - "`cupy.random.RandomState` doesn't support weighted sampling " - "without replacement." - ) - - expected = psr.sample( - n=n, - frac=frac, - replace=replace, - weights=pd_weights, - random_state=pd_random_state, - ) - - got = sr.sample( - n=n, - frac=frac, - replace=replace, - weights=gd_weights, - random_state=gd_random_state, - ) - checker(expected, got) - - @pytest.mark.parametrize( "data", [