Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rewrites sample API #10262

Merged
merged 43 commits into from
Mar 4, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
b831cc9
Rewrites `sample`
isVoid Feb 10, 2022
ec64a98
Restore support for index but add deprecation warning
isVoid Feb 10, 2022
80e6cf6
Better logic to compute `n`.
isVoid Feb 15, 2022
82597e6
Move size checks
isVoid Feb 15, 2022
aa166b7
Consolidating error checking codes to reduce checking the same variab…
isVoid Feb 15, 2022
0466e84
Change error message.
isVoid Feb 15, 2022
8965538
Inlining `weights`, `random_state` handling; rewriting `weights` docs…
isVoid Feb 15, 2022
3f33dcb
Rewrites `make_random_state` into a fixture; Optimizes code locations.
isVoid Feb 15, 2022
79d3d56
Use fixture in reproducibility test
isVoid Feb 15, 2022
ecd8061
Reverting support for cupy arrays for weights; add tests for weights.
isVoid Feb 15, 2022
ce2e0fa
Apply suggestions from code review
isVoid Feb 15, 2022
b0b6ce7
Apply fixes from reviews
isVoid Feb 15, 2022
62ee2b4
Conforming error messages.
isVoid Feb 16, 2022
6c34a71
Removing redundant tests
isVoid Feb 16, 2022
e0d78ee
Rewrites weights fixture; skip comparing error messages.
isVoid Feb 16, 2022
8c0dff1
Merge branch 'branch-22.04' of github.com:rapidsai/cudf into improvem…
isVoid Feb 16, 2022
5300d02
Update copyright headers
isVoid Feb 16, 2022
f3b05ec
Commiting changes discussed offline
isVoid Feb 24, 2022
b08a9b5
Merge branch 'branch-22.04' of github.com:rapidsai/cudf into improvem…
isVoid Feb 24, 2022
7c07fbd
Skip comparing index.sample message
isVoid Feb 25, 2022
0a8888a
Merge branch 'branch-22.04' into improvement/rewrite_sample
vyasr Feb 25, 2022
4e1f4f3
Fix index style
isVoid Feb 25, 2022
118f058
Merge branch 'improvement/rewrite_sample' of github.com:isVoid/cudf i…
isVoid Feb 25, 2022
ddc80aa
Revert copyright change in b08a9b53400ad9801776ddadb986fbed9171ba5b
isVoid Feb 25, 2022
c57f470
Update python/cudf/cudf/core/indexed_frame.py
isVoid Feb 25, 2022
f158610
Remove error mimic
isVoid Feb 25, 2022
4f0d2ea
Pre-commits
isVoid Feb 25, 2022
2a9c013
update axis=1 cupy random state error message
isVoid Feb 25, 2022
84d2da3
update reprocibility test
isVoid Feb 25, 2022
3f797cf
Revert test_struct copyright change.
isVoid Feb 25, 2022
413d8e9
Merge branch 'improvement/rewrite_sample' of github.com:isVoid/cudf i…
isVoid Feb 25, 2022
967a77e
Test built-in iterable with argument
isVoid Feb 25, 2022
ebf6f70
doc fix
isVoid Feb 25, 2022
20a5f9c
Simplifying `axis==1` case
isVoid Feb 26, 2022
17319b1
Simplify `axis=0` case, further document unsupported argument combina…
isVoid Feb 26, 2022
ac77a23
Simplify series test.
isVoid Feb 26, 2022
12015f7
Update comments
isVoid Mar 3, 2022
5511585
Update error message
isVoid Mar 3, 2022
93ffc7d
Move nested function outside
isVoid Mar 3, 2022
d0c7c39
Inline random state construction
isVoid Mar 4, 2022
66acd59
Remove nested fixture request
isVoid Mar 4, 2022
a7588b3
Use common variable
isVoid Mar 4, 2022
f4e2686
Parametrize series test into axis_0
isVoid Mar 4, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 0 additions & 26 deletions python/cudf/cudf/_lib/copying.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -656,32 +656,6 @@ def get_element(Column input_column, size_type index):
)


def sample(input, size_type n,
bool replace, int64_t seed, bool keep_index=True):
cdef table_view tbl_view = table_view_from_table(input, not keep_index)
cdef cpp_copying.sample_with_replacement replacement

if replace:
replacement = cpp_copying.sample_with_replacement.TRUE
else:
replacement = cpp_copying.sample_with_replacement.FALSE

cdef unique_ptr[table] c_output
with nogil:
c_output = move(
cpp_copying.sample(tbl_view, n, replacement, seed)
)

return data_from_unique_ptr(
move(c_output),
column_names=input._column_names,
index_names=(
None if keep_index is False
else input._index_names
)
)


def segmented_gather(Column source_column, Column gather_map):
cdef shared_ptr[lists_column_view] source_LCV = (
make_shared[lists_column_view](source_column.view())
Expand Down
9 changes: 1 addition & 8 deletions python/cudf/cudf/_lib/cpp/copying.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

from libc.stdint cimport int32_t, int64_t, uint8_t
from libcpp cimport bool
Expand Down Expand Up @@ -175,10 +175,3 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
ctypedef enum sample_with_replacement:
FALSE 'cudf::sample_with_replacement::FALSE',
TRUE 'cudf::sample_with_replacement::TRUE',

cdef unique_ptr[table] sample (
isVoid marked this conversation as resolved.
Show resolved Hide resolved
table_view input,
size_type n,
sample_with_replacement replacement,
int64_t seed
) except +
22 changes: 22 additions & 0 deletions python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import annotations

import pickle
import warnings
from functools import cached_property
from typing import Any, Set

Expand Down Expand Up @@ -1528,6 +1529,27 @@ def _split_columns_by_levels(self, levels):
[],
)

def sample(
self,
n=None,
frac=None,
replace=False,
weights=None,
random_state=None,
axis=None,
ignore_index=False,
):
warnings.warn(
"Index.sample is deprecated and will be removed.", FutureWarning,
)
return cudf.core.index._index_from_data(
self.to_frame()
.sample(
n, frac, replace, weights, random_state, axis, ignore_index
)
._data
)


def _get_result_name(left_name, right_name):
if left_name == right_name:
Expand Down
27 changes: 27 additions & 0 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import cudf
import cudf.core.common
from cudf import _lib as libcudf
from cudf._typing import ColumnLike
from cudf.api.types import (
_is_scalar_or_zero_d_array,
is_bool_dtype,
Expand Down Expand Up @@ -6327,6 +6328,32 @@ def nunique(self, axis=0, dropna=True):

return cudf.Series(super().nunique(method="sort", dropna=dropna))

def _sample_axis_1(
self,
n: int,
weights: Optional[ColumnLike],
replace: bool,
random_state: np.random.RandomState,
ignore_index: bool,
):
if replace:
# Since cuDF does not support multiple columns with same name,
# sample with replace=True at axis 1 is unsupported.
raise NotImplementedError(
"Sample is not supported for axis 1/`columns` when"
"`replace=True`."
)

sampled_column_labels = random_state.choice(
self._column_names, size=n, replace=False, p=weights
)

result = self._get_columns_by_label(sampled_column_labels)
if ignore_index:
result.reset_index(drop=True)

return result


def from_dataframe(df, allow_copy=False):
return df_protocol.from_dataframe(df, allow_copy=allow_copy)
Expand Down
195 changes: 1 addition & 194 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
from cudf.core.window import Rolling
from cudf.utils import ioutils
from cudf.utils.docutils import copy_docstring
from cudf.utils.dtypes import find_common_type, is_column_like
from cudf.utils.dtypes import find_common_type

T = TypeVar("T", bound="Frame")

Expand Down Expand Up @@ -1656,199 +1656,6 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
zip(self._column_names, data_columns), self._index
)

@annotate("FRAME_SAMPLE", color="orange", domain="cudf_python")
def sample(
self,
n=None,
frac=None,
replace=False,
weights=None,
random_state=None,
axis=None,
keep_index=True,
):
"""Return a random sample of items from an axis of object.

You can use random_state for reproducibility.

Parameters
----------
n : int, optional
Number of items from axis to return. Cannot be used with frac.
Default = 1 if frac = None.
frac : float, optional
Fraction of axis items to return. Cannot be used with n.
replace : bool, default False
Allow or disallow sampling of the same row more than once.
replace == True is not yet supported for axis = 1/"columns"
weights : str or ndarray-like, optional
Only supported for axis=1/"columns"
random_state : int, numpy RandomState or None, default None
Seed for the random number generator (if int), or None.
If None, a random seed will be chosen.
if RandomState, seed will be extracted from current state.
axis : {0 or ‘index’, 1 or ‘columns’, None}, default None
Axis to sample. Accepts axis number or name.
Default is stat axis for given data type
(0 for Series and DataFrames). Series and Index doesn't
support axis=1.

Returns
-------
Series or DataFrame or Index
A new object of same type as caller containing n items
randomly sampled from the caller object.

Examples
--------
>>> import cudf as cudf
>>> df = cudf.DataFrame({"a":{1, 2, 3, 4, 5}})
>>> df.sample(3)
a
1 2
3 4
0 1

>>> sr = cudf.Series([1, 2, 3, 4, 5])
>>> sr.sample(10, replace=True)
1 4
3 1
2 4
0 5
0 1
4 5
4 1
0 2
0 3
3 2
dtype: int64

>>> df = cudf.DataFrame(
... {"a":[1, 2], "b":[2, 3], "c":[3, 4], "d":[4, 5]})
>>> df.sample(2, axis=1)
a c
0 1 3
1 2 4
"""

if frac is not None and frac > 1 and not replace:
raise ValueError(
"Replace has to be set to `True` "
"when upsampling the population `frac` > 1."
)
elif frac is not None and n is not None:
raise ValueError(
"Please enter a value for `frac` OR `n`, not both"
)

if frac is None and n is None:
n = 1
elif frac is not None:
if axis is None or axis == 0 or axis == "index":
n = int(round(self.shape[0] * frac))
else:
n = int(round(self.shape[1] * frac))

if axis is None or axis == 0 or axis == "index":
if n > 0 and self.shape[0] == 0:
raise ValueError(
"Cannot take a sample larger than 0 when axis is empty"
)

if not replace and n > self.shape[0]:
raise ValueError(
"Cannot take a larger sample than population "
"when 'replace=False'"
)

if weights is not None:
raise NotImplementedError(
"weights is not yet supported for axis=0/index"
)

if random_state is None:
seed = np.random.randint(
np.iinfo(np.int64).max, dtype=np.int64
)
elif isinstance(random_state, np.random.mtrand.RandomState):
_, keys, pos, _, _ = random_state.get_state()
seed = 0 if pos >= len(keys) else pos
else:
seed = np.int64(random_state)

result = self.__class__._from_data(
*libcudf.copying.sample(
self,
n=n,
replace=replace,
seed=seed,
keep_index=keep_index,
)
)
result._copy_type_metadata(self)

return result
else:
if len(self.shape) != 2:
raise ValueError(
f"No axis named {axis} for "
f"object type {self.__class__}"
)

if replace:
raise NotImplementedError(
"Sample is not supported for "
f"axis {axis} when 'replace=True'"
)

if n > 0 and self.shape[1] == 0:
raise ValueError(
"Cannot take a sample larger than 0 when axis is empty"
)

columns = np.asarray(self._data.names)
if not replace and n > columns.size:
raise ValueError(
"Cannot take a larger sample "
"than population when 'replace=False'"
)

if weights is not None:
if is_column_like(weights):
weights = np.asarray(weights)
else:
raise ValueError(
"Strings can only be passed to weights "
"when sampling from rows on a DataFrame"
)

if columns.size != len(weights):
raise ValueError(
"Weights and axis to be sampled must be of same length"
)

total_weight = weights.sum()
if total_weight != 1:
if not isinstance(weights.dtype, float):
weights = weights.astype("float64")
weights = weights / total_weight

np.random.seed(random_state)
gather_map = np.random.choice(
columns, size=n, replace=replace, p=weights
)

if isinstance(self, cudf.MultiIndex):
# TODO: Need to update this once MultiIndex is refactored,
# should be able to treat it similar to other Frame object
result = cudf.Index(self.to_frame(index=False)[gather_map])
else:
result = self[gather_map]
if not keep_index:
result.index = None

return result

@classmethod
@annotate("FRAME_FROM_ARROW", color="orange", domain="cudf_python")
def from_arrow(cls, data):
Expand Down
Loading