Skip to content

Commit

Permalink
[REVIEW] Retain index in stratified splitting for dataframes (#2805)
Browse files Browse the repository at this point in the history
* Handle CUDF case separately

* Update changelog

* STYLE fix

* CHANGE test to 10 rows

Co-authored-by: Nanthini Balasubramanian <[email protected]>
  • Loading branch information
Nanthini10 and Nanthini10 authored Sep 25, 2020
1 parent 808c1f6 commit bd65c15
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 22 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
- PR #2744: Supporting larger number of classes in KNeighborsClassifier
- PR #2769: Remove outdated doxygen options for 1.8.20
- PR #2787: Skip lightgbm test for version 3 and above temporarily
- PR #2805: Retain index in stratified splitting for dataframes
- PR #2781: Use Python print to correctly redirect spdlogs when sys.stdout is changed
- PR #2787: Skip lightgbm test for version 3 and above temporarily
- PR #2813: Fix memory access in generation of non-row-major random blobs
Expand Down
77 changes: 56 additions & 21 deletions python/cuml/preprocessing/model_selection.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@

# Copyright (c) 2019, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down Expand Up @@ -45,26 +46,31 @@ def _stratify_split(X, y, n_train, n_test, x_numba, y_numba, random_state):
"""
x_cudf = False
y_cudf = False

if isinstance(X, cudf.DataFrame):
x_cudf = True
X = X.values
elif hasattr(X, "__cuda_array_interface__"):
X = cp.asarray(X)
x_order = _strides_to_order(X.__cuda_array_interface__['strides'],
cp.dtype(X.dtype))

if isinstance(y, cudf.Series):
y_cudf = True
y = y.values
elif hasattr(y, "__cuda_array_interface__"):
y = cp.asarray(y)
y_order = _strides_to_order(y.__cuda_array_interface__['strides'],
cp.dtype(y.dtype))
elif isinstance(y, cudf.DataFrame):
y_cudf = True
# ensuring it has just one column
if y.shape[1] == 1:
y = y.values
else:
if y.shape[1] != 1:
raise ValueError('Expected one label, but found y'
'with shape = %d' % (y.shape))

classes, y_indices = cp.unique(y, return_inverse=True)
classes, y_indices = cp.unique(y.values if y_cudf
else y,
return_inverse=True)

n_classes = classes.shape[0]
class_counts = cp.bincount(y_indices)
if n_train < n_classes:
Expand All @@ -91,21 +97,50 @@ def _stratify_split(X, y, n_train, n_test, x_numba, y_numba, random_state):
for i in range(n_classes):
permutation = random_state.permutation(class_counts[i].item())
perm_indices_class_i = class_indices[i].take(permutation)
X_train_i = X[perm_indices_class_i[:n_i[i]]]
X_test_i = X[perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]]]
y_train_i = y[perm_indices_class_i[:n_i[i]]]
y_test_i = y[perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]]]

if X_train is None:
X_train = X_train_i
y_train = y_train_i
X_test = X_test_i
y_test = y_test_i
else:
X_train = cp.concatenate([X_train, X_train_i], axis=0)
X_test = cp.concatenate([X_test, X_test_i], axis=0)
y_train = cp.concatenate([y_train, y_train_i], axis=0)
y_test = cp.concatenate([y_test, y_test_i], axis=0)

if hasattr(X, "__cuda_array_interface__") or \
isinstance(X, cupyx.scipy.sparse.csr_matrix):

X_train_i = cp.array(X[perm_indices_class_i[:n_i[i]]],
order=x_order)
X_test_i = cp.array(X[perm_indices_class_i[n_i[i]:n_i[i] +
t_i[i]]],
order=x_order)

y_train_i = cp.array(y[perm_indices_class_i[:n_i[i]]],
order=y_order)
y_test_i = cp.array(y[perm_indices_class_i[n_i[i]:n_i[i] +
t_i[i]]],
order=y_order)

if X_train is None:
X_train = cp.array(X_train_i, order=x_order)
y_train = cp.array(y_train_i, order=y_order)
X_test = cp.array(X_test_i, order=x_order)
y_test = cp.array(y_test_i, order=y_order)
else:
X_train = cp.concatenate([X_train, X_train_i], axis=0)
X_test = cp.concatenate([X_test, X_test_i], axis=0)
y_train = cp.concatenate([y_train, y_train_i], axis=0)
y_test = cp.concatenate([y_test, y_test_i], axis=0)

elif x_cudf:
X_train_i = X.iloc[perm_indices_class_i[:n_i[i]]]
X_test_i = X.iloc[perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]]]

y_train_i = y.iloc[perm_indices_class_i[:n_i[i]]]
y_test_i = y.iloc[perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]]]

if X_train is None:
X_train = X_train_i
y_train = y_train_i
X_test = X_test_i
y_test = y_test_i
else:
X_train = cudf.concat([X_train, X_train_i], ignore_index=False)
X_test = cudf.concat([X_test, X_test_i], ignore_index=False)
y_train = cudf.concat([y_train, y_train_i], ignore_index=False)
y_test = cudf.concat([y_test, y_test_i], ignore_index=False)

if x_numba:
X_train = cuda.as_cuda_array(X_train)
Expand Down
23 changes: 22 additions & 1 deletion python/cuml/test/test_train_test_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,10 +343,31 @@ def test_stratified_random_seed(seed_type):
assert y_test.equals(y_test2)

# Ensure that data is shuffled
assert not X_train.head().equals(X.head())
assert not (X.head().index.values == X_train.head().index.values).all()

def monotonic_inc(x):
dx = cp.diff(x.values, axis=0)
return cp.all(dx == 1)

assert not monotonic_inc(X_train)


@pytest.mark.parametrize('test_size', [0.2, 0.4, None])
@pytest.mark.parametrize('train_size', [0.6, 0.8, None])
def test_stratify_retain_index(test_size, train_size):
X = cudf.DataFrame({"x": range(10)})
y = cudf.Series(([0] * (10 // 2)) + ([1] * (10 // 2)))

X_train, X_test, y_train, y_test = train_test_split(X, y,
train_size=train_size,
test_size=test_size,
shuffle=True,
stratify=True)
assert (X_train["x"] == X_train.index).all()
assert (X_test["x"] == X_test.index).all()

if train_size is not None:
assert X_train.shape[0] == (int)(X.shape[0] * train_size)

elif test_size is not None:
assert X_test.shape[0] == (int)(X.shape[0] * test_size)

0 comments on commit bd65c15

Please sign in to comment.