[REVIEW] Retain index in stratified splitting for dataframes (#2805)

* Handle CUDF case separately * Update changelog * STYLE fix * CHANGE test to 10 rows Co-authored-by: Nanthini Balasubramanian <[email protected]>
rapidsai · Sep 25, 2020 · bd65c15 · bd65c15
1 parent 808c1f6
commit bd65c15
Show file tree

Hide file tree

Showing 3 changed files with 79 additions and 22 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -39,6 +39,7 @@
 - PR #2744: Supporting larger number of classes in KNeighborsClassifier
 - PR #2769: Remove outdated doxygen options for 1.8.20
 - PR #2787: Skip lightgbm test for version 3 and above temporarily
+- PR #2805: Retain index in stratified splitting for dataframes
 - PR #2781: Use Python print to correctly redirect spdlogs when sys.stdout is changed
 - PR #2787: Skip lightgbm test for version 3 and above temporarily
 - PR #2813: Fix memory access in generation of non-row-major random blobs

diff --git a/python/cuml/preprocessing/model_selection.py b/python/cuml/preprocessing/model_selection.py
@@ -1,3 +1,4 @@
+
 # Copyright (c) 2019, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -45,26 +46,31 @@ def _stratify_split(X, y, n_train, n_test, x_numba, y_numba, random_state):
     """
     x_cudf = False
     y_cudf = False
+
     if isinstance(X, cudf.DataFrame):
         x_cudf = True
-        X = X.values
     elif hasattr(X, "__cuda_array_interface__"):
         X = cp.asarray(X)
+        x_order = _strides_to_order(X.__cuda_array_interface__['strides'],
+                                    cp.dtype(X.dtype))
+
     if isinstance(y, cudf.Series):
         y_cudf = True
-        y = y.values
     elif hasattr(y, "__cuda_array_interface__"):
         y = cp.asarray(y)
+        y_order = _strides_to_order(y.__cuda_array_interface__['strides'],
+                                    cp.dtype(y.dtype))
     elif isinstance(y, cudf.DataFrame):
         y_cudf = True
         # ensuring it has just one column
-        if y.shape[1] == 1:
-            y = y.values
-        else:
+        if y.shape[1] != 1:
             raise ValueError('Expected one label, but found y'
                              'with shape = %d' % (y.shape))
 
-    classes, y_indices = cp.unique(y, return_inverse=True)
+    classes, y_indices = cp.unique(y.values if y_cudf
+                                   else y,
+                                   return_inverse=True)
+
     n_classes = classes.shape[0]
     class_counts = cp.bincount(y_indices)
     if n_train < n_classes:
@@ -91,21 +97,50 @@ def _stratify_split(X, y, n_train, n_test, x_numba, y_numba, random_state):
     for i in range(n_classes):
         permutation = random_state.permutation(class_counts[i].item())
         perm_indices_class_i = class_indices[i].take(permutation)
-        X_train_i = X[perm_indices_class_i[:n_i[i]]]
-        X_test_i = X[perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]]]
-        y_train_i = y[perm_indices_class_i[:n_i[i]]]
-        y_test_i = y[perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]]]
-
-        if X_train is None:
-            X_train = X_train_i
-            y_train = y_train_i
-            X_test = X_test_i
-            y_test = y_test_i
-        else:
-            X_train = cp.concatenate([X_train, X_train_i], axis=0)
-            X_test = cp.concatenate([X_test, X_test_i], axis=0)
-            y_train = cp.concatenate([y_train, y_train_i], axis=0)
-            y_test = cp.concatenate([y_test, y_test_i], axis=0)
+
+        if hasattr(X, "__cuda_array_interface__") or \
+           isinstance(X, cupyx.scipy.sparse.csr_matrix):
+
+            X_train_i = cp.array(X[perm_indices_class_i[:n_i[i]]],
+                                 order=x_order)
+            X_test_i = cp.array(X[perm_indices_class_i[n_i[i]:n_i[i] +
+                                                       t_i[i]]],
+                                order=x_order)
+
+            y_train_i = cp.array(y[perm_indices_class_i[:n_i[i]]],
+                                 order=y_order)
+            y_test_i = cp.array(y[perm_indices_class_i[n_i[i]:n_i[i] +
+                                                       t_i[i]]],
+                                order=y_order)
+
+            if X_train is None:
+                X_train = cp.array(X_train_i, order=x_order)
+                y_train = cp.array(y_train_i, order=y_order)
+                X_test = cp.array(X_test_i, order=x_order)
+                y_test = cp.array(y_test_i, order=y_order)
+            else:
+                X_train = cp.concatenate([X_train, X_train_i], axis=0)
+                X_test = cp.concatenate([X_test, X_test_i], axis=0)
+                y_train = cp.concatenate([y_train, y_train_i], axis=0)
+                y_test = cp.concatenate([y_test, y_test_i], axis=0)
+
+        elif x_cudf:
+            X_train_i = X.iloc[perm_indices_class_i[:n_i[i]]]
+            X_test_i = X.iloc[perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]]]
+
+            y_train_i = y.iloc[perm_indices_class_i[:n_i[i]]]
+            y_test_i = y.iloc[perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]]]
+
+            if X_train is None:
+                X_train = X_train_i
+                y_train = y_train_i
+                X_test = X_test_i
+                y_test = y_test_i
+            else:
+                X_train = cudf.concat([X_train, X_train_i], ignore_index=False)
+                X_test = cudf.concat([X_test, X_test_i], ignore_index=False)
+                y_train = cudf.concat([y_train, y_train_i], ignore_index=False)
+                y_test = cudf.concat([y_test, y_test_i], ignore_index=False)
 
     if x_numba:
         X_train = cuda.as_cuda_array(X_train)

diff --git a/python/cuml/test/test_train_test_split.py b/python/cuml/test/test_train_test_split.py
@@ -343,10 +343,31 @@ def test_stratified_random_seed(seed_type):
     assert y_test.equals(y_test2)
 
     # Ensure that data is shuffled
-    assert not X_train.head().equals(X.head())
+    assert not (X.head().index.values == X_train.head().index.values).all()
 
     def monotonic_inc(x):
         dx = cp.diff(x.values, axis=0)
         return cp.all(dx == 1)
 
     assert not monotonic_inc(X_train)
+
+
+@pytest.mark.parametrize('test_size', [0.2, 0.4, None])
+@pytest.mark.parametrize('train_size', [0.6, 0.8, None])
+def test_stratify_retain_index(test_size, train_size):
+    X = cudf.DataFrame({"x": range(10)})
+    y = cudf.Series(([0] * (10 // 2)) + ([1] * (10 // 2)))
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y,
+                                                        train_size=train_size,
+                                                        test_size=test_size,
+                                                        shuffle=True,
+                                                        stratify=True)
+    assert (X_train["x"] == X_train.index).all()
+    assert (X_test["x"] == X_test.index).all()
+
+    if train_size is not None:
+        assert X_train.shape[0] == (int)(X.shape[0] * train_size)
+
+    elif test_size is not None:
+        assert X_test.shape[0] == (int)(X.shape[0] * test_size)