Ensure make_classification respects output type (#3415)

Switch to api_return_generic decorator in order to get correct output type from make_classification Provide tests of global_output_type compliance for all dataset generators Authors: - William Hicks (@wphicks) Approvers: - John Zedlewski (@JohnZed) - Corey J. Nolet (@cjnolet) URL: #3415
rapidsai · Feb 2, 2021 · a3c62b1 · a3c62b1
1 parent fa2371a
commit a3c62b1
Show file tree

Hide file tree

Showing 3 changed files with 73 additions and 32 deletions.
diff --git a/python/cuml/datasets/classification.py b/python/cuml/datasets/classification.py
@@ -41,7 +41,7 @@ def _generate_hypercube(samples, dimensions, rng):
     return out
 
 
-@cuml.internals.api_return_any()
+@cuml.internals.api_return_generic()
 def make_classification(n_samples=100, n_features=20, n_informative=2,
                         n_redundant=2, n_repeated=0, n_classes=2,
                         n_clusters_per_class=2, weights=None, flip_y=0.01,
@@ -205,6 +205,8 @@ def make_classification(n_samples=100, n_features=20, n_informative=2,
            selection benchmark", 2003.
 
     """
+    cuml.internals.set_api_output_type("cupy")
+
     generator = _create_rs_generator(random_state)
     np_seed = int(generator.randint(n_samples, size=1))
     np.random.seed(np_seed)

diff --git a/python/cuml/test/test_dataset_generator_types.py b/python/cuml/test/test_dataset_generator_types.py
@@ -0,0 +1,69 @@
+#
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import cudf
+import cupy as cp
+import numba
+import numpy as np
+import pytest
+
+import cuml
+from cuml.datasets import (
+    make_arima,
+    make_blobs,
+    make_classification,
+    make_regression
+)
+
+
+TEST_OUTPUT_TYPES = (
+    (None, (cp.ndarray, cp.ndarray)),  # Default is cupy if None is used
+    ('numpy', (np.ndarray, np.ndarray)),
+    ('cupy', (cp.ndarray, cp.ndarray)),
+    ('numba', (numba.cuda.devicearray.DeviceNDArrayBase,
+               numba.cuda.devicearray.DeviceNDArrayBase)),
+    ('cudf', (cudf.DataFrame, cudf.Series))
+)
+
+GENERATORS = (
+    make_blobs, make_classification, make_regression
+)
+
+
+@pytest.mark.parametrize('generator', GENERATORS)
+@pytest.mark.parametrize(
+    'output_str,output_types', TEST_OUTPUT_TYPES
+)
+def test_xy_output_type(generator, output_str, output_types):
+
+    # Set the output type and ensure data of that type is generated
+    with cuml.using_output_type(output_str):
+        data = generator(n_samples=10, random_state=0)
+
+    for data, type_ in zip(data, output_types):
+        assert isinstance(data, type_)
+
+
+@pytest.mark.parametrize(
+    'output_str,output_types', TEST_OUTPUT_TYPES
+)
+def test_time_series_label_output_type(output_str, output_types):
+
+    # Set the output type and ensure data of that type is generated
+    with cuml.using_output_type(output_str):
+        data = make_arima(n_obs=10, random_state=0)[0]
+
+    assert isinstance(data, output_types[1])
diff --git a/python/cuml/test/test_make_blobs.py b/python/cuml/test/test_make_blobs.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,9 +16,6 @@
 import cuml
 import pytest
 import cupy as cp
-import cudf
-import numpy as np
-import numba.cuda
 
 # Testing parameters for scalar parameter tests
 
@@ -96,30 +93,3 @@ def test_make_blobs_scalar_parameters(dtype, n_samples, n_features, centers,
     elif centers <= n_samples:
         assert cp.unique(labels).shape == (centers,), \
             "unexpected number of clusters"
-
-
-test_output_types = {
-    None: cp.ndarray,  # Default is cupy if None is used
-    'numpy': np.ndarray,
-    'cupy': cp.ndarray,
-    'numba': numba.cuda.devicearray.DeviceNDArrayBase,
-    'cudf': (cudf.DataFrame, cudf.Series)
-}
-
-
-@pytest.mark.parametrize("input_type", test_output_types.keys())
-def test_output_type(input_type: str):
-
-    # Set the output type and ensure its respected by the function
-    with cuml.using_output_type(input_type):
-        X, y = cuml.make_blobs(n_samples=10,
-                               centers=3,
-                               n_features=2,
-                               random_state=0)
-
-        if (isinstance(test_output_types[input_type], tuple)):
-            assert (isinstance(X, test_output_types[input_type][0]))
-            assert (isinstance(y, test_output_types[input_type][1]))
-        else:
-            assert (isinstance(X, test_output_types[input_type]))
-            assert (isinstance(y, test_output_types[input_type]))