rapidsai · rapids-bot · Nov 11, 2022 · Oct 26, 2022 · Oct 27, 2022 · Oct 27, 2022
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,7 +14,8 @@
 #
 from hypothesis import assume
 from hypothesis.extra.numpy import arrays, floating_dtypes
-from hypothesis.strategies import booleans, composite, floats, integers
+from hypothesis.strategies import (booleans, composite, floats, integers,
+                                   sampled_from)
 from sklearn.datasets import make_regression
 from sklearn.model_selection import train_test_split
 
@@ -26,19 +27,62 @@ def datasets(
     n_samples=integers(min_value=0, max_value=200),
     n_features=integers(min_value=0, max_value=200),
 ):
+    """
+    Generic datasets that can serve as an input to an estimator.
+
+    Parameters
+    ----------
+    dtypes: SearchStrategy[np.dtype]
+        Returned arrays will have a dtype drawn from these types.
+    n_samples: SearchStrategy[int]
+        Returned arrays will have number of rows drawn from these values.
+    n_features: SearchStrategy[int]
+        Returned arrays will have number of columns drawn from these values.
+
+    Returns
+    -------
+    X: SearchStrategy[array] (n_samples, n_features)
+        The search strategy for input samples.
+    y: SearchStrategy[array] (n_samples,) or (n_samples, n_targets)
+        The search strategy for output samples.
+
+    """
     xs = draw(n_samples)
     ys = draw(n_features)
     X = arrays(dtype=dtypes, shape=(xs, ys))
-    y = arrays(dtype=dtypes, shape=(xs, 1))
+    y = arrays(dtype=dtypes, shape=(xs, draw(sampled_from((1, ys)))))
     return draw(X), draw(y)
 
 
 @composite
 def split_datasets(
     draw,
-    datasets=datasets(),
+    datasets=datasets(n_samples=integers(min_value=10, max_value=200)),
     train_sizes=floats(min_value=0.1, max_value=1.0, exclude_max=True),
 ):
+    """
+    Split a generic search strategy for datasets into test and train subsets.
+
+    Note: This function uses the sklearn.model_selection.train_test_split
+    function.
+
+    See also:
+    datasets(): A search strategy for datasets that can serve as input to this
+    strategy.
+
+    Parameters
+    ----------
+    datasets: SearchStrategy[dataset]
+        A search strategy for datasets.
+    train_sizes: SearchStrategy[float]
+        A search strategy for the train size. Must be provided as float and is
+        limited by valid inputs to sklearn's train_test_split() function.
+
+    Returns
+    -------
+    splitting: list, length=2 * len(arrays)
+        List with a drawn train-test split of the drawn dataset.
+    """
     X, y = draw(datasets)
     ts = draw(train_sizes)
     assume(int(len(X) * ts) > 0)  # train_test_split limitation
@@ -54,6 +98,31 @@ def regression_datasets(
     n_informatives=integers(min_value=0, max_value=200),
     is_normal=booleans(),
 ):
+    """
+    Generic datasets that can serve as an input to an estimator.
+
+    See also:
+    datasets(): Generate generic datasets.
+    split_datasets(): Split dataset into test-train subsets.
+
+    Parameters
+    ----------
+    dtypes: SearchStrategy[np.dtype]
+        Returned arrays will have a dtype drawn from these types.
+    n_samples: SearchStrategy[int]
+        Returned arrays will have number of rows drawn from these values.
+    n_features: SearchStrategy[int]
+        Returned arrays will have number of columns drawn from these values.
+    n_informatives: SearchStrategy[int]
+        Determines the number of informative features in a normal dataset.
+    is_normal: SearchStrategy[bool]
+        Whether the returned dataset is considered normal or more random.
+
+    Returns
+    -------
+    (X, y):  tuple(SearchStrategy[array], SearchStrategy[array])
+        A tuple of search strategies for the requested arrays.
+    """
     if draw(is_normal):
         dtype_ = draw(dtypes)
         X, y = make_regression(

@@ -20,6 +20,7 @@
 
 import numpy as np
 import cupy as cp
+import hypothesis
 
 from math import ceil
 from sklearn.datasets import fetch_20newsgroups
@@ -34,6 +35,20 @@
 pytest_plugins = ("cuml.testing.plugins.quick_run_plugin")
 
 
+# Configure hypothesis profiles
+
+hypothesis.settings.register_profile(
+    name="quality",
+    parent=hypothesis.settings.get_profile("default"),
+)
+
+hypothesis.settings.register_profile(
+    name="stress",
+    parent=hypothesis.settings.get_profile("quality"),
+    max_examples=1000
+)
+
+
 def pytest_addoption(parser):
     # Any custom option, that should be available at any time (not just a
     # plugin), goes here.
@@ -101,6 +116,15 @@ def pytest_configure(config):
     pytest.max_gpu_memory = get_gpu_memory()
     pytest.adapt_stress_test = 'CUML_ADAPT_STRESS_TESTS' in os.environ
 
+    # Load special hypothesis profiles for either quality or stress tests.
+    # Note that the profile can be manually overwritten with the
+    # --hypothesis-profile command line option in which case the settings
+    # specified here will be ignored.
+    if config.getoption("--run_stress"):
+        hypothesis.settings.load_profile("stress")
+    elif config.getoption("--run_quality"):
+        hypothesis.settings.load_profile("quality")
+
 
 @pytest.fixture(scope="module")
 def nlp_20news():

@@ -217,25 +217,23 @@ def test_linear_regression_single_column():
 )
 @example(small_regression_dataset(np.float32))
 @example(small_regression_dataset(np.float64))
-@settings(
-    deadline=5000, max_examples=20
-)  # TODO: re-evaluate max_examples after benchmarking
+@settings(deadline=5000)
 def test_linear_regression_model_default(dataset):
 
     X_train, X_test, y_train, y_test = dataset
     n_rows, n_cols = X_train.shape
 
-    ## Required assumptions:
-    # sklinearRegression:
-    assume(n_cols >= 1)
+    # Required assumptions:
+    #  sklinearRegression:
     assume((X_train > 0).any())
     assume((y_train > 0).any())
-    assume(np.isfinite(X_train).all())
     assume(np.isfinite(y_train).all())
-    # cuml.LinearRegression:
+    #  cuml.LinearRegression:
     assume(n_rows >= 2)
-    # both:
+    #  both:
     assume(n_cols >= 1)
+    #    w/o the next assumption sklearn complains and cuml hangs(!):
+    assume(np.isfinite(X_train).all())
 
     # Initialization of cuML's linear regression model
     cuols = cuLinearRegression()

@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

@@ -49,6 +49,8 @@ Examples subject to numerical imprecision, or that can't be reproduced consisten
 ## Testing and Unit Testing
 We use [https://docs.pytest.org/en/latest/]() for writing and running tests. To see existing examples, refer to any of the `test_*.py` files in the folder `cuml/tests`.
 
+Some tests are run against inputs generated with [hypothesis](https://hypothesis.works/). See the `cuml/testing/strategies.py` module for custom strategies that can be used to test cuml estimators with diverse inputs. For example, use the `regression_datasets()` strategy to test random regression problems.
+
 ## Device and Host memory allocations
 TODO: talk about enabling RMM here when it is ready