rapidsai · rapids-bot · Nov 11, 2022 · Oct 26, 2022 · Oct 27, 2022 · Oct 27, 2022
@@ -0,0 +1,72 @@
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from hypothesis import assume
+from hypothesis.extra.numpy import arrays, floating_dtypes
+from hypothesis.strategies import booleans, composite, floats, integers
+from sklearn.datasets import make_regression
+from sklearn.model_selection import train_test_split
+
+
+@composite
+def datasets(
+    draw,
+    dtypes=floating_dtypes(),
+    n_samples=integers(min_value=0, max_value=200),
+    n_features=integers(min_value=0, max_value=200),
+):
+    xs = draw(n_samples)
+    ys = draw(n_features)
+    X = arrays(dtype=dtypes, shape=(xs, ys))
+    y = arrays(dtype=dtypes, shape=(xs, 1))
+    return draw(X), draw(y)
+
+
+@composite
+def split_datasets(
+    draw,
+    datasets=datasets(),
+    train_sizes=floats(min_value=0.1, max_value=1.0, exclude_max=True),
+):
+    X, y = draw(datasets)
+    ts = draw(train_sizes)
+    assume(int(len(X) * ts) > 0)  # train_test_split limitation
+    return train_test_split(X, y, train_size=ts)
+
+
+@composite
+def regression_datasets(
+    draw,
+    dtypes=floating_dtypes(),
+    n_samples=integers(min_value=0, max_value=200),
+    n_features=integers(min_value=0, max_value=200),
+    n_informatives=integers(min_value=0, max_value=200),
+    is_normal=booleans(),
+):
+    if draw(is_normal):
+        dtype_ = draw(dtypes)
+        X, y = make_regression(
+            n_samples=draw(n_samples),
+            n_features=draw(n_features),
+            n_informative=draw(n_informatives),
+        )
+        return X.astype(dtype_), y.astype(dtype_)
+    else:
+        return draw(
+            datasets(
+                dtypes=dtypes,
+                n_samples=n_samples,
+                n_features=n_features,
+            )
+        )
diff --git a/python/cuml/testing/utils.py b/python/cuml/testing/utils.py
@@ -39,6 +39,21 @@
 import pytest
 
 
+def array_difference(a, b, with_sign=True):
+    """
+    Utility function to compute the difference between 2 arrays.
+    """
+    a = to_nparray(a)
+    b = to_nparray(b)
+
+    if len(a) == 0 and len(b) == 0:
+        return 0
+
+    if not with_sign:
+        a, b = np.abs(a), np.abs(b)
+    return np.sum(np.abs(a - b))
+
+
 def array_equal(a, b, unit_tol=1e-4, total_tol=1e-4, with_sign=True):
     """
     Utility function to compare 2 numpy arrays. Two individual elements

@@ -16,13 +16,24 @@
 import cupy as cp
 import numpy as np
 import pytest
+from hypothesis import (
+    assume,
+    example,
+    given,
+    settings,
+    strategies as st,
+    target
+)
+from hypothesis.extra.numpy import floating_dtypes
 from distutils.version import LooseVersion
 import cudf
 from cuml import ElasticNet as cuElasticNet
 from cuml import LinearRegression as cuLinearRegression
 from cuml import LogisticRegression as cuLog
 from cuml import Ridge as cuRidge
+from cuml.testing.strategies import split_datasets, regression_datasets
 from cuml.testing.utils import (
+    array_difference,
     array_equal,
     small_regression_dataset,
     small_classification_dataset,
@@ -193,10 +204,38 @@ def test_linear_regression_single_column():
         model.fit(cp.random.rand(46341), cp.random.rand(46341))
 
 
-@pytest.mark.parametrize("datatype", [np.float32, np.float64])
-def test_linear_regression_model_default(datatype):
-
-    X_train, X_test, y_train, y_test = small_regression_dataset(datatype)
+@given(
+    split_datasets(
+        regression_datasets(
+            # Two assumptions required for cuml.LinearRegression:
+            n_samples=st.integers(
+                min_value=20, max_value=200
+            ),  # assuming min(train_size)=0.1
+            dtypes=floating_dtypes(sizes=(32, 64)),
+        )
+    )
+)
+@example(small_regression_dataset(np.float32))
+@example(small_regression_dataset(np.float64))
+@settings(
+    deadline=5000, max_examples=20
+)  # TODO: re-evaluate max_examples after benchmarking
+def test_linear_regression_model_default(dataset):
+
+    X_train, X_test, y_train, y_test = dataset
+    n_rows, n_cols = X_train.shape
+
+    ## Required assumptions:
+    # sklinearRegression:
+    assume(n_cols >= 1)
+    assume((X_train > 0).any())
+    assume((y_train > 0).any())
+    assume(np.isfinite(X_train).all())
+    assume(np.isfinite(y_train).all())
+    # cuml.LinearRegression:
+    assume(n_rows >= 2)
+    # both:
+    assume(n_cols >= 1)
 
     # Initialization of cuML's linear regression model
     cuols = cuLinearRegression()
@@ -211,6 +250,7 @@ def test_linear_regression_model_default(datatype):
 
     skols_predict = skols.predict(X_test)
 
+    target(float(array_difference(skols_predict, cuols_predict)))
     assert array_equal(skols_predict, cuols_predict, 1e-1, with_sign=True)
 
 

@@ -0,0 +1,54 @@
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from cuml.testing.strategies import (
+    datasets,
+    regression_datasets,
+    split_datasets,
+)
+from hypothesis import given
+
+
+@given(datasets())
+def test_datasets(dataset):
+    X, y = dataset
+
+    assert X.ndim == 2
+    assert y.ndim in (0, 1, 2)
+
+
+@given(split_datasets())
+def test_split_datasets(split_dataset):
+    X_train, X_test, y_train, y_test = split_dataset
+
+    assert X_train.ndim == X_test.ndim == 2
+    assert y_train.ndim == y_test.ndim
+    assert y_train.ndim in (0, 1, 2)
+
+
+@given(regression_datasets())
+def test_regression_datasets(dataset):
+    X, y = dataset
+
+    assert X.ndim == 2
+    assert y.ndim in (0, 1, 2)
+
+
+@given(split_datasets(datasets=regression_datasets()))
+def test_split_regression_datasets(split_dataset):
+    X_train, X_test, y_train, y_test = split_dataset
+
+    assert X_train.ndim == X_test.ndim == 2
+    assert y_train.ndim == y_test.ndim
+    assert y_train.ndim in (0, 1, 2)