rapidsai · dantegd · Oct 29, 2020 · Oct 27, 2020 · Oct 28, 2020 · Oct 28, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,7 @@
 ## New Features
 
 ## Improvements
+- PR #3075: Speed up test_linear_model tests
 - PR #3078: Speed up test_incremental_pca tests
 - PR #2902: `matrix/matrix.cuh` in RAFT namespacing
 - PR #2903: Moving linalg's gemm, gemv, transpose to RAFT namespaces

@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+from functools import lru_cache
 import cupy as cp
 import numpy as np
 import pytest
@@ -38,17 +39,25 @@
 from sklearn.model_selection import train_test_split
 
 
-def make_regression_dataset(datatype, nrows, ncols, n_info):
+def _make_regression_dataset_uncached(nrows, ncols, n_info):
     X, y = make_regression(
         n_samples=nrows, n_features=ncols, n_informative=n_info, random_state=0
     )
-    X = X.astype(datatype)
-    y = y.astype(datatype)
-    X_train, X_test, y_train, y_test = train_test_split(
-        X, y, train_size=0.8, random_state=10
-    )
+    return train_test_split(X, y, train_size=0.8, random_state=10)
 
-    return X_train, X_test, y_train, y_test
+
+@lru_cache(4)
+def _make_regression_dataset_from_cache(nrows, ncols, n_info):
+    return _make_regression_dataset_uncached(nrows, ncols, n_info)
+
+
+def make_regression_dataset(datatype, nrows, ncols, n_info):
+    if nrows * ncols < 1e8:  # Keep cache under 4 GB
+        dataset = _make_regression_dataset_from_cache(nrows, ncols, n_info)
+    else:
+        dataset = _make_regression_dataset_uncached(nrows, ncols, n_info)
+
+    return map(lambda arr: arr.astype(datatype), dataset)
 
 
 def make_classification_dataset(datatype, nrows, ncols, n_info, num_classes):
@@ -197,15 +206,24 @@ def test_ridge_regression_model(datatype, algorithm, nrows, column_info):
                            with_sign=True)
 
 
-@pytest.mark.parametrize("num_classes", [2, 10])
-@pytest.mark.parametrize("dtype", [np.float32, np.float64])
-@pytest.mark.parametrize("penalty", ["none", "l1", "l2", "elasticnet"])
-@pytest.mark.parametrize("l1_ratio", [1.0])
-@pytest.mark.parametrize("fit_intercept", [True, False])
+@pytest.mark.parametrize(
+    "num_classes, dtype, penalty, l1_ratio, fit_intercept, C, tol", [
+        # L-BFGS Solver
+        (2, np.float32, "none", 1.0, True, 1.0, 1e-3),
+        (2, np.float64, "l2", 1.0, True, 1.0, 1e-8),
+        (10, np.float32, "elasticnet", 0.0, True, 1.0, 1e-3),
+        (10, np.float32, "none", 1.0, False, 1.0, 1e-8),
+        (10, np.float32, "none", 1.0, False, 2.0, 1e-3),
+        # OWL-QN Solver
+        (2, np.float32, "l1", 1.0, True, 1.0, 1e-3),
+        (2, np.float64, "elasticnet", 1.0, True, 1.0, 1e-8),
+        (10, np.float32, "l1", 1.0, True, 1.0, 1e-3),
+        (10, np.float32, "l1", 1.0, False, 1.0, 1e-8),
+        (10, np.float32, "elasticnet", 1.0, False, 0.5, 1e-3),
+    ]
+)
 @pytest.mark.parametrize("nrows", [unit_param(1000)])
 @pytest.mark.parametrize("column_info", [unit_param([20, 10])])
-@pytest.mark.parametrize("C", [2.0, 1.0, 0.5])
-@pytest.mark.parametrize("tol", [1e-3, 1e-8])
 def test_logistic_regression(
     num_classes, dtype, penalty, l1_ratio,
     fit_intercept, nrows, column_info, C, tol
@@ -276,19 +294,21 @@ def test_logistic_regression(
     assert len(np.unique(cu_preds)) == len(np.unique(y_test))
 
 
-@pytest.mark.parametrize("dtype", [np.float32, np.float64])
-@pytest.mark.parametrize("penalty", ["none", "l1", "l2", "elasticnet"])
-def test_logistic_regression_unscaled(dtype, penalty):
+@pytest.mark.parametrize("dtype, penalty, l1_ratio", [
+    (np.float32, "none", 1.0),
+    (np.float64, "l2", 0.0),
+    (np.float32, "elasticnet", 1.0),
+    (np.float64, "l1", None),
+])
+def test_logistic_regression_unscaled(dtype, penalty, l1_ratio):
     # Test logistic regression on the breast cancer dataset. We do not scale
     # the dataset which could lead to numerical problems (fixed in PR #2543).
     X, y = load_breast_cancer(return_X_y=True)
     X = X.astype(dtype)
     y = y.astype(dtype)
     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
     params = {"penalty": penalty, "C": 1, "tol": 1e-4, "fit_intercept": True,
-              'max_iter': 5000}
-    if penalty == "elasticnet":
-        params["l1_ratio"] = 1.0
+              'max_iter': 5000, "l1_ratio": l1_ratio}
     culog = cuLog(**params)
     culog.fit(X_train, y_train)
 
@@ -314,11 +334,12 @@ def test_logistic_regression_model_default(dtype):
     assert culog.score(X_test, y_test) >= sklog.score(X_test, y_test) - 0.022
 
 
-@pytest.mark.parametrize("dtype", [np.float32, np.float64])
-@pytest.mark.parametrize("nrows", [10, 100])
+@pytest.mark.parametrize("dtype, nrows, num_classes, fit_intercept", [
+    (np.float32, 10, 2, True),
+    (np.float64, 100, 10, False),
+    (np.float64, 100, 2, True)
+])
 @pytest.mark.parametrize("column_info", [(20, 10)])
-@pytest.mark.parametrize("num_classes", [2, 10])
-@pytest.mark.parametrize("fit_intercept", [True, False])
 def test_logistic_regression_decision_function(
     dtype, nrows, column_info, num_classes, fit_intercept
 ):
@@ -350,11 +371,12 @@ def test_logistic_regression_decision_function(
     assert array_equal(cu_dec_func, sk_dec_func)
 
 
-@pytest.mark.parametrize("dtype", [np.float32, np.float64])
-@pytest.mark.parametrize("nrows", [10, 100])
+@pytest.mark.parametrize("dtype, nrows, num_classes, fit_intercept", [
+    (np.float32, 10, 2, True),
+    (np.float64, 100, 10, False),
+    (np.float64, 100, 2, True)
+])
 @pytest.mark.parametrize("column_info", [(20, 10)])
-@pytest.mark.parametrize("num_classes", [2, 10])
-@pytest.mark.parametrize("fit_intercept", [True, False])
 def test_logistic_regression_predict_proba(
     dtype, nrows, column_info, num_classes, fit_intercept
 ):