Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Speed up test_linear_model #3075

Merged
merged 9 commits into from
Oct 29, 2020
Merged
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
## New Features

## Improvements
- PR #3075: Speed up test_linear_model tests
- PR #3078: Speed up test_incremental_pca tests
- PR #2902: `matrix/matrix.cuh` in RAFT namespacing
- PR #2903: Moving linalg's gemm, gemv, transpose to RAFT namespaces
Expand Down
78 changes: 50 additions & 28 deletions python/cuml/test/test_linear_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
from functools import lru_cache
import cupy as cp
import numpy as np
import pytest
Expand All @@ -38,17 +39,25 @@
from sklearn.model_selection import train_test_split


def make_regression_dataset(datatype, nrows, ncols, n_info):
def _make_regression_dataset_uncached(nrows, ncols, n_info):
X, y = make_regression(
n_samples=nrows, n_features=ncols, n_informative=n_info, random_state=0
)
X = X.astype(datatype)
y = y.astype(datatype)
X_train, X_test, y_train, y_test = train_test_split(
X, y, train_size=0.8, random_state=10
)
return train_test_split(X, y, train_size=0.8, random_state=10)

return X_train, X_test, y_train, y_test

@lru_cache(4)
wphicks marked this conversation as resolved.
Show resolved Hide resolved
def _make_regression_dataset_from_cache(nrows, ncols, n_info):
return _make_regression_dataset_uncached(nrows, ncols, n_info)


def make_regression_dataset(datatype, nrows, ncols, n_info):
if nrows * ncols < 1e8: # Keep cache under 4 GB
wphicks marked this conversation as resolved.
Show resolved Hide resolved
dataset = _make_regression_dataset_from_cache(nrows, ncols, n_info)
else:
dataset = _make_regression_dataset_uncached(nrows, ncols, n_info)

return map(lambda arr: arr.astype(datatype), dataset)


def make_classification_dataset(datatype, nrows, ncols, n_info, num_classes):
Expand Down Expand Up @@ -197,15 +206,24 @@ def test_ridge_regression_model(datatype, algorithm, nrows, column_info):
with_sign=True)


@pytest.mark.parametrize("num_classes", [2, 10])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize("penalty", ["none", "l1", "l2", "elasticnet"])
@pytest.mark.parametrize("l1_ratio", [1.0])
@pytest.mark.parametrize("fit_intercept", [True, False])
@pytest.mark.parametrize(
"num_classes, dtype, penalty, l1_ratio, fit_intercept, C, tol", [
# L-BFGS Solver
(2, np.float32, "none", 1.0, True, 1.0, 1e-3),
(2, np.float64, "l2", 1.0, True, 1.0, 1e-8),
(10, np.float32, "elasticnet", 0.0, True, 1.0, 1e-3),
(10, np.float32, "none", 1.0, False, 1.0, 1e-8),
(10, np.float32, "none", 1.0, False, 2.0, 1e-3),
# OWL-QN Solver
(2, np.float32, "l1", 1.0, True, 1.0, 1e-3),
(2, np.float64, "elasticnet", 1.0, True, 1.0, 1e-8),
(10, np.float32, "l1", 1.0, True, 1.0, 1e-3),
(10, np.float32, "l1", 1.0, False, 1.0, 1e-8),
(10, np.float32, "elasticnet", 1.0, False, 0.5, 1e-3),
]
)
@pytest.mark.parametrize("nrows", [unit_param(1000)])
@pytest.mark.parametrize("column_info", [unit_param([20, 10])])
@pytest.mark.parametrize("C", [2.0, 1.0, 0.5])
@pytest.mark.parametrize("tol", [1e-3, 1e-8])
def test_logistic_regression(
num_classes, dtype, penalty, l1_ratio,
fit_intercept, nrows, column_info, C, tol
Expand Down Expand Up @@ -276,19 +294,21 @@ def test_logistic_regression(
assert len(np.unique(cu_preds)) == len(np.unique(y_test))


@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize("penalty", ["none", "l1", "l2", "elasticnet"])
def test_logistic_regression_unscaled(dtype, penalty):
@pytest.mark.parametrize("dtype, penalty, l1_ratio", [
(np.float32, "none", 1.0),
(np.float64, "l2", 0.0),
(np.float32, "elasticnet", 1.0),
(np.float64, "l1", None),
])
def test_logistic_regression_unscaled(dtype, penalty, l1_ratio):
# Test logistic regression on the breast cancer dataset. We do not scale
# the dataset which could lead to numerical problems (fixed in PR #2543).
X, y = load_breast_cancer(return_X_y=True)
X = X.astype(dtype)
y = y.astype(dtype)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
params = {"penalty": penalty, "C": 1, "tol": 1e-4, "fit_intercept": True,
'max_iter': 5000}
if penalty == "elasticnet":
params["l1_ratio"] = 1.0
'max_iter': 5000, "l1_ratio": l1_ratio}
culog = cuLog(**params)
culog.fit(X_train, y_train)

Expand All @@ -314,11 +334,12 @@ def test_logistic_regression_model_default(dtype):
assert culog.score(X_test, y_test) >= sklog.score(X_test, y_test) - 0.022


@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize("nrows", [10, 100])
@pytest.mark.parametrize("dtype, nrows, num_classes, fit_intercept", [
(np.float32, 10, 2, True),
(np.float64, 100, 10, False),
(np.float64, 100, 2, True)
])
@pytest.mark.parametrize("column_info", [(20, 10)])
@pytest.mark.parametrize("num_classes", [2, 10])
@pytest.mark.parametrize("fit_intercept", [True, False])
def test_logistic_regression_decision_function(
dtype, nrows, column_info, num_classes, fit_intercept
):
Expand Down Expand Up @@ -350,11 +371,12 @@ def test_logistic_regression_decision_function(
assert array_equal(cu_dec_func, sk_dec_func)


@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize("nrows", [10, 100])
@pytest.mark.parametrize("dtype, nrows, num_classes, fit_intercept", [
(np.float32, 10, 2, True),
(np.float64, 100, 10, False),
(np.float64, 100, 2, True)
])
@pytest.mark.parametrize("column_info", [(20, 10)])
@pytest.mark.parametrize("num_classes", [2, 10])
@pytest.mark.parametrize("fit_intercept", [True, False])
def test_logistic_regression_predict_proba(
dtype, nrows, column_info, num_classes, fit_intercept
):
Expand Down