Skip to content

Commit

Permalink
Update Scikit-learn compatibility to 1.2 (#5141)
Browse files Browse the repository at this point in the history
Authors:
   - Dante Gama Dessavre (https://github.com/dantegd)
   - AJ Schmidt (https://github.com/ajschmidt8)

Approvers:
   - William Hicks (https://github.com/wphicks)
   - AJ Schmidt (https://github.com/ajschmidt8)
  • Loading branch information
dantegd authored Jan 26, 2023
1 parent f6d72fc commit 79d0b9e
Show file tree
Hide file tree
Showing 21 changed files with 139 additions and 103 deletions.
2 changes: 1 addition & 1 deletion conda/environments/all_cuda-115_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ dependencies:
- raft-dask=23.02.*
- rmm=23.02.*
- scikit-build>=0.13.1
- scikit-learn=0.24
- scikit-learn=1.2
- seaborn
- sparse
- sphinx-markdown-tables
Expand Down
2 changes: 1 addition & 1 deletion dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ dependencies:
- pytest-cases
- pytest-cov
- pytest-xdist
- &scikit_learn scikit-learn=0.24
- &scikit_learn scikit-learn=1.2
- seaborn
- sparse
- statsmodels
Expand Down
15 changes: 8 additions & 7 deletions notebooks/kmeans_demo.ipynb

Large diffs are not rendered by default.

6 changes: 2 additions & 4 deletions notebooks/linear_regression_demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,6 @@
"source": [
"%%time\n",
"ols_sk = skLinearRegression(fit_intercept=True,\n",
" normalize=True,\n",
" n_jobs=-1)\n",
"\n",
"ols_sk.fit(X_train, y_train)"
Expand Down Expand Up @@ -157,7 +156,6 @@
"source": [
"%%time\n",
"ols_cuml = cuLinearRegression(fit_intercept=True,\n",
" normalize=True,\n",
" algorithm='eig')\n",
"\n",
"ols_cuml.fit(X_cudf, y_cudf)"
Expand Down Expand Up @@ -203,7 +201,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
Expand All @@ -217,7 +215,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
"version": "3.9.15"
}
},
"nbformat": 4,
Expand Down
21 changes: 9 additions & 12 deletions python/cuml/decomposition/pca.pyx
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2019-2022, NVIDIA CORPORATION.
# Copyright (c) 2019-2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -362,8 +362,7 @@ class PCA(UniversalBase,
self._sparse_model = True

self.n_samples_ = X.shape[0]
self.n_features_ = X.shape[1] if X.ndim == 2 else 1
self.n_features_in_ = self.n_features_
self.n_features_in_ = X.shape[1] if X.ndim == 2 else 1
self.dtype = X.dtype

# NOTE: All intermediate calculations are done using cupy.ndarray and
Expand All @@ -387,7 +386,7 @@ class PCA(UniversalBase,
self.explained_variance_ratio_ = self.explained_variance_ / cp.sum(
self.explained_variance_)

if self.n_components_ < min(self.n_samples_, self.n_features_):
if self.n_components_ < min(self.n_samples_, self.n_features_in_):
self.noise_variance_ = \
self.explained_variance_[self.n_components_:].mean()
else:
Expand Down Expand Up @@ -433,16 +432,15 @@ class PCA(UniversalBase,
X = sparse_scipy_to_cp(X, dtype=None)
return self._sparse_fit(X)

X_m, self.n_samples_, self.n_features_, self.dtype = \
X_m, self.n_samples_, self.n_features_in_, self.dtype = \
input_to_cuml_array(X, check_dtype=[np.float32, np.float64])
cdef uintptr_t input_ptr = X_m.ptr
self.n_features_in_ = self.n_features_
self.feature_names_in_ = X_m.index

cdef paramsPCA *params = <paramsPCA*><size_t> \
self._build_params(self.n_samples_, self.n_features_)
self._build_params(self.n_samples_, self.n_features_in_)

if params.n_components > self.n_features_:
if params.n_components > self.n_features_in_:
raise ValueError('Number of components should not be greater than'
'the number of columns in the data')

Expand Down Expand Up @@ -587,7 +585,7 @@ class PCA(UniversalBase,
cpdef paramsPCA params
params.n_components = self.n_components_
params.n_rows = n_rows
params.n_cols = self.n_features_
params.n_cols = self.n_features_in_
params.whiten = self.whiten

input_data = CumlArray.zeros((params.n_rows, params.n_cols),
Expand Down Expand Up @@ -675,7 +673,7 @@ class PCA(UniversalBase,
input_to_cuml_array(X, check_dtype=dtype,
convert_to_dtype=(dtype if convert_dtype
else None),
check_cols=self.n_features_)
check_cols=self.n_features_in_)

cdef uintptr_t input_ptr = X_m.ptr

Expand Down Expand Up @@ -735,5 +733,4 @@ class PCA(UniversalBase,
return ['components_', 'explained_variance_',
'explained_variance_ratio_', 'singular_values_',
'mean_', 'n_components_', 'noise_variance_',
'n_samples_', 'n_features_', 'n_features_in_',
'feature_names_in_']
'n_samples_', 'n_features_in_', 'feature_names_in_']
2 changes: 2 additions & 0 deletions python/cuml/solvers/sgd.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,8 @@ class SGD(Base,
msg = "loss {!r} is not supported"
raise TypeError(msg.format(loss))

if penalty is None:
penalty = 'none'
if penalty in ['none', 'l1', 'l2', 'elasticnet']:
self.penalty = penalty
else:
Expand Down
2 changes: 1 addition & 1 deletion python/cuml/svm/svc.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -618,7 +618,7 @@ class SVC(SVMBase,

with cuml.internals.exit_internal_api():
for clf in self.prob_svc.calibrated_classifiers_:
df = df + clf.base_estimator.decision_function(X)
df = df + clf.estimator.decision_function(X)
df = df / len(self.prob_svc.calibrated_classifiers_)
return df
elif self.n_classes_ > 2:
Expand Down
35 changes: 35 additions & 0 deletions python/cuml/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,20 @@

from cuml.testing.utils import create_synthetic_dataset
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import datasets
from sklearn.datasets import make_regression as skl_make_reg
from sklearn.datasets import make_classification as skl_make_clas
from sklearn.datasets import fetch_california_housing
from sklearn.datasets import fetch_20newsgroups
from sklearn.utils import Bunch
from datetime import timedelta
from math import ceil
import hypothesis
from cuml.internals.safe_imports import gpu_only_import
import pytest
import os
import subprocess
import pandas as pd

from cuml.internals.safe_imports import cpu_only_import
np = cpu_only_import('numpy')
Expand Down Expand Up @@ -180,6 +183,38 @@ def housing_dataset():
return X, y, feature_names


@pytest.fixture(scope="module")
def deprecated_boston_dataset():
# dataset was removed in Scikit-learn 1.2, we should change it for a
# better dataset for tests, see
# https://github.com/rapidsai/cuml/issues/5158

df = pd.read_csv('https://raw.githubusercontent.com/scikit-learn/scikit-learn/baf828ca126bcb2c0ad813226963621cafe38adb/sklearn/datasets/data/boston_house_prices.csv', header=None) # noqa: E501
n_samples = int(df[0][0])
data = df[list(np.arange(13))].values[2:n_samples].astype(np.float64)
targets = df[13].values[2:n_samples].astype(np.float64)

return Bunch(
data=data,
target=targets,
)


@pytest.fixture(scope="module", params=["digits",
"deprecated_boston_dataset",
"diabetes",
"cancer"])
def test_datasets(request, deprecated_boston_dataset):
test_datasets_dict = {
"digits": datasets.load_digits(),
"deprecated_boston_dataset": deprecated_boston_dataset,
"diabetes": datasets.load_diabetes(),
"cancer": datasets.load_breast_cancer(),
}

return test_datasets_dict[request.param]


@pytest.fixture(scope="session")
def random_seed(request):
current_random_seed = os.getenv('PYTEST_RANDOM_SEED')
Expand Down
4 changes: 4 additions & 0 deletions python/cuml/tests/test_compose.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.
#


from cuml.testing.test_preproc_utils import assert_allclose
from sklearn.preprocessing import \
StandardScaler as skStandardScaler, \
Expand Down Expand Up @@ -197,6 +198,9 @@ def test_make_column_transformer_sparse(sparse_clf_dataset, # noqa: F811
assert_allclose(t_X, sk_t_X)


@pytest.mark.skip(reason="scikit-learn replaced get_feature_names with "
"get_feature_names_out"
"https://github.com/rapidsai/cuml/issues/5159")
def test_column_transformer_get_feature_names(clf_dataset): # noqa: F811
X_np, X = clf_dataset

Expand Down
12 changes: 6 additions & 6 deletions python/cuml/tests/test_coordinate_descent.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def test_lasso(datatype, X_type, alpha, algorithm,
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
random_state=0)
cu_lasso = cuLasso(alpha=np.array([alpha]), fit_intercept=True,
normalize=False, max_iter=1000,
max_iter=1000,
selection=algorithm, tol=1e-10)

cu_lasso.fit(X_train, y_train)
Expand All @@ -55,8 +55,8 @@ def test_lasso(datatype, X_type, alpha, algorithm,
cu_r2 = r2_score(y_test, cu_predict)

if nrows < 500000:
sk_lasso = Lasso(alpha=np.array([alpha]), fit_intercept=True,
normalize=False, max_iter=1000,
sk_lasso = Lasso(alpha=alpha, fit_intercept=True,
max_iter=1000,
selection=algorithm, tol=1e-10)
sk_lasso.fit(X_train, y_train)
sk_predict = sk_lasso.predict(X_test)
Expand Down Expand Up @@ -162,7 +162,7 @@ def test_elastic_net(datatype, X_type, alpha, algorithm,
random_state=0)

elastic_cu = cuElasticNet(alpha=np.array([alpha]), fit_intercept=True,
normalize=False, max_iter=1000,
max_iter=1000,
selection=algorithm, tol=1e-10)

elastic_cu.fit(X_train, y_train)
Expand All @@ -171,8 +171,8 @@ def test_elastic_net(datatype, X_type, alpha, algorithm,
cu_r2 = r2_score(y_test, cu_predict)

if nrows < 500000:
elastic_sk = ElasticNet(alpha=np.array([alpha]), fit_intercept=True,
normalize=False, max_iter=1000,
elastic_sk = ElasticNet(alpha=alpha, fit_intercept=True,
max_iter=1000,
selection=algorithm, tol=1e-10)
elastic_sk.fit(X_train, y_train)
sk_predict = elastic_sk.predict(X_test)
Expand Down
2 changes: 0 additions & 2 deletions python/cuml/tests/test_device_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,12 +176,10 @@ def fixture_generation_helper(params):
'input_type': ['numpy', 'dataframe', 'cupy',
'cudf', 'numba'],
'fit_intercept': [False, True],
'normalize': [False, True]
}))
def linreg_test_data(request):
kwargs = {
'fit_intercept': request.param['fit_intercept'],
'normalize': request.param['normalize'],
}

sk_model = skLinearRegression(**kwargs)
Expand Down
18 changes: 4 additions & 14 deletions python/cuml/tests/test_hdbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,6 @@

cp = gpu_only_import('cupy')

test_datasets = {
"digits": datasets.load_digits(),
"boston": datasets.load_boston(),
"diabetes": datasets.load_diabetes(),
"cancer": datasets.load_breast_cancer(),
}

dataset_names = ['noisy_circles', 'noisy_moons', 'varied']

Expand Down Expand Up @@ -208,14 +202,13 @@ def test_hdbscan_blobs(nrows, ncols, nclusters,

@pytest.mark.skipif(cp.cuda.driver.get_build_version() <= 11020,
reason="Test failing on driver 11.2")
@pytest.mark.parametrize('dataset', test_datasets.values())
@pytest.mark.parametrize('cluster_selection_epsilon', [0.0, 50.0, 150.0])
@pytest.mark.parametrize('min_samples_cluster_size_bounds', [(150, 150, 0),
(50, 25, 0)])
@pytest.mark.parametrize('allow_single_cluster', [True, False])
@pytest.mark.parametrize('cluster_selection_method', ['eom', 'leaf'])
@pytest.mark.parametrize('connectivity', ['knn'])
def test_hdbscan_sklearn_datasets(dataset,
def test_hdbscan_sklearn_datasets(test_datasets,
connectivity,
cluster_selection_epsilon,
cluster_selection_method,
Expand All @@ -225,7 +218,7 @@ def test_hdbscan_sklearn_datasets(dataset,
min_samples, min_cluster_size, max_cluster_size = \
min_samples_cluster_size_bounds

X = dataset.data
X = test_datasets.data

cuml_agg = HDBSCAN(verbose=logger.level_info,
allow_single_cluster=allow_single_cluster,
Expand Down Expand Up @@ -261,25 +254,22 @@ def test_hdbscan_sklearn_datasets(dataset,
rtol=0.1, atol=0.1)


@pytest.mark.parametrize('dataset', test_datasets.values())
@pytest.mark.parametrize('cluster_selection_epsilon', [0.0, 50.0, 150.0])
@pytest.mark.parametrize('min_samples', [150, 50, 5, 400])
@pytest.mark.parametrize('min_cluster_size', [150, 25, 5, 250])
@pytest.mark.parametrize('max_cluster_size', [0])
@pytest.mark.parametrize('allow_single_cluster', [True, False])
@pytest.mark.parametrize('cluster_selection_method', ['eom', 'leaf'])
@pytest.mark.parametrize('connectivity', ['knn'])
def test_hdbscan_sklearn_extract_clusters(dataset,
def test_hdbscan_sklearn_extract_clusters(test_datasets,
connectivity,
cluster_selection_epsilon,
cluster_selection_method,
min_samples,
min_cluster_size,
max_cluster_size,
allow_single_cluster):

X = dataset.data

X = test_datasets.data
cuml_agg = HDBSCAN(verbose=logger.level_info,
allow_single_cluster=allow_single_cluster,
gen_min_span_tree=True,
Expand Down
11 changes: 8 additions & 3 deletions python/cuml/tests/test_lars.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#

import sklearn

from sklearn.linear_model import Lars as skLars
from sklearn.datasets import load_boston
from sklearn.datasets import fetch_california_housing
from cuml.testing.utils import (
array_equal,
unit_param,
Expand Down Expand Up @@ -141,6 +144,8 @@ def test_lars_collinear(datatype, nrows, column_info, precompute):
assert culars.score(X_test, y_test) > 0.85


@pytest.mark.skipif(sklearn.__version__ >= "1.0",
reason="discrepancies on coefficients with sklearn 1.2")
@pytest.mark.parametrize("datatype", [np.float32, np.float64])
@pytest.mark.parametrize("params", [{"precompute": True},
{"precompute": False},
Expand All @@ -149,7 +154,7 @@ def test_lars_collinear(datatype, nrows, column_info, precompute):
{"n_nonzero_coefs": 2,
"fit_intercept": False}])
def test_lars_attributes(datatype, params):
X, y = load_boston(return_X_y=True)
X, y = fetch_california_housing(return_X_y=True)
X = X.astype(datatype)
y = y.astype(datatype)

Expand Down Expand Up @@ -191,7 +196,7 @@ def test_lars_attributes(datatype, params):

@pytest.mark.parametrize("datatype", [np.float32, np.float64])
def test_lars_copy_X(datatype):
X, y = load_boston(return_X_y=True)
X, y = fetch_california_housing(return_X_y=True)
X = cp.asarray(X, dtype=datatype, order='F')
y = cp.asarray(y, dtype=datatype, order='F')

Expand Down
Loading

0 comments on commit 79d0b9e

Please sign in to comment.