Update Scikit-learn compatibility to 1.2 (#5141)

Authors: - Dante Gama Dessavre (https://github.com/dantegd) - AJ Schmidt (https://github.com/ajschmidt8) Approvers: - William Hicks (https://github.com/wphicks) - AJ Schmidt (https://github.com/ajschmidt8)
rapidsai · Jan 26, 2023 · 79d0b9e · 79d0b9e
1 parent f6d72fc
commit 79d0b9e
Show file tree

Hide file tree

Showing 21 changed files with 139 additions and 103 deletions.
diff --git a/conda/environments/all_cuda-115_arch-x86_64.yaml b/conda/environments/all_cuda-115_arch-x86_64.yaml
@@ -54,7 +54,7 @@ dependencies:
 - raft-dask=23.02.*
 - rmm=23.02.*
 - scikit-build>=0.13.1
-- scikit-learn=0.24
+- scikit-learn=1.2
 - seaborn
 - sparse
 - sphinx-markdown-tables

diff --git a/dependencies.yaml b/dependencies.yaml
@@ -213,7 +213,7 @@ dependencies:
           - pytest-cases
           - pytest-cov
           - pytest-xdist
-          - &scikit_learn scikit-learn=0.24
+          - &scikit_learn scikit-learn=1.2
           - seaborn
           - sparse
           - statsmodels

diff --git a/notebooks/kmeans_demo.ipynb b/notebooks/kmeans_demo.ipynb
diff --git a/notebooks/linear_regression_demo.ipynb b/notebooks/linear_regression_demo.ipynb
@@ -109,7 +109,6 @@
    "source": [
     "%%time\n",
     "ols_sk = skLinearRegression(fit_intercept=True,\n",
-    "                            normalize=True,\n",
     "                            n_jobs=-1)\n",
     "\n",
     "ols_sk.fit(X_train, y_train)"
@@ -157,7 +156,6 @@
    "source": [
     "%%time\n",
     "ols_cuml = cuLinearRegression(fit_intercept=True,\n",
-    "                              normalize=True,\n",
     "                              algorithm='eig')\n",
     "\n",
     "ols_cuml.fit(X_cudf, y_cudf)"
@@ -203,7 +201,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -217,7 +215,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.6"
+   "version": "3.9.15"
   }
  },
  "nbformat": 4,

diff --git a/python/cuml/decomposition/pca.pyx b/python/cuml/decomposition/pca.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -362,8 +362,7 @@ class PCA(UniversalBase,
         self._sparse_model = True
 
         self.n_samples_ = X.shape[0]
-        self.n_features_ = X.shape[1] if X.ndim == 2 else 1
-        self.n_features_in_ = self.n_features_
+        self.n_features_in_ = X.shape[1] if X.ndim == 2 else 1
         self.dtype = X.dtype
 
         # NOTE: All intermediate calculations are done using cupy.ndarray and
@@ -387,7 +386,7 @@ class PCA(UniversalBase,
         self.explained_variance_ratio_ = self.explained_variance_ / cp.sum(
             self.explained_variance_)
 
-        if self.n_components_ < min(self.n_samples_, self.n_features_):
+        if self.n_components_ < min(self.n_samples_, self.n_features_in_):
             self.noise_variance_ = \
                 self.explained_variance_[self.n_components_:].mean()
         else:
@@ -433,16 +432,15 @@ class PCA(UniversalBase,
             X = sparse_scipy_to_cp(X, dtype=None)
             return self._sparse_fit(X)
 
-        X_m, self.n_samples_, self.n_features_, self.dtype = \
+        X_m, self.n_samples_, self.n_features_in_, self.dtype = \
             input_to_cuml_array(X, check_dtype=[np.float32, np.float64])
         cdef uintptr_t input_ptr = X_m.ptr
-        self.n_features_in_ = self.n_features_
         self.feature_names_in_ = X_m.index
 
         cdef paramsPCA *params = <paramsPCA*><size_t> \
-            self._build_params(self.n_samples_, self.n_features_)
+            self._build_params(self.n_samples_, self.n_features_in_)
 
-        if params.n_components > self.n_features_:
+        if params.n_components > self.n_features_in_:
             raise ValueError('Number of components should not be greater than'
                              'the number of columns in the data')
 
@@ -587,7 +585,7 @@ class PCA(UniversalBase,
         cpdef paramsPCA params
         params.n_components = self.n_components_
         params.n_rows = n_rows
-        params.n_cols = self.n_features_
+        params.n_cols = self.n_features_in_
         params.whiten = self.whiten
 
         input_data = CumlArray.zeros((params.n_rows, params.n_cols),
@@ -675,7 +673,7 @@ class PCA(UniversalBase,
             input_to_cuml_array(X, check_dtype=dtype,
                                 convert_to_dtype=(dtype if convert_dtype
                                                   else None),
-                                check_cols=self.n_features_)
+                                check_cols=self.n_features_in_)
 
         cdef uintptr_t input_ptr = X_m.ptr
 
@@ -735,5 +733,4 @@ class PCA(UniversalBase,
         return ['components_', 'explained_variance_',
                 'explained_variance_ratio_', 'singular_values_',
                 'mean_', 'n_components_', 'noise_variance_',
-                'n_samples_', 'n_features_', 'n_features_in_',
-                'feature_names_in_']
+                'n_samples_', 'n_features_in_', 'feature_names_in_']
diff --git a/python/cuml/solvers/sgd.pyx b/python/cuml/solvers/sgd.pyx
@@ -237,6 +237,8 @@ class SGD(Base,
             msg = "loss {!r} is not supported"
             raise TypeError(msg.format(loss))
 
+        if penalty is None:
+            penalty = 'none'
         if penalty in ['none', 'l1', 'l2', 'elasticnet']:
             self.penalty = penalty
         else:

diff --git a/python/cuml/svm/svc.pyx b/python/cuml/svm/svc.pyx
@@ -618,7 +618,7 @@ class SVC(SVMBase,
 
             with cuml.internals.exit_internal_api():
                 for clf in self.prob_svc.calibrated_classifiers_:
-                    df = df + clf.base_estimator.decision_function(X)
+                    df = df + clf.estimator.decision_function(X)
             df = df / len(self.prob_svc.calibrated_classifiers_)
             return df
         elif self.n_classes_ > 2:

diff --git a/python/cuml/tests/conftest.py b/python/cuml/tests/conftest.py
@@ -16,17 +16,20 @@
 
 from cuml.testing.utils import create_synthetic_dataset
 from sklearn.feature_extraction.text import CountVectorizer
+from sklearn import datasets
 from sklearn.datasets import make_regression as skl_make_reg
 from sklearn.datasets import make_classification as skl_make_clas
 from sklearn.datasets import fetch_california_housing
 from sklearn.datasets import fetch_20newsgroups
+from sklearn.utils import Bunch
 from datetime import timedelta
 from math import ceil
 import hypothesis
 from cuml.internals.safe_imports import gpu_only_import
 import pytest
 import os
 import subprocess
+import pandas as pd
 
 from cuml.internals.safe_imports import cpu_only_import
 np = cpu_only_import('numpy')
@@ -180,6 +183,38 @@ def housing_dataset():
     return X, y, feature_names
 
 
+@pytest.fixture(scope="module")
+def deprecated_boston_dataset():
+    # dataset was removed in Scikit-learn 1.2, we should change it for a
+    # better dataset for tests, see
+    # https://github.com/rapidsai/cuml/issues/5158
+
+    df = pd.read_csv('https://raw.githubusercontent.com/scikit-learn/scikit-learn/baf828ca126bcb2c0ad813226963621cafe38adb/sklearn/datasets/data/boston_house_prices.csv', header=None)  # noqa: E501
+    n_samples = int(df[0][0])
+    data = df[list(np.arange(13))].values[2:n_samples].astype(np.float64)
+    targets = df[13].values[2:n_samples].astype(np.float64)
+
+    return Bunch(
+        data=data,
+        target=targets,
+    )
+
+
+@pytest.fixture(scope="module", params=["digits",
+                                        "deprecated_boston_dataset",
+                                        "diabetes",
+                                        "cancer"])
+def test_datasets(request, deprecated_boston_dataset):
+    test_datasets_dict = {
+        "digits": datasets.load_digits(),
+        "deprecated_boston_dataset": deprecated_boston_dataset,
+        "diabetes": datasets.load_diabetes(),
+        "cancer": datasets.load_breast_cancer(),
+    }
+
+    return test_datasets_dict[request.param]
+
+
 @pytest.fixture(scope="session")
 def random_seed(request):
     current_random_seed = os.getenv('PYTEST_RANDOM_SEED')

diff --git a/python/cuml/tests/test_compose.py b/python/cuml/tests/test_compose.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 #
 
+
 from cuml.testing.test_preproc_utils import assert_allclose
 from sklearn.preprocessing import \
     StandardScaler as skStandardScaler, \
@@ -197,6 +198,9 @@ def test_make_column_transformer_sparse(sparse_clf_dataset,  # noqa: F811
     assert_allclose(t_X, sk_t_X)
 
 
+@pytest.mark.skip(reason="scikit-learn replaced get_feature_names with "
+                         "get_feature_names_out"
+                         "https://github.com/rapidsai/cuml/issues/5159")
 def test_column_transformer_get_feature_names(clf_dataset):  # noqa: F811
     X_np, X = clf_dataset
 

diff --git a/python/cuml/tests/test_coordinate_descent.py b/python/cuml/tests/test_coordinate_descent.py
@@ -45,7 +45,7 @@ def test_lasso(datatype, X_type, alpha, algorithm,
     X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
                                                         random_state=0)
     cu_lasso = cuLasso(alpha=np.array([alpha]), fit_intercept=True,
-                       normalize=False, max_iter=1000,
+                       max_iter=1000,
                        selection=algorithm, tol=1e-10)
 
     cu_lasso.fit(X_train, y_train)
@@ -55,8 +55,8 @@ def test_lasso(datatype, X_type, alpha, algorithm,
     cu_r2 = r2_score(y_test, cu_predict)
 
     if nrows < 500000:
-        sk_lasso = Lasso(alpha=np.array([alpha]), fit_intercept=True,
-                         normalize=False, max_iter=1000,
+        sk_lasso = Lasso(alpha=alpha, fit_intercept=True,
+                         max_iter=1000,
                          selection=algorithm, tol=1e-10)
         sk_lasso.fit(X_train, y_train)
         sk_predict = sk_lasso.predict(X_test)
@@ -162,7 +162,7 @@ def test_elastic_net(datatype, X_type, alpha, algorithm,
                                                         random_state=0)
 
     elastic_cu = cuElasticNet(alpha=np.array([alpha]), fit_intercept=True,
-                              normalize=False, max_iter=1000,
+                              max_iter=1000,
                               selection=algorithm, tol=1e-10)
 
     elastic_cu.fit(X_train, y_train)
@@ -171,8 +171,8 @@ def test_elastic_net(datatype, X_type, alpha, algorithm,
     cu_r2 = r2_score(y_test, cu_predict)
 
     if nrows < 500000:
-        elastic_sk = ElasticNet(alpha=np.array([alpha]), fit_intercept=True,
-                                normalize=False, max_iter=1000,
+        elastic_sk = ElasticNet(alpha=alpha, fit_intercept=True,
+                                max_iter=1000,
                                 selection=algorithm, tol=1e-10)
         elastic_sk.fit(X_train, y_train)
         sk_predict = elastic_sk.predict(X_test)

diff --git a/python/cuml/tests/test_device_selection.py b/python/cuml/tests/test_device_selection.py
@@ -176,12 +176,10 @@ def fixture_generation_helper(params):
     'input_type': ['numpy', 'dataframe', 'cupy',
                    'cudf', 'numba'],
     'fit_intercept': [False, True],
-    'normalize': [False, True]
 }))
 def linreg_test_data(request):
     kwargs = {
         'fit_intercept': request.param['fit_intercept'],
-        'normalize': request.param['normalize'],
     }
 
     sk_model = skLinearRegression(**kwargs)

diff --git a/python/cuml/tests/test_hdbscan.py b/python/cuml/tests/test_hdbscan.py
@@ -36,12 +36,6 @@
 
 cp = gpu_only_import('cupy')
 
-test_datasets = {
-    "digits": datasets.load_digits(),
-    "boston": datasets.load_boston(),
-    "diabetes": datasets.load_diabetes(),
-    "cancer": datasets.load_breast_cancer(),
-}
 
 dataset_names = ['noisy_circles', 'noisy_moons', 'varied']
 
@@ -208,14 +202,13 @@ def test_hdbscan_blobs(nrows, ncols, nclusters,
 
 @pytest.mark.skipif(cp.cuda.driver.get_build_version() <= 11020,
                     reason="Test failing on driver 11.2")
-@pytest.mark.parametrize('dataset', test_datasets.values())
 @pytest.mark.parametrize('cluster_selection_epsilon', [0.0, 50.0, 150.0])
 @pytest.mark.parametrize('min_samples_cluster_size_bounds', [(150, 150, 0),
                                                              (50, 25, 0)])
 @pytest.mark.parametrize('allow_single_cluster', [True, False])
 @pytest.mark.parametrize('cluster_selection_method', ['eom', 'leaf'])
 @pytest.mark.parametrize('connectivity', ['knn'])
-def test_hdbscan_sklearn_datasets(dataset,
+def test_hdbscan_sklearn_datasets(test_datasets,
                                   connectivity,
                                   cluster_selection_epsilon,
                                   cluster_selection_method,
@@ -225,7 +218,7 @@ def test_hdbscan_sklearn_datasets(dataset,
     min_samples, min_cluster_size, max_cluster_size = \
         min_samples_cluster_size_bounds
 
-    X = dataset.data
+    X = test_datasets.data
 
     cuml_agg = HDBSCAN(verbose=logger.level_info,
                        allow_single_cluster=allow_single_cluster,
@@ -261,25 +254,22 @@ def test_hdbscan_sklearn_datasets(dataset,
                        rtol=0.1, atol=0.1)
 
 
-@pytest.mark.parametrize('dataset', test_datasets.values())
 @pytest.mark.parametrize('cluster_selection_epsilon', [0.0, 50.0, 150.0])
 @pytest.mark.parametrize('min_samples', [150, 50, 5, 400])
 @pytest.mark.parametrize('min_cluster_size', [150, 25, 5, 250])
 @pytest.mark.parametrize('max_cluster_size', [0])
 @pytest.mark.parametrize('allow_single_cluster', [True, False])
 @pytest.mark.parametrize('cluster_selection_method', ['eom', 'leaf'])
 @pytest.mark.parametrize('connectivity', ['knn'])
-def test_hdbscan_sklearn_extract_clusters(dataset,
+def test_hdbscan_sklearn_extract_clusters(test_datasets,
                                           connectivity,
                                           cluster_selection_epsilon,
                                           cluster_selection_method,
                                           min_samples,
                                           min_cluster_size,
                                           max_cluster_size,
                                           allow_single_cluster):
-
-    X = dataset.data
-
+    X = test_datasets.data
     cuml_agg = HDBSCAN(verbose=logger.level_info,
                        allow_single_cluster=allow_single_cluster,
                        gen_min_span_tree=True,

diff --git a/python/cuml/tests/test_lars.py b/python/cuml/tests/test_lars.py
@@ -12,8 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+
+import sklearn
+
 from sklearn.linear_model import Lars as skLars
-from sklearn.datasets import load_boston
+from sklearn.datasets import fetch_california_housing
 from cuml.testing.utils import (
     array_equal,
     unit_param,
@@ -141,6 +144,8 @@ def test_lars_collinear(datatype, nrows, column_info, precompute):
     assert culars.score(X_test, y_test) > 0.85
 
 
+@pytest.mark.skipif(sklearn.__version__ >= "1.0",
+                    reason="discrepancies on coefficients with sklearn 1.2")
 @pytest.mark.parametrize("datatype", [np.float32, np.float64])
 @pytest.mark.parametrize("params", [{"precompute": True},
                                     {"precompute": False},
@@ -149,7 +154,7 @@ def test_lars_collinear(datatype, nrows, column_info, precompute):
                                     {"n_nonzero_coefs": 2,
                                      "fit_intercept": False}])
 def test_lars_attributes(datatype, params):
-    X, y = load_boston(return_X_y=True)
+    X, y = fetch_california_housing(return_X_y=True)
     X = X.astype(datatype)
     y = y.astype(datatype)
 
@@ -191,7 +196,7 @@ def test_lars_attributes(datatype, params):
 
 @pytest.mark.parametrize("datatype", [np.float32, np.float64])
 def test_lars_copy_X(datatype):
-    X, y = load_boston(return_X_y=True)
+    X, y = fetch_california_housing(return_X_y=True)
     X = cp.asarray(X, dtype=datatype, order='F')
     y = cp.asarray(y, dtype=datatype, order='F')