From ea1452d57d389acce5fe31aae10098c8d1c6e67a Mon Sep 17 00:00:00 2001
From: Nick Becker <nickb500@gmail.com>
Date: Fri, 17 Feb 2023 13:01:16 -0500
Subject: [PATCH 1/4] add hdbscan, linearsvc, and linearsvr; update RF
 arguments

---
 python/cuml/benchmark/algorithms.py | 40 ++++++++++++++++++++++++++---
 1 file changed, 37 insertions(+), 3 deletions(-)

diff --git a/python/cuml/benchmark/algorithms.py b/python/cuml/benchmark/algorithms.py
index 42bd8b72ec..1f8eff81c9 100644
--- a/python/cuml/benchmark/algorithms.py
+++ b/python/cuml/benchmark/algorithms.py
@@ -59,7 +59,7 @@
     decomposition,
     linear_model,
 )  # noqa: F401
-from cuml.internals.import_utils import has_umap
+from cuml.internals.import_utils import has_umap, has_hdbscan_prediction
 from cuml.internals.safe_imports import cpu_only_import
 
 np = cpu_only_import("numpy")
@@ -69,6 +69,10 @@
     import umap
 
 
+if has_hdbscan_prediction(raise_if_unavailable=False):
+    import hdbscan
+
+
 class AlgorithmPair:
     """
     Wraps a cuML algorithm and (optionally) a cpu-based algorithm
@@ -272,6 +276,16 @@ def all_algorithms():
             name="DBSCAN",
             accepts_labels=False,
         ),
+        AlgorithmPair(
+            hdbscan.HDBSCAN
+            if has_hdbscan_prediction(raise_if_unavailable=True)
+            else None,
+            cuml.cluster.HDBSCAN,
+            shared_args={},
+            cpu_args={},
+            name="HDBSCAN",
+            accepts_labels=False,
+        ),
         AlgorithmPair(
             sklearn.linear_model.LinearRegression,
             cuml.linear_model.LinearRegression,
@@ -315,7 +329,8 @@ def all_algorithms():
         AlgorithmPair(
             sklearn.ensemble.RandomForestClassifier,
             cuml.ensemble.RandomForestClassifier,
-            shared_args={"max_features": 1.0, "n_estimators": 10},
+            shared_args={"max_features": "sqrt", "n_estimators": 50},
+            cpu_args={"n_jobs": 1},
             name="RandomForestClassifier",
             accepts_labels=True,
             cpu_data_prep_hook=_labels_to_int_hook,
@@ -325,7 +340,8 @@ def all_algorithms():
         AlgorithmPair(
             sklearn.ensemble.RandomForestRegressor,
             cuml.ensemble.RandomForestRegressor,
-            shared_args={"max_features": 1.0, "n_estimators": 10},
+            shared_args={"max_features": 1.0, "n_estimators": 50},
+            cpu_args={"n_jobs": 1},
             name="RandomForestRegressor",
             accepts_labels=True,
             accuracy_function=metrics.r2_score,
@@ -382,6 +398,24 @@ def all_algorithms():
             accepts_labels=True,
             accuracy_function=cuml.metrics.r2_score,
         ),
+        AlgorithmPair(
+            sklearn.svm.LinearSVC,
+            cuml.svm.LinearSVC,
+            shared_args={},
+            cuml_args={},
+            name="LinearSVC",
+            accepts_labels=True,
+            accuracy_function=cuml.metrics.accuracy_score,
+        ),
+        AlgorithmPair(
+            sklearn.svm.LinearSVR,
+            cuml.svm.LinearSVR,
+            shared_args={},
+            cuml_args={},
+            name="LinearSVR",
+            accepts_labels=True,
+            accuracy_function=cuml.metrics.accuracy_score,
+        ),
         AlgorithmPair(
             sklearn.neighbors.KNeighborsClassifier,
             cuml.neighbors.KNeighborsClassifier,

From abc52f9c4d508627b45f52c56d012d72923d73a7 Mon Sep 17 00:00:00 2001
From: Nick Becker <nickb500@gmail.com>
Date: Fri, 17 Feb 2023 13:10:22 -0500
Subject: [PATCH 2/4] n_jobs typo

---
 python/cuml/benchmark/algorithms.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/cuml/benchmark/algorithms.py b/python/cuml/benchmark/algorithms.py
index 1f8eff81c9..59ab2479cd 100644
--- a/python/cuml/benchmark/algorithms.py
+++ b/python/cuml/benchmark/algorithms.py
@@ -330,7 +330,7 @@ def all_algorithms():
             sklearn.ensemble.RandomForestClassifier,
             cuml.ensemble.RandomForestClassifier,
             shared_args={"max_features": "sqrt", "n_estimators": 50},
-            cpu_args={"n_jobs": 1},
+            cpu_args={"n_jobs": -1},
             name="RandomForestClassifier",
             accepts_labels=True,
             cpu_data_prep_hook=_labels_to_int_hook,
@@ -341,7 +341,7 @@ def all_algorithms():
             sklearn.ensemble.RandomForestRegressor,
             cuml.ensemble.RandomForestRegressor,
             shared_args={"max_features": 1.0, "n_estimators": 50},
-            cpu_args={"n_jobs": 1},
+            cpu_args={"n_jobs": -1},
             name="RandomForestRegressor",
             accepts_labels=True,
             accuracy_function=metrics.r2_score,

From b5c030b912e8a0af766dd429a67e6a504e94511e Mon Sep 17 00:00:00 2001
From: Nick Becker <nickb500@gmail.com>
Date: Fri, 17 Feb 2023 13:11:23 -0500
Subject: [PATCH 3/4] dont raise if unavailable

---
 python/cuml/benchmark/algorithms.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cuml/benchmark/algorithms.py b/python/cuml/benchmark/algorithms.py
index 59ab2479cd..039c18a7ad 100644
--- a/python/cuml/benchmark/algorithms.py
+++ b/python/cuml/benchmark/algorithms.py
@@ -278,7 +278,7 @@ def all_algorithms():
         ),
         AlgorithmPair(
             hdbscan.HDBSCAN
-            if has_hdbscan_prediction(raise_if_unavailable=True)
+            if has_hdbscan_prediction(raise_if_unavailable=False)
             else None,
             cuml.cluster.HDBSCAN,
             shared_args={},

From dd4f3b1cf83e0a10a33eb56ca373a32598b4a998 Mon Sep 17 00:00:00 2001
From: Nick Becker <nickb500@gmail.com>
Date: Tue, 21 Feb 2023 17:28:02 -0500
Subject: [PATCH 4/4] hdbscan import utils refactoring. just require the full,
 standard hdbscan package

---
 python/cuml/benchmark/algorithms.py        |  8 +++-----
 python/cuml/cluster/hdbscan/hdbscan.pyx    | 11 +++++------
 python/cuml/cluster/hdbscan/prediction.pyx |  7 +++----
 python/cuml/internals/import_utils.py      | 22 +++++-----------------
 4 files changed, 16 insertions(+), 32 deletions(-)

diff --git a/python/cuml/benchmark/algorithms.py b/python/cuml/benchmark/algorithms.py
index 039c18a7ad..a333a2beb7 100644
--- a/python/cuml/benchmark/algorithms.py
+++ b/python/cuml/benchmark/algorithms.py
@@ -59,7 +59,7 @@
     decomposition,
     linear_model,
 )  # noqa: F401
-from cuml.internals.import_utils import has_umap, has_hdbscan_prediction
+from cuml.internals.import_utils import has_hdbscan, has_umap
 from cuml.internals.safe_imports import cpu_only_import
 
 np = cpu_only_import("numpy")
@@ -69,7 +69,7 @@
     import umap
 
 
-if has_hdbscan_prediction(raise_if_unavailable=False):
+if has_hdbscan():
     import hdbscan
 
 
@@ -277,9 +277,7 @@ def all_algorithms():
             accepts_labels=False,
         ),
         AlgorithmPair(
-            hdbscan.HDBSCAN
-            if has_hdbscan_prediction(raise_if_unavailable=False)
-            else None,
+            hdbscan.HDBSCAN if has_hdbscan() else None,
             cuml.cluster.HDBSCAN,
             shared_args={},
             cpu_args={},
diff --git a/python/cuml/cluster/hdbscan/hdbscan.pyx b/python/cuml/cluster/hdbscan/hdbscan.pyx
index 1bf4386b63..4bfecb3136 100644
--- a/python/cuml/cluster/hdbscan/hdbscan.pyx
+++ b/python/cuml/cluster/hdbscan/hdbscan.pyx
@@ -41,8 +41,7 @@ from cuml.internals.api_decorators import enable_device_interop
 from cuml.internals.mixins import ClusterMixin
 from cuml.internals.mixins import CMajorInputTagMixin
 from cuml.internals import logger
-from cuml.internals.import_utils import has_hdbscan_plots
-from cuml.internals.import_utils import has_hdbscan_prediction
+from cuml.internals.import_utils import has_hdbscan
 
 import cuml
 from cuml.metrics.distance_type cimport DistanceType
@@ -210,7 +209,7 @@ def _build_condensed_tree_plot_host(
     raw_tree['lambda_val'] = lambdas
     raw_tree['child_size'] = sizes
 
-    if has_hdbscan_plots():
+    if has_hdbscan(raise_if_unavailable=True):
         from hdbscan.plots import CondensedTree
         return CondensedTree(raw_tree,
                              cluster_selection_method,
@@ -586,7 +585,7 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):
 
             raw_tree = raw_tree.astype(np.float64)
 
-            if has_hdbscan_plots():
+            if has_hdbscan(raise_if_unavailable=True):
                 from hdbscan.plots import SingleLinkageTree
                 self.single_linkage_tree_obj = SingleLinkageTree(raw_tree)
 
@@ -605,7 +604,7 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):
                'model.generate_prediction_data()')
 
         if self.prediction_data_obj is None:
-            if has_hdbscan_prediction():
+            if has_hdbscan(raise_if_unavailable=True):
                 from sklearn.neighbors import KDTree, BallTree
                 from hdbscan.prediction import PredictionData
 
@@ -646,7 +645,7 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):
 
             raw_tree = raw_tree.astype(np.float64)
 
-            if has_hdbscan_plots():
+            if has_hdbscan(raise_if_unavailable=True):
                 from hdbscan.plots import MinimumSpanningTree
                 self.minimum_spanning_tree_ = \
                     MinimumSpanningTree(raw_tree, X.to_output("numpy"))
diff --git a/python/cuml/cluster/hdbscan/prediction.pyx b/python/cuml/cluster/hdbscan/prediction.pyx
index 63f2dadb45..2de567c6f1 100644
--- a/python/cuml/cluster/hdbscan/prediction.pyx
+++ b/python/cuml/cluster/hdbscan/prediction.pyx
@@ -40,8 +40,7 @@ from cuml.internals.device_type import DeviceType
 from cuml.internals.mixins import ClusterMixin
 from cuml.internals.mixins import CMajorInputTagMixin
 from cuml.internals import logger
-from cuml.internals.import_utils import has_hdbscan_plots
-from cuml.internals.import_utils import has_hdbscan_prediction
+from cuml.internals.import_utils import has_hdbscan
 
 import cuml
 from cuml.metrics.distance_type cimport DistanceType
@@ -144,7 +143,7 @@ def all_points_membership_vectors(clusterer):
 
     # cpu infer, cpu/gpu train
     if device_type == DeviceType.host:
-        assert has_hdbscan_prediction()
+        assert has_hdbscan(raise_if_unavailable=True)
         from hdbscan.prediction import all_points_membership_vectors \
             as cpu_all_points_membership_vectors
 
@@ -247,7 +246,7 @@ def approximate_predict(clusterer, points_to_predict, convert_dtype=True):
 
     # cpu infer, cpu/gpu train
     if device_type == DeviceType.host:
-        assert has_hdbscan_prediction()
+        assert has_hdbscan(raise_if_unavailable=True)
         from hdbscan.prediction import approximate_predict \
             as cpu_approximate_predict
 
diff --git a/python/cuml/internals/import_utils.py b/python/cuml/internals/import_utils.py
index 35d793c8ab..2e4165e3bb 100644
--- a/python/cuml/internals/import_utils.py
+++ b/python/cuml/internals/import_utils.py
@@ -162,30 +162,18 @@ def has_sklearn():
         return False
 
 
-def has_hdbscan_plots(raise_if_unavailable=True):
+def has_hdbscan(raise_if_unavailable=False):
     try:
-        from hdbscan.plots import SingleLinkageTree  # NOQA
+        import hdbscan  # NOQA
 
         return True
     except ImportError:
-        if raise_if_unavailable:
-            raise ImportError("hdbscan must be installed to use plots.")
-        else:
+        if not raise_if_unavailable:
             return False
-
-
-def has_hdbscan_prediction(raise_if_unavailable=True):
-    try:
-        from hdbscan.prediction import PredictionData  # NOQA
-
-        return True
-    except ImportError:
-        if raise_if_unavailable:
+        else:
             raise ImportError(
-                "hdbscan.prediction must be installed " "to use prediction."
+                "hdbscan is not available. Please install hdbscan."
             )
-        else:
-            return False
 
 
 def has_shap(min_version="0.37"):