From ea1452d57d389acce5fe31aae10098c8d1c6e67a Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Fri, 17 Feb 2023 13:01:16 -0500 Subject: [PATCH 1/4] add hdbscan, linearsvc, and linearsvr; update RF arguments --- python/cuml/benchmark/algorithms.py | 40 ++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/python/cuml/benchmark/algorithms.py b/python/cuml/benchmark/algorithms.py index 42bd8b72ec..1f8eff81c9 100644 --- a/python/cuml/benchmark/algorithms.py +++ b/python/cuml/benchmark/algorithms.py @@ -59,7 +59,7 @@ decomposition, linear_model, ) # noqa: F401 -from cuml.internals.import_utils import has_umap +from cuml.internals.import_utils import has_umap, has_hdbscan_prediction from cuml.internals.safe_imports import cpu_only_import np = cpu_only_import("numpy") @@ -69,6 +69,10 @@ import umap +if has_hdbscan_prediction(raise_if_unavailable=False): + import hdbscan + + class AlgorithmPair: """ Wraps a cuML algorithm and (optionally) a cpu-based algorithm @@ -272,6 +276,16 @@ def all_algorithms(): name="DBSCAN", accepts_labels=False, ), + AlgorithmPair( + hdbscan.HDBSCAN + if has_hdbscan_prediction(raise_if_unavailable=True) + else None, + cuml.cluster.HDBSCAN, + shared_args={}, + cpu_args={}, + name="HDBSCAN", + accepts_labels=False, + ), AlgorithmPair( sklearn.linear_model.LinearRegression, cuml.linear_model.LinearRegression, @@ -315,7 +329,8 @@ def all_algorithms(): AlgorithmPair( sklearn.ensemble.RandomForestClassifier, cuml.ensemble.RandomForestClassifier, - shared_args={"max_features": 1.0, "n_estimators": 10}, + shared_args={"max_features": "sqrt", "n_estimators": 50}, + cpu_args={"n_jobs": 1}, name="RandomForestClassifier", accepts_labels=True, cpu_data_prep_hook=_labels_to_int_hook, @@ -325,7 +340,8 @@ def all_algorithms(): AlgorithmPair( sklearn.ensemble.RandomForestRegressor, cuml.ensemble.RandomForestRegressor, - shared_args={"max_features": 1.0, "n_estimators": 10}, + shared_args={"max_features": 1.0, "n_estimators": 50}, + cpu_args={"n_jobs": 1}, name="RandomForestRegressor", accepts_labels=True, accuracy_function=metrics.r2_score, @@ -382,6 +398,24 @@ def all_algorithms(): accepts_labels=True, accuracy_function=cuml.metrics.r2_score, ), + AlgorithmPair( + sklearn.svm.LinearSVC, + cuml.svm.LinearSVC, + shared_args={}, + cuml_args={}, + name="LinearSVC", + accepts_labels=True, + accuracy_function=cuml.metrics.accuracy_score, + ), + AlgorithmPair( + sklearn.svm.LinearSVR, + cuml.svm.LinearSVR, + shared_args={}, + cuml_args={}, + name="LinearSVR", + accepts_labels=True, + accuracy_function=cuml.metrics.accuracy_score, + ), AlgorithmPair( sklearn.neighbors.KNeighborsClassifier, cuml.neighbors.KNeighborsClassifier, From abc52f9c4d508627b45f52c56d012d72923d73a7 Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Fri, 17 Feb 2023 13:10:22 -0500 Subject: [PATCH 2/4] n_jobs typo --- python/cuml/benchmark/algorithms.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cuml/benchmark/algorithms.py b/python/cuml/benchmark/algorithms.py index 1f8eff81c9..59ab2479cd 100644 --- a/python/cuml/benchmark/algorithms.py +++ b/python/cuml/benchmark/algorithms.py @@ -330,7 +330,7 @@ def all_algorithms(): sklearn.ensemble.RandomForestClassifier, cuml.ensemble.RandomForestClassifier, shared_args={"max_features": "sqrt", "n_estimators": 50}, - cpu_args={"n_jobs": 1}, + cpu_args={"n_jobs": -1}, name="RandomForestClassifier", accepts_labels=True, cpu_data_prep_hook=_labels_to_int_hook, @@ -341,7 +341,7 @@ def all_algorithms(): sklearn.ensemble.RandomForestRegressor, cuml.ensemble.RandomForestRegressor, shared_args={"max_features": 1.0, "n_estimators": 50}, - cpu_args={"n_jobs": 1}, + cpu_args={"n_jobs": -1}, name="RandomForestRegressor", accepts_labels=True, accuracy_function=metrics.r2_score, From b5c030b912e8a0af766dd429a67e6a504e94511e Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Fri, 17 Feb 2023 13:11:23 -0500 Subject: [PATCH 3/4] dont raise if unavailable --- python/cuml/benchmark/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cuml/benchmark/algorithms.py b/python/cuml/benchmark/algorithms.py index 59ab2479cd..039c18a7ad 100644 --- a/python/cuml/benchmark/algorithms.py +++ b/python/cuml/benchmark/algorithms.py @@ -278,7 +278,7 @@ def all_algorithms(): ), AlgorithmPair( hdbscan.HDBSCAN - if has_hdbscan_prediction(raise_if_unavailable=True) + if has_hdbscan_prediction(raise_if_unavailable=False) else None, cuml.cluster.HDBSCAN, shared_args={}, From dd4f3b1cf83e0a10a33eb56ca373a32598b4a998 Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Tue, 21 Feb 2023 17:28:02 -0500 Subject: [PATCH 4/4] hdbscan import utils refactoring. just require the full, standard hdbscan package --- python/cuml/benchmark/algorithms.py | 8 +++----- python/cuml/cluster/hdbscan/hdbscan.pyx | 11 +++++------ python/cuml/cluster/hdbscan/prediction.pyx | 7 +++---- python/cuml/internals/import_utils.py | 22 +++++----------------- 4 files changed, 16 insertions(+), 32 deletions(-) diff --git a/python/cuml/benchmark/algorithms.py b/python/cuml/benchmark/algorithms.py index 039c18a7ad..a333a2beb7 100644 --- a/python/cuml/benchmark/algorithms.py +++ b/python/cuml/benchmark/algorithms.py @@ -59,7 +59,7 @@ decomposition, linear_model, ) # noqa: F401 -from cuml.internals.import_utils import has_umap, has_hdbscan_prediction +from cuml.internals.import_utils import has_hdbscan, has_umap from cuml.internals.safe_imports import cpu_only_import np = cpu_only_import("numpy") @@ -69,7 +69,7 @@ import umap -if has_hdbscan_prediction(raise_if_unavailable=False): +if has_hdbscan(): import hdbscan @@ -277,9 +277,7 @@ def all_algorithms(): accepts_labels=False, ), AlgorithmPair( - hdbscan.HDBSCAN - if has_hdbscan_prediction(raise_if_unavailable=False) - else None, + hdbscan.HDBSCAN if has_hdbscan() else None, cuml.cluster.HDBSCAN, shared_args={}, cpu_args={}, diff --git a/python/cuml/cluster/hdbscan/hdbscan.pyx b/python/cuml/cluster/hdbscan/hdbscan.pyx index 1bf4386b63..4bfecb3136 100644 --- a/python/cuml/cluster/hdbscan/hdbscan.pyx +++ b/python/cuml/cluster/hdbscan/hdbscan.pyx @@ -41,8 +41,7 @@ from cuml.internals.api_decorators import enable_device_interop from cuml.internals.mixins import ClusterMixin from cuml.internals.mixins import CMajorInputTagMixin from cuml.internals import logger -from cuml.internals.import_utils import has_hdbscan_plots -from cuml.internals.import_utils import has_hdbscan_prediction +from cuml.internals.import_utils import has_hdbscan import cuml from cuml.metrics.distance_type cimport DistanceType @@ -210,7 +209,7 @@ def _build_condensed_tree_plot_host( raw_tree['lambda_val'] = lambdas raw_tree['child_size'] = sizes - if has_hdbscan_plots(): + if has_hdbscan(raise_if_unavailable=True): from hdbscan.plots import CondensedTree return CondensedTree(raw_tree, cluster_selection_method, @@ -586,7 +585,7 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin): raw_tree = raw_tree.astype(np.float64) - if has_hdbscan_plots(): + if has_hdbscan(raise_if_unavailable=True): from hdbscan.plots import SingleLinkageTree self.single_linkage_tree_obj = SingleLinkageTree(raw_tree) @@ -605,7 +604,7 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin): 'model.generate_prediction_data()') if self.prediction_data_obj is None: - if has_hdbscan_prediction(): + if has_hdbscan(raise_if_unavailable=True): from sklearn.neighbors import KDTree, BallTree from hdbscan.prediction import PredictionData @@ -646,7 +645,7 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin): raw_tree = raw_tree.astype(np.float64) - if has_hdbscan_plots(): + if has_hdbscan(raise_if_unavailable=True): from hdbscan.plots import MinimumSpanningTree self.minimum_spanning_tree_ = \ MinimumSpanningTree(raw_tree, X.to_output("numpy")) diff --git a/python/cuml/cluster/hdbscan/prediction.pyx b/python/cuml/cluster/hdbscan/prediction.pyx index 63f2dadb45..2de567c6f1 100644 --- a/python/cuml/cluster/hdbscan/prediction.pyx +++ b/python/cuml/cluster/hdbscan/prediction.pyx @@ -40,8 +40,7 @@ from cuml.internals.device_type import DeviceType from cuml.internals.mixins import ClusterMixin from cuml.internals.mixins import CMajorInputTagMixin from cuml.internals import logger -from cuml.internals.import_utils import has_hdbscan_plots -from cuml.internals.import_utils import has_hdbscan_prediction +from cuml.internals.import_utils import has_hdbscan import cuml from cuml.metrics.distance_type cimport DistanceType @@ -144,7 +143,7 @@ def all_points_membership_vectors(clusterer): # cpu infer, cpu/gpu train if device_type == DeviceType.host: - assert has_hdbscan_prediction() + assert has_hdbscan(raise_if_unavailable=True) from hdbscan.prediction import all_points_membership_vectors \ as cpu_all_points_membership_vectors @@ -247,7 +246,7 @@ def approximate_predict(clusterer, points_to_predict, convert_dtype=True): # cpu infer, cpu/gpu train if device_type == DeviceType.host: - assert has_hdbscan_prediction() + assert has_hdbscan(raise_if_unavailable=True) from hdbscan.prediction import approximate_predict \ as cpu_approximate_predict diff --git a/python/cuml/internals/import_utils.py b/python/cuml/internals/import_utils.py index 35d793c8ab..2e4165e3bb 100644 --- a/python/cuml/internals/import_utils.py +++ b/python/cuml/internals/import_utils.py @@ -162,30 +162,18 @@ def has_sklearn(): return False -def has_hdbscan_plots(raise_if_unavailable=True): +def has_hdbscan(raise_if_unavailable=False): try: - from hdbscan.plots import SingleLinkageTree # NOQA + import hdbscan # NOQA return True except ImportError: - if raise_if_unavailable: - raise ImportError("hdbscan must be installed to use plots.") - else: + if not raise_if_unavailable: return False - - -def has_hdbscan_prediction(raise_if_unavailable=True): - try: - from hdbscan.prediction import PredictionData # NOQA - - return True - except ImportError: - if raise_if_unavailable: + else: raise ImportError( - "hdbscan.prediction must be installed " "to use prediction." + "hdbscan is not available. Please install hdbscan." ) - else: - return False def has_shap(min_version="0.37"):