Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhance cuML benchmark utility and refactor hdbscan import utilities #5242

Merged
merged 11 commits into from
Mar 6, 2023
38 changes: 35 additions & 3 deletions python/cuml/benchmark/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
decomposition,
linear_model,
) # noqa: F401
from cuml.internals.import_utils import has_umap
from cuml.internals.import_utils import has_hdbscan, has_umap
from cuml.internals.safe_imports import cpu_only_import

np = cpu_only_import("numpy")
Expand All @@ -69,6 +69,10 @@
import umap


if has_hdbscan():
import hdbscan


class AlgorithmPair:
"""
Wraps a cuML algorithm and (optionally) a cpu-based algorithm
Expand Down Expand Up @@ -272,6 +276,14 @@ def all_algorithms():
name="DBSCAN",
accepts_labels=False,
),
AlgorithmPair(
hdbscan.HDBSCAN if has_hdbscan() else None,
cuml.cluster.HDBSCAN,
shared_args={},
cpu_args={},
name="HDBSCAN",
accepts_labels=False,
),
AlgorithmPair(
sklearn.linear_model.LinearRegression,
cuml.linear_model.LinearRegression,
Expand Down Expand Up @@ -315,7 +327,8 @@ def all_algorithms():
AlgorithmPair(
sklearn.ensemble.RandomForestClassifier,
cuml.ensemble.RandomForestClassifier,
shared_args={"max_features": 1.0, "n_estimators": 10},
shared_args={"max_features": "sqrt", "n_estimators": 50},
cpu_args={"n_jobs": -1},
name="RandomForestClassifier",
accepts_labels=True,
cpu_data_prep_hook=_labels_to_int_hook,
Expand All @@ -325,7 +338,8 @@ def all_algorithms():
AlgorithmPair(
sklearn.ensemble.RandomForestRegressor,
cuml.ensemble.RandomForestRegressor,
shared_args={"max_features": 1.0, "n_estimators": 10},
shared_args={"max_features": 1.0, "n_estimators": 50},
cpu_args={"n_jobs": -1},
name="RandomForestRegressor",
accepts_labels=True,
accuracy_function=metrics.r2_score,
Expand Down Expand Up @@ -382,6 +396,24 @@ def all_algorithms():
accepts_labels=True,
accuracy_function=cuml.metrics.r2_score,
),
AlgorithmPair(
sklearn.svm.LinearSVC,
cuml.svm.LinearSVC,
shared_args={},
cuml_args={},
name="LinearSVC",
accepts_labels=True,
accuracy_function=cuml.metrics.accuracy_score,
),
AlgorithmPair(
sklearn.svm.LinearSVR,
cuml.svm.LinearSVR,
shared_args={},
cuml_args={},
name="LinearSVR",
accepts_labels=True,
accuracy_function=cuml.metrics.accuracy_score,
),
AlgorithmPair(
sklearn.neighbors.KNeighborsClassifier,
cuml.neighbors.KNeighborsClassifier,
Expand Down
11 changes: 5 additions & 6 deletions python/cuml/cluster/hdbscan/hdbscan.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,7 @@ from cuml.internals.api_decorators import enable_device_interop
from cuml.internals.mixins import ClusterMixin
from cuml.internals.mixins import CMajorInputTagMixin
from cuml.internals import logger
from cuml.internals.import_utils import has_hdbscan_plots
from cuml.internals.import_utils import has_hdbscan_prediction
from cuml.internals.import_utils import has_hdbscan

import cuml
from cuml.metrics.distance_type cimport DistanceType
Expand Down Expand Up @@ -210,7 +209,7 @@ def _build_condensed_tree_plot_host(
raw_tree['lambda_val'] = lambdas
raw_tree['child_size'] = sizes

if has_hdbscan_plots():
if has_hdbscan(raise_if_unavailable=True):
from hdbscan.plots import CondensedTree
return CondensedTree(raw_tree,
cluster_selection_method,
Expand Down Expand Up @@ -586,7 +585,7 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):

raw_tree = raw_tree.astype(np.float64)

if has_hdbscan_plots():
if has_hdbscan(raise_if_unavailable=True):
from hdbscan.plots import SingleLinkageTree
self.single_linkage_tree_obj = SingleLinkageTree(raw_tree)

Expand All @@ -605,7 +604,7 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):
'model.generate_prediction_data()')

if self.prediction_data_obj is None:
if has_hdbscan_prediction():
if has_hdbscan(raise_if_unavailable=True):
from sklearn.neighbors import KDTree, BallTree
from hdbscan.prediction import PredictionData

Expand Down Expand Up @@ -646,7 +645,7 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):

raw_tree = raw_tree.astype(np.float64)

if has_hdbscan_plots():
if has_hdbscan(raise_if_unavailable=True):
from hdbscan.plots import MinimumSpanningTree
self.minimum_spanning_tree_ = \
MinimumSpanningTree(raw_tree, X.to_output("numpy"))
Expand Down
7 changes: 3 additions & 4 deletions python/cuml/cluster/hdbscan/prediction.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,7 @@ from cuml.internals.device_type import DeviceType
from cuml.internals.mixins import ClusterMixin
from cuml.internals.mixins import CMajorInputTagMixin
from cuml.internals import logger
from cuml.internals.import_utils import has_hdbscan_plots
from cuml.internals.import_utils import has_hdbscan_prediction
from cuml.internals.import_utils import has_hdbscan

import cuml
from cuml.metrics.distance_type cimport DistanceType
Expand Down Expand Up @@ -144,7 +143,7 @@ def all_points_membership_vectors(clusterer):

# cpu infer, cpu/gpu train
if device_type == DeviceType.host:
assert has_hdbscan_prediction()
assert has_hdbscan(raise_if_unavailable=True)
from hdbscan.prediction import all_points_membership_vectors \
as cpu_all_points_membership_vectors

Expand Down Expand Up @@ -247,7 +246,7 @@ def approximate_predict(clusterer, points_to_predict, convert_dtype=True):

# cpu infer, cpu/gpu train
if device_type == DeviceType.host:
assert has_hdbscan_prediction()
assert has_hdbscan(raise_if_unavailable=True)
from hdbscan.prediction import approximate_predict \
as cpu_approximate_predict

Expand Down
22 changes: 5 additions & 17 deletions python/cuml/internals/import_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,30 +162,18 @@ def has_sklearn():
return False


def has_hdbscan_plots(raise_if_unavailable=True):
def has_hdbscan(raise_if_unavailable=False):
try:
from hdbscan.plots import SingleLinkageTree # NOQA
import hdbscan # NOQA

return True
except ImportError:
if raise_if_unavailable:
raise ImportError("hdbscan must be installed to use plots.")
else:
if not raise_if_unavailable:
return False


def has_hdbscan_prediction(raise_if_unavailable=True):
try:
from hdbscan.prediction import PredictionData # NOQA

return True
except ImportError:
if raise_if_unavailable:
else:
raise ImportError(
"hdbscan.prediction must be installed " "to use prediction."
"hdbscan is not available. Please install hdbscan."
)
else:
return False


def has_shap(min_version="0.37"):
Expand Down