diff --git a/python/cuml/benchmark/algorithms.py b/python/cuml/benchmark/algorithms.py index ae477e099c..0168de32d9 100644 --- a/python/cuml/benchmark/algorithms.py +++ b/python/cuml/benchmark/algorithms.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2021, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -26,6 +26,8 @@ import cuml.metrics import cuml.decomposition import cuml.naive_bayes +from cuml.dask import neighbors, cluster, manifold, \ + decomposition, linear_model # noqa: F401 from cuml.common.import_utils import has_umap import numpy as np import tempfile @@ -37,14 +39,17 @@ from cuml.benchmark.bench_helper_funcs import ( fit, - fit_kneighbors, - fit_transform, + transform, predict, + fit_transform, + fit_predict, + fit_kneighbors, _build_cpu_skl_classifier, _build_fil_skl_classifier, _build_fil_classifier, _build_treelite_classifier, _treelite_fil_accuracy_score, + _build_mnmg_umap ) import treelite import treelite_runtime @@ -122,13 +127,13 @@ def __init__( def __str__(self): return "AlgoPair:%s" % (self.name) - def run_cpu(self, data, **override_args): + def run_cpu(self, data, bench_args={}, **override_setup_args): """Runs the cpu-based algorithm's fit method on specified data""" if self.cpu_class is None: raise ValueError("No CPU implementation for %s" % self.name) all_args = {**self.shared_args, **self.cpu_args} - all_args = {**all_args, **override_args} + all_args = {**all_args, **override_setup_args} if "cpu_setup_result" not in all_args: cpu_obj = self.cpu_class(**all_args) @@ -137,16 +142,16 @@ def run_cpu(self, data, **override_args): if self.cpu_data_prep_hook: data = self.cpu_data_prep_hook(data) if self.accepts_labels: - self.bench_func(cpu_obj, data[0], data[1]) + self.bench_func(cpu_obj, data[0], data[1], **bench_args) else: - self.bench_func(cpu_obj, data[0]) + self.bench_func(cpu_obj, data[0], **bench_args) return cpu_obj - def run_cuml(self, data, **override_args): + def run_cuml(self, data, bench_args={}, **override_setup_args): """Runs the cuml-based algorithm's fit method on specified data""" all_args = {**self.shared_args, **self.cuml_args} - all_args = {**all_args, **override_args} + all_args = {**all_args, **override_setup_args} if "cuml_setup_result" not in all_args: cuml_obj = self.cuml_class(**all_args) @@ -155,35 +160,35 @@ def run_cuml(self, data, **override_args): if self.cuml_data_prep_hook: data = self.cuml_data_prep_hook(data) if self.accepts_labels: - self.bench_func(cuml_obj, data[0], data[1]) + self.bench_func(cuml_obj, data[0], data[1], **bench_args) else: - self.bench_func(cuml_obj, data[0]) + self.bench_func(cuml_obj, data[0], **bench_args) return cuml_obj def setup_cpu(self, data, **override_args): + all_args = {**self.shared_args, **self.cpu_args} + all_args = {**all_args, **override_args} if self.setup_cpu_func is not None: - all_args = {**self.shared_args, **self.cpu_args} - all_args = {**all_args, **override_args} return { "cpu_setup_result": self.setup_cpu_func( self.cpu_class, data, all_args, self.tmpdir ) } else: - return {} + return all_args def setup_cuml(self, data, **override_args): + all_args = {**self.shared_args, **self.cuml_args} + all_args = {**all_args, **override_args} if self.setup_cuml_func is not None: - all_args = {**self.shared_args, **self.cuml_args} - all_args = {**all_args, **override_args} return { "cuml_setup_result": self.setup_cuml_func( self.cuml_class, data, all_args, self.tmpdir ) } else: - return {} + return all_args def _labels_to_int_hook(data): @@ -228,7 +233,6 @@ def all_algorithms(): cuml.random_projection.GaussianRandomProjection, shared_args=dict(n_components=10), name="GaussianRandomProjection", - bench_func=fit_transform, accepts_labels=False, ), AlgorithmPair( @@ -236,7 +240,6 @@ def all_algorithms(): cuml.random_projection.SparseRandomProjection, shared_args=dict(n_components=10), name="SparseRandomProjection", - bench_func=fit_transform, accepts_labels=False, ), AlgorithmPair( @@ -434,7 +437,7 @@ def all_algorithms(): cuml.manifold.UMAP, shared_args=dict(n_neighbors=5, n_epochs=500), name="UMAP-Unsupervised", - accepts_labels=True, + accepts_labels=False, accuracy_function=cuml.metrics.trustworthiness, ), AlgorithmPair( @@ -556,6 +559,139 @@ def all_algorithms(): name="SparseCSRPolynomialFeatures", accepts_labels=False, bench_func=fit_transform + ), + + AlgorithmPair( + None, + cuml.dask.neighbors.KNeighborsClassifier, + shared_args={}, + cuml_args={}, + name="MNMG.KNeighborsClassifier", + bench_func=fit_predict, + accepts_labels=True, + accuracy_function=cuml.metrics.accuracy_score + ), + + AlgorithmPair( + None, + cuml.dask.cluster.KMeans, + shared_args=dict(n_clusters=8, max_iter=300, n_init=1), + cpu_args=dict(init="k-means++"), + cuml_args=dict(init="scalable-k-means++"), + name="MNMG.KMeans", + bench_func=fit_predict, + accepts_labels=False, + accuracy_function=metrics.homogeneity_score, + ), + + AlgorithmPair( + None, + cuml.dask.cluster.DBSCAN, + shared_args=dict(eps=3, min_samples=2), + cpu_args=dict(algorithm="brute"), + name="MNMG.DBSCAN", + bench_func=fit_predict, + accepts_labels=False, + ), + + AlgorithmPair( + None, + cuml.dask.manifold.UMAP, + shared_args=dict(n_neighbors=5, n_epochs=500), + name="MNMG.UMAP-Unsupervised", + bench_func=transform, + setup_cuml_func=_build_mnmg_umap, + accepts_labels=False, + accuracy_function=cuml.metrics.trustworthiness, + ), + + AlgorithmPair( + None, + cuml.dask.manifold.UMAP, + shared_args=dict(n_neighbors=5, n_epochs=500), + name="MNMG.UMAP-Supervised", + bench_func=transform, + setup_cuml_func=_build_mnmg_umap, + accepts_labels=True, + accuracy_function=cuml.metrics.trustworthiness, + ), + + AlgorithmPair( + None, + cuml.dask.neighbors.NearestNeighbors, + shared_args=dict(n_neighbors=1024), + cpu_args=dict(algorithm="brute", n_jobs=-1), + cuml_args={}, + name="MNMG.NearestNeighbors", + accepts_labels=False, + bench_func=fit_kneighbors, + ), + + AlgorithmPair( + None, + cuml.dask.decomposition.TruncatedSVD, + shared_args=dict(n_components=10), + name="MNMG.tSVD", + accepts_labels=False, + ), + + AlgorithmPair( + None, + cuml.dask.decomposition.PCA, + shared_args=dict(n_components=10), + name="MNMG.PCA", + accepts_labels=False, + ), + + AlgorithmPair( + None, + cuml.dask.linear_model.LinearRegression, + shared_args={}, + name="MNMG.LinearRegression", + bench_func=fit_predict, + accepts_labels=True, + accuracy_function=metrics.r2_score, + ), + + AlgorithmPair( + None, + cuml.dask.linear_model.Lasso, + shared_args={}, + name="MNMG.Lasso", + bench_func=fit_predict, + accepts_labels=True, + accuracy_function=metrics.r2_score, + ), + + AlgorithmPair( + None, + cuml.dask.linear_model.ElasticNet, + shared_args={"alpha": 0.1, "l1_ratio": 0.5}, + name="MNMG.ElasticNet", + bench_func=fit_predict, + accepts_labels=True, + accuracy_function=metrics.r2_score, + ), + + AlgorithmPair( + None, + cuml.dask.linear_model.Ridge, + shared_args={}, + name="MNMG.Ridge", + bench_func=fit_predict, + accepts_labels=True, + accuracy_function=metrics.r2_score, + ), + + AlgorithmPair( + None, + cuml.dask.neighbors.KNeighborsRegressor, + shared_args={}, + cuml_args={}, + name="MNMG.KNeighborsRegressor", + bench_func=fit_predict, + accepts_labels=True, + accuracy_function=cuml.metrics.r2_score ) ] diff --git a/python/cuml/benchmark/automated/__init__.py b/python/cuml/benchmark/automated/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/cuml/benchmark/automated/bench_classification.py b/python/cuml/benchmark/automated/bench_classification.py new file mode 100644 index 0000000000..2d34d01c67 --- /dev/null +++ b/python/cuml/benchmark/automated/bench_classification.py @@ -0,0 +1,68 @@ +# +# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest +from .utils.utils import _benchmark_algo, fixture_generation_helper +from .utils.utils import bench_step # noqa: F401 +from .. import datagen + +# +# Core tests +# + + +@pytest.fixture(**fixture_generation_helper({ + 'n_samples': [1000, 10000], + 'n_features': [5, 500] + })) +def classification(request): + data = datagen.gen_data( + 'classification', + 'cupy', + n_samples=request.param['n_samples'], + n_features=request.param['n_features'] + ) + return data, { + 'dataset_type': 'classification', + **request.param + } + + +def bench_logistic_regression(gpubenchmark, bench_step, # noqa: F811 + classification): + _benchmark_algo(gpubenchmark, 'LogisticRegression', + bench_step, classification) + + +def bench_mbsgcclf(gpubenchmark, bench_step, classification): # noqa: F811 + _benchmark_algo(gpubenchmark, 'MBSGDClassifier', + bench_step, classification) + + +def bench_knnclassifier(gpubenchmark, bench_step, # noqa: F811 + classification): + _benchmark_algo(gpubenchmark, 'KNeighborsClassifier', + bench_step, classification) + + +def bench_svc_linear(gpubenchmark, bench_step, classification): # noqa: F811 + _benchmark_algo(gpubenchmark, 'SVC-Linear', + bench_step, classification) + + +def bench_svc_rbf(gpubenchmark, bench_step, classification): # noqa: F811 + _benchmark_algo(gpubenchmark, 'SVC-RBF', + bench_step, classification) diff --git a/python/cuml/benchmark/automated/bench_dimensionality_reduction.py b/python/cuml/benchmark/automated/bench_dimensionality_reduction.py new file mode 100644 index 0000000000..e7aefedec2 --- /dev/null +++ b/python/cuml/benchmark/automated/bench_dimensionality_reduction.py @@ -0,0 +1,98 @@ +# +# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest +from .utils.utils import _benchmark_algo, fixture_generation_helper +from .utils.utils import bench_step # noqa: F401 +from .. import datagen + +# +# Core tests +# + + +@pytest.fixture(**fixture_generation_helper({ + 'n_samples': [1000, 10000], + 'n_features': [5, 500] + })) +def blobs1(request): + data = datagen.gen_data( + 'blobs', + 'cupy', + n_samples=request.param['n_samples'], + n_features=request.param['n_features'] + ) + return data, { + 'dataset_type': 'blobs', + **request.param + } + + +@pytest.fixture(scope='session') +def blobs2(request): + dataset_kwargs = { + 'dataset_type': 'blobs', + 'n_samples': 10000, + 'n_features': 100 + } + dataset = datagen.gen_data( + dataset_kwargs['dataset_type'], + 'cupy', + n_samples=dataset_kwargs['n_samples'], + n_features=dataset_kwargs['n_features'] + ) + return dataset, dataset_kwargs + + +@pytest.fixture(scope='session') +def blobs3(request): + dataset_kwargs = { + 'dataset_type': 'blobs', + 'n_samples': 50000, + 'n_features': 100 + } + dataset = datagen.gen_data( + dataset_kwargs['dataset_type'], + 'cupy', + n_samples=dataset_kwargs['n_samples'], + n_features=dataset_kwargs['n_features'] + ) + return dataset, dataset_kwargs + + +def bench_kmeans(gpubenchmark, bench_step, blobs1): # noqa: F811 + _benchmark_algo(gpubenchmark, 'KMeans', bench_step, blobs1) + + +@pytest.mark.parametrize('algo_name', ['DBSCAN', + 'UMAP-Unsupervised', + 'UMAP-Supervised', + 'NearestNeighbors', + 'TSNE']) +def bench_with_blobs(gpubenchmark, algo_name, bench_step, # noqa: F811 + blobs2): + # Lump together a bunch of simple blobs-based tests + _benchmark_algo(gpubenchmark, algo_name, bench_step, blobs2) + + +@pytest.mark.parametrize('n_components', [2, 10, 50]) +@pytest.mark.parametrize('algo_name', ['tSVD', + 'PCA']) +def bench_dimensionality_reduction(gpubenchmark, algo_name, + bench_step, blobs3, # noqa: F811 + n_components): + _benchmark_algo(gpubenchmark, algo_name, bench_step, blobs3, + setup_kwargs={'n_components': n_components}) diff --git a/python/cuml/benchmark/automated/bench_preprocessing.py b/python/cuml/benchmark/automated/bench_preprocessing.py new file mode 100644 index 0000000000..7a8400296c --- /dev/null +++ b/python/cuml/benchmark/automated/bench_preprocessing.py @@ -0,0 +1,55 @@ +# +# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest +from .utils.utils import _benchmark_algo +from .utils.utils import bench_step # noqa: F401 +from .. import datagen + +# +# Core tests +# + + +@pytest.fixture(scope='session') +def regression(request): + dataset_kwargs = { + 'dataset_type': 'regression', + 'n_samples': 10000, + 'n_features': 100 + } + dataset = datagen.gen_data( + dataset_kwargs['dataset_type'], + 'cupy', + n_samples=dataset_kwargs['n_samples'], + n_features=dataset_kwargs['n_features'] + ) + return dataset, dataset_kwargs + + +def bench_standardscaler(gpubenchmark, bench_step, regression): # noqa: F811 + _benchmark_algo(gpubenchmark, 'StandardScaler', + bench_step, regression) + + +def bench_maxabsscaler(gpubenchmark, bench_step, regression): # noqa: F811 + _benchmark_algo(gpubenchmark, 'MaxAbsScaler', + bench_step, regression) + + +def bench_normalizer(gpubenchmark, bench_step, regression): # noqa: F811 + _benchmark_algo(gpubenchmark, 'Normalizer', + bench_step, regression) diff --git a/python/cuml/benchmark/automated/bench_random_forest.py b/python/cuml/benchmark/automated/bench_random_forest.py new file mode 100644 index 0000000000..02650c1fae --- /dev/null +++ b/python/cuml/benchmark/automated/bench_random_forest.py @@ -0,0 +1,75 @@ +# +# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest +from .utils.utils import _benchmark_algo, fixture_generation_helper +from .utils.utils import bench_step # noqa: F401 +from .. import datagen + +# +# Core tests +# + + +@pytest.fixture(**fixture_generation_helper({ + 'n_samples': [1000, 10000], + 'n_features': [5, 500] + })) +def classification(request): + data = datagen.gen_data( + 'classification', + 'cupy', + n_samples=request.param['n_samples'], + n_features=request.param['n_features'] + ) + return data, { + 'dataset_type': 'classification', + **request.param + } + + +@pytest.fixture(**fixture_generation_helper({ + 'n_samples': [1000, 10000], + 'n_features': [5, 500] + })) +def regression(request): + data = datagen.gen_data( + 'regression', + 'cupy', + n_samples=request.param['n_samples'], + n_features=request.param['n_features'] + ) + return data, { + 'dataset_type': 'regression', + **request.param + } + + +""" +def bench_fil(gpubenchmark, bench_step, classification): + _benchmark_algo(gpubenchmark, 'FIL', + bench_step, classification) +""" + + +def bench_rfc(gpubenchmark, bench_step, classification): # noqa: F811 + _benchmark_algo(gpubenchmark, 'RandomForestClassifier', + bench_step, classification) + + +def bench_rfr(gpubenchmark, bench_step, regression): # noqa: F811 + _benchmark_algo(gpubenchmark, 'RandomForestRegressor', + bench_step, regression) diff --git a/python/cuml/benchmark/automated/bench_regression.py b/python/cuml/benchmark/automated/bench_regression.py new file mode 100644 index 0000000000..0d4ae91a71 --- /dev/null +++ b/python/cuml/benchmark/automated/bench_regression.py @@ -0,0 +1,94 @@ +# +# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest +from .utils.utils import _benchmark_algo, fixture_generation_helper +from .utils.utils import bench_step # noqa: F401 +from .. import datagen + +# +# Core tests +# + + +@pytest.fixture(**fixture_generation_helper({ + 'n_samples': [1000, 10000], + 'n_features': [5, 400] + })) +def regression1(request): + data = datagen.gen_data( + 'regression', + 'cupy', + n_samples=request.param['n_samples'], + n_features=request.param['n_features'] + ) + return data, { + 'dataset_type': 'regression', + **request.param + } + + +@pytest.fixture(**fixture_generation_helper({ + 'n_samples': [500, 4000], + 'n_features': [5, 400] + })) +def regression2(request): + data = datagen.gen_data( + 'regression', + 'cupy', + n_samples=request.param['n_samples'], + n_features=request.param['n_features'] + ) + return data, { + 'dataset_type': 'regression', + **request.param + } + + +def bench_linear_regression(gpubenchmark, bench_step, # noqa: F811 + regression1): + _benchmark_algo(gpubenchmark, 'LinearRegression', + bench_step, regression1) + + +def bench_lasso(gpubenchmark, bench_step, regression1): # noqa: F811 + _benchmark_algo(gpubenchmark, 'Lasso', + bench_step, regression1) + + +def bench_elastic(gpubenchmark, bench_step, regression1): # noqa: F811 + _benchmark_algo(gpubenchmark, 'ElasticNet', + bench_step, regression1) + + +def bench_ridge(gpubenchmark, bench_step, regression1): # noqa: F811 + _benchmark_algo(gpubenchmark, 'Ridge', + bench_step, regression1) + + +def bench_knnregressor(gpubenchmark, bench_step, regression1): # noqa: F811 + _benchmark_algo(gpubenchmark, 'KNeighborsRegressor', + bench_step, regression1) + + +def bench_svr_rbf(gpubenchmark, bench_step, regression1): # noqa: F811 + _benchmark_algo(gpubenchmark, 'SVR-RBF', + bench_step, regression1) + + +def bench_svr_linear(gpubenchmark, bench_step, regression2): # noqa: F811 + _benchmark_algo(gpubenchmark, 'SVR-Linear', + bench_step, regression2) diff --git a/python/cuml/benchmark/automated/dask/__init__.py b/python/cuml/benchmark/automated/dask/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/cuml/benchmark/automated/dask/bench_mnmg_classification.py b/python/cuml/benchmark/automated/dask/bench_mnmg_classification.py new file mode 100644 index 0000000000..ce1ea2347d --- /dev/null +++ b/python/cuml/benchmark/automated/dask/bench_mnmg_classification.py @@ -0,0 +1,44 @@ +# +# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest +from ..utils.utils import _benchmark_algo, fixture_generation_helper +from ..utils.utils import bench_step # noqa: F401 +from ... import datagen + +# +# Core tests +# + + +@pytest.fixture(**fixture_generation_helper({ + 'n_samples': [1000, 10000], + 'n_features': [5, 500] + })) +def classification(request): + data = datagen.gen_data( + 'classification', + 'cudf', + n_samples=request.param['n_samples'], + n_features=request.param['n_features'] + ) + return data, None + + +def bench_mnmg_knnclassifier(gpubenchmark, bench_step, # noqa: F811 + classification, client): + _benchmark_algo(gpubenchmark, 'MNMG.KNeighborsClassifier', + bench_step, classification, client=client) diff --git a/python/cuml/benchmark/automated/dask/bench_mnmg_dimensionality_reduction.py b/python/cuml/benchmark/automated/dask/bench_mnmg_dimensionality_reduction.py new file mode 100644 index 0000000000..938afe2937 --- /dev/null +++ b/python/cuml/benchmark/automated/dask/bench_mnmg_dimensionality_reduction.py @@ -0,0 +1,106 @@ +# +# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest +from ..utils.utils import _benchmark_algo, fixture_generation_helper +from ..utils.utils import bench_step # noqa: F401 +from ... import datagen + +# +# Core tests +# + + +@pytest.fixture(**fixture_generation_helper({ + 'n_samples': [1000, 10000], + 'n_features': [5, 500] + })) +def blobs1(request): + data = datagen.gen_data( + 'classification', + 'cupy', + n_samples=request.param['n_samples'], + n_features=request.param['n_features'] + ) + return data, None + + +@pytest.fixture(scope='session') +def blobs2(request): + dataset_kwargs = { + 'dataset_type': 'blobs', + 'n_samples': 10000, + 'n_features': 100 + } + dataset = datagen.gen_data( + dataset_kwargs['dataset_type'], + 'cupy', + n_samples=dataset_kwargs['n_samples'], + n_features=dataset_kwargs['n_features'] + ) + return dataset, dataset_kwargs + + +@pytest.fixture(scope='session') +def blobs3(request): + dataset_kwargs = { + 'dataset_type': 'blobs', + 'n_samples': 50000, + 'n_features': 100 + } + dataset = datagen.gen_data( + dataset_kwargs['dataset_type'], + 'cupy', + n_samples=dataset_kwargs['n_samples'], + n_features=dataset_kwargs['n_features'] + ) + return dataset, dataset_kwargs + + +def bench_mnmg_kmeans(gpubenchmark, bench_step, blobs1, client): # noqa: F811 + _benchmark_algo(gpubenchmark, 'MNMG.KMeans', + bench_step, blobs1, client=client) + + +def bench_mnmg_dbscan(gpubenchmark, bench_step, blobs2, client): # noqa: F811 + _benchmark_algo(gpubenchmark, 'MNMG.DBSCAN', + bench_step, blobs2, client=client) + + +def bench_mnmg_nearest_neighbors(gpubenchmark, bench_step, # noqa: F811 + blobs2, client): + _benchmark_algo(gpubenchmark, 'MNMG.NearestNeighbors', + bench_step, blobs2, client=client) + + +@pytest.mark.parametrize('algo_name', ['MNMG.UMAP-Unsupervised', + 'MNMG.UMAP-Supervised']) +def bench_mnmg_umap(gpubenchmark, algo_name, bench_step, # noqa: F811 + blobs2, client): + _benchmark_algo(gpubenchmark, algo_name, + bench_step, blobs2, client=client) + + +@pytest.mark.parametrize('algo_name', ['MNMG.tSVD', + 'MNMG.PCA']) +@pytest.mark.parametrize('n_components', [2, 10, 50]) +def bench_mnmg_dimensionality_reduction(gpubenchmark, algo_name, + bench_step, blobs3, # noqa: F811 + client, n_components): + _benchmark_algo(gpubenchmark, algo_name, + bench_step, blobs3, + setup_kwargs={'n_components': n_components}, + client=client) diff --git a/python/cuml/benchmark/automated/dask/bench_mnmg_regression.py b/python/cuml/benchmark/automated/dask/bench_mnmg_regression.py new file mode 100644 index 0000000000..6929b75807 --- /dev/null +++ b/python/cuml/benchmark/automated/dask/bench_mnmg_regression.py @@ -0,0 +1,68 @@ +# +# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest +from ..utils.utils import _benchmark_algo, fixture_generation_helper +from ..utils.utils import bench_step # noqa: F401 +from ... import datagen + +# +# Core tests +# + + +@pytest.fixture(**fixture_generation_helper({ + 'n_samples': [1000, 10000], + 'n_features': [5, 500] + })) +def regression(request): + data = datagen.gen_data( + 'regression', + 'cupy', + n_samples=request.param['n_samples'], + n_features=request.param['n_features'] + ) + return data, None + + +def bench_linear_regression(gpubenchmark, bench_step, # noqa: F811 + regression, client): + _benchmark_algo(gpubenchmark, 'MNMG.LinearRegression', + bench_step, regression, client=client) + + +def bench_mnmg_lasso(gpubenchmark, bench_step, # noqa: F811 + regression, client): + _benchmark_algo(gpubenchmark, 'MNMG.Lasso', + bench_step, regression, client=client) + + +def bench_mnmg_elastic(gpubenchmark, bench_step, # noqa: F811 + regression, client): + _benchmark_algo(gpubenchmark, 'MNMG.ElasticNet', + bench_step, regression, client=client) + + +def bench_mnmg_ridge(gpubenchmark, bench_step, # noqa: F811 + regression, client): + _benchmark_algo(gpubenchmark, 'MNMG.Ridge', + bench_step, regression, client=client) + + +def bench_mnmg_knnregressor(gpubenchmark, bench_step, # noqa: F811 + regression, client): + _benchmark_algo(gpubenchmark, 'MNMG.KNeighborsRegressor', + bench_step, regression, client=client) diff --git a/python/cuml/benchmark/automated/dask/conftest.py b/python/cuml/benchmark/automated/dask/conftest.py new file mode 100644 index 0000000000..f38e5045af --- /dev/null +++ b/python/cuml/benchmark/automated/dask/conftest.py @@ -0,0 +1,63 @@ +# +# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest + +from dask_cuda import initialize +from dask_cuda import LocalCUDACluster +from dask.distributed import Client + +enable_tcp_over_ucx = True +enable_nvlink = False +enable_infiniband = False + + +@pytest.fixture(scope="module") +def cluster(): + + cluster = LocalCUDACluster(protocol="tcp", scheduler_port=0) + yield cluster + cluster.close() + + +@pytest.fixture(scope="function") +def client(cluster): + + client = Client(cluster) + yield client + client.close() + + +@pytest.fixture(scope="module") +def ucx_cluster(): + initialize.initialize(create_cuda_context=True, + enable_tcp_over_ucx=enable_tcp_over_ucx, + enable_nvlink=enable_nvlink, + enable_infiniband=enable_infiniband) + cluster = LocalCUDACluster(protocol="ucx", + enable_tcp_over_ucx=enable_tcp_over_ucx, + enable_nvlink=enable_nvlink, + enable_infiniband=enable_infiniband) + yield cluster + cluster.close() + + +@pytest.fixture(scope="function") +def ucx_client(ucx_cluster): + + client = Client(ucx_cluster) + yield client + client.close() diff --git a/python/cuml/benchmark/automated/pytest.ini b/python/cuml/benchmark/automated/pytest.ini new file mode 100644 index 0000000000..068fac6e15 --- /dev/null +++ b/python/cuml/benchmark/automated/pytest.ini @@ -0,0 +1,28 @@ +[pytest] +addopts = + --benchmark-warmup=on + --benchmark-warmup-iterations=1 + --benchmark-min-rounds=3 + --benchmark-columns="min, max, mean, stddev, outliers, gpu_mem, rounds" + +markers = + managedmem_on: RMM managed memory enabled + managedmem_off: RMM managed memory disabled + poolallocator_on: RMM pool allocator enabled + poolallocator_off: RMM pool allocator disabled + ETL: benchmarks for ETL steps + small: small datasets + tiny: tiny datasets + ML: benchmarks for ML steps + +python_classes = + Bench* + Test* + +python_files = + bench_* + test_* + +python_functions = + bench_* + test_* diff --git a/python/cuml/benchmark/automated/utils/__init__.py b/python/cuml/benchmark/automated/utils/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/cuml/benchmark/automated/utils/auto_nvtx_bench.py b/python/cuml/benchmark/automated/utils/auto_nvtx_bench.py new file mode 100644 index 0000000000..95d827230f --- /dev/null +++ b/python/cuml/benchmark/automated/utils/auto_nvtx_bench.py @@ -0,0 +1,136 @@ +# +# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import json +from cuml.benchmark import datagen, algorithms +from cuml.benchmark.automated.utils.utils import setup_bench + +parser = argparse.ArgumentParser( + prog='launch-benchmark', + description=r''' + Command-line cuML benchmark runner. + + Examples: + python run_benchmarks.py \ + --algo_name LinearRegression \ + --dataset_type regression + ''', + formatter_class=argparse.RawTextHelpFormatter, +) +parser.add_argument( + '--algo_name', + type=str, + default='', + help='Algorithm name', +) +parser.add_argument( + '--dataset_type', + type=str, + default='', + help='Dataset type', +) +parser.add_argument( + '--n_samples', + type=int, + default=10000, + help='Number of samples', +) +parser.add_argument( + '--n_features', + type=int, + default=100, + help='Number of features', +) +parser.add_argument( + '--dataset_format', + type=str, + default='cupy', + help='Dataset format', +) +parser.add_argument( + '--data_kwargs', + type=json.loads, + default={}, + help='Data generation options', +) +parser.add_argument( + '--setup_kwargs', + type=json.loads, + default={}, + help='Algorithm setup options', +) +parser.add_argument( + '--training_kwargs', + type=json.loads, + default={}, + help='Algorithm training options', +) +parser.add_argument( + '--inference_kwargs', + type=json.loads, + default={}, + help='Algorithm inference options', +) +parser.add_argument( + '--json', + type=str, + default='', + help='JSON file containing benchmark parameters', +) +args = parser.parse_args() + + +def parse_json(args): + with open(args.json) as json_file: + params = json.load(json_file) + + # Overwriting + if 'algo_name' in params: + args.algo_name = params['algo_name'] + if 'dataset_type' in params: + args.dataset_type = params['dataset_type'] + if 'n_samples' in params: + args.n_samples = params['n_samples'] + if 'n_features' in params: + args.n_features = params['n_features'] + if 'dataset_format' in params: + args.dataset_format = params['dataset_format'] + if 'data_kwargs' in params: + args.data_kwargs = params['data_kwargs'] + if 'setup_kwargs' in params: + args.setup_kwargs = params['setup_kwargs'] + if 'training_kwargs' in params: + args.training_kwargs = params['training_kwargs'] + if 'inference_kwargs' in params: + args.inference_kwargs = params['inference_kwargs'] + + +if len(args.json): + parse_json(args) + +dataset = datagen.gen_data( + args.dataset_type, + args.dataset_format, + n_samples=args.n_samples, + n_features=args.n_features, + **args.data_kwargs +) + +algo = algorithms.algorithm_by_name(args.algo_name) +cuml_setup = setup_bench('cuml', algo, 'inference', dataset, + args.setup_kwargs, args.training_kwargs) +algo.run_cuml(dataset, bench_args=args.inference_kwargs, **cuml_setup) diff --git a/python/cuml/benchmark/automated/utils/utils.py b/python/cuml/benchmark/automated/utils/utils.py new file mode 100644 index 0000000000..fdbe72b6d4 --- /dev/null +++ b/python/cuml/benchmark/automated/utils/utils.py @@ -0,0 +1,321 @@ +# +# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +try: + from rapids_pytest_benchmark import setFixtureParamNames +except ImportError: + print("\n\nWARNING: rapids_pytest_benchmark is not installed, " + "falling back to pytest_benchmark fixtures.\n") + + # if rapids_pytest_benchmark is not available, just perfrom time-only + # benchmarking and replace the util functions with nops + import pytest_benchmark + gpubenchmark = pytest_benchmark.plugin.benchmark + + def setFixtureParamNames(*args, **kwargs): + pass + +import os +import json +import time +import itertools as it +import warnings +import numpy as np +import cupy as cp +import cudf + +import pytest +from cuml.benchmark import datagen, algorithms +from cuml.benchmark.nvtx_benchmark import Profiler +import dask.array as da +import dask.dataframe as df +from copy import copy + +from cuml.benchmark.bench_helper_funcs import pass_func, fit, predict, \ + transform, kneighbors, \ + fit_predict, fit_transform, \ + fit_kneighbors + + +def distribute(client, data): + if data is not None: + n_rows = data.shape[0] + n_workers = len(client.scheduler_info()['workers']) + if isinstance(data, (np.ndarray, cp.ndarray)): + dask_array = da.from_array(x=data, + chunks={0: n_rows // n_workers, 1: -1}) + return dask_array + elif isinstance(data, (cudf.DataFrame, cudf.Series)): + dask_df = df.from_pandas(data, + chunksize=n_rows // n_workers) + return dask_df + else: + raise ValueError('Could not distribute data') + + +def nvtx_profiling(algo_name, data_kwargs, setup_kwargs, + training_kwargs, inference_kwargs): + dataset_type = data_kwargs['dataset_type'] + n_samples = data_kwargs['n_samples'] + n_features = data_kwargs['n_features'] + dataset_format = (data_kwargs['dataset_format'] if 'dataset_format' + in data_kwargs else 'cupy') + + data_kwargs_edited = copy(data_kwargs) + for param in ['dataset_type', 'n_samples', 'n_features', + 'dataset_format']: + data_kwargs_edited.pop(param, None) + + path = os.path.dirname(os.path.realpath(__file__)) + command = """ + python {path}/auto_nvtx_bench.py + --algo_name {algo_name} + --dataset_type {dataset_type} + --n_samples {n_samples} + --n_features {n_features} + --dataset_format {dataset_format} + --data_kwargs {data_kwargs} + --setup_kwargs {setup_kwargs} + --training_kwargs {training_kwargs} + --inference_kwargs {inference_kwargs} + """.format(path=path, + algo_name=algo_name, + dataset_type=dataset_type, + n_samples=n_samples, + n_features=n_features, + dataset_format=dataset_format, + data_kwargs=json.dumps(data_kwargs_edited, + separators=(',', ':')), + setup_kwargs=json.dumps(setup_kwargs, + separators=(',', ':')), + training_kwargs=json.dumps(training_kwargs, + separators=(',', ':')), + inference_kwargs=json.dumps(inference_kwargs, + separators=(',', ':'))) + command = command.replace('\n', '').replace('\t', ' ') + command = ' '.join(command.split()) + + print('\n\n' + '\033[96m' + '=x'*48) + print('=x'*20 + ' NVTX BENCHMARK ' + '=x'*20) + + profiler = Profiler() + profiler.profile(command) + + print('=x'*48) + print('=x'*48 + '\033[0m' + '\n') + + +def cpu_bench(algo, bench_step, dataset, inference_args, cpu_setup): + if algo.cpu_class is None: + return + + t = time.process_time() + if bench_step == 'training': + algo.run_cpu(dataset, **cpu_setup) + elif bench_step == 'inference': + algo.run_cpu(dataset, **inference_args, **cpu_setup) + elapsed_time = time.process_time() - t + + print('\n' + '\033[33m' + '=x'*20 + ' CPU BENCHMARK ' + '=x'*20) + print(algo.name + ' : ' + str(algo.cpu_class)) + print('\tbench_function: ' + str(algo.bench_func)) + print('\truntime: ' + str(elapsed_time)) + print('=x'*48 + '\033[0m' + '\n') + + +def setup_bench(platform, algo, bench_step, dataset, + setup_kwargs, training_kwargs): + """ + Will setup the AlgorithmPair and the model to be ready for benchmark + + Parameters + ---------- + platform : + Either 'cpu' or 'cuml' + algo_name : + Algorithm/model name, can be found in the algorithms.py file + bench_step : + Either 'training' or 'inference', describe the algorithm/model + step to be benchmarked + dataset : + Dataset data + setup_kwargs : + Algorithm/model setup kwargs + training_kwargs : + Algorithm/model training kwargs + """ + + # Generate the model + if platform == 'cuml': + setup = algo.setup_cuml(dataset, **setup_kwargs) + elif platform == 'cpu': + setup = algo.setup_cpu(dataset, **setup_kwargs) + + # Set the bench_func to perform training + if bench_step == 'training': + if hasattr(algo.cuml_class, 'fit'): + algo.bench_func = fit + # Model cannot be trained (special construction) + elif algo.setup_cuml_func: + pytest.skip('Model cannot be trained (special construction)') + else: + raise ValueError('Training function not found') + # Train the model and then set the bench_func to perform inference + elif bench_step == 'inference': + if hasattr(algo.cuml_class, 'fit'): + algo.bench_func = fit + # Model cannot be trained (special construction) + elif algo.setup_cuml_func: + algo.bench_func = pass_func + else: + raise ValueError('Training function not found') + + if platform == 'cuml': + setup['cuml_setup_result'] = \ + algo.run_cuml(dataset, bench_args=training_kwargs, **setup) + elif platform == 'cpu': + setup['cpu_setup_result'] = \ + algo.run_cpu(dataset, bench_args=training_kwargs, **setup) + + if hasattr(algo.cuml_class, 'predict'): + algo.bench_func = predict + elif hasattr(algo.cuml_class, 'transform'): + algo.bench_func = transform + elif hasattr(algo.cuml_class, 'kneighbors'): + algo.bench_func = kneighbors + elif any(hasattr(algo.cuml_class, attr) for attr in + ['fit_predict', 'fit_transform', 'fit_kneighbors']): + warnings.warn('Inference cannot be done separately, ' + 'doing both training and inference') + if hasattr(algo.cuml_class, 'fit_predict'): + algo.bench_func = fit_predict + elif hasattr(algo.cuml_class, 'fit_transform'): + algo.bench_func = fit_transform + elif hasattr(algo.cuml_class, 'fit_kneighbors'): + algo.bench_func = fit_kneighbors + else: + raise ValueError('Inference function not found') + else: + raise ValueError('bench_func should be either training or inference') + return setup + + +def _benchmark_algo( + benchmarker, + algo_name, + bench_step, + dataset, + setup_kwargs={}, + training_kwargs={}, + inference_kwargs={}, + client=None +): + """ + Benchmark utility + + Parameters + ---------- + benchmarker : + Pytest benchmark function, allows to enclose the code + that should be benchmarked + algo_name : + Algorithm/model name, can be found in the algorithms.py file + bench_step : + Either 'training' or 'inference', describe the algorithm/model + step to be benchmarked + dataset : + Tuple with the data and a dictionnary that describes how it was built. + The dictionnary can be later used during the NVTX benchmark. + setup_kwargs : + Algorithm/model setup kwargs + training_kwargs : + Algorithm/model training kwargs + inference_kwargs : + Algorithm/model inference kwargs + client : + Dask client used in MNMG settings + """ + + # Get data and dict describing how it was built + dataset, data_kwargs = dataset + + # The presence of a Dask client signifies MNMG mode + MNMG_mode = client is not None + + # Distribute data in MNMG settings + if MNMG_mode: + # Add the client to the setup kwargs used by model instantiation + setup_kwargs['client'] = client + # Exception : data is scattered by the MNMG DBSCAN model itself + if algo_name != 'MNMG.DBSCAN': + # Distribute data + dataset = [distribute(client, d) for d in dataset] + + # Search AlgorithmPair instance by name + algo = algorithms.algorithm_by_name(algo_name) + # Setup the AlgorithmPair and the model to be ready for benchmark on GPU + cuml_setup = setup_bench('cuml', algo, bench_step, dataset, + setup_kwargs, training_kwargs) + + # Pytest benchmark + if bench_step == 'training': + benchmarker(algo.run_cuml, dataset, bench_args=training_kwargs, + **cuml_setup) + elif bench_step == 'inference': + benchmarker(algo.run_cuml, dataset, bench_args=inference_kwargs, + **cuml_setup) + + # CPU benchmark and NVTX benchmark (only in SG mode) + if not MNMG_mode: + # Check that the cuML model has a CPU equivalency + if algo.cpu_class: + # Convert sataset to a Numpy array + cpu_dataset = datagen._convert_to_numpy(dataset) + # Setup the AlgorithmPair and the model + # to be ready for benchmark on CPU + cpu_setup = setup_bench('cpu', algo, bench_step, cpu_dataset, + setup_kwargs, training_kwargs) + # CPU benchmark + cpu_bench(algo, bench_step, cpu_dataset, inference_kwargs, + cpu_setup) + + # NVTX benchmark performs both the training and inference at once + # but only when bench_step == 'inference' + if bench_step == 'inference': + # NVTX benchmark + nvtx_profiling(algo_name, data_kwargs, setup_kwargs, + training_kwargs, inference_kwargs) + + +def fixture_generation_helper(params): + param_names = sorted(params) + param_combis = list(it.product(*(params[param_name] + for param_name in param_names))) + ids = ['-'.join(map(str, param_combi)) for param_combi in param_combis] + param_combis = [dict(zip(param_names, param_combi)) + for param_combi in param_combis] + return { + 'scope': 'session', + 'params': param_combis, + 'ids': ids + } + + +@pytest.fixture(scope='session', + params=['training', 'inference'], + ids=['training', 'inference']) +def bench_step(request): + return request.param diff --git a/python/cuml/benchmark/bench_helper_funcs.py b/python/cuml/benchmark/bench_helper_funcs.py index 865b8e538e..bdf2b6936f 100644 --- a/python/cuml/benchmark/bench_helper_funcs.py +++ b/python/cuml/benchmark/bench_helper_funcs.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2021, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -23,23 +23,68 @@ import cudf from numba import cuda from cuml.benchmark import datagen +from cuml.manifold import UMAP + + +def call(m, func_name, X, y=None): + def unwrap_and_get_args(func): + if hasattr(func, '__wrapped__'): + return unwrap_and_get_args(func.__wrapped__) + else: + return func.__code__.co_varnames + + if not hasattr(m, func_name): + raise ValueError('Model does not have function ' + func_name) + func = getattr(m, func_name) + argnames = unwrap_and_get_args(func) + if y is not None and 'y' in argnames: + func(X, y=y) + else: + func(X) -def fit_kneighbors(m, x): - m.fit(x) - m.kneighbors(x) +def pass_func(m, x, y=None): + pass def fit(m, x, y=None): - m.fit(x) if y is None else m.fit(x, y) + call(m, 'fit', x, y) + + +def predict(m, x, y=None): + call(m, 'predict', x) + + +def transform(m, x, y=None): + call(m, 'transform', x) -def fit_transform(m, x): - m.fit_transform(x) +def kneighbors(m, x, y=None): + call(m, 'kneighbors', x) -def predict(m, x): - m.predict(x) +def fit_predict(m, x, y=None): + if hasattr(m, 'predict'): + fit(m, x, y) + predict(m, x) + else: + call(m, 'fit_predict', x, y) + + +def fit_transform(m, x, y=None): + if hasattr(m, 'transform'): + fit(m, x, y) + transform(m, x) + else: + call(m, 'fit_transform', x, y) + + +def fit_kneighbors(m, x, y=None): + if hasattr(m, 'kneighbors'): + fit(m, x, y) + kneighbors(m, x) + else: + call(m, 'fit_kneighbors', x, y) def _training_data_to_numpy(X, y): @@ -182,3 +227,20 @@ def _treelite_fil_accuracy_score(y_true, y_pred): y_pred_binary = input_utils.convert_dtype(y_pred1 > 0.5, np.int32) return cuml.metrics.accuracy_score(y_true1, y_pred_binary) + + +def _build_mnmg_umap(m, data, args, tmpdir): + client = args['client'] + del args['client'] + local_model = UMAP(**args) + + if isinstance(data, (tuple, list)): + local_data = [x.compute() for x in data if x is not None] + if len(local_data) == 2: + X, y = local_data + local_model.fit(X, y) + else: + X = local_data + local_model.fit(X) + + return m(client=client, model=local_model, **args) diff --git a/python/cuml/benchmark/datagen.py b/python/cuml/benchmark/datagen.py index b9d5e5ffa6..5a6f20cdfc 100644 --- a/python/cuml/benchmark/datagen.py +++ b/python/cuml/benchmark/datagen.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2021, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -37,8 +37,9 @@ import cudf import gzip import functools -import numpy as np import os +import numpy as np +import cupy as cp import pandas as pd import cuml.datasets @@ -58,39 +59,37 @@ def _gen_data_regression(n_samples, n_features, random_state=42, n_samples = int(1e6) if n_features == 0: n_features = 100 + X_arr, y_arr = cuml.datasets.make_regression( - n_samples=n_samples, n_features=n_features, random_state=random_state, - dtype=dtype) - return cudf.DataFrame(X_arr), cudf.Series(y_arr) + n_samples=n_samples, n_features=n_features, + random_state=random_state, dtype=dtype) + return X_arr, y_arr -def _gen_data_blobs(n_samples, n_features, random_state=42, dtype=np.float32, - centers=None): + +def _gen_data_blobs(n_samples, n_features, random_state=42, centers=None, + dtype=np.float32): """Wrapper for sklearn make_blobs""" if n_samples == 0: n_samples = int(1e6) if n_features == 0: n_samples = 100 + X_arr, y_arr = cuml.datasets.make_blobs( n_samples=n_samples, n_features=n_features, centers=centers, random_state=random_state, dtype=dtype) - return ( - cudf.DataFrame(X_arr), - cudf.Series(y_arr), - ) + + return X_arr, y_arr -def _gen_data_zeros(n_samples, n_features, random_state=42, dtype=np.float32): +def _gen_data_zeros(n_samples, n_features, dtype=np.float32): """Dummy generator for use in testing - returns all 0s""" - return ( - cudf.DataFrame(np.zeros((n_samples, n_features), dtype=dtype)), - cudf.Series(np.zeros(n_samples, dtype=dtype)), - ) + return cp.zeros((n_samples, n_features), dtype=dtype), \ + cp.zeros(n_samples, dtype=dtype) -def _gen_data_classification( - n_samples, n_features, random_state=42, dtype=np.float32, n_classes=2 -): +def _gen_data_classification(n_samples, n_features, random_state=42, + n_classes=2, dtype=np.float32): """Wrapper for sklearn make_blobs""" if n_samples == 0: n_samples = int(1e6) @@ -101,14 +100,10 @@ def _gen_data_classification( n_samples=n_samples, n_features=n_features, n_classes=n_classes, random_state=random_state, dtype=dtype) - return ( - cudf.DataFrame(X_arr), - cudf.Series(y_arr), - ) + return X_arr, y_arr -def _gen_data_higgs(n_samples=None, n_features=None, random_state=42, - dtype=np.float32): +def _gen_data_higgs(n_samples=None, n_features=None, dtype=np.float32): """Wrapper returning Higgs in Pandas format""" X_df, y_df = load_higgs() if n_samples == 0: @@ -125,7 +120,8 @@ def _gen_data_higgs(n_samples=None, n_features=None, random_state=42, "Higgs dataset has only %d rows, cannot support %d" % (X_df.shape[0], n_samples) ) - return X_df.iloc[:n_samples, :n_features], y_df.iloc[:n_samples] + return X_df.iloc[:n_samples, :n_features].astype(dtype), \ + y_df.iloc[:n_samples].astype(dtype) def _download_and_cache(url, compressed_filepath, decompressed_filepath): @@ -173,6 +169,8 @@ def _convert_to_numpy(data): return tuple([_convert_to_numpy(d) for d in data]) elif isinstance(data, np.ndarray): return data + elif isinstance(data, cp.ndarray): + return cp.asnumpy(data) elif isinstance(data, cudf.DataFrame): return data.to_numpy() elif isinstance(data, cudf.Series): @@ -183,6 +181,26 @@ def _convert_to_numpy(data): raise Exception("Unsupported type %s" % str(type(data))) +def _convert_to_cupy(data): + """Returns tuple data with all elements converted to cupy ndarrays""" + if data is None: + return None + elif isinstance(data, tuple): + return tuple([_convert_to_cupy(d) for d in data]) + elif isinstance(data, np.ndarray): + return cp.asarray(data) + elif isinstance(data, cp.ndarray): + return data + elif isinstance(data, cudf.DataFrame): + return data.values + elif isinstance(data, cudf.Series): + return data.values + elif isinstance(data, (pd.DataFrame, pd.Series)): + return cp.asarray(data.to_numpy()) + else: + raise Exception("Unsupported type %s" % str(type(data))) + + def _convert_to_cudf(data): if data is None: return None @@ -194,6 +212,18 @@ def _convert_to_cudf(data): return cudf.DataFrame.from_pandas(data) elif isinstance(data, pd.Series): return cudf.Series.from_pandas(data) + elif isinstance(data, np.ndarray): + data = np.squeeze(data) + if data.ndim == 1: + return cudf.Series(data) + else: + return cudf.DataFrame(data) + elif isinstance(data, cp.ndarray): + data = np.squeeze(cp.asnumpy(data)) + if data.ndim == 1: + return cudf.Series(data) + else: + return cudf.DataFrame(data) else: raise Exception("Unsupported type %s" % str(type(data))) @@ -207,6 +237,18 @@ def _convert_to_pandas(data): return data elif isinstance(data, (cudf.DataFrame, cudf.Series)): return data.to_pandas() + elif isinstance(data, np.ndarray): + data = np.squeeze(data) + if data.ndim == 1: + return pd.Series(data) + else: + return pd.DataFrame(data) + elif isinstance(data, cp.ndarray): + data = np.squeeze(cp.asnumpy(data)) + if data.ndim == 1: + return pd.Series(data) + else: + return pd.DataFrame(data) else: raise Exception("Unsupported type %s" % str(type(data))) @@ -285,6 +327,7 @@ def _convert_to_scipy_sparse_csc(data): } _data_converters = { 'numpy': _convert_to_numpy, + 'cupy': _convert_to_cupy, 'cudf': _convert_to_cudf, 'pandas': _convert_to_pandas, 'gpuarray': _convert_to_gpuarray, @@ -304,9 +347,7 @@ def gen_data( dataset_format, n_samples=0, n_features=0, - random_state=42, test_fraction=0.0, - dtype=np.float32, **kwargs ): """Returns a tuple of data from the specified generator. @@ -335,16 +376,17 @@ def gen_data( data = _data_generators[dataset_name]( int(n_samples / (1 - test_fraction)), n_features, - random_state, - dtype, **kwargs ) if test_fraction != 0.0: if n_samples == 0: n_samples = int(data[0].shape[0] * (1 - test_fraction)) + random_state_dict = ({'random_state': kwargs['random_state']} + if 'random_state' in kwargs else {}) X_train, X_test, y_train, y_test = tuple( sklearn.model_selection.train_test_split( - *data, train_size=n_samples, random_state=random_state + *data, train_size=n_samples, + **random_state_dict ) ) data = (X_train, y_train, X_test, y_test) diff --git a/python/cuml/benchmark/nvtx_benchmark.py b/python/cuml/benchmark/nvtx_benchmark.py index 83ecdf3df8..8a2b7338fd 100644 --- a/python/cuml/benchmark/nvtx_benchmark.py +++ b/python/cuml/benchmark/nvtx_benchmark.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -187,5 +187,6 @@ def profile(self, command): self._display_results(results) -profiler = Profiler() -profiler.profile(sys.argv[1]) +if __name__ == "__main__": + profiler = Profiler() + profiler.profile(sys.argv[1]) diff --git a/python/cuml/pytest_benchmarks/test_bench.py b/python/cuml/pytest_benchmarks/test_bench.py deleted file mode 100644 index 0cd18f1a15..0000000000 --- a/python/cuml/pytest_benchmarks/test_bench.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2019, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Demo integration of benchmarking to pytest interface -Requires pytest-benchmark, which is not currently installed by default. -""" - -from cuml.benchmark import datagen, algorithms -from cuml.common.import_utils import has_pytest_benchmark -import pytest - - -# -# Testing utilities -# -def _benchmark_algo( - benchmark, - name, - dataset_name, - n_samples=10000, - n_features=100, - input_type='numpy', - data_kwargs={}, - algo_args={}, -): - """Simplest benchmark wrapper to time algorithm 'name' on dataset - 'dataset_name'""" - algo = algorithms.algorithm_by_name(name) - data = datagen.gen_data( - dataset_name, - input_type, - n_samples=n_samples, - n_features=n_features, - **data_kwargs - ) - - def _benchmark_inner(): - algo.run_cuml(data, **algo_args) - - benchmark(_benchmark_inner) - - -# -# Core tests -# -@pytest.mark.skipif(not has_pytest_benchmark(), - reason='pytest-benchmark missing') -@pytest.mark.parametrize('n_rows', [1000, 10000]) -@pytest.mark.parametrize('n_features', [5, 500]) -def test_kmeans(benchmark, n_rows, n_features): - _benchmark_algo(benchmark, 'KMeans', 'blobs', n_rows, n_features) - - -@pytest.mark.skipif(not has_pytest_benchmark(), - reason='pytest-benchmark missing') -@pytest.mark.parametrize('algo_name', ['DBSCAN', 'UMAP-Supervised', - 'NearestNeighbors']) -def test_with_blobs(benchmark, algo_name): - # Lump together a bunch of simple blobs-based tests - _benchmark_algo(benchmark, algo_name, 'blobs', 10000, 100) - - -@pytest.mark.skipif(not has_pytest_benchmark(), - reason='pytest-benchmark missing') -@pytest.mark.parametrize('n_components', [2, 10, 50]) -def test_pca(benchmark, n_components): - _benchmark_algo( - benchmark, - 'PCA', - 'blobs', - 50000, - 100, - algo_args=dict(n_components=n_components), - ) diff --git a/python/cuml/test/test_benchmark.py b/python/cuml/test/test_benchmark.py index 3e1fbe1d85..efc376a55a 100644 --- a/python/cuml/test/test_benchmark.py +++ b/python/cuml/test/test_benchmark.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2021, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -28,6 +28,8 @@ import time +from cuml.benchmark.bench_helper_funcs import fit, fit_predict + @pytest.mark.parametrize('dataset', ['blobs', 'regression', 'classification']) def test_data_generators(dataset): @@ -104,6 +106,7 @@ def __init__(self): FastMockAlgo, shared_args={}, name="Mock", + bench_func=fit_predict, accuracy_function=metrics.accuracy_score, ) @@ -128,6 +131,7 @@ def fit(self, X, y): CountingAlgo, CountingAlgo, shared_args={}, + bench_func=fit, name="Counting", ) @@ -157,6 +161,7 @@ def predict(self, X): MockAlgo, shared_args={}, name="Mock", + bench_func=fit_predict, accuracy_function=metrics.accuracy_score, )