Nightly automated benchmark (#4414)

This PR contains the code allowing the nightly automated runs of benchmarks for `cuML`. Authors: - Victor Lafargue (https://github.com/viclafargue) - Nanthini (https://github.com/Nanthini10) Approvers: - Dante Gama Dessavre (https://github.com/dantegd) URL: #4414
rapidsai · Feb 9, 2022 · 1dd32dc · 1dd32dc
1 parent fcad23f
commit 1dd32dc
Show file tree

Hide file tree

Showing 21 changed files with 1,466 additions and 148 deletions.
diff --git a/python/cuml/benchmark/algorithms.py b/python/cuml/benchmark/algorithms.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -26,6 +26,8 @@
 import cuml.metrics
 import cuml.decomposition
 import cuml.naive_bayes
+from cuml.dask import neighbors, cluster, manifold, \
+    decomposition, linear_model  # noqa: F401
 from cuml.common.import_utils import has_umap
 import numpy as np
 import tempfile
@@ -37,14 +39,17 @@
 
 from cuml.benchmark.bench_helper_funcs import (
     fit,
-    fit_kneighbors,
-    fit_transform,
+    transform,
     predict,
+    fit_transform,
+    fit_predict,
+    fit_kneighbors,
     _build_cpu_skl_classifier,
     _build_fil_skl_classifier,
     _build_fil_classifier,
     _build_treelite_classifier,
     _treelite_fil_accuracy_score,
+    _build_mnmg_umap
 )
 import treelite
 import treelite_runtime
@@ -122,13 +127,13 @@ def __init__(
     def __str__(self):
         return "AlgoPair:%s" % (self.name)
 
-    def run_cpu(self, data, **override_args):
+    def run_cpu(self, data, bench_args={}, **override_setup_args):
         """Runs the cpu-based algorithm's fit method on specified data"""
         if self.cpu_class is None:
             raise ValueError("No CPU implementation for %s" % self.name)
 
         all_args = {**self.shared_args, **self.cpu_args}
-        all_args = {**all_args, **override_args}
+        all_args = {**all_args, **override_setup_args}
 
         if "cpu_setup_result" not in all_args:
             cpu_obj = self.cpu_class(**all_args)
@@ -137,16 +142,16 @@ def run_cpu(self, data, **override_args):
         if self.cpu_data_prep_hook:
             data = self.cpu_data_prep_hook(data)
         if self.accepts_labels:
-            self.bench_func(cpu_obj, data[0], data[1])
+            self.bench_func(cpu_obj, data[0], data[1], **bench_args)
         else:
-            self.bench_func(cpu_obj, data[0])
+            self.bench_func(cpu_obj, data[0], **bench_args)
 
         return cpu_obj
 
-    def run_cuml(self, data, **override_args):
+    def run_cuml(self, data, bench_args={}, **override_setup_args):
         """Runs the cuml-based algorithm's fit method on specified data"""
         all_args = {**self.shared_args, **self.cuml_args}
-        all_args = {**all_args, **override_args}
+        all_args = {**all_args, **override_setup_args}
 
         if "cuml_setup_result" not in all_args:
             cuml_obj = self.cuml_class(**all_args)
@@ -155,35 +160,35 @@ def run_cuml(self, data, **override_args):
         if self.cuml_data_prep_hook:
             data = self.cuml_data_prep_hook(data)
         if self.accepts_labels:
-            self.bench_func(cuml_obj, data[0], data[1])
+            self.bench_func(cuml_obj, data[0], data[1], **bench_args)
         else:
-            self.bench_func(cuml_obj, data[0])
+            self.bench_func(cuml_obj, data[0], **bench_args)
 
         return cuml_obj
 
     def setup_cpu(self, data, **override_args):
+        all_args = {**self.shared_args, **self.cpu_args}
+        all_args = {**all_args, **override_args}
         if self.setup_cpu_func is not None:
-            all_args = {**self.shared_args, **self.cpu_args}
-            all_args = {**all_args, **override_args}
             return {
                 "cpu_setup_result": self.setup_cpu_func(
                     self.cpu_class, data, all_args, self.tmpdir
                 )
             }
         else:
-            return {}
+            return all_args
 
     def setup_cuml(self, data, **override_args):
+        all_args = {**self.shared_args, **self.cuml_args}
+        all_args = {**all_args, **override_args}
         if self.setup_cuml_func is not None:
-            all_args = {**self.shared_args, **self.cuml_args}
-            all_args = {**all_args, **override_args}
             return {
                 "cuml_setup_result": self.setup_cuml_func(
                     self.cuml_class, data, all_args, self.tmpdir
                 )
             }
         else:
-            return {}
+            return all_args
 
 
 def _labels_to_int_hook(data):
@@ -228,15 +233,13 @@ def all_algorithms():
             cuml.random_projection.GaussianRandomProjection,
             shared_args=dict(n_components=10),
             name="GaussianRandomProjection",
-            bench_func=fit_transform,
             accepts_labels=False,
         ),
         AlgorithmPair(
             sklearn.random_projection.SparseRandomProjection,
             cuml.random_projection.SparseRandomProjection,
             shared_args=dict(n_components=10),
             name="SparseRandomProjection",
-            bench_func=fit_transform,
             accepts_labels=False,
         ),
         AlgorithmPair(
@@ -434,7 +437,7 @@ def all_algorithms():
             cuml.manifold.UMAP,
             shared_args=dict(n_neighbors=5, n_epochs=500),
             name="UMAP-Unsupervised",
-            accepts_labels=True,
+            accepts_labels=False,
             accuracy_function=cuml.metrics.trustworthiness,
         ),
         AlgorithmPair(
@@ -556,6 +559,139 @@ def all_algorithms():
             name="SparseCSRPolynomialFeatures",
             accepts_labels=False,
             bench_func=fit_transform
+        ),
+
+        AlgorithmPair(
+            None,
+            cuml.dask.neighbors.KNeighborsClassifier,
+            shared_args={},
+            cuml_args={},
+            name="MNMG.KNeighborsClassifier",
+            bench_func=fit_predict,
+            accepts_labels=True,
+            accuracy_function=cuml.metrics.accuracy_score
+        ),
+
+        AlgorithmPair(
+            None,
+            cuml.dask.cluster.KMeans,
+            shared_args=dict(n_clusters=8, max_iter=300, n_init=1),
+            cpu_args=dict(init="k-means++"),
+            cuml_args=dict(init="scalable-k-means++"),
+            name="MNMG.KMeans",
+            bench_func=fit_predict,
+            accepts_labels=False,
+            accuracy_function=metrics.homogeneity_score,
+        ),
+
+        AlgorithmPair(
+            None,
+            cuml.dask.cluster.DBSCAN,
+            shared_args=dict(eps=3, min_samples=2),
+            cpu_args=dict(algorithm="brute"),
+            name="MNMG.DBSCAN",
+            bench_func=fit_predict,
+            accepts_labels=False,
+        ),
+
+        AlgorithmPair(
+            None,
+            cuml.dask.manifold.UMAP,
+            shared_args=dict(n_neighbors=5, n_epochs=500),
+            name="MNMG.UMAP-Unsupervised",
+            bench_func=transform,
+            setup_cuml_func=_build_mnmg_umap,
+            accepts_labels=False,
+            accuracy_function=cuml.metrics.trustworthiness,
+        ),
+
+        AlgorithmPair(
+            None,
+            cuml.dask.manifold.UMAP,
+            shared_args=dict(n_neighbors=5, n_epochs=500),
+            name="MNMG.UMAP-Supervised",
+            bench_func=transform,
+            setup_cuml_func=_build_mnmg_umap,
+            accepts_labels=True,
+            accuracy_function=cuml.metrics.trustworthiness,
+        ),
+
+        AlgorithmPair(
+            None,
+            cuml.dask.neighbors.NearestNeighbors,
+            shared_args=dict(n_neighbors=1024),
+            cpu_args=dict(algorithm="brute", n_jobs=-1),
+            cuml_args={},
+            name="MNMG.NearestNeighbors",
+            accepts_labels=False,
+            bench_func=fit_kneighbors,
+        ),
+
+        AlgorithmPair(
+            None,
+            cuml.dask.decomposition.TruncatedSVD,
+            shared_args=dict(n_components=10),
+            name="MNMG.tSVD",
+            accepts_labels=False,
+        ),
+
+        AlgorithmPair(
+            None,
+            cuml.dask.decomposition.PCA,
+            shared_args=dict(n_components=10),
+            name="MNMG.PCA",
+            accepts_labels=False,
+        ),
+
+        AlgorithmPair(
+            None,
+            cuml.dask.linear_model.LinearRegression,
+            shared_args={},
+            name="MNMG.LinearRegression",
+            bench_func=fit_predict,
+            accepts_labels=True,
+            accuracy_function=metrics.r2_score,
+        ),
+
+        AlgorithmPair(
+            None,
+            cuml.dask.linear_model.Lasso,
+            shared_args={},
+            name="MNMG.Lasso",
+            bench_func=fit_predict,
+            accepts_labels=True,
+            accuracy_function=metrics.r2_score,
+        ),
+
+        AlgorithmPair(
+            None,
+            cuml.dask.linear_model.ElasticNet,
+            shared_args={"alpha": 0.1, "l1_ratio": 0.5},
+            name="MNMG.ElasticNet",
+            bench_func=fit_predict,
+            accepts_labels=True,
+            accuracy_function=metrics.r2_score,
+        ),
+
+        AlgorithmPair(
+            None,
+            cuml.dask.linear_model.Ridge,
+            shared_args={},
+            name="MNMG.Ridge",
+            bench_func=fit_predict,
+            accepts_labels=True,
+            accuracy_function=metrics.r2_score,
+        ),
+
+        AlgorithmPair(
+            None,
+            cuml.dask.neighbors.KNeighborsRegressor,
+            shared_args={},
+            cuml_args={},
+            name="MNMG.KNeighborsRegressor",
+            bench_func=fit_predict,
+            accepts_labels=True,
+            accuracy_function=cuml.metrics.r2_score
         )
     ]
 

diff --git a/python/cuml/benchmark/automated/__init__.py b/python/cuml/benchmark/automated/__init__.py
diff --git a/python/cuml/benchmark/automated/bench_classification.py b/python/cuml/benchmark/automated/bench_classification.py
@@ -0,0 +1,68 @@
+#
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import pytest
+from .utils.utils import _benchmark_algo, fixture_generation_helper
+from .utils.utils import bench_step  # noqa: F401
+from .. import datagen
+
+#
+# Core tests
+#
+
+
+@pytest.fixture(**fixture_generation_helper({
+                    'n_samples': [1000, 10000],
+                    'n_features': [5, 500]
+                }))
+def classification(request):
+    data = datagen.gen_data(
+        'classification',
+        'cupy',
+        n_samples=request.param['n_samples'],
+        n_features=request.param['n_features']
+    )
+    return data, {
+                    'dataset_type': 'classification',
+                    **request.param
+                 }
+
+
+def bench_logistic_regression(gpubenchmark, bench_step,  # noqa: F811
+                              classification):
+    _benchmark_algo(gpubenchmark, 'LogisticRegression',
+                    bench_step, classification)
+
+
+def bench_mbsgcclf(gpubenchmark, bench_step, classification):  # noqa: F811
+    _benchmark_algo(gpubenchmark, 'MBSGDClassifier',
+                    bench_step, classification)
+
+
+def bench_knnclassifier(gpubenchmark, bench_step,  # noqa: F811
+                        classification):
+    _benchmark_algo(gpubenchmark, 'KNeighborsClassifier',
+                    bench_step, classification)
+
+
+def bench_svc_linear(gpubenchmark, bench_step, classification):  # noqa: F811
+    _benchmark_algo(gpubenchmark, 'SVC-Linear',
+                    bench_step, classification)
+
+
+def bench_svc_rbf(gpubenchmark, bench_step, classification):  # noqa: F811
+    _benchmark_algo(gpubenchmark, 'SVC-RBF',
+                    bench_step, classification)