-
Notifications
You must be signed in to change notification settings - Fork 916
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Feature/python benchmarking (#11125)
This PR ports the benchmarks in https://github.com/vyasr/cudf_benchmarks, adding official benchmarks to the repository. The new benchmarks are designed from the ground up to make the best use of pytest, pytest-benchmark, and pytest-cases to simplify writing and maintaining benchmarks. Extended discussions of various previous design questions may be found on [the original repo](https://github.com/vyasr/cudf_benchmarks). Reviewers may also benefit from reviewing the companion PR creating documentation for how to write benchmarks, #11122. Tests will not pass here until rapidsai/integration#492 is merged. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) - Bradley Dice (https://github.com/bdice) - Michael Wang (https://github.com/isVoid) - GALI PREM SAGAR (https://github.com/galipremsagar) - Matthew Roeschke (https://github.com/mroeschke) URL: #11125
- Loading branch information
Showing
18 changed files
with
1,292 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
# Copyright (c) 2022, NVIDIA CORPORATION. | ||
|
||
"""Benchmarks of DataFrame methods.""" | ||
|
||
import string | ||
|
||
import numpy | ||
import pytest | ||
from config import cudf, cupy | ||
from utils import benchmark_with_object | ||
|
||
|
||
@pytest.mark.parametrize("N", [100, 1_000_000]) | ||
def bench_construction(benchmark, N): | ||
benchmark(cudf.DataFrame, {None: cupy.random.rand(N)}) | ||
|
||
|
||
@benchmark_with_object(cls="dataframe", dtype="float", cols=6) | ||
@pytest.mark.parametrize( | ||
"expr", ["a+b", "a+b+c+d+e", "a / (sin(a) + cos(b)) * tanh(d*e*f)"] | ||
) | ||
def bench_eval_func(benchmark, expr, dataframe): | ||
benchmark(dataframe.eval, expr) | ||
|
||
|
||
@benchmark_with_object(cls="dataframe", dtype="int", nulls=False, cols=6) | ||
@pytest.mark.parametrize( | ||
"num_key_cols", | ||
[2, 3, 4], | ||
) | ||
def bench_merge(benchmark, dataframe, num_key_cols): | ||
benchmark( | ||
dataframe.merge, dataframe, on=list(dataframe.columns[:num_key_cols]) | ||
) | ||
|
||
|
||
# TODO: Some of these cases could be generalized to an IndexedFrame benchmark | ||
# instead of a DataFrame benchmark. | ||
@benchmark_with_object(cls="dataframe", dtype="int") | ||
@pytest.mark.parametrize( | ||
"values", | ||
[ | ||
range(1000), | ||
{f"key{i}": range(1000) for i in range(10)}, | ||
cudf.DataFrame({f"key{i}": range(1000) for i in range(10)}), | ||
cudf.Series(range(1000)), | ||
], | ||
) | ||
def bench_isin(benchmark, dataframe, values): | ||
benchmark(dataframe.isin, values) | ||
|
||
|
||
@pytest.fixture( | ||
params=[0, numpy.random.RandomState, cupy.random.RandomState], | ||
ids=["Seed", "NumpyRandomState", "CupyRandomState"], | ||
) | ||
def random_state(request): | ||
rs = request.param | ||
return rs if isinstance(rs, int) else rs(seed=42) | ||
|
||
|
||
@benchmark_with_object(cls="dataframe", dtype="int") | ||
@pytest.mark.parametrize("frac", [0.5]) | ||
def bench_sample(benchmark, dataframe, axis, frac, random_state): | ||
if axis == 1 and isinstance(random_state, cupy.random.RandomState): | ||
pytest.skip("Unsupported params.") | ||
benchmark( | ||
dataframe.sample, frac=frac, axis=axis, random_state=random_state | ||
) | ||
|
||
|
||
@benchmark_with_object(cls="dataframe", dtype="int", nulls=False, cols=6) | ||
@pytest.mark.parametrize( | ||
"num_key_cols", | ||
[2, 3, 4], | ||
) | ||
def bench_groupby(benchmark, dataframe, num_key_cols): | ||
benchmark(dataframe.groupby, by=list(dataframe.columns[:num_key_cols])) | ||
|
||
|
||
@benchmark_with_object(cls="dataframe", dtype="int", nulls=False, cols=6) | ||
@pytest.mark.parametrize( | ||
"agg", | ||
[ | ||
"sum", | ||
["sum", "mean"], | ||
{ | ||
f"{string.ascii_lowercase[i]}": ["sum", "mean", "count"] | ||
for i in range(6) | ||
}, | ||
], | ||
) | ||
@pytest.mark.parametrize( | ||
"num_key_cols", | ||
[2, 3, 4], | ||
) | ||
@pytest.mark.parametrize("as_index", [True, False]) | ||
@pytest.mark.parametrize("sort", [True, False]) | ||
def bench_groupby_agg(benchmark, dataframe, agg, num_key_cols, as_index, sort): | ||
by = list(dataframe.columns[:num_key_cols]) | ||
benchmark(dataframe.groupby(by=by, as_index=as_index, sort=sort).agg, agg) | ||
|
||
|
||
@benchmark_with_object(cls="dataframe", dtype="int") | ||
@pytest.mark.parametrize("num_cols_to_sort", [1]) | ||
def bench_sort_values(benchmark, dataframe, num_cols_to_sort): | ||
benchmark( | ||
dataframe.sort_values, list(dataframe.columns[:num_cols_to_sort]) | ||
) | ||
|
||
|
||
@benchmark_with_object(cls="dataframe", dtype="int") | ||
@pytest.mark.parametrize("num_cols_to_sort", [1]) | ||
@pytest.mark.parametrize("n", [10]) | ||
def bench_nsmallest(benchmark, dataframe, num_cols_to_sort, n): | ||
by = list(dataframe.columns[:num_cols_to_sort]) | ||
benchmark(dataframe.nsmallest, n, by) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
# Copyright (c) 2022, NVIDIA CORPORATION. | ||
|
||
"""Benchmarks of methods that exist for both Frame and BaseIndex.""" | ||
|
||
import operator | ||
|
||
import numpy as np | ||
import pytest | ||
from utils import benchmark_with_object, make_gather_map | ||
|
||
|
||
@benchmark_with_object(cls="frame_or_index", dtype="int") | ||
@pytest.mark.parametrize("gather_how", ["sequence", "reverse", "random"]) | ||
@pytest.mark.parametrize("fraction", [0.4]) | ||
def bench_take(benchmark, gather_how, fraction, frame_or_index): | ||
nr = len(frame_or_index) | ||
gather_map = make_gather_map(nr * fraction, nr, gather_how) | ||
benchmark(frame_or_index.take, gather_map) | ||
|
||
|
||
@pytest.mark.pandas_incompatible # Series/Index work, but not DataFrame | ||
@benchmark_with_object(cls="frame_or_index", dtype="int") | ||
def bench_argsort(benchmark, frame_or_index): | ||
benchmark(frame_or_index.argsort) | ||
|
||
|
||
@benchmark_with_object(cls="frame_or_index", dtype="int") | ||
def bench_min(benchmark, frame_or_index): | ||
benchmark(frame_or_index.min) | ||
|
||
|
||
@benchmark_with_object(cls="frame_or_index", dtype="int") | ||
def bench_where(benchmark, frame_or_index): | ||
cond = frame_or_index % 2 == 0 | ||
benchmark(frame_or_index.where, cond, 0) | ||
|
||
|
||
@benchmark_with_object(cls="frame_or_index", dtype="int", nulls=False) | ||
@pytest.mark.pandas_incompatible | ||
def bench_values_host(benchmark, frame_or_index): | ||
benchmark(lambda: frame_or_index.values_host) | ||
|
||
|
||
@benchmark_with_object(cls="frame_or_index", dtype="int", nulls=False) | ||
def bench_values(benchmark, frame_or_index): | ||
benchmark(lambda: frame_or_index.values) | ||
|
||
|
||
@benchmark_with_object(cls="frame_or_index", dtype="int") | ||
def bench_nunique(benchmark, frame_or_index): | ||
benchmark(frame_or_index.nunique) | ||
|
||
|
||
@benchmark_with_object(cls="frame_or_index", dtype="int", nulls=False) | ||
def bench_to_numpy(benchmark, frame_or_index): | ||
benchmark(frame_or_index.to_numpy) | ||
|
||
|
||
@benchmark_with_object(cls="frame_or_index", dtype="int", nulls=False) | ||
@pytest.mark.pandas_incompatible | ||
def bench_to_cupy(benchmark, frame_or_index): | ||
benchmark(frame_or_index.to_cupy) | ||
|
||
|
||
@benchmark_with_object(cls="frame_or_index", dtype="int") | ||
@pytest.mark.pandas_incompatible | ||
def bench_to_arrow(benchmark, frame_or_index): | ||
benchmark(frame_or_index.to_arrow) | ||
|
||
|
||
@benchmark_with_object(cls="frame_or_index", dtype="int") | ||
def bench_astype(benchmark, frame_or_index): | ||
benchmark(frame_or_index.astype, float) | ||
|
||
|
||
@pytest.mark.parametrize("ufunc", [np.add, np.logical_and]) | ||
@benchmark_with_object(cls="frame_or_index", dtype="int") | ||
def bench_ufunc_series_binary(benchmark, frame_or_index, ufunc): | ||
benchmark(ufunc, frame_or_index, frame_or_index) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"op", | ||
[operator.add, operator.mul, operator.eq], | ||
) | ||
@benchmark_with_object(cls="frame_or_index", dtype="int") | ||
def bench_binops(benchmark, op, frame_or_index): | ||
benchmark(lambda: op(frame_or_index, frame_or_index)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
# Copyright (c) 2022, NVIDIA CORPORATION. | ||
|
||
"""Benchmarks of free functions that accept cudf objects.""" | ||
|
||
import pytest | ||
import pytest_cases | ||
from config import cudf, cupy | ||
|
||
|
||
@pytest_cases.parametrize_with_cases("objs", prefix="concat") | ||
@pytest.mark.parametrize( | ||
"axis", | ||
[ | ||
1, | ||
], | ||
) | ||
@pytest.mark.parametrize("join", ["inner", "outer"]) | ||
@pytest.mark.parametrize("ignore_index", [True, False]) | ||
def bench_concat_axis_1(benchmark, objs, axis, join, ignore_index): | ||
benchmark( | ||
cudf.concat, objs=objs, axis=axis, join=join, ignore_index=ignore_index | ||
) | ||
|
||
|
||
@pytest.mark.parametrize("size", [10_000, 100_000]) | ||
@pytest.mark.parametrize("cardinality", [10, 100, 1000]) | ||
@pytest.mark.parametrize("dtype", [cupy.bool_, cupy.float64]) | ||
def bench_get_dummies_high_cardinality(benchmark, size, cardinality, dtype): | ||
"""Benchmark when the cardinality of column to encode is high.""" | ||
df = cudf.DataFrame( | ||
{ | ||
"col": cudf.Series( | ||
cupy.random.randint(low=0, high=cardinality, size=size) | ||
).astype("category") | ||
} | ||
) | ||
benchmark(cudf.get_dummies, df, columns=["col"], dtype=dtype) | ||
|
||
|
||
@pytest.mark.parametrize("prefix", [None, "pre"]) | ||
def bench_get_dummies_simple(benchmark, prefix): | ||
"""Benchmark with small input to test the efficiency of the API itself.""" | ||
df = cudf.DataFrame( | ||
{ | ||
"col1": list(range(10)), | ||
"col2": list("abcdefghij"), | ||
"col3": cudf.Series(list(range(100, 110)), dtype="category"), | ||
} | ||
) | ||
benchmark( | ||
cudf.get_dummies, df, columns=["col1", "col2", "col3"], prefix=prefix | ||
) |
Oops, something went wrong.