rapidsai · rapids-bot · Jun 27, 2022 · Jun 17, 2022 · Jun 17, 2022 · Jun 17, 2022
@@ -235,6 +235,11 @@ cd "$WORKSPACE/python/cudf/cudf"
 gpuci_logger "Python py.test for cuDF"
 py.test -n 8 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config="$WORKSPACE/python/cudf/.coveragerc" --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term --dist=loadscope tests
 
+# Run benchmarks with both cudf and pandas to ensure compatibility is maintained.
+cd "$WORKSPACE/python/cudf"
+CUDF_BENCHMARKS_TEST_ONLY=ON pytest -n 8 benchmarks
+CUDF_BENCHMARKS_USE_PANDAS=ON CUDF_BENCHMARKS_TEST_ONLY=ON pytest -n 8 benchmarks
+
 cd "$WORKSPACE/python/dask_cudf"
 gpuci_logger "Python py.test for dask-cudf"
 py.test -n 8 --cache-clear --basetemp="$WORKSPACE/dask-cudf-cuda-tmp" --junitxml="$WORKSPACE/junit-dask-cudf.xml" -v --cov-config=.coveragerc --cov=dask_cudf --cov-report=xml:"$WORKSPACE/python/dask_cudf/dask-cudf-coverage.xml" --cov-report term dask_cudf

@@ -28,6 +28,7 @@ dependencies:
   - cython>=0.29,<0.30
   - fsspec>=0.6.0
   - pytest
+  - pytest-cases
   - pytest-benchmark
   - pytest-xdist
   - sphinx

@@ -0,0 +1,115 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+"""Benchmarks of DataFrame methods."""
+
+import string
+
+import numpy
+import pytest
+from config import cudf, cupy
+from utils import accepts_cudf_fixture
+
+
+@pytest.mark.parametrize("N", [100, 1_000_000])
+def bench_construction(benchmark, N):
+    benchmark(cudf.DataFrame, {None: cupy.random.rand(N)})
+
+
+@accepts_cudf_fixture(cls="dataframe", dtype="float", cols=6)
+@pytest.mark.parametrize(
+    "expr", ["a+b", "a+b+c+d+e", "a / (sin(a) + cos(b)) * tanh(d*e*f)"]
+)
+def bench_eval_func(benchmark, expr, dataframe):
+    benchmark(dataframe.eval, expr)
+
+
+@accepts_cudf_fixture(
+    cls="dataframe", dtype="int", nulls=False, cols=6, name="df"
+)
+@pytest.mark.parametrize(
+    "nkey_cols",
+    [2, 3, 4],
+)
+def bench_merge(benchmark, df, nkey_cols):
+    benchmark(df.merge, df, on=list(df.columns[:nkey_cols]))
+
+
+# TODO: Some of these cases could be generalized to an IndexedFrame benchmark
+# instead of a DataFrame benchmark.
+@accepts_cudf_fixture(cls="dataframe", dtype="int")
+@pytest.mark.parametrize(
+    "values",
+    [
+        range(1000),
+        {f"key{i}": range(1000) for i in range(10)},
+        cudf.DataFrame({f"key{i}": range(1000) for i in range(10)}),
+        cudf.Series(range(1000)),
+    ],
+)
+def bench_isin(benchmark, dataframe, values):
+    benchmark(dataframe.isin, values)
+
+
+@pytest.fixture(
+    params=[0, numpy.random.RandomState, cupy.random.RandomState],
+    ids=["Seed", "NumpyRandomState", "CupyRandomState"],
+)
+def random_state(request):
+    rs = request.param
+    return rs if isinstance(rs, int) else rs(seed=42)
+
+
+@accepts_cudf_fixture(cls="dataframe", dtype="int")
+@pytest.mark.parametrize("frac", [0.5])
+def bench_sample(benchmark, dataframe, axis, frac, random_state):
+    if axis == 1 and isinstance(random_state, cupy.random.RandomState):
+        pytest.skip("Unsupported params.")
+    benchmark(
+        dataframe.sample, frac=frac, axis=axis, random_state=random_state
+    )
+
+
+@accepts_cudf_fixture(cls="dataframe", dtype="int", nulls=False, cols=6)
+@pytest.mark.parametrize(
+    "nkey_cols",
+    [2, 3, 4],
+)
+def bench_groupby(benchmark, dataframe, nkey_cols):
+    benchmark(dataframe.groupby, by=list(dataframe.columns[:nkey_cols]))
+
+
+@accepts_cudf_fixture(cls="dataframe", dtype="int", nulls=False, cols=6)
+@pytest.mark.parametrize(
+    "agg",
+    [
+        "sum",
+        ["sum", "mean"],
+        {
+            f"{string.ascii_lowercase[i]}": ["sum", "mean", "count"]
+            for i in range(6)
-            f"{string.ascii_lowercase[i]}": ["sum", "mean", "count"]
-            for i in range(6)
+            col: ["sum", "mean", "count"]
+            for col in ["a", "b", "c", "d", "e", "f"]
-            f"{string.ascii_lowercase[i]}": ["sum", "mean", "count"]
-            for i in range(6)
+            col: ["sum", "mean", "count"]
+            for col in ["a", "b", "c", "d", "e", "f"]
+        },
+    ],
+)
+@pytest.mark.parametrize(
+    "nkey_cols",
+    [2, 3, 4],
+)
+@pytest.mark.parametrize("as_index", [True, False])
+@pytest.mark.parametrize("sort", [True, False])
+def bench_groupby_agg(benchmark, dataframe, agg, nkey_cols, as_index, sort):
+    by = list(dataframe.columns[:nkey_cols])
+    benchmark(dataframe.groupby(by=by, as_index=as_index, sort=sort).agg, agg)
+
+
+@accepts_cudf_fixture(cls="dataframe", dtype="int")
+@pytest.mark.parametrize("ncol_sort", [1])
+def bench_sort_values(benchmark, dataframe, ncol_sort):
+    benchmark(dataframe.sort_values, list(dataframe.columns[:ncol_sort]))
+
+
+@accepts_cudf_fixture(cls="dataframe", dtype="int")
+@pytest.mark.parametrize("ncol_sort", [1])
+@pytest.mark.parametrize("n", [10])
+def bench_nsmallest(benchmark, dataframe, ncol_sort, n):
+    by = list(dataframe.columns[:ncol_sort])
+    benchmark(dataframe.nsmallest, n, by)
@@ -0,0 +1,88 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+"""Benchmarks of methods that exist for both Frame and BaseIndex."""
+
+import operator
+
+import numpy as np
+import pytest
+from utils import accepts_cudf_fixture, make_gather_map
+
+
+@accepts_cudf_fixture(cls="frame_or_index", dtype="int")
+@pytest.mark.parametrize("gather_how", ["sequence", "reverse", "random"])
+@pytest.mark.parametrize("fraction", [0.4])
+def bench_take(benchmark, gather_how, fraction, frame_or_index):
+    nr = len(frame_or_index)
+    gather_map = make_gather_map(nr * fraction, nr, gather_how)
+    benchmark(frame_or_index.take, gather_map)
+
+
+@pytest.mark.pandas_incompatible  # Series/Index work, but not DataFrame
+@accepts_cudf_fixture(cls="frame_or_index", dtype="int")
+def bench_argsort(benchmark, frame_or_index):
+    benchmark(frame_or_index.argsort)
+
+
+@accepts_cudf_fixture(cls="frame_or_index", dtype="int")
+def bench_min(benchmark, frame_or_index):
+    benchmark(frame_or_index.min)
+
+
+@accepts_cudf_fixture(cls="frame_or_index", dtype="int")
+def bench_where(benchmark, frame_or_index):
+    cond = frame_or_index % 2 == 0
+    benchmark(frame_or_index.where, cond, 0)
+
+
+@accepts_cudf_fixture(cls="frame_or_index", dtype="int", nulls=False)
+@pytest.mark.pandas_incompatible
+def bench_values_host(benchmark, frame_or_index):
+    benchmark(lambda: frame_or_index.values_host)
+
+
+@accepts_cudf_fixture(cls="frame_or_index", dtype="int", nulls=False)
+def bench_values(benchmark, frame_or_index):
+    benchmark(lambda: frame_or_index.values)
+
+
+@accepts_cudf_fixture(cls="frame_or_index", dtype="int")
+def bench_nunique(benchmark, frame_or_index):
+    benchmark(frame_or_index.nunique)
+
+
+@accepts_cudf_fixture(cls="frame_or_index", dtype="int", nulls=False)
+def bench_to_numpy(benchmark, frame_or_index):
+    benchmark(frame_or_index.to_numpy)
+
+
+@accepts_cudf_fixture(cls="frame_or_index", dtype="int", nulls=False)
+@pytest.mark.pandas_incompatible
+def bench_to_cupy(benchmark, frame_or_index):
+    benchmark(frame_or_index.to_cupy)
+
+
+@accepts_cudf_fixture(cls="frame_or_index", dtype="int")
+@pytest.mark.pandas_incompatible
+def bench_to_arrow(benchmark, frame_or_index):
+    benchmark(frame_or_index.to_arrow)
+
+
+@accepts_cudf_fixture(cls="frame_or_index", dtype="int")
+def bench_astype(benchmark, frame_or_index):
+    benchmark(frame_or_index.astype, float)
+
+
+@pytest.mark.parametrize("ufunc", [np.add, np.logical_and])
+@accepts_cudf_fixture(cls="frame_or_index", dtype="int")
+def bench_ufunc_series_binary(benchmark, frame_or_index, ufunc):
+    benchmark(ufunc, frame_or_index, frame_or_index)
+
+
+@pytest.mark.parametrize(
+    "op",
+    [operator.add, operator.mul, operator.eq],
+)
+@accepts_cudf_fixture(cls="frame_or_index", dtype="int")
+def bench_binops(benchmark, op, frame_or_index):
+    benchmark(lambda: op(frame_or_index, frame_or_index))
@@ -0,0 +1,56 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+"""Benchmarks of free functions that accept cudf objects."""
+
+import pytest
+import pytest_cases
+from config import cudf, cupy
+
+
+@pytest_cases.parametrize_with_cases("objs", prefix="concat")
+@pytest.mark.parametrize(
+    "axis",
+    [
+        1,
+    ],
+)
+@pytest.mark.parametrize("join", ["inner", "outer"])
+@pytest.mark.parametrize("ignore_index", [True, False])
+def bench_concat_axis_1(benchmark, objs, axis, join, ignore_index):
+    benchmark(
+        cudf.concat, objs=objs, axis=axis, join=join, ignore_index=ignore_index
+    )
+
+
+@pytest.mark.parametrize("size", [10_000, 100_000])
+@pytest.mark.parametrize("cardinality", [10, 100, 1000])
+@pytest.mark.parametrize("dtype", [cupy.bool_, cupy.float64])
+def bench_get_dummies_high_cardinality(benchmark, size, cardinality, dtype):
+    """This test is mean to test the performance of get_dummies given the
+    cardinality of column to encode is high.
+    """
+    df = cudf.DataFrame(
+        {
+            "col": cudf.Series(
+                cupy.random.randint(low=0, high=cardinality, size=size)
+            ).astype("category")
+        }
+    )
+    benchmark(cudf.get_dummies, df, columns=["col"], dtype=dtype)
+
+
+@pytest.mark.parametrize("prefix", [None, "pre"])
+def bench_get_dummies_simple(benchmark, prefix):
+    """This test provides a small input to get_dummies to test the efficiency
+    of the API itself.
+    """
+    df = cudf.DataFrame(
+        {
+            "col1": list(range(10)),
+            "col2": list("abcdefghij"),
+            "col3": cudf.Series(list(range(100, 110)), dtype="category"),
+        }
+    )
+    benchmark(
+        cudf.get_dummies, df, columns=["col1", "col2", "col3"], prefix=prefix
+    )