From 851e23545d29bca17a778f84dbc2000b3dde0ba8 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Tue, 15 Feb 2022 10:31:45 -0600
Subject: [PATCH] Reduce pytest runtime (#10203)

This PR reduces the overall runtime of the cuDF pytest suite. Changes include:

- asserting equal on the GPU where possible for large datasets
- in some cases reducing excessive test data size

part of https://github.com/rapidsai/cudf/issues/9999

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Ashwin Srinath (https://github.com/shwina)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/10203
---
 python/cudf/cudf/testing/_utils.py            |  9 +++-
 .../test_avro_reader_fastavro_integration.py  |  4 +-
 python/cudf/cudf/tests/test_binops.py         | 15 +++---
 python/cudf/cudf/tests/test_csv.py            | 11 +++--
 .../cudf/tests/test_extension_compilation.py  | 12 +++--
 python/cudf/cudf/tests/test_indexing.py       | 46 +++++++++----------
 python/cudf/cudf/tests/test_orc.py            | 24 +++++-----
 python/cudf/cudf/tests/test_parquet.py        |  8 ++--
 python/cudf/cudf/tests/test_repr.py           | 45 ++++++++----------
 python/cudf/cudf/tests/test_reshape.py        |  8 ++--
 python/cudf/cudf/tests/test_string.py         |  7 +--
 python/cudf/cudf/tests/test_udf_masked_ops.py | 18 +++++---
 12 files changed, 106 insertions(+), 101 deletions(-)

diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py
index b97b2d660d6..e767c0c62be 100644
--- a/python/cudf/cudf/testing/_utils.py
+++ b/python/cudf/cudf/testing/_utils.py
@@ -1,5 +1,6 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
+import itertools
 import re
 import warnings
 from collections.abc import Mapping, Sequence
@@ -330,3 +331,9 @@ def does_not_raise():
 
 def xfail_param(param, **kwargs):
     return pytest.param(param, marks=pytest.mark.xfail(**kwargs))
+
+
+parametrize_numeric_dtypes_pairwise = pytest.mark.parametrize(
+    "left_dtype,right_dtype",
+    list(itertools.combinations_with_replacement(NUMERIC_TYPES, 2)),
+)
diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
index a064bec9e82..9eb01ae31b4 100644
--- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
+++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -210,7 +210,7 @@ def test_can_parse_no_schema():
     assert_eq(expected, actual)
 
 
-@pytest.mark.parametrize("rows", [0, 1, 10, 100000])
+@pytest.mark.parametrize("rows", [0, 1, 10, 1000])
 @pytest.mark.parametrize("codec", ["null", "deflate", "snappy"])
 def test_avro_compression(rows, codec):
     schema = {
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 76add8b9c5d..02ca7a0cd58 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -4,7 +4,7 @@
 import decimal
 import operator
 import random
-from itertools import product
+from itertools import combinations_with_replacement, product
 
 import cupy as cp
 import numpy as np
@@ -216,13 +216,12 @@ def test_series_compare(cmpop, obj_class, dtype):
 
 
 def _series_compare_nulls_typegen():
-    tests = []
-    tests += list(product(DATETIME_TYPES, DATETIME_TYPES))
-    tests += list(product(TIMEDELTA_TYPES, TIMEDELTA_TYPES))
-    tests += list(product(NUMERIC_TYPES, NUMERIC_TYPES))
-    tests += list(product(STRING_TYPES, STRING_TYPES))
-
-    return tests
+    return [
+        *combinations_with_replacement(DATETIME_TYPES, 2),
+        *combinations_with_replacement(TIMEDELTA_TYPES, 2),
+        *combinations_with_replacement(NUMERIC_TYPES, 2),
+        *combinations_with_replacement(STRING_TYPES, 2),
+    ]
 
 
 @pytest.mark.parametrize("cmpop", _cmpops)
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 9208b8c7cd4..f3d69e1745e 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 import gzip
 import os
@@ -8,6 +8,7 @@
 from io import BytesIO, StringIO
 from pathlib import Path
 
+import cupy as cp
 import numpy as np
 import pandas as pd
 import pytest
@@ -1009,17 +1010,17 @@ def test_small_zip(tmpdir):
 def test_csv_reader_carriage_return(tmpdir):
     rows = 1000
     names = ["int_row", "int_double_row"]
-
     buffer = ",".join(names) + "\r\n"
     for row in range(rows):
         buffer += str(row) + ", " + str(2 * row) + "\r\n"
 
     df = read_csv(StringIO(buffer))
+    expect = cudf.DataFrame(
+        {"int_row": cp.arange(rows), "int_double_row": cp.arange(rows) * 2}
+    )
 
     assert len(df) == rows
-    for row in range(0, rows):
-        assert df[names[0]][row] == row
-        assert df[names[1]][row] == 2 * row
+    assert_eq(expect, df)
 
 
 def test_csv_reader_tabs():
diff --git a/python/cudf/cudf/tests/test_extension_compilation.py b/python/cudf/cudf/tests/test_extension_compilation.py
index 47c9448cf63..692f40873d7 100644
--- a/python/cudf/cudf/tests/test_extension_compilation.py
+++ b/python/cudf/cudf/tests/test_extension_compilation.py
@@ -1,13 +1,17 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 import operator
 
 import cupy as cp
+import numpy as np
 import pytest
 from numba import cuda, types
 from numba.cuda import compile_ptx
+from numba.np.numpy_support import from_dtype
 
 from cudf import NA
 from cudf.core.udf.api import Masked
 from cudf.core.udf.typing import MaskedType
+from cudf.testing._utils import parametrize_numeric_dtypes_pairwise
 
 arith_ops = (
     operator.add,
@@ -159,19 +163,21 @@ def func(x):
 
 
 @pytest.mark.parametrize("op", ops)
-@pytest.mark.parametrize("ty1", number_types, ids=number_ids)
-@pytest.mark.parametrize("ty2", number_types, ids=number_ids)
+@parametrize_numeric_dtypes_pairwise
 @pytest.mark.parametrize(
     "masked",
     ((False, True), (True, False), (True, True)),
     ids=("um", "mu", "mm"),
 )
-def test_compile_arith_masked_ops(op, ty1, ty2, masked):
+def test_compile_arith_masked_ops(op, left_dtype, right_dtype, masked):
     def func(x, y):
         return op(x, y)
 
     cc = (7, 5)
 
+    ty1 = from_dtype(np.dtype(left_dtype))
+    ty2 = from_dtype(np.dtype(right_dtype))
+
     if masked[0]:
         ty1 = MaskedType(ty1)
     if masked[1]:
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 102e5b57e8e..19d7c8a10ab 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 
 from itertools import combinations
 
@@ -1292,45 +1292,43 @@ def test_loc_datetime_index(sli, is_dataframe):
 
 
 @pytest.mark.parametrize(
-    "gdf",
+    "gdf_kwargs",
     [
-        cudf.DataFrame({"a": range(1000000)}),
-        cudf.DataFrame({"a": range(1000000), "b": range(1000000)}),
-        cudf.DataFrame({"a": range(20), "b": range(20)}),
-        cudf.DataFrame(
-            {
+        {"data": {"a": range(100000)}},
+        {"data": {"a": range(100000), "b": range(100000)}},
+        {
+            "data": {
                 "a": range(20),
                 "b": range(20),
                 "c": ["abc", "def", "xyz", "def", "pqr"] * 4,
             }
-        ),
-        cudf.DataFrame(index=[1, 2, 3]),
-        cudf.DataFrame(index=range(1000000)),
-        cudf.DataFrame(columns=["a", "b", "c", "d"]),
-        cudf.DataFrame(columns=["a"], index=range(1000000)),
-        cudf.DataFrame(
-            columns=["a", "col2", "...col n"], index=range(1000000)
-        ),
-        cudf.DataFrame(index=cudf.Series(range(1000000)).astype("str")),
-        cudf.DataFrame(
-            columns=["a", "b", "c", "d"],
-            index=cudf.Series(range(1000000)).astype("str"),
-        ),
+        },
+        {"index": [1, 2, 3]},
+        {"index": range(100000)},
+        {"columns": ["a", "b", "c", "d"]},
+        {"columns": ["a"], "index": range(100000)},
+        {"columns": ["a", "col2", "...col n"], "index": range(100000)},
+        {"index": cudf.Series(range(100000)).astype("str")},
+        {
+            "columns": ["a", "b", "c", "d"],
+            "index": cudf.Series(range(100000)).astype("str"),
+        },
     ],
 )
 @pytest.mark.parametrize(
     "slice",
     [
-        slice(250000, 500000),
-        slice(250000, 250001),
-        slice(500000),
+        slice(25000, 50000),
+        slice(25000, 25001),
+        slice(50000),
         slice(1, 10),
         slice(10, 20),
         slice(15, 24000),
         slice(6),
     ],
 )
-def test_dataframe_sliced(gdf, slice):
+def test_dataframe_sliced(gdf_kwargs, slice):
+    gdf = cudf.DataFrame(**gdf_kwargs)
     pdf = gdf.to_pandas()
 
     actual = gdf[slice]
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 8689f773a02..623098741a9 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -16,6 +16,7 @@
 
 import cudf
 from cudf.io.orc import ORCWriter
+from cudf.testing import assert_frame_equal
 from cudf.testing._utils import (
     assert_eq,
     gen_rand_series,
@@ -93,7 +94,7 @@ def test_orc_reader_basic(datadir, inputfile, columns, use_index, engine):
         path, engine=engine, columns=columns, use_index=use_index
     )
 
-    assert_eq(expect, got, check_categorical=False)
+    assert_frame_equal(cudf.from_pandas(expect), got, check_categorical=False)
 
 
 def test_orc_reader_filenotfound(tmpdir):
@@ -384,11 +385,13 @@ def test_orc_writer(datadir, tmpdir, reference_file, columns, compression):
         else:
             print(type(excpr).__name__)
 
-    expect = orcfile.read(columns=columns).to_pandas()
-    cudf.from_pandas(expect).to_orc(gdf_fname.strpath, compression=compression)
-    got = pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas()
+    expect = cudf.from_pandas(orcfile.read(columns=columns).to_pandas())
+    expect.to_orc(gdf_fname.strpath, compression=compression)
+    got = cudf.from_pandas(
+        pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas()
+    )
 
-    assert_eq(expect, got)
+    assert_frame_equal(expect, got)
 
 
 @pytest.mark.parametrize("stats_freq", ["NONE", "STRIPE", "ROWGROUP"])
@@ -405,11 +408,11 @@ def test_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq):
         else:
             print(type(excpr).__name__)
 
-    expect = orcfile.read().to_pandas()
-    cudf.from_pandas(expect).to_orc(gdf_fname.strpath, statistics=stats_freq)
-    got = pa.orc.ORCFile(gdf_fname).read().to_pandas()
+    expect = cudf.from_pandas(orcfile.read().to_pandas())
+    expect.to_orc(gdf_fname.strpath, statistics=stats_freq)
+    got = cudf.from_pandas(pa.orc.ORCFile(gdf_fname).read().to_pandas())
 
-    assert_eq(expect, got)
+    assert_frame_equal(expect, got)
 
 
 @pytest.mark.parametrize("stats_freq", ["NONE", "STRIPE", "ROWGROUP"])
@@ -492,8 +495,7 @@ def test_chunked_orc_writer(
     writer.close()
 
     got = pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas()
-
-    assert_eq(expect, got)
+    assert_frame_equal(cudf.from_pandas(expect), cudf.from_pandas(got))
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index e1ca9f6f006..7feaa400446 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1105,9 +1105,9 @@ def test_parquet_reader_list_large_multi_rowgroup_nulls(tmpdir):
     assert_eq(expect, got)
 
 
-@pytest.mark.parametrize("skip", range(0, 128))
+@pytest.mark.parametrize("skip", [0, 1, 5, 10])
 def test_parquet_reader_list_skiprows(skip, tmpdir):
-    num_rows = 128
+    num_rows = 10
     src = pd.DataFrame(
         {
             "a": list_gen(int_gen, 0, num_rows, 80, 50),
@@ -1124,9 +1124,9 @@ def test_parquet_reader_list_skiprows(skip, tmpdir):
     assert_eq(expect, got, check_dtype=False)
 
 
-@pytest.mark.parametrize("skip", range(0, 120))
+@pytest.mark.parametrize("skip", [0, 1, 5, 10])
 def test_parquet_reader_list_num_rows(skip, tmpdir):
-    num_rows = 128
+    num_rows = 20
     src = pd.DataFrame(
         {
             "a": list_gen(int_gen, 0, num_rows, 80, 50),
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index ca02ee55df0..8f2e4811e36 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
 import textwrap
 
@@ -13,7 +13,14 @@
 from cudf.testing import _utils as utils
 from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes
 
-repr_categories = utils.NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]
+repr_categories = [
+    "uint16",
+    "int64",
+    "float64",
+    "str",
+    "category",
+    "datetime64[ns]",
+]
 
 
 @pytest.mark.parametrize("dtype", repr_categories)
@@ -84,36 +91,22 @@ def test_full_series(nrows, dtype):
     pd.reset_option("display.max_rows")
 
 
+@pytest.mark.parametrize("nrows", [5, 10, 15])
+@pytest.mark.parametrize("ncols", [5, 10, 15])
+@pytest.mark.parametrize("size", [20, 21])
 @pytest.mark.parametrize("dtype", repr_categories)
-@pytest.mark.parametrize("nrows", [0, 1, 2, 9, 20 / 2, 11, 20 - 1, 20, 20 + 1])
-@pytest.mark.parametrize("ncols", [0, 1, 2, 9, 20 / 2, 11, 20 - 1, 20, 20 + 1])
-def test_full_dataframe_20(dtype, nrows, ncols):
-    size = 20
-    pdf = pd.DataFrame(
-        {idx: np.random.randint(0, 100, size) for idx in range(size)}
-    ).astype(dtype)
-    gdf = cudf.from_pandas(pdf)
-
-    assert pdf.__repr__() == gdf.__repr__()
-    assert pdf._repr_html_() == gdf._repr_html_()
-    assert pdf._repr_latex_() == gdf._repr_latex_()
-
-
-@pytest.mark.parametrize("dtype", repr_categories)
-@pytest.mark.parametrize("nrows", [9, 21 / 2, 11, 21 - 1])
-@pytest.mark.parametrize("ncols", [9, 21 / 2, 11, 21 - 1])
-def test_full_dataframe_21(dtype, nrows, ncols):
-    size = 21
+def test_full_dataframe_20(dtype, size, nrows, ncols):
     pdf = pd.DataFrame(
         {idx: np.random.randint(0, 100, size) for idx in range(size)}
     ).astype(dtype)
     gdf = cudf.from_pandas(pdf)
 
-    pd.options.display.max_rows = int(nrows)
-    pd.options.display.max_columns = int(ncols)
-    assert pdf.__repr__() == gdf.__repr__()
-    pd.reset_option("display.max_rows")
-    pd.reset_option("display.max_columns")
+    with pd.option_context(
+        "display.max_rows", int(nrows), "display.max_columns", int(ncols)
+    ):
+        assert repr(pdf) == repr(gdf)
+        assert pdf._repr_html_() == gdf._repr_html_()
+        assert pdf._repr_latex_() == gdf._repr_latex_()
 
 
 @given(
diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py
index b8f975f233e..2efa781c506 100644
--- a/python/cudf/cudf/tests/test_reshape.py
+++ b/python/cudf/cudf/tests/test_reshape.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 
 import re
 
@@ -17,9 +17,9 @@
 )
 
 
-@pytest.mark.parametrize("num_id_vars", [0, 1, 2, 10])
-@pytest.mark.parametrize("num_value_vars", [0, 1, 2, 10])
-@pytest.mark.parametrize("num_rows", [1, 2, 1000])
+@pytest.mark.parametrize("num_id_vars", [0, 1, 2])
+@pytest.mark.parametrize("num_value_vars", [0, 1, 2])
+@pytest.mark.parametrize("num_rows", [1, 2, 100])
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES)
 @pytest.mark.parametrize("nulls", ["none", "some", "all"])
 def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype):
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index efe8e523d4e..56218372c23 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -532,12 +532,7 @@ def _cat_convert_seq_to_cudf(others):
 @pytest.mark.parametrize("sep", [None, "", " ", "|", ",", "|||"])
 @pytest.mark.parametrize("na_rep", [None, "", "null", "a"])
 @pytest.mark.parametrize(
-    "index",
-    [
-        ["1", "2", "3", "4", "5"],
-        pd.Series(["1", "2", "3", "4", "5"]),
-        pd.Index(["1", "2", "3", "4", "5"]),
-    ],
+    "index", [["1", "2", "3", "4", "5"]],
 )
 def test_string_cat(ps_gs, others, sep, na_rep, index):
     ps, gs = ps_gs
diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index 56090c8eacf..faaea6eec08 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 import math
 import operator
 
@@ -14,7 +15,11 @@
     unary_ops,
 )
 from cudf.core.udf.utils import precompiled
-from cudf.testing._utils import NUMERIC_TYPES, _decimal_series, assert_eq
+from cudf.testing._utils import (
+    _decimal_series,
+    assert_eq,
+    parametrize_numeric_dtypes_pairwise,
+)
 
 
 def run_masked_udf_test(func, data, args=(), **kwargs):
@@ -238,10 +243,9 @@ def func(row):
     run_masked_udf_test(func, gdf, check_dtype=False)
 
 
-@pytest.mark.parametrize("dtype_a", list(NUMERIC_TYPES))
-@pytest.mark.parametrize("dtype_b", list(NUMERIC_TYPES))
+@parametrize_numeric_dtypes_pairwise
 @pytest.mark.parametrize("op", [operator.add, operator.and_, operator.eq])
-def test_apply_mixed_dtypes(dtype_a, dtype_b, op):
+def test_apply_mixed_dtypes(left_dtype, right_dtype, op):
     """
     Test that operations can be performed between columns
     of different dtypes and return a column with the correct
@@ -251,7 +255,7 @@ def test_apply_mixed_dtypes(dtype_a, dtype_b, op):
     # First perform the op on two dummy data on host, if numpy can
     # safely type cast, we should expect it to work in udf too.
     try:
-        op(getattr(np, dtype_a)(0), getattr(np, dtype_b)(42))
+        op(np.dtype(left_dtype).type(0), np.dtype(right_dtype).type(42))
     except TypeError:
         pytest.skip("Operation is unsupported for corresponding dtype.")
 
@@ -261,8 +265,8 @@ def func(row):
         return op(x, y)
 
     gdf = cudf.DataFrame({"a": [1.5, None, 3, None], "b": [4, 5, None, None]})
-    gdf["a"] = gdf["a"].astype(dtype_a)
-    gdf["b"] = gdf["b"].astype(dtype_b)
+    gdf["a"] = gdf["a"].astype(left_dtype)
+    gdf["b"] = gdf["b"].astype(right_dtype)
 
     run_masked_udf_test(func, gdf, check_dtype=False)