From 851e23545d29bca17a778f84dbc2000b3dde0ba8 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Tue, 15 Feb 2022 10:31:45 -0600 Subject: [PATCH] Reduce pytest runtime (#10203) This PR reduces the overall runtime of the cuDF pytest suite. Changes include: - asserting equal on the GPU where possible for large datasets - in some cases reducing excessive test data size part of https://github.com/rapidsai/cudf/issues/9999 Authors: - https://github.com/brandon-b-miller Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Ashwin Srinath (https://github.com/shwina) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/10203 --- python/cudf/cudf/testing/_utils.py | 9 +++- .../test_avro_reader_fastavro_integration.py | 4 +- python/cudf/cudf/tests/test_binops.py | 15 +++--- python/cudf/cudf/tests/test_csv.py | 11 +++-- .../cudf/tests/test_extension_compilation.py | 12 +++-- python/cudf/cudf/tests/test_indexing.py | 46 +++++++++---------- python/cudf/cudf/tests/test_orc.py | 24 +++++----- python/cudf/cudf/tests/test_parquet.py | 8 ++-- python/cudf/cudf/tests/test_repr.py | 45 ++++++++---------- python/cudf/cudf/tests/test_reshape.py | 8 ++-- python/cudf/cudf/tests/test_string.py | 7 +-- python/cudf/cudf/tests/test_udf_masked_ops.py | 18 +++++--- 12 files changed, 106 insertions(+), 101 deletions(-) diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index b97b2d660d6..e767c0c62be 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -1,5 +1,6 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. +import itertools import re import warnings from collections.abc import Mapping, Sequence @@ -330,3 +331,9 @@ def does_not_raise(): def xfail_param(param, **kwargs): return pytest.param(param, marks=pytest.mark.xfail(**kwargs)) + + +parametrize_numeric_dtypes_pairwise = pytest.mark.parametrize( + "left_dtype,right_dtype", + list(itertools.combinations_with_replacement(NUMERIC_TYPES, 2)), +) diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py index a064bec9e82..9eb01ae31b4 100644 --- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py +++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -210,7 +210,7 @@ def test_can_parse_no_schema(): assert_eq(expected, actual) -@pytest.mark.parametrize("rows", [0, 1, 10, 100000]) +@pytest.mark.parametrize("rows", [0, 1, 10, 1000]) @pytest.mark.parametrize("codec", ["null", "deflate", "snappy"]) def test_avro_compression(rows, codec): schema = { diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 76add8b9c5d..02ca7a0cd58 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -4,7 +4,7 @@ import decimal import operator import random -from itertools import product +from itertools import combinations_with_replacement, product import cupy as cp import numpy as np @@ -216,13 +216,12 @@ def test_series_compare(cmpop, obj_class, dtype): def _series_compare_nulls_typegen(): - tests = [] - tests += list(product(DATETIME_TYPES, DATETIME_TYPES)) - tests += list(product(TIMEDELTA_TYPES, TIMEDELTA_TYPES)) - tests += list(product(NUMERIC_TYPES, NUMERIC_TYPES)) - tests += list(product(STRING_TYPES, STRING_TYPES)) - - return tests + return [ + *combinations_with_replacement(DATETIME_TYPES, 2), + *combinations_with_replacement(TIMEDELTA_TYPES, 2), + *combinations_with_replacement(NUMERIC_TYPES, 2), + *combinations_with_replacement(STRING_TYPES, 2), + ] @pytest.mark.parametrize("cmpop", _cmpops) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 9208b8c7cd4..f3d69e1745e 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. import gzip import os @@ -8,6 +8,7 @@ from io import BytesIO, StringIO from pathlib import Path +import cupy as cp import numpy as np import pandas as pd import pytest @@ -1009,17 +1010,17 @@ def test_small_zip(tmpdir): def test_csv_reader_carriage_return(tmpdir): rows = 1000 names = ["int_row", "int_double_row"] - buffer = ",".join(names) + "\r\n" for row in range(rows): buffer += str(row) + ", " + str(2 * row) + "\r\n" df = read_csv(StringIO(buffer)) + expect = cudf.DataFrame( + {"int_row": cp.arange(rows), "int_double_row": cp.arange(rows) * 2} + ) assert len(df) == rows - for row in range(0, rows): - assert df[names[0]][row] == row - assert df[names[1]][row] == 2 * row + assert_eq(expect, df) def test_csv_reader_tabs(): diff --git a/python/cudf/cudf/tests/test_extension_compilation.py b/python/cudf/cudf/tests/test_extension_compilation.py index 47c9448cf63..692f40873d7 100644 --- a/python/cudf/cudf/tests/test_extension_compilation.py +++ b/python/cudf/cudf/tests/test_extension_compilation.py @@ -1,13 +1,17 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. import operator import cupy as cp +import numpy as np import pytest from numba import cuda, types from numba.cuda import compile_ptx +from numba.np.numpy_support import from_dtype from cudf import NA from cudf.core.udf.api import Masked from cudf.core.udf.typing import MaskedType +from cudf.testing._utils import parametrize_numeric_dtypes_pairwise arith_ops = ( operator.add, @@ -159,19 +163,21 @@ def func(x): @pytest.mark.parametrize("op", ops) -@pytest.mark.parametrize("ty1", number_types, ids=number_ids) -@pytest.mark.parametrize("ty2", number_types, ids=number_ids) +@parametrize_numeric_dtypes_pairwise @pytest.mark.parametrize( "masked", ((False, True), (True, False), (True, True)), ids=("um", "mu", "mm"), ) -def test_compile_arith_masked_ops(op, ty1, ty2, masked): +def test_compile_arith_masked_ops(op, left_dtype, right_dtype, masked): def func(x, y): return op(x, y) cc = (7, 5) + ty1 = from_dtype(np.dtype(left_dtype)) + ty2 = from_dtype(np.dtype(right_dtype)) + if masked[0]: ty1 = MaskedType(ty1) if masked[1]: diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 102e5b57e8e..19d7c8a10ab 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. from itertools import combinations @@ -1292,45 +1292,43 @@ def test_loc_datetime_index(sli, is_dataframe): @pytest.mark.parametrize( - "gdf", + "gdf_kwargs", [ - cudf.DataFrame({"a": range(1000000)}), - cudf.DataFrame({"a": range(1000000), "b": range(1000000)}), - cudf.DataFrame({"a": range(20), "b": range(20)}), - cudf.DataFrame( - { + {"data": {"a": range(100000)}}, + {"data": {"a": range(100000), "b": range(100000)}}, + { + "data": { "a": range(20), "b": range(20), "c": ["abc", "def", "xyz", "def", "pqr"] * 4, } - ), - cudf.DataFrame(index=[1, 2, 3]), - cudf.DataFrame(index=range(1000000)), - cudf.DataFrame(columns=["a", "b", "c", "d"]), - cudf.DataFrame(columns=["a"], index=range(1000000)), - cudf.DataFrame( - columns=["a", "col2", "...col n"], index=range(1000000) - ), - cudf.DataFrame(index=cudf.Series(range(1000000)).astype("str")), - cudf.DataFrame( - columns=["a", "b", "c", "d"], - index=cudf.Series(range(1000000)).astype("str"), - ), + }, + {"index": [1, 2, 3]}, + {"index": range(100000)}, + {"columns": ["a", "b", "c", "d"]}, + {"columns": ["a"], "index": range(100000)}, + {"columns": ["a", "col2", "...col n"], "index": range(100000)}, + {"index": cudf.Series(range(100000)).astype("str")}, + { + "columns": ["a", "b", "c", "d"], + "index": cudf.Series(range(100000)).astype("str"), + }, ], ) @pytest.mark.parametrize( "slice", [ - slice(250000, 500000), - slice(250000, 250001), - slice(500000), + slice(25000, 50000), + slice(25000, 25001), + slice(50000), slice(1, 10), slice(10, 20), slice(15, 24000), slice(6), ], ) -def test_dataframe_sliced(gdf, slice): +def test_dataframe_sliced(gdf_kwargs, slice): + gdf = cudf.DataFrame(**gdf_kwargs) pdf = gdf.to_pandas() actual = gdf[slice] diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 8689f773a02..623098741a9 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -16,6 +16,7 @@ import cudf from cudf.io.orc import ORCWriter +from cudf.testing import assert_frame_equal from cudf.testing._utils import ( assert_eq, gen_rand_series, @@ -93,7 +94,7 @@ def test_orc_reader_basic(datadir, inputfile, columns, use_index, engine): path, engine=engine, columns=columns, use_index=use_index ) - assert_eq(expect, got, check_categorical=False) + assert_frame_equal(cudf.from_pandas(expect), got, check_categorical=False) def test_orc_reader_filenotfound(tmpdir): @@ -384,11 +385,13 @@ def test_orc_writer(datadir, tmpdir, reference_file, columns, compression): else: print(type(excpr).__name__) - expect = orcfile.read(columns=columns).to_pandas() - cudf.from_pandas(expect).to_orc(gdf_fname.strpath, compression=compression) - got = pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas() + expect = cudf.from_pandas(orcfile.read(columns=columns).to_pandas()) + expect.to_orc(gdf_fname.strpath, compression=compression) + got = cudf.from_pandas( + pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas() + ) - assert_eq(expect, got) + assert_frame_equal(expect, got) @pytest.mark.parametrize("stats_freq", ["NONE", "STRIPE", "ROWGROUP"]) @@ -405,11 +408,11 @@ def test_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq): else: print(type(excpr).__name__) - expect = orcfile.read().to_pandas() - cudf.from_pandas(expect).to_orc(gdf_fname.strpath, statistics=stats_freq) - got = pa.orc.ORCFile(gdf_fname).read().to_pandas() + expect = cudf.from_pandas(orcfile.read().to_pandas()) + expect.to_orc(gdf_fname.strpath, statistics=stats_freq) + got = cudf.from_pandas(pa.orc.ORCFile(gdf_fname).read().to_pandas()) - assert_eq(expect, got) + assert_frame_equal(expect, got) @pytest.mark.parametrize("stats_freq", ["NONE", "STRIPE", "ROWGROUP"]) @@ -492,8 +495,7 @@ def test_chunked_orc_writer( writer.close() got = pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas() - - assert_eq(expect, got) + assert_frame_equal(cudf.from_pandas(expect), cudf.from_pandas(got)) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index e1ca9f6f006..7feaa400446 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1105,9 +1105,9 @@ def test_parquet_reader_list_large_multi_rowgroup_nulls(tmpdir): assert_eq(expect, got) -@pytest.mark.parametrize("skip", range(0, 128)) +@pytest.mark.parametrize("skip", [0, 1, 5, 10]) def test_parquet_reader_list_skiprows(skip, tmpdir): - num_rows = 128 + num_rows = 10 src = pd.DataFrame( { "a": list_gen(int_gen, 0, num_rows, 80, 50), @@ -1124,9 +1124,9 @@ def test_parquet_reader_list_skiprows(skip, tmpdir): assert_eq(expect, got, check_dtype=False) -@pytest.mark.parametrize("skip", range(0, 120)) +@pytest.mark.parametrize("skip", [0, 1, 5, 10]) def test_parquet_reader_list_num_rows(skip, tmpdir): - num_rows = 128 + num_rows = 20 src = pd.DataFrame( { "a": list_gen(int_gen, 0, num_rows, 80, 50), diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index ca02ee55df0..8f2e4811e36 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2021, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. import textwrap @@ -13,7 +13,14 @@ from cudf.testing import _utils as utils from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes -repr_categories = utils.NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] +repr_categories = [ + "uint16", + "int64", + "float64", + "str", + "category", + "datetime64[ns]", +] @pytest.mark.parametrize("dtype", repr_categories) @@ -84,36 +91,22 @@ def test_full_series(nrows, dtype): pd.reset_option("display.max_rows") +@pytest.mark.parametrize("nrows", [5, 10, 15]) +@pytest.mark.parametrize("ncols", [5, 10, 15]) +@pytest.mark.parametrize("size", [20, 21]) @pytest.mark.parametrize("dtype", repr_categories) -@pytest.mark.parametrize("nrows", [0, 1, 2, 9, 20 / 2, 11, 20 - 1, 20, 20 + 1]) -@pytest.mark.parametrize("ncols", [0, 1, 2, 9, 20 / 2, 11, 20 - 1, 20, 20 + 1]) -def test_full_dataframe_20(dtype, nrows, ncols): - size = 20 - pdf = pd.DataFrame( - {idx: np.random.randint(0, 100, size) for idx in range(size)} - ).astype(dtype) - gdf = cudf.from_pandas(pdf) - - assert pdf.__repr__() == gdf.__repr__() - assert pdf._repr_html_() == gdf._repr_html_() - assert pdf._repr_latex_() == gdf._repr_latex_() - - -@pytest.mark.parametrize("dtype", repr_categories) -@pytest.mark.parametrize("nrows", [9, 21 / 2, 11, 21 - 1]) -@pytest.mark.parametrize("ncols", [9, 21 / 2, 11, 21 - 1]) -def test_full_dataframe_21(dtype, nrows, ncols): - size = 21 +def test_full_dataframe_20(dtype, size, nrows, ncols): pdf = pd.DataFrame( {idx: np.random.randint(0, 100, size) for idx in range(size)} ).astype(dtype) gdf = cudf.from_pandas(pdf) - pd.options.display.max_rows = int(nrows) - pd.options.display.max_columns = int(ncols) - assert pdf.__repr__() == gdf.__repr__() - pd.reset_option("display.max_rows") - pd.reset_option("display.max_columns") + with pd.option_context( + "display.max_rows", int(nrows), "display.max_columns", int(ncols) + ): + assert repr(pdf) == repr(gdf) + assert pdf._repr_html_() == gdf._repr_html_() + assert pdf._repr_latex_() == gdf._repr_latex_() @given( diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index b8f975f233e..2efa781c506 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. import re @@ -17,9 +17,9 @@ ) -@pytest.mark.parametrize("num_id_vars", [0, 1, 2, 10]) -@pytest.mark.parametrize("num_value_vars", [0, 1, 2, 10]) -@pytest.mark.parametrize("num_rows", [1, 2, 1000]) +@pytest.mark.parametrize("num_id_vars", [0, 1, 2]) +@pytest.mark.parametrize("num_value_vars", [0, 1, 2]) +@pytest.mark.parametrize("num_rows", [1, 2, 100]) @pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES) @pytest.mark.parametrize("nulls", ["none", "some", "all"]) def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype): diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index efe8e523d4e..56218372c23 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -532,12 +532,7 @@ def _cat_convert_seq_to_cudf(others): @pytest.mark.parametrize("sep", [None, "", " ", "|", ",", "|||"]) @pytest.mark.parametrize("na_rep", [None, "", "null", "a"]) @pytest.mark.parametrize( - "index", - [ - ["1", "2", "3", "4", "5"], - pd.Series(["1", "2", "3", "4", "5"]), - pd.Index(["1", "2", "3", "4", "5"]), - ], + "index", [["1", "2", "3", "4", "5"]], ) def test_string_cat(ps_gs, others, sep, na_rep, index): ps, gs = ps_gs diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py index 56090c8eacf..faaea6eec08 100644 --- a/python/cudf/cudf/tests/test_udf_masked_ops.py +++ b/python/cudf/cudf/tests/test_udf_masked_ops.py @@ -1,3 +1,4 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. import math import operator @@ -14,7 +15,11 @@ unary_ops, ) from cudf.core.udf.utils import precompiled -from cudf.testing._utils import NUMERIC_TYPES, _decimal_series, assert_eq +from cudf.testing._utils import ( + _decimal_series, + assert_eq, + parametrize_numeric_dtypes_pairwise, +) def run_masked_udf_test(func, data, args=(), **kwargs): @@ -238,10 +243,9 @@ def func(row): run_masked_udf_test(func, gdf, check_dtype=False) -@pytest.mark.parametrize("dtype_a", list(NUMERIC_TYPES)) -@pytest.mark.parametrize("dtype_b", list(NUMERIC_TYPES)) +@parametrize_numeric_dtypes_pairwise @pytest.mark.parametrize("op", [operator.add, operator.and_, operator.eq]) -def test_apply_mixed_dtypes(dtype_a, dtype_b, op): +def test_apply_mixed_dtypes(left_dtype, right_dtype, op): """ Test that operations can be performed between columns of different dtypes and return a column with the correct @@ -251,7 +255,7 @@ def test_apply_mixed_dtypes(dtype_a, dtype_b, op): # First perform the op on two dummy data on host, if numpy can # safely type cast, we should expect it to work in udf too. try: - op(getattr(np, dtype_a)(0), getattr(np, dtype_b)(42)) + op(np.dtype(left_dtype).type(0), np.dtype(right_dtype).type(42)) except TypeError: pytest.skip("Operation is unsupported for corresponding dtype.") @@ -261,8 +265,8 @@ def func(row): return op(x, y) gdf = cudf.DataFrame({"a": [1.5, None, 3, None], "b": [4, 5, None, None]}) - gdf["a"] = gdf["a"].astype(dtype_a) - gdf["b"] = gdf["b"].astype(dtype_b) + gdf["a"] = gdf["a"].astype(left_dtype) + gdf["b"] = gdf["b"].astype(right_dtype) run_masked_udf_test(func, gdf, check_dtype=False)