From 8df33eed224bf33ad6013179459dfe41e5f26b2a Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Tue, 9 Jan 2024 18:47:38 -0600 Subject: [PATCH] Expand JIT groupby test suite (#13813) This PR reorganizes and expands the test suite for groupby apply functions using the JIT engine to include nan cases and cases where the groups are larger than a single thread block. Authors: - https://github.com/brandon-b-miller Approvers: - Bradley Dice (https://github.com/bdice) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/13813 --- python/cudf/cudf/tests/test_groupby.py | 371 ++++++++++++++++--------- 1 file changed, 238 insertions(+), 133 deletions(-) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 57aa6e72eae..b46949faa06 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. import collections import datetime @@ -392,19 +392,70 @@ def emulate(df): @pytest.fixture(scope="module") -def groupby_jit_data(): - np.random.seed(0) +def groupby_jit_data_small(): + """ + Return a small dataset for testing JIT Groupby Apply. The dataframe + contains 4 groups of size 1, 2, 3, 4 as well as an additional key + column that can be used to test subgroups within groups. This data + is useful for smoke testing basic numeric results + """ + rng = np.random.default_rng(42) df = DataFrame() - nelem = 20 - df["key1"] = np.random.randint(0, 3, nelem) - df["key2"] = np.random.randint(0, 2, nelem) - df["val1"] = np.random.random(nelem) - df["val2"] = np.random.random(nelem) - df["val3"] = np.random.randint(0, 10, nelem) - df["val4"] = np.random.randint(0, 10, nelem) + key1 = [1] + [2] * 2 + [3] * 3 + [4] * 4 + key2 = [1, 2] * 5 + df["key1"] = key1 + df["key2"] = key2 + + df["val1"] = rng.integers(0, 10, len(key1)) + df["val2"] = rng.integers(0, 10, len(key1)) + + # randomly permute data + df = df.sample(frac=1, ignore_index=True) return df +@pytest.fixture(scope="module") +def groupby_jit_data_large(groupby_jit_data_small): + """ + Larger version of groupby_jit_data_small which contains enough data + to require more than one block per group. This data is useful for + testing if JIT GroupBy algorithms scale to larger dastasets without + manifesting numerical issues such as overflow. + """ + max_tpb = 1024 + factor = ( + max_tpb + 1 + ) # bigger than a block but not always an exact multiple + df = cudf.concat([groupby_jit_data_small] * factor) + + return df + + +@pytest.fixture(scope="module") +def groupby_jit_data_nans(groupby_jit_data_small): + """ + Returns a modified version of groupby_jit_data_small which contains + nan values. + """ + + df = groupby_jit_data_small.sort_values(["key1", "key2"]) + df["val1"] = df["val1"].astype("float64") + df["val1"][::2] = np.nan + df = df.sample(frac=1, ignore_index=True) + return df + + +@pytest.fixture(scope="module") +def groupby_jit_datasets( + groupby_jit_data_small, groupby_jit_data_large, groupby_jit_data_nans +): + return { + "small": groupby_jit_data_small, + "large": groupby_jit_data_large, + "nans": groupby_jit_data_nans, + } + + def run_groupby_apply_jit_test(data, func, keys, *args): expect_groupby_obj = data.to_pandas().groupby(keys) got_groupby_obj = data.groupby(keys) @@ -415,6 +466,30 @@ def run_groupby_apply_jit_test(data, func, keys, *args): assert_groupby_results_equal(cudf_jit_result, pandas_result) +def groupby_apply_jit_reductions_test_inner(func, data, dtype): + # ideally we'd just have: + # lambda group: getattr(group, func)() + # but the current kernel caching mechanism relies on pickle which + # does not play nice with local functions. What's below uses + # exec as a workaround to write the test functions dynamically + + funcstr = textwrap.dedent( + f""" + def func(df): + return df['val1'].{func}() + """ + ) + lcl = {} + exec(funcstr, lcl) + func = lcl["func"] + + data["val1"] = data["val1"].astype(dtype) + data["val2"] = data["val2"].astype(dtype) + + run_groupby_apply_jit_test(data, func, ["key1"]) + + +# test unary reductions @pytest.mark.parametrize( "dtype", SUPPORTED_GROUPBY_NUMPY_TYPES, @@ -423,13 +498,41 @@ def run_groupby_apply_jit_test(data, func, keys, *args): @pytest.mark.parametrize( "func", ["min", "max", "sum", "mean", "var", "std", "idxmin", "idxmax"] ) -def test_groupby_apply_jit_reductions(func, groupby_jit_data, dtype): - # ideally we'd just have: - # lambda group: getattr(group, func)() - # but the current kernel caching mechanism relies on pickle which - # does not play nice with local functions. What's below uses - # exec as a workaround to write the test functions dynamically +@pytest.mark.parametrize("dataset", ["small", "large", "nans"]) +def test_groupby_apply_jit_unary_reductions( + func, dtype, dataset, groupby_jit_datasets +): + dataset = groupby_jit_datasets[dataset] + + groupby_apply_jit_reductions_test_inner(func, dataset, dtype) + + +# test unary reductions for special values +def groupby_apply_jit_reductions_special_vals_inner( + func, data, dtype, special_val +): + funcstr = textwrap.dedent( + f""" + def func(df): + return df['val1'].{func}() + """ + ) + lcl = {} + exec(funcstr, lcl) + func = lcl["func"] + data["val1"] = data["val1"].astype(dtype) + data["val2"] = data["val2"].astype(dtype) + data["val1"] = special_val + data["val2"] = special_val + + run_groupby_apply_jit_test(data, func, ["key1"]) + + +# test unary index reductions for special values +def groupby_apply_jit_idx_reductions_special_vals_inner( + func, data, dtype, special_val +): funcstr = textwrap.dedent( f""" def func(df): @@ -440,36 +543,129 @@ def func(df): exec(funcstr, lcl) func = lcl["func"] - groupby_jit_data["val1"] = groupby_jit_data["val1"].astype(dtype) - groupby_jit_data["val2"] = groupby_jit_data["val2"].astype(dtype) + data["val1"] = data["val1"].astype(dtype) + data["val2"] = data["val2"].astype(dtype) + data["val1"] = special_val + data["val2"] = special_val - run_groupby_apply_jit_test(groupby_jit_data, func, ["key1"]) + run_groupby_apply_jit_test(data, func, ["key1"]) -@pytest.mark.parametrize("dtype", SUPPORTED_GROUPBY_NUMPY_TYPES) -def test_groupby_apply_jit_correlation(groupby_jit_data, dtype): +@pytest.mark.parametrize("dtype", ["float64", "float32"]) +@pytest.mark.parametrize("func", ["min", "max", "sum", "mean", "var", "std"]) +@pytest.mark.parametrize("special_val", [np.nan, np.inf, -np.inf]) +@pytest.mark.parametrize("dataset", ["small", "large", "nans"]) +def test_groupby_apply_jit_reductions_special_vals( + func, dtype, dataset, groupby_jit_datasets, special_val +): + dataset = groupby_jit_datasets[dataset] + groupby_apply_jit_reductions_special_vals_inner( + func, dataset, dtype, special_val + ) - groupby_jit_data["val3"] = groupby_jit_data["val3"].astype(dtype) - groupby_jit_data["val4"] = groupby_jit_data["val4"].astype(dtype) - keys = ["key1", "key2"] +@pytest.mark.parametrize("dtype", ["float64"]) +@pytest.mark.parametrize("func", ["idxmax", "idxmin"]) +@pytest.mark.parametrize( + "special_val", + [ + pytest.param( + np.nan, + marks=pytest.mark.xfail( + reason="https://github.com/rapidsai/cudf/issues/13832" + ), + ), + np.inf, + -np.inf, + ], +) +@pytest.mark.parametrize("dataset", ["small", "large", "nans"]) +def test_groupby_apply_jit_idx_reductions_special_vals( + func, dtype, dataset, groupby_jit_datasets, special_val +): + dataset = groupby_jit_datasets[dataset] + groupby_apply_jit_idx_reductions_special_vals_inner( + func, dataset, dtype, special_val + ) + + +@pytest.mark.parametrize("dtype", ["int32"]) +def test_groupby_apply_jit_sum_integer_overflow(dtype): + max = np.iinfo(dtype).max + + data = DataFrame( + { + "a": [0, 0, 0], + "b": [max, max, max], + } + ) def func(group): - return group["val3"].corr(group["val4"]) + return group["b"].sum() - if dtype.kind == "f": + run_groupby_apply_jit_test(data, func, ["a"]) + + +@pytest.mark.parametrize("dtype", ["int32", "int64", "float32", "float64"]) +@pytest.mark.parametrize( + "dataset", + [ + pytest.param( + "small", + marks=[ + pytest.mark.filterwarnings( + "ignore:Degrees of Freedom <= 0 for slice" + ), + pytest.mark.filterwarnings( + "ignore:divide by zero encountered in divide" + ), + ], + ), + "large", + ], +) +def test_groupby_apply_jit_correlation(dataset, groupby_jit_datasets, dtype): + + dataset = groupby_jit_datasets[dataset] + + dataset["val1"] = dataset["val1"].astype(dtype) + dataset["val2"] = dataset["val2"].astype(dtype) + + keys = ["key1"] + + def func(group): + return group["val1"].corr(group["val2"]) + + if np.dtype(dtype).kind == "f": + # Correlation of floating types is not yet supported: + # https://github.com/rapidsai/cudf/issues/13839 m = ( f"Series.corr\\(Series\\) is not " f"supported for \\({dtype}, {dtype}\\)" ) with pytest.raises(UDFError, match=m): - run_groupby_apply_jit_test(groupby_jit_data, func, keys) + run_groupby_apply_jit_test(dataset, func, keys) return - run_groupby_apply_jit_test(groupby_jit_data, func, keys) + run_groupby_apply_jit_test(dataset, func, keys) + + +@pytest.mark.parametrize("dtype", ["int32", "int64"]) +def test_groupby_apply_jit_correlation_zero_variance(dtype): + # pearson correlation is undefined when the variance of either + # variable is zero. This test ensures that the jit implementation + # returns the same result as pandas in this case. + data = DataFrame( + {"a": [0, 0, 0, 0, 0], "b": [1, 1, 1, 1, 1], "c": [2, 2, 2, 2, 2]} + ) + + def func(group): + return group["b"].corr(group["c"]) + + run_groupby_apply_jit_test(data, func, ["a"]) @pytest.mark.parametrize("op", unary_ops) -def test_groupby_apply_jit_invalid_unary_ops_error(groupby_jit_data, op): +def test_groupby_apply_jit_invalid_unary_ops_error(groupby_jit_data_small, op): keys = ["key1"] def func(group): @@ -479,11 +675,13 @@ def func(group): UDFError, match=f"{op.__name__}\\(Series\\) is not supported by JIT GroupBy", ): - run_groupby_apply_jit_test(groupby_jit_data, func, keys) + run_groupby_apply_jit_test(groupby_jit_data_small, func, keys) @pytest.mark.parametrize("op", arith_ops + comparison_ops) -def test_groupby_apply_jit_invalid_binary_ops_error(groupby_jit_data, op): +def test_groupby_apply_jit_invalid_binary_ops_error( + groupby_jit_data_small, op +): keys = ["key1"] def func(group): @@ -493,10 +691,10 @@ def func(group): UDFError, match=f"{op.__name__}\\(Series, Series\\) is not supported", ): - run_groupby_apply_jit_test(groupby_jit_data, func, keys) + run_groupby_apply_jit_test(groupby_jit_data_small, func, keys) -def test_groupby_apply_jit_no_df_ops(groupby_jit_data): +def test_groupby_apply_jit_no_df_ops(groupby_jit_data_small): # DataFrame level operations are not yet supported. def func(group): return group.sum() @@ -505,7 +703,7 @@ def func(group): UDFError, match="JIT GroupBy.apply\\(\\) does not support DataFrame.sum\\(\\)", ): - run_groupby_apply_jit_test(groupby_jit_data, func, ["key1"]) + run_groupby_apply_jit_test(groupby_jit_data_small, func, ["key1"]) @pytest.mark.parametrize("dtype", ["uint8", "str"]) @@ -529,101 +727,6 @@ def func(group): run_groupby_apply_jit_test(df, func, ["a"]) -@pytest.mark.parametrize("dtype", ["int32", "int64"]) -def test_groupby_apply_jit_correlation_zero_variance(dtype): - # pearson correlation is undefined when the variance of either - # variable is zero. This test ensures that the jit implementation - # returns the same result as pandas in this case. - data = DataFrame( - {"a": [0, 0, 0, 0, 0], "b": [1, 1, 1, 1, 1], "c": [2, 2, 2, 2, 2]} - ) - - def func(group): - return group["b"].corr(group["c"]) - - run_groupby_apply_jit_test(data, func, ["a"]) - - -@pytest.mark.parametrize("dtype", ["int32"]) -def test_groupby_apply_jit_sum_integer_overflow(dtype): - max = np.iinfo(dtype).max - - data = DataFrame( - { - "a": [0, 0, 0], - "b": [max, max, max], - } - ) - - def func(group): - return group["b"].sum() - - run_groupby_apply_jit_test(data, func, ["a"]) - - -@pytest.mark.parametrize("dtype", ["float64"]) -@pytest.mark.parametrize("func", ["min", "max", "sum", "mean", "var", "std"]) -@pytest.mark.parametrize("special_val", [np.nan, np.inf, -np.inf]) -def test_groupby_apply_jit_reductions_special_vals( - func, groupby_jit_data, dtype, special_val -): - # dynamically generate to avoid pickling error. - # see test_groupby_apply_jit_reductions for details. - funcstr = textwrap.dedent( - f""" - def func(df): - return df['val1'].{func}() - """ - ) - lcl = {} - exec(funcstr, lcl) - func = lcl["func"] - - groupby_jit_data["val1"] = special_val - groupby_jit_data["val1"] = groupby_jit_data["val1"].astype(dtype) - - run_groupby_apply_jit_test(groupby_jit_data, func, ["key1"]) - - -@pytest.mark.parametrize("dtype", ["float64"]) -@pytest.mark.parametrize("func", ["idxmax", "idxmin"]) -@pytest.mark.parametrize( - "special_val", - [ - pytest.param( - np.nan, - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/13832" - ), - ), - np.inf, - -np.inf, - ], -) -def test_groupby_apply_jit_idx_reductions_special_vals( - func, groupby_jit_data, dtype, special_val -): - # dynamically generate to avoid pickling error. - # see test_groupby_apply_jit_reductions for details. - funcstr = textwrap.dedent( - f""" - def func(df): - return df['val1'].{func}() - """ - ) - lcl = {} - exec(funcstr, lcl) - func = lcl["func"] - - groupby_jit_data["val1"] = special_val - groupby_jit_data["val1"] = groupby_jit_data["val1"].astype(dtype) - - expect = groupby_jit_data.to_pandas().groupby("key1").apply(func) - got = groupby_jit_data.groupby("key1").apply(func, engine="jit") - - assert_eq(expect, got, check_dtype=False) - - @pytest.mark.parametrize( "func", [ @@ -632,8 +735,8 @@ def func(df): lambda df: df["val1"].mean() + df["val2"].std(), ], ) -def test_groupby_apply_jit_basic(func, groupby_jit_data): - run_groupby_apply_jit_test(groupby_jit_data, func, ["key1", "key2"]) +def test_groupby_apply_jit_basic(func, groupby_jit_data_small): + run_groupby_apply_jit_test(groupby_jit_data_small, func, ["key1", "key2"]) def create_test_groupby_apply_jit_args_params(): @@ -652,8 +755,10 @@ def f3(df, k, L, m): @pytest.mark.parametrize( "func,args", create_test_groupby_apply_jit_args_params() ) -def test_groupby_apply_jit_args(func, args, groupby_jit_data): - run_groupby_apply_jit_test(groupby_jit_data, func, ["key1", "key2"], *args) +def test_groupby_apply_jit_args(func, args, groupby_jit_data_small): + run_groupby_apply_jit_test( + groupby_jit_data_small, func, ["key1", "key2"], *args + ) def test_groupby_apply_jit_block_divergence():