From 8df33eed224bf33ad6013179459dfe41e5f26b2a Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Tue, 9 Jan 2024 18:47:38 -0600
Subject: [PATCH] Expand JIT groupby test suite (#13813)

This PR reorganizes and expands the test suite for groupby apply functions using the JIT engine to include nan cases and cases where the groups are larger than a single thread block.

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/13813
---
 python/cudf/cudf/tests/test_groupby.py | 371 ++++++++++++++++---------
 1 file changed, 238 insertions(+), 133 deletions(-)

diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 57aa6e72eae..b46949faa06 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 import collections
 import datetime
@@ -392,19 +392,70 @@ def emulate(df):
 
 
 @pytest.fixture(scope="module")
-def groupby_jit_data():
-    np.random.seed(0)
+def groupby_jit_data_small():
+    """
+    Return a small dataset for testing JIT Groupby Apply. The dataframe
+    contains 4 groups of size 1, 2, 3, 4 as well as an additional key
+    column that can be used to test subgroups within groups. This data
+    is useful for smoke testing basic numeric results
+    """
+    rng = np.random.default_rng(42)
     df = DataFrame()
-    nelem = 20
-    df["key1"] = np.random.randint(0, 3, nelem)
-    df["key2"] = np.random.randint(0, 2, nelem)
-    df["val1"] = np.random.random(nelem)
-    df["val2"] = np.random.random(nelem)
-    df["val3"] = np.random.randint(0, 10, nelem)
-    df["val4"] = np.random.randint(0, 10, nelem)
+    key1 = [1] + [2] * 2 + [3] * 3 + [4] * 4
+    key2 = [1, 2] * 5
+    df["key1"] = key1
+    df["key2"] = key2
+
+    df["val1"] = rng.integers(0, 10, len(key1))
+    df["val2"] = rng.integers(0, 10, len(key1))
+
+    # randomly permute data
+    df = df.sample(frac=1, ignore_index=True)
     return df
 
 
+@pytest.fixture(scope="module")
+def groupby_jit_data_large(groupby_jit_data_small):
+    """
+    Larger version of groupby_jit_data_small which contains enough data
+    to require more than one block per group. This data is useful for
+    testing if JIT GroupBy algorithms scale to larger dastasets without
+    manifesting numerical issues such as overflow.
+    """
+    max_tpb = 1024
+    factor = (
+        max_tpb + 1
+    )  # bigger than a block but not always an exact multiple
+    df = cudf.concat([groupby_jit_data_small] * factor)
+
+    return df
+
+
+@pytest.fixture(scope="module")
+def groupby_jit_data_nans(groupby_jit_data_small):
+    """
+    Returns a modified version of groupby_jit_data_small which contains
+    nan values.
+    """
+
+    df = groupby_jit_data_small.sort_values(["key1", "key2"])
+    df["val1"] = df["val1"].astype("float64")
+    df["val1"][::2] = np.nan
+    df = df.sample(frac=1, ignore_index=True)
+    return df
+
+
+@pytest.fixture(scope="module")
+def groupby_jit_datasets(
+    groupby_jit_data_small, groupby_jit_data_large, groupby_jit_data_nans
+):
+    return {
+        "small": groupby_jit_data_small,
+        "large": groupby_jit_data_large,
+        "nans": groupby_jit_data_nans,
+    }
+
+
 def run_groupby_apply_jit_test(data, func, keys, *args):
     expect_groupby_obj = data.to_pandas().groupby(keys)
     got_groupby_obj = data.groupby(keys)
@@ -415,6 +466,30 @@ def run_groupby_apply_jit_test(data, func, keys, *args):
     assert_groupby_results_equal(cudf_jit_result, pandas_result)
 
 
+def groupby_apply_jit_reductions_test_inner(func, data, dtype):
+    # ideally we'd just have:
+    # lambda group: getattr(group, func)()
+    # but the current kernel caching mechanism relies on pickle which
+    # does not play nice with local functions. What's below uses
+    # exec as a workaround to write the test functions dynamically
+
+    funcstr = textwrap.dedent(
+        f"""
+        def func(df):
+            return df['val1'].{func}()
+        """
+    )
+    lcl = {}
+    exec(funcstr, lcl)
+    func = lcl["func"]
+
+    data["val1"] = data["val1"].astype(dtype)
+    data["val2"] = data["val2"].astype(dtype)
+
+    run_groupby_apply_jit_test(data, func, ["key1"])
+
+
+# test unary reductions
 @pytest.mark.parametrize(
     "dtype",
     SUPPORTED_GROUPBY_NUMPY_TYPES,
@@ -423,13 +498,41 @@ def run_groupby_apply_jit_test(data, func, keys, *args):
 @pytest.mark.parametrize(
     "func", ["min", "max", "sum", "mean", "var", "std", "idxmin", "idxmax"]
 )
-def test_groupby_apply_jit_reductions(func, groupby_jit_data, dtype):
-    # ideally we'd just have:
-    # lambda group: getattr(group, func)()
-    # but the current kernel caching mechanism relies on pickle which
-    # does not play nice with local functions. What's below uses
-    # exec as a workaround to write the test functions dynamically
+@pytest.mark.parametrize("dataset", ["small", "large", "nans"])
+def test_groupby_apply_jit_unary_reductions(
+    func, dtype, dataset, groupby_jit_datasets
+):
+    dataset = groupby_jit_datasets[dataset]
+
+    groupby_apply_jit_reductions_test_inner(func, dataset, dtype)
+
+
+# test unary reductions for special values
+def groupby_apply_jit_reductions_special_vals_inner(
+    func, data, dtype, special_val
+):
+    funcstr = textwrap.dedent(
+        f"""
+        def func(df):
+            return df['val1'].{func}()
+        """
+    )
+    lcl = {}
+    exec(funcstr, lcl)
+    func = lcl["func"]
 
+    data["val1"] = data["val1"].astype(dtype)
+    data["val2"] = data["val2"].astype(dtype)
+    data["val1"] = special_val
+    data["val2"] = special_val
+
+    run_groupby_apply_jit_test(data, func, ["key1"])
+
+
+# test unary index reductions for special values
+def groupby_apply_jit_idx_reductions_special_vals_inner(
+    func, data, dtype, special_val
+):
     funcstr = textwrap.dedent(
         f"""
         def func(df):
@@ -440,36 +543,129 @@ def func(df):
     exec(funcstr, lcl)
     func = lcl["func"]
 
-    groupby_jit_data["val1"] = groupby_jit_data["val1"].astype(dtype)
-    groupby_jit_data["val2"] = groupby_jit_data["val2"].astype(dtype)
+    data["val1"] = data["val1"].astype(dtype)
+    data["val2"] = data["val2"].astype(dtype)
+    data["val1"] = special_val
+    data["val2"] = special_val
 
-    run_groupby_apply_jit_test(groupby_jit_data, func, ["key1"])
+    run_groupby_apply_jit_test(data, func, ["key1"])
 
 
-@pytest.mark.parametrize("dtype", SUPPORTED_GROUPBY_NUMPY_TYPES)
-def test_groupby_apply_jit_correlation(groupby_jit_data, dtype):
+@pytest.mark.parametrize("dtype", ["float64", "float32"])
+@pytest.mark.parametrize("func", ["min", "max", "sum", "mean", "var", "std"])
+@pytest.mark.parametrize("special_val", [np.nan, np.inf, -np.inf])
+@pytest.mark.parametrize("dataset", ["small", "large", "nans"])
+def test_groupby_apply_jit_reductions_special_vals(
+    func, dtype, dataset, groupby_jit_datasets, special_val
+):
+    dataset = groupby_jit_datasets[dataset]
+    groupby_apply_jit_reductions_special_vals_inner(
+        func, dataset, dtype, special_val
+    )
 
-    groupby_jit_data["val3"] = groupby_jit_data["val3"].astype(dtype)
-    groupby_jit_data["val4"] = groupby_jit_data["val4"].astype(dtype)
 
-    keys = ["key1", "key2"]
+@pytest.mark.parametrize("dtype", ["float64"])
+@pytest.mark.parametrize("func", ["idxmax", "idxmin"])
+@pytest.mark.parametrize(
+    "special_val",
+    [
+        pytest.param(
+            np.nan,
+            marks=pytest.mark.xfail(
+                reason="https://github.com/rapidsai/cudf/issues/13832"
+            ),
+        ),
+        np.inf,
+        -np.inf,
+    ],
+)
+@pytest.mark.parametrize("dataset", ["small", "large", "nans"])
+def test_groupby_apply_jit_idx_reductions_special_vals(
+    func, dtype, dataset, groupby_jit_datasets, special_val
+):
+    dataset = groupby_jit_datasets[dataset]
+    groupby_apply_jit_idx_reductions_special_vals_inner(
+        func, dataset, dtype, special_val
+    )
+
+
+@pytest.mark.parametrize("dtype", ["int32"])
+def test_groupby_apply_jit_sum_integer_overflow(dtype):
+    max = np.iinfo(dtype).max
+
+    data = DataFrame(
+        {
+            "a": [0, 0, 0],
+            "b": [max, max, max],
+        }
+    )
 
     def func(group):
-        return group["val3"].corr(group["val4"])
+        return group["b"].sum()
 
-    if dtype.kind == "f":
+    run_groupby_apply_jit_test(data, func, ["a"])
+
+
+@pytest.mark.parametrize("dtype", ["int32", "int64", "float32", "float64"])
+@pytest.mark.parametrize(
+    "dataset",
+    [
+        pytest.param(
+            "small",
+            marks=[
+                pytest.mark.filterwarnings(
+                    "ignore:Degrees of Freedom <= 0 for slice"
+                ),
+                pytest.mark.filterwarnings(
+                    "ignore:divide by zero encountered in divide"
+                ),
+            ],
+        ),
+        "large",
+    ],
+)
+def test_groupby_apply_jit_correlation(dataset, groupby_jit_datasets, dtype):
+
+    dataset = groupby_jit_datasets[dataset]
+
+    dataset["val1"] = dataset["val1"].astype(dtype)
+    dataset["val2"] = dataset["val2"].astype(dtype)
+
+    keys = ["key1"]
+
+    def func(group):
+        return group["val1"].corr(group["val2"])
+
+    if np.dtype(dtype).kind == "f":
+        # Correlation of floating types is not yet supported:
+        # https://github.com/rapidsai/cudf/issues/13839
         m = (
             f"Series.corr\\(Series\\) is not "
             f"supported for \\({dtype}, {dtype}\\)"
         )
         with pytest.raises(UDFError, match=m):
-            run_groupby_apply_jit_test(groupby_jit_data, func, keys)
+            run_groupby_apply_jit_test(dataset, func, keys)
         return
-    run_groupby_apply_jit_test(groupby_jit_data, func, keys)
+    run_groupby_apply_jit_test(dataset, func, keys)
+
+
+@pytest.mark.parametrize("dtype", ["int32", "int64"])
+def test_groupby_apply_jit_correlation_zero_variance(dtype):
+    # pearson correlation is undefined when the variance of either
+    # variable is zero. This test ensures that the jit implementation
+    # returns the same result as pandas in this case.
+    data = DataFrame(
+        {"a": [0, 0, 0, 0, 0], "b": [1, 1, 1, 1, 1], "c": [2, 2, 2, 2, 2]}
+    )
+
+    def func(group):
+        return group["b"].corr(group["c"])
+
+    run_groupby_apply_jit_test(data, func, ["a"])
 
 
 @pytest.mark.parametrize("op", unary_ops)
-def test_groupby_apply_jit_invalid_unary_ops_error(groupby_jit_data, op):
+def test_groupby_apply_jit_invalid_unary_ops_error(groupby_jit_data_small, op):
     keys = ["key1"]
 
     def func(group):
@@ -479,11 +675,13 @@ def func(group):
         UDFError,
         match=f"{op.__name__}\\(Series\\) is not supported by JIT GroupBy",
     ):
-        run_groupby_apply_jit_test(groupby_jit_data, func, keys)
+        run_groupby_apply_jit_test(groupby_jit_data_small, func, keys)
 
 
 @pytest.mark.parametrize("op", arith_ops + comparison_ops)
-def test_groupby_apply_jit_invalid_binary_ops_error(groupby_jit_data, op):
+def test_groupby_apply_jit_invalid_binary_ops_error(
+    groupby_jit_data_small, op
+):
     keys = ["key1"]
 
     def func(group):
@@ -493,10 +691,10 @@ def func(group):
         UDFError,
         match=f"{op.__name__}\\(Series, Series\\) is not supported",
     ):
-        run_groupby_apply_jit_test(groupby_jit_data, func, keys)
+        run_groupby_apply_jit_test(groupby_jit_data_small, func, keys)
 
 
-def test_groupby_apply_jit_no_df_ops(groupby_jit_data):
+def test_groupby_apply_jit_no_df_ops(groupby_jit_data_small):
     # DataFrame level operations are not yet supported.
     def func(group):
         return group.sum()
@@ -505,7 +703,7 @@ def func(group):
         UDFError,
         match="JIT GroupBy.apply\\(\\) does not support DataFrame.sum\\(\\)",
     ):
-        run_groupby_apply_jit_test(groupby_jit_data, func, ["key1"])
+        run_groupby_apply_jit_test(groupby_jit_data_small, func, ["key1"])
 
 
 @pytest.mark.parametrize("dtype", ["uint8", "str"])
@@ -529,101 +727,6 @@ def func(group):
         run_groupby_apply_jit_test(df, func, ["a"])
 
 
-@pytest.mark.parametrize("dtype", ["int32", "int64"])
-def test_groupby_apply_jit_correlation_zero_variance(dtype):
-    # pearson correlation is undefined when the variance of either
-    # variable is zero. This test ensures that the jit implementation
-    # returns the same result as pandas in this case.
-    data = DataFrame(
-        {"a": [0, 0, 0, 0, 0], "b": [1, 1, 1, 1, 1], "c": [2, 2, 2, 2, 2]}
-    )
-
-    def func(group):
-        return group["b"].corr(group["c"])
-
-    run_groupby_apply_jit_test(data, func, ["a"])
-
-
-@pytest.mark.parametrize("dtype", ["int32"])
-def test_groupby_apply_jit_sum_integer_overflow(dtype):
-    max = np.iinfo(dtype).max
-
-    data = DataFrame(
-        {
-            "a": [0, 0, 0],
-            "b": [max, max, max],
-        }
-    )
-
-    def func(group):
-        return group["b"].sum()
-
-    run_groupby_apply_jit_test(data, func, ["a"])
-
-
-@pytest.mark.parametrize("dtype", ["float64"])
-@pytest.mark.parametrize("func", ["min", "max", "sum", "mean", "var", "std"])
-@pytest.mark.parametrize("special_val", [np.nan, np.inf, -np.inf])
-def test_groupby_apply_jit_reductions_special_vals(
-    func, groupby_jit_data, dtype, special_val
-):
-    # dynamically generate to avoid pickling error.
-    # see test_groupby_apply_jit_reductions for details.
-    funcstr = textwrap.dedent(
-        f"""
-            def func(df):
-                return df['val1'].{func}()
-        """
-    )
-    lcl = {}
-    exec(funcstr, lcl)
-    func = lcl["func"]
-
-    groupby_jit_data["val1"] = special_val
-    groupby_jit_data["val1"] = groupby_jit_data["val1"].astype(dtype)
-
-    run_groupby_apply_jit_test(groupby_jit_data, func, ["key1"])
-
-
-@pytest.mark.parametrize("dtype", ["float64"])
-@pytest.mark.parametrize("func", ["idxmax", "idxmin"])
-@pytest.mark.parametrize(
-    "special_val",
-    [
-        pytest.param(
-            np.nan,
-            marks=pytest.mark.xfail(
-                reason="https://github.com/rapidsai/cudf/issues/13832"
-            ),
-        ),
-        np.inf,
-        -np.inf,
-    ],
-)
-def test_groupby_apply_jit_idx_reductions_special_vals(
-    func, groupby_jit_data, dtype, special_val
-):
-    # dynamically generate to avoid pickling error.
-    # see test_groupby_apply_jit_reductions for details.
-    funcstr = textwrap.dedent(
-        f"""
-            def func(df):
-                return df['val1'].{func}()
-        """
-    )
-    lcl = {}
-    exec(funcstr, lcl)
-    func = lcl["func"]
-
-    groupby_jit_data["val1"] = special_val
-    groupby_jit_data["val1"] = groupby_jit_data["val1"].astype(dtype)
-
-    expect = groupby_jit_data.to_pandas().groupby("key1").apply(func)
-    got = groupby_jit_data.groupby("key1").apply(func, engine="jit")
-
-    assert_eq(expect, got, check_dtype=False)
-
-
 @pytest.mark.parametrize(
     "func",
     [
@@ -632,8 +735,8 @@ def func(df):
         lambda df: df["val1"].mean() + df["val2"].std(),
     ],
 )
-def test_groupby_apply_jit_basic(func, groupby_jit_data):
-    run_groupby_apply_jit_test(groupby_jit_data, func, ["key1", "key2"])
+def test_groupby_apply_jit_basic(func, groupby_jit_data_small):
+    run_groupby_apply_jit_test(groupby_jit_data_small, func, ["key1", "key2"])
 
 
 def create_test_groupby_apply_jit_args_params():
@@ -652,8 +755,10 @@ def f3(df, k, L, m):
 @pytest.mark.parametrize(
     "func,args", create_test_groupby_apply_jit_args_params()
 )
-def test_groupby_apply_jit_args(func, args, groupby_jit_data):
-    run_groupby_apply_jit_test(groupby_jit_data, func, ["key1", "key2"], *args)
+def test_groupby_apply_jit_args(func, args, groupby_jit_data_small):
+    run_groupby_apply_jit_test(
+        groupby_jit_data_small, func, ["key1", "key2"], *args
+    )
 
 
 def test_groupby_apply_jit_block_divergence():