diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py index ebf8c677e55..f3270b93406 100644 --- a/python/cudf/cudf/core/udf/groupby_utils.py +++ b/python/cudf/cudf/core/udf/groupby_utils.py @@ -150,7 +150,7 @@ def jit_groupby_apply(offsets, grouped_values, function, *args): ngroups = len(offsets) - 1 cache_key = _generate_cache_key( - grouped_values, function, suffix="__GROUPBY_APPLY_UDF" + grouped_values, function, args, suffix="__GROUPBY_APPLY_UDF" ) if cache_key not in precompiled: diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index ed0c3332499..b3b846658f6 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -24,6 +24,7 @@ column_from_udf_string_array, column_to_string_view_array, ) +from cudf.api.types import is_scalar from cudf.core.column.column import as_column from cudf.core.dtypes import dtype from cudf.core.udf.masked_typing import MaskedType @@ -245,7 +246,7 @@ def _mask_get(mask, pos): return (mask[pos // MASK_BITSIZE] >> (pos % MASK_BITSIZE)) & 1 -def _generate_cache_key(frame, func: Callable, suffix="__APPLY_UDF"): +def _generate_cache_key(frame, func: Callable, args, suffix="__APPLY_UDF"): """Create a cache key that uniquely identifies a compilation. A new compilation is needed any time any of the following things change: @@ -253,12 +254,14 @@ def _generate_cache_key(frame, func: Callable, suffix="__APPLY_UDF"): - The types of the columns utilized by the UDF - The existence of the input columns masks """ + scalar_argtypes = tuple(typeof(arg) for arg in args) return ( *cudautils.make_cache_key( func, tuple(_all_dtypes_from_frame(frame).values()) ), *(col.mask is None for col in frame._data.values()), *frame._data.keys(), + scalar_argtypes, suffix, ) @@ -285,9 +288,11 @@ def _compile_or_get(frame, func, args, kernel_getter=None): we then obtain the return type from that separate compilation and use it to allocate an output column of the right dtype. """ + if not all(is_scalar(arg) for arg in args): + raise TypeError("only scalar valued args are supported by apply") # check to see if we already compiled this function - cache_key = _generate_cache_key(frame, func) + cache_key = _generate_cache_key(frame, func, args) if precompiled.get(cache_key) is not None: kernel, masked_or_scalar = precompiled[cache_key] return kernel, masked_or_scalar diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py index ab0205df677..515a9fd5956 100644 --- a/python/cudf/cudf/tests/test_udf_masked_ops.py +++ b/python/cudf/cudf/tests/test_udf_masked_ops.py @@ -775,6 +775,19 @@ def f(x): assert precompiled.currsize == 1 + # validate that changing the type of a scalar arg + # results in a miss + precompiled.clear() + + def f(x, c): + return x + c + + data.apply(f, args=(1,)) + assert precompiled.currsize == 1 + + data.apply(f, args=(1.5,)) + assert precompiled.currsize == 2 + @pytest.mark.parametrize( "data", [[1.0, 0.0, 1.5], [1, 0, 2], [True, False, True]]