diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 784cabaa542..16dd2f563c9 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -464,6 +464,23 @@ def func(group): run_groupby_apply_jit_test(data, func, ["a"]) +@pytest.mark.parametrize("dtype", ["int32"]) +def test_groupby_apply_jit_sum_integer_overflow(dtype): + max = np.iinfo(dtype).max + + data = DataFrame( + { + "a": [0, 0, 0], + "b": [max, max, max], + } + ) + + def func(group): + return group["b"].sum() + + run_groupby_apply_jit_test(data, func, ["a"]) + + @pytest.mark.parametrize("dtype", ["float64"]) @pytest.mark.parametrize("func", ["min", "max", "sum", "mean", "var", "std"]) @pytest.mark.parametrize("special_val", [np.nan, np.inf, -np.inf]) diff --git a/python/cudf/udf_cpp/shim.cu b/python/cudf/udf_cpp/shim.cu index 686e39e7036..cabca3154be 100644 --- a/python/cudf/udf_cpp/shim.cu +++ b/python/cudf/udf_cpp/shim.cu @@ -388,26 +388,30 @@ __device__ bool are_all_nans(cooperative_groups::thread_block const& block, return count == 0; } -template -__device__ void device_sum(cooperative_groups::thread_block const& block, - T const* data, - int64_t size, - T* sum) +template , int64_t, T>> +__device__ AccumT device_sum(cooperative_groups::thread_block const& block, + T const* data, + int64_t size) { - T local_sum = 0; + __shared__ AccumT block_sum; + if (block.thread_rank() == 0) { block_sum = 0; } + block.sync(); + + AccumT local_sum = 0; for (int64_t idx = block.thread_rank(); idx < size; idx += block.size()) { - local_sum += data[idx]; + local_sum += static_cast(data[idx]); } - cuda::atomic_ref ref{*sum}; + cuda::atomic_ref ref{block_sum}; ref.fetch_add(local_sum, cuda::std::memory_order_relaxed); block.sync(); + return block_sum; } -template -__device__ T BlockSum(T const* data, int64_t size) +template , int64_t, T>> +__device__ AccumT BlockSum(T const* data, int64_t size) { auto block = cooperative_groups::this_thread_block(); @@ -415,11 +419,7 @@ __device__ T BlockSum(T const* data, int64_t size) if (are_all_nans(block, data, size)) { return 0; } } - __shared__ T block_sum; - if (block.thread_rank() == 0) { block_sum = 0; } - block.sync(); - - device_sum(block, data, size, &block_sum); + auto block_sum = device_sum(block, data, size); return block_sum; } @@ -428,11 +428,7 @@ __device__ double BlockMean(T const* data, int64_t size) { auto block = cooperative_groups::this_thread_block(); - __shared__ T block_sum; - if (block.thread_rank() == 0) { block_sum = 0; } - block.sync(); - - device_sum(block, data, size, &block_sum); + auto block_sum = device_sum(block, data, size); return static_cast(block_sum) / static_cast(size); } @@ -443,17 +439,11 @@ __device__ double BlockCoVar(T const* lhs, T const* rhs, int64_t size) __shared__ double block_covar; - __shared__ T block_sum_lhs; - __shared__ T block_sum_rhs; - - if (block.thread_rank() == 0) { - block_covar = 0; - block_sum_lhs = 0; - block_sum_rhs = 0; - } + if (block.thread_rank() == 0) { block_covar = 0; } block.sync(); - device_sum(block, lhs, size, &block_sum_lhs); + auto block_sum_lhs = device_sum(block, lhs, size); + auto const mu_l = static_cast(block_sum_lhs) / static_cast(size); auto const mu_r = [=]() { if (lhs == rhs) { @@ -461,7 +451,7 @@ __device__ double BlockCoVar(T const* lhs, T const* rhs, int64_t size) // Thus we can assume mu_r = mu_l. return mu_l; } else { - device_sum(block, rhs, size, &block_sum_rhs); + auto block_sum_rhs = device_sum(block, rhs, size); return static_cast(block_sum_rhs) / static_cast(size); } }();