diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 784cabaa542..16dd2f563c9 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -464,6 +464,23 @@ def func(group):
     run_groupby_apply_jit_test(data, func, ["a"])
 
 
+@pytest.mark.parametrize("dtype", ["int32"])
+def test_groupby_apply_jit_sum_integer_overflow(dtype):
+    max = np.iinfo(dtype).max
+
+    data = DataFrame(
+        {
+            "a": [0, 0, 0],
+            "b": [max, max, max],
+        }
+    )
+
+    def func(group):
+        return group["b"].sum()
+
+    run_groupby_apply_jit_test(data, func, ["a"])
+
+
 @pytest.mark.parametrize("dtype", ["float64"])
 @pytest.mark.parametrize("func", ["min", "max", "sum", "mean", "var", "std"])
 @pytest.mark.parametrize("special_val", [np.nan, np.inf, -np.inf])
diff --git a/python/cudf/udf_cpp/shim.cu b/python/cudf/udf_cpp/shim.cu
index 686e39e7036..cabca3154be 100644
--- a/python/cudf/udf_cpp/shim.cu
+++ b/python/cudf/udf_cpp/shim.cu
@@ -388,26 +388,30 @@ __device__ bool are_all_nans(cooperative_groups::thread_block const& block,
   return count == 0;
 }
 
-template <typename T>
-__device__ void device_sum(cooperative_groups::thread_block const& block,
-                           T const* data,
-                           int64_t size,
-                           T* sum)
+template <typename T, typename AccumT = std::conditional_t<std::is_integral_v<T>, int64_t, T>>
+__device__ AccumT device_sum(cooperative_groups::thread_block const& block,
+                             T const* data,
+                             int64_t size)
 {
-  T local_sum = 0;
+  __shared__ AccumT block_sum;
+  if (block.thread_rank() == 0) { block_sum = 0; }
+  block.sync();
+
+  AccumT local_sum = 0;
 
   for (int64_t idx = block.thread_rank(); idx < size; idx += block.size()) {
-    local_sum += data[idx];
+    local_sum += static_cast<AccumT>(data[idx]);
   }
 
-  cuda::atomic_ref<T, cuda::thread_scope_block> ref{*sum};
+  cuda::atomic_ref<AccumT, cuda::thread_scope_block> ref{block_sum};
   ref.fetch_add(local_sum, cuda::std::memory_order_relaxed);
 
   block.sync();
+  return block_sum;
 }
 
-template <typename T>
-__device__ T BlockSum(T const* data, int64_t size)
+template <typename T, typename AccumT = std::conditional_t<std::is_integral_v<T>, int64_t, T>>
+__device__ AccumT BlockSum(T const* data, int64_t size)
 {
   auto block = cooperative_groups::this_thread_block();
 
@@ -415,11 +419,7 @@ __device__ T BlockSum(T const* data, int64_t size)
     if (are_all_nans(block, data, size)) { return 0; }
   }
 
-  __shared__ T block_sum;
-  if (block.thread_rank() == 0) { block_sum = 0; }
-  block.sync();
-
-  device_sum<T>(block, data, size, &block_sum);
+  auto block_sum = device_sum<T>(block, data, size);
   return block_sum;
 }
 
@@ -428,11 +428,7 @@ __device__ double BlockMean(T const* data, int64_t size)
 {
   auto block = cooperative_groups::this_thread_block();
 
-  __shared__ T block_sum;
-  if (block.thread_rank() == 0) { block_sum = 0; }
-  block.sync();
-
-  device_sum<T>(block, data, size, &block_sum);
+  auto block_sum = device_sum<T>(block, data, size);
   return static_cast<double>(block_sum) / static_cast<double>(size);
 }
 
@@ -443,17 +439,11 @@ __device__ double BlockCoVar(T const* lhs, T const* rhs, int64_t size)
 
   __shared__ double block_covar;
 
-  __shared__ T block_sum_lhs;
-  __shared__ T block_sum_rhs;
-
-  if (block.thread_rank() == 0) {
-    block_covar   = 0;
-    block_sum_lhs = 0;
-    block_sum_rhs = 0;
-  }
+  if (block.thread_rank() == 0) { block_covar = 0; }
   block.sync();
 
-  device_sum<T>(block, lhs, size, &block_sum_lhs);
+  auto block_sum_lhs = device_sum<T>(block, lhs, size);
+
   auto const mu_l = static_cast<double>(block_sum_lhs) / static_cast<double>(size);
   auto const mu_r = [=]() {
     if (lhs == rhs) {
@@ -461,7 +451,7 @@ __device__ double BlockCoVar(T const* lhs, T const* rhs, int64_t size)
       // Thus we can assume mu_r = mu_l.
       return mu_l;
     } else {
-      device_sum<T>(block, rhs, size, &block_sum_rhs);
+      auto block_sum_rhs = device_sum<T>(block, rhs, size);
       return static_cast<double>(block_sum_rhs) / static_cast<double>(size);
     }
   }();