From 231cb716baf44b64e0284e23ae9666500de7d593 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 18 Jun 2024 11:50:46 -0700
Subject: [PATCH] Fix a size overflow bug in hash groupby (#16053)

This PR fixes a size overflow bug discovered by @matal-nvidia. It converts the groupby problem size to `int64_t` so it won't overflow if larger than `INT_MAX / 2` with 50% hash table occupancy.

Unit tests for this scenario will saturate device memory and take longer than necessary, making them likely not worth adding.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Matthew Roeschke (https://github.com/mroeschke)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16053
---
 cpp/src/groupby/hash/groupby.cu                  | 3 ++-
 java/src/test/java/ai/rapids/cudf/TableTest.java | 3 ++-
 python/cudf/cudf/core/groupby/groupby.py         | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 0ec293ae3f0..5fe4a5eb30f 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -553,7 +553,8 @@ std::unique_ptr<table> groupby(table_view const& keys,
                                rmm::cuda_stream_view stream,
                                rmm::device_async_resource_ref mr)
 {
-  auto const num_keys            = keys.num_rows();
+  // convert to int64_t to avoid potential overflow with large `keys`
+  auto const num_keys            = static_cast<int64_t>(keys.num_rows());
   auto const null_keys_are_equal = null_equality::EQUAL;
   auto const has_null            = nullate::DYNAMIC{cudf::has_nested_nulls(keys)};
 
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index dc6eb55fc6a..050bcbb268f 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -7838,11 +7838,12 @@ void testSumWithStrings() {
         .build();
          Table result = t.groupBy(0).aggregate(
              GroupByAggregation.sum().onColumn(1));
+         Table sorted = result.orderBy(OrderByArg.asc(0));
          Table expected = new Table.TestBuilder()
              .column("1-URGENT", "3-MEDIUM")
              .column(5289L + 5303L, 5203L + 5206L)
              .build()) {
-      assertTablesAreEqual(expected, result);
+      assertTablesAreEqual(expected, sorted);
     }
   }
 
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index d08268eea3a..77b54a583d3 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1308,7 +1308,7 @@ def pipe(self, func, *args, **kwargs):
         To get the difference between each groups maximum and minimum value
         in one pass, you can do
 
-        >>> df.groupby('A').pipe(lambda x: x.max() - x.min())
+        >>> df.groupby('A', sort=True).pipe(lambda x: x.max() - x.min())
            B
         A
         a  2