From 231cb716baf44b64e0284e23ae9666500de7d593 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 18 Jun 2024 11:50:46 -0700 Subject: [PATCH] Fix a size overflow bug in hash groupby (#16053) This PR fixes a size overflow bug discovered by @matal-nvidia. It converts the groupby problem size to `int64_t` so it won't overflow if larger than `INT_MAX / 2` with 50% hash table occupancy. Unit tests for this scenario will saturate device memory and take longer than necessary, making them likely not worth adding. Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - Bradley Dice (https://github.com/bdice) - Matthew Roeschke (https://github.com/mroeschke) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/16053 --- cpp/src/groupby/hash/groupby.cu | 3 ++- java/src/test/java/ai/rapids/cudf/TableTest.java | 3 ++- python/cudf/cudf/core/groupby/groupby.py | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 0ec293ae3f0..5fe4a5eb30f 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -553,7 +553,8 @@ std::unique_ptr groupby(table_view const& keys, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - auto const num_keys = keys.num_rows(); + // convert to int64_t to avoid potential overflow with large `keys` + auto const num_keys = static_cast(keys.num_rows()); auto const null_keys_are_equal = null_equality::EQUAL; auto const has_null = nullate::DYNAMIC{cudf::has_nested_nulls(keys)}; diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index dc6eb55fc6a..050bcbb268f 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -7838,11 +7838,12 @@ void testSumWithStrings() { .build(); Table result = t.groupBy(0).aggregate( GroupByAggregation.sum().onColumn(1)); + Table sorted = result.orderBy(OrderByArg.asc(0)); Table expected = new Table.TestBuilder() .column("1-URGENT", "3-MEDIUM") .column(5289L + 5303L, 5203L + 5206L) .build()) { - assertTablesAreEqual(expected, result); + assertTablesAreEqual(expected, sorted); } } diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index d08268eea3a..77b54a583d3 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1308,7 +1308,7 @@ def pipe(self, func, *args, **kwargs): To get the difference between each groups maximum and minimum value in one pass, you can do - >>> df.groupby('A').pipe(lambda x: x.max() - x.min()) + >>> df.groupby('A', sort=True).pipe(lambda x: x.max() - x.min()) B A a 2