Add cub::DivideAndRoundUp helper to avoid overflow.

Users have been reporting that device algorithms return invalid `temp_storage_bytes` values when `num_items` is close to -- but not over -- INT32_MAX. This is caused by an overflow in the numerator of the pattern `num_tiles = (num_items + items_per_tile - 1) / items_per_tile`. The new function implements the same calculation but protects against overflow. Fixes NVIDIA#221. Bug 3075796
alliepiper · Feb 10, 2021 · ee506d4 · ee506d4
1 parent b7207a2
commit ee506d4
Show file tree

Hide file tree

Showing 12 changed files with 104 additions and 29 deletions.
diff --git a/cub/device/dispatch/dispatch_histogram.cuh b/cub/device/dispatch/dispatch_histogram.cuh
@@ -41,6 +41,7 @@
 #include "../../agent/agent_histogram.cuh"
 #include "../../util_debug.cuh"
 #include "../../util_device.cuh"
+#include "../../util_math.cuh"
 #include "../../thread/thread_search.cuh"
 #include "../../grid/grid_queue.cuh"
 #include "../../config.cuh"
@@ -518,7 +519,7 @@ struct DipatchHistogram
 
             // Get grid dimensions, trying to keep total blocks ~histogram_sweep_occupancy
             int pixels_per_tile     = histogram_sweep_config.block_threads * histogram_sweep_config.pixels_per_thread;
-            int tiles_per_row       = int(num_row_pixels + pixels_per_tile - 1) / pixels_per_tile;
+            int tiles_per_row       = static_cast<int>(cub::DivideAndRoundUp(num_row_pixels, pixels_per_tile));
             int blocks_per_row      = CUB_MIN(histogram_sweep_occupancy, tiles_per_row);
             int blocks_per_col      = (blocks_per_row > 0) ?
                                         int(CUB_MIN(histogram_sweep_occupancy / blocks_per_row, num_rows)) :

diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh
@@ -48,6 +48,7 @@
 #include "../../util_type.cuh"
 #include "../../util_debug.cuh"
 #include "../../util_device.cuh"
+#include "../../util_math.cuh"
 
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
@@ -1261,10 +1262,9 @@ struct DispatchRadixSort :
         // parts handle inputs with >=2**30 elements, due to the way lookback works
         // for testing purposes, one part is <= 2**28 elements
         const int PART_SIZE = ((1 << 28) - 1) / ONESWEEP_TILE_ITEMS * ONESWEEP_TILE_ITEMS;
-        int num_passes = (end_bit - begin_bit + RADIX_BITS - 1) / RADIX_BITS;
-        int num_parts = ((long long)num_items + PART_SIZE - 1) / PART_SIZE;
-        OffsetT max_num_blocks = (CUB_MIN(num_items, PART_SIZE) + ONESWEEP_TILE_ITEMS - 1) /
-            ONESWEEP_TILE_ITEMS;
+        int num_passes = cub::DivideAndRoundUp(end_bit - begin_bit, RADIX_BITS);
+        int num_parts = static_cast<int>(cub::DivideAndRoundUp(num_items, PART_SIZE));
+        OffsetT max_num_blocks = cub::DivideAndRoundUp(CUB_MIN(num_items, PART_SIZE), ONESWEEP_TILE_ITEMS);
 
         size_t value_size = KEYS_ONLY ? 0 : sizeof(ValueT);
         size_t allocation_sizes[] =
@@ -1341,7 +1341,7 @@ struct DispatchRadixSort :
                 for (int part = 0; part < num_parts; ++part)
                 {
                     int part_num_items = CUB_MIN(num_items - part * PART_SIZE, PART_SIZE);
-                    int num_blocks = (part_num_items + ONESWEEP_TILE_ITEMS - 1) / ONESWEEP_TILE_ITEMS;
+                    int num_blocks = cub::DivideAndRoundUp(part_num_items, ONESWEEP_TILE_ITEMS);
                     if (CubDebug(error = cudaMemsetAsync(
                            d_lookback, 0, num_blocks * RADIX_DIGITS * sizeof(AtomicOffsetT),
                            stream))) break;
@@ -1466,7 +1466,7 @@ struct DispatchRadixSort :
 
             // Pass planning.  Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size
             int num_bits            = end_bit - begin_bit;
-            int num_passes          = (num_bits + pass_config.radix_bits - 1) / pass_config.radix_bits;
+            int num_passes          = cub::DivideAndRoundUp(num_bits, pass_config.radix_bits);
             bool is_num_passes_odd  = num_passes & 1;
             int max_alt_passes      = (num_passes * pass_config.radix_bits) - num_bits;
             int alt_end_bit         = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits));

diff --git a/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -43,6 +43,7 @@
 #include "../../thread/thread_operators.cuh"
 #include "../../grid/grid_queue.cuh"
 #include "../../util_device.cuh"
+#include "../../util_math.cuh"
 
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
@@ -309,7 +310,7 @@ struct DispatchReduceByKey
 
             // Number of input tiles
             int tile_size = reduce_by_key_config.block_threads * reduce_by_key_config.items_per_thread;
-            int num_tiles = (num_items + tile_size - 1) / tile_size;
+            int num_tiles = static_cast<int>(cub::DivideAndRoundUp(num_items, tile_size));
 
             // Specify temporary storage allocation requirements
             size_t  allocation_sizes[1];
@@ -329,7 +330,7 @@ struct DispatchReduceByKey
             if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
 
             // Log init_kernel configuration
-            int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
+            int init_grid_size = CUB_MAX(1, cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS));
             if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
 
             // Invoke init_kernel to initialize tile descriptors

diff --git a/cub/device/dispatch/dispatch_rle.cuh b/cub/device/dispatch/dispatch_rle.cuh
@@ -43,6 +43,7 @@
 #include "../../thread/thread_operators.cuh"
 #include "../../grid/grid_queue.cuh"
 #include "../../util_device.cuh"
+#include "../../util_math.cuh"
 
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
@@ -291,7 +292,7 @@ struct DeviceRleDispatch
 
             // Number of input tiles
             int tile_size = device_rle_config.block_threads * device_rle_config.items_per_thread;
-            int num_tiles = (num_items + tile_size - 1) / tile_size;
+            int num_tiles = static_cast<int>(cub::DivideAndRoundUp(num_items, tile_size));
 
             // Specify temporary storage allocation requirements
             size_t  allocation_sizes[1];
@@ -311,7 +312,7 @@ struct DeviceRleDispatch
             if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
 
             // Log device_scan_init_kernel configuration
-            int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
+            int init_grid_size = CUB_MAX(1, cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS));
             if (debug_synchronous) _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
 
             // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors
@@ -346,7 +347,7 @@ struct DeviceRleDispatch
             // Get grid size for scanning tiles
             dim3 scan_grid_size;
             scan_grid_size.z = 1;
-            scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x;
+            scan_grid_size.y = cub::DivideAndRoundUp(num_tiles, max_dim_x);
             scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
 
             // Log device_rle_sweep_kernel configuration

diff --git a/cub/device/dispatch/dispatch_scan.cuh b/cub/device/dispatch/dispatch_scan.cuh
@@ -43,6 +43,7 @@
 #include "../../config.cuh"
 #include "../../util_debug.cuh"
 #include "../../util_device.cuh"
+#include "../../util_math.cuh"
 
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
@@ -296,7 +297,7 @@ struct DispatchScan:
 
             // Number of input tiles
             int tile_size = Policy::BLOCK_THREADS * Policy::ITEMS_PER_THREAD;
-            int num_tiles = static_cast<int>((num_items + tile_size - 1) / tile_size);
+            int num_tiles = static_cast<int>(cub::DivideAndRoundUp(num_items, tile_size));
 
             // Specify temporary storage allocation requirements
             size_t  allocation_sizes[1];
@@ -320,7 +321,7 @@ struct DispatchScan:
             if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
 
             // Log init_kernel configuration
-            int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS;
+            int init_grid_size = cub::DivideAndRoundUp(num_tiles, int{INIT_KERNEL_THREADS});
             if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
 
             // Invoke init_kernel to initialize tile descriptors

diff --git a/cub/device/dispatch/dispatch_select_if.cuh b/cub/device/dispatch/dispatch_select_if.cuh
@@ -43,6 +43,7 @@
 #include "../../thread/thread_operators.cuh"
 #include "../../grid/grid_queue.cuh"
 #include "../../util_device.cuh"
+#include "../../util_math.cuh"
 
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
@@ -297,7 +298,7 @@ struct DispatchSelectIf
 
             // Number of input tiles
             int tile_size = select_if_config.block_threads * select_if_config.items_per_thread;
-            int num_tiles = (num_items + tile_size - 1) / tile_size;
+            int num_tiles = static_cast<int>(cub::DivideAndRoundUp(num_items, tile_size));
 
             // Specify temporary storage allocation requirements
             size_t  allocation_sizes[1];
@@ -317,7 +318,7 @@ struct DispatchSelectIf
             if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
 
             // Log scan_init_kernel configuration
-            int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
+            int init_grid_size = CUB_MAX(1, cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS));
             if (debug_synchronous) _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
 
             // Invoke scan_init_kernel to initialize tile descriptors
@@ -352,7 +353,7 @@ struct DispatchSelectIf
             // Get grid size for scanning tiles
             dim3 scan_grid_size;
             scan_grid_size.z = 1;
-            scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x;
+            scan_grid_size.y = cub::DivideAndRoundUp(num_tiles, max_dim_x);
             scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
 
             // Log select_if_kernel configuration

diff --git a/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/device/dispatch/dispatch_spmv_orig.cuh
@@ -43,6 +43,7 @@
 #include "../../util_type.cuh"
 #include "../../util_debug.cuh"
 #include "../../util_device.cuh"
+#include "../../util_math.cuh"
 #include "../../thread/thread_search.cuh"
 #include "../../grid/grid_queue.cuh"
 #include "../../config.cuh"
@@ -510,8 +511,8 @@ struct DispatchSpmv
                 }
 
                 // Get search/init grid dims
-                int degen_col_kernel_block_size     = INIT_KERNEL_THREADS;
-                int degen_col_kernel_grid_size      = (spmv_params.num_rows + degen_col_kernel_block_size - 1) / degen_col_kernel_block_size;
+                int degen_col_kernel_block_size = INIT_KERNEL_THREADS;
+                int degen_col_kernel_grid_size = cub::DivideAndRoundUp(spmv_params.num_rows, degen_col_kernel_block_size);
 
                 if (debug_synchronous) _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n",
                     degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream);
@@ -552,8 +553,8 @@ struct DispatchSpmv
             int segment_fixup_tile_size     = segment_fixup_config.block_threads * segment_fixup_config.items_per_thread;
 
             // Number of tiles for kernels
-            int num_merge_tiles            = (num_merge_items + merge_tile_size - 1) / merge_tile_size;
-            int num_segment_fixup_tiles    = (num_merge_tiles + segment_fixup_tile_size - 1) / segment_fixup_tile_size;
+            int num_merge_tiles            = cub::DivideAndRoundUp(num_merge_items, merge_tile_size);
+            int num_segment_fixup_tiles    = cub::DivideAndRoundUp(num_merge_tiles, segment_fixup_tile_size);
 
             // Get SM occupancy for kernels
             int spmv_sm_occupancy;
@@ -571,12 +572,12 @@ struct DispatchSpmv
             // Get grid dimensions
             dim3 spmv_grid_size(
                 CUB_MIN(num_merge_tiles, max_dim_x),
-                (num_merge_tiles + max_dim_x - 1) / max_dim_x,
+                cub::DivideAndRoundUp(num_merge_tiles, max_dim_x),
                 1);
 
             dim3 segment_fixup_grid_size(
                 CUB_MIN(num_segment_fixup_tiles, max_dim_x),
-                (num_segment_fixup_tiles + max_dim_x - 1) / max_dim_x,
+                cub::DivideAndRoundUp(num_segment_fixup_tiles, max_dim_x),
                 1);
 
             // Get the temporary storage allocation requirements
@@ -604,7 +605,7 @@ struct DispatchSpmv
 
             // Get search/init grid dims
             int search_block_size   = INIT_KERNEL_THREADS;
-            int search_grid_size    = (num_merge_tiles + 1 + search_block_size - 1) / search_block_size;
+            int search_grid_size    = cub::DivideAndRoundUp(num_merge_tiles + 1, search_block_size);
 
             #if CUB_INCLUDE_HOST_CODE
                 if (CUB_IS_HOST_CODE)

diff --git a/cub/grid/grid_even_share.cuh b/cub/grid/grid_even_share.cuh
@@ -37,6 +37,7 @@
 #include "../config.cuh"
 #include "../util_namespace.cuh"
 #include "../util_macro.cuh"
+#include "../util_math.cuh"
 #include "../util_type.cuh"
 #include "grid_mapping.cuh"
 
@@ -129,7 +130,7 @@ public:
         this->block_offset          = num_items_;    // Initialize past-the-end
         this->block_end             = num_items_;    // Initialize past-the-end
         this->num_items             = num_items_;
-        this->total_tiles           = (num_items_ + tile_items - 1) / tile_items;
+        this->total_tiles           = cub::DivideAndRoundUp(num_items_, tile_items);
         this->grid_size             = CUB_MIN(static_cast<int>(total_tiles), max_grid_size);
         OffsetT avg_tiles_per_block = total_tiles / grid_size;
         // leftover grains go to big blocks

diff --git a/cub/util_math.cuh b/cub/util_math.cuh
@@ -0,0 +1,63 @@
+/******************************************************************************
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Define helper math functions.
+ */
+
+#pragma once
+
+#include <type_traits>
+
+// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+// CUB namespace
+namespace cub
+{
+
+/**
+ * Divide n by d, round up if any remainder, and return the result.
+ *
+ * Effectively performs `(n + d - 1) / d`, but is robust against the case where
+ * `(n + d - 1)` would overflow.
+ */
+template <typename NumeratorT, typename DenominatorT>
+__host__ __device__ __forceinline__ constexpr NumeratorT
+DivideAndRoundUp(NumeratorT n, DenominatorT d)
+{
+  static_assert(std::is_integral<NumeratorT>::value &&
+                std::is_integral<DenominatorT>::value,
+                "DivideAndRoundUp is only intended for integral types.");
+
+  // Static cast to undo integral promotion.
+  return static_cast<NumeratorT>(n / d + (n % d != 0 ? 1 : 0));
+}
+
+} // namespace cub
+CUB_NS_POSTFIX // Optional outer namespace(s)
diff --git a/test/test_device_radix_sort.cu b/test/test_device_radix_sort.cu
@@ -42,6 +42,7 @@
 #endif
 
 #include <cub/util_allocator.cuh>
+#include <cub/util_math.cuh>
 #include <cub/device/device_radix_sort.cuh>
 #include <cub/device/device_segmented_radix_sort.cuh>
 
@@ -1035,8 +1036,10 @@ void TestSizes(
     int     max_items,
     int     max_segments)
 {
-    for (int num_items = max_items; num_items > 1; num_items = (num_items + 32 - 1) / 32)
-    {
+  for (int num_items = max_items;
+       num_items > 1;
+       num_items = cub::DivideAndRoundUp(num_items, 32))
+  {
         TestSegments(h_keys, num_items, max_segments);
     }
     TestSegments(h_keys, 1, max_segments);

diff --git a/test/test_device_reduce.cu b/test/test_device_reduce.cu
@@ -41,6 +41,7 @@
 #include <thrust/reduce.h>
 
 #include <cub/util_allocator.cuh>
+#include <cub/util_math.cuh>
 #include <cub/device/device_reduce.cuh>
 #include <cub/device/device_segmented_reduce.cuh>
 #include <cub/iterator/constant_input_iterator.cuh>
@@ -1085,7 +1086,7 @@ void TestByBackend(
     // Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment
     int max_items_per_segment = 128000;
 
-    for (int num_segments = (num_items + max_items_per_segment - 1) / max_items_per_segment;
+    for (int num_segments = cub::DivideAndRoundUp(num_items, max_items_per_segment);
         num_segments < max_segments;
         num_segments = (num_segments * 32) + 1)
     {

diff --git a/test/test_util.h b/test/test_util.h
@@ -54,6 +54,7 @@
 #include "cub/util_device.cuh"
 #include "cub/util_type.cuh"
 #include "cub/util_macro.cuh"
+#include "cub/util_math.cuh"
 #include "cub/iterator/discard_output_iterator.cuh"
 
 /******************************************************************************
@@ -1560,7 +1561,7 @@ void InitializeSegments(
     if (num_segments <= 0)
         return;
 
-    unsigned int expected_segment_length = (num_items + num_segments - 1) / num_segments;
+    unsigned int expected_segment_length = cub::DivideAndRoundUp(num_items, num_segments);
     int offset = 0;
     for (int i = 0; i < num_segments; ++i)
     {