Skip to content

Commit

Permalink
Add cub::DivideAndRoundUp helper to avoid overflow.
Browse files Browse the repository at this point in the history
Users have been reporting that device algorithms return invalid
`temp_storage_bytes` values when `num_items` is close to -- but
not over -- INT32_MAX.

This is caused by an overflow in the numerator of the pattern
`num_tiles = (num_items + items_per_tile - 1) / items_per_tile`.

The new function implements the same calculation but protects against
overflow.

Fixes NVIDIA#221.
Bug 3075796
  • Loading branch information
alliepiper committed Feb 10, 2021
1 parent b7207a2 commit ee506d4
Show file tree
Hide file tree
Showing 12 changed files with 104 additions and 29 deletions.
3 changes: 2 additions & 1 deletion cub/device/dispatch/dispatch_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
#include "../../agent/agent_histogram.cuh"
#include "../../util_debug.cuh"
#include "../../util_device.cuh"
#include "../../util_math.cuh"
#include "../../thread/thread_search.cuh"
#include "../../grid/grid_queue.cuh"
#include "../../config.cuh"
Expand Down Expand Up @@ -518,7 +519,7 @@ struct DipatchHistogram

// Get grid dimensions, trying to keep total blocks ~histogram_sweep_occupancy
int pixels_per_tile = histogram_sweep_config.block_threads * histogram_sweep_config.pixels_per_thread;
int tiles_per_row = int(num_row_pixels + pixels_per_tile - 1) / pixels_per_tile;
int tiles_per_row = static_cast<int>(cub::DivideAndRoundUp(num_row_pixels, pixels_per_tile));
int blocks_per_row = CUB_MIN(histogram_sweep_occupancy, tiles_per_row);
int blocks_per_col = (blocks_per_row > 0) ?
int(CUB_MIN(histogram_sweep_occupancy / blocks_per_row, num_rows)) :
Expand Down
12 changes: 6 additions & 6 deletions cub/device/dispatch/dispatch_radix_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
#include "../../util_type.cuh"
#include "../../util_debug.cuh"
#include "../../util_device.cuh"
#include "../../util_math.cuh"

#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

Expand Down Expand Up @@ -1261,10 +1262,9 @@ struct DispatchRadixSort :
// parts handle inputs with >=2**30 elements, due to the way lookback works
// for testing purposes, one part is <= 2**28 elements
const int PART_SIZE = ((1 << 28) - 1) / ONESWEEP_TILE_ITEMS * ONESWEEP_TILE_ITEMS;
int num_passes = (end_bit - begin_bit + RADIX_BITS - 1) / RADIX_BITS;
int num_parts = ((long long)num_items + PART_SIZE - 1) / PART_SIZE;
OffsetT max_num_blocks = (CUB_MIN(num_items, PART_SIZE) + ONESWEEP_TILE_ITEMS - 1) /
ONESWEEP_TILE_ITEMS;
int num_passes = cub::DivideAndRoundUp(end_bit - begin_bit, RADIX_BITS);
int num_parts = static_cast<int>(cub::DivideAndRoundUp(num_items, PART_SIZE));
OffsetT max_num_blocks = cub::DivideAndRoundUp(CUB_MIN(num_items, PART_SIZE), ONESWEEP_TILE_ITEMS);

size_t value_size = KEYS_ONLY ? 0 : sizeof(ValueT);
size_t allocation_sizes[] =
Expand Down Expand Up @@ -1341,7 +1341,7 @@ struct DispatchRadixSort :
for (int part = 0; part < num_parts; ++part)
{
int part_num_items = CUB_MIN(num_items - part * PART_SIZE, PART_SIZE);
int num_blocks = (part_num_items + ONESWEEP_TILE_ITEMS - 1) / ONESWEEP_TILE_ITEMS;
int num_blocks = cub::DivideAndRoundUp(part_num_items, ONESWEEP_TILE_ITEMS);
if (CubDebug(error = cudaMemsetAsync(
d_lookback, 0, num_blocks * RADIX_DIGITS * sizeof(AtomicOffsetT),
stream))) break;
Expand Down Expand Up @@ -1466,7 +1466,7 @@ struct DispatchRadixSort :

// Pass planning. Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size
int num_bits = end_bit - begin_bit;
int num_passes = (num_bits + pass_config.radix_bits - 1) / pass_config.radix_bits;
int num_passes = cub::DivideAndRoundUp(num_bits, pass_config.radix_bits);
bool is_num_passes_odd = num_passes & 1;
int max_alt_passes = (num_passes * pass_config.radix_bits) - num_bits;
int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits));
Expand Down
5 changes: 3 additions & 2 deletions cub/device/dispatch/dispatch_reduce_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
#include "../../thread/thread_operators.cuh"
#include "../../grid/grid_queue.cuh"
#include "../../util_device.cuh"
#include "../../util_math.cuh"

#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

Expand Down Expand Up @@ -309,7 +310,7 @@ struct DispatchReduceByKey

// Number of input tiles
int tile_size = reduce_by_key_config.block_threads * reduce_by_key_config.items_per_thread;
int num_tiles = (num_items + tile_size - 1) / tile_size;
int num_tiles = static_cast<int>(cub::DivideAndRoundUp(num_items, tile_size));

// Specify temporary storage allocation requirements
size_t allocation_sizes[1];
Expand All @@ -329,7 +330,7 @@ struct DispatchReduceByKey
if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;

// Log init_kernel configuration
int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
int init_grid_size = CUB_MAX(1, cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS));
if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);

// Invoke init_kernel to initialize tile descriptors
Expand Down
7 changes: 4 additions & 3 deletions cub/device/dispatch/dispatch_rle.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
#include "../../thread/thread_operators.cuh"
#include "../../grid/grid_queue.cuh"
#include "../../util_device.cuh"
#include "../../util_math.cuh"

#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

Expand Down Expand Up @@ -291,7 +292,7 @@ struct DeviceRleDispatch

// Number of input tiles
int tile_size = device_rle_config.block_threads * device_rle_config.items_per_thread;
int num_tiles = (num_items + tile_size - 1) / tile_size;
int num_tiles = static_cast<int>(cub::DivideAndRoundUp(num_items, tile_size));

// Specify temporary storage allocation requirements
size_t allocation_sizes[1];
Expand All @@ -311,7 +312,7 @@ struct DeviceRleDispatch
if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;

// Log device_scan_init_kernel configuration
int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
int init_grid_size = CUB_MAX(1, cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS));
if (debug_synchronous) _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);

// Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors
Expand Down Expand Up @@ -346,7 +347,7 @@ struct DeviceRleDispatch
// Get grid size for scanning tiles
dim3 scan_grid_size;
scan_grid_size.z = 1;
scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x;
scan_grid_size.y = cub::DivideAndRoundUp(num_tiles, max_dim_x);
scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);

// Log device_rle_sweep_kernel configuration
Expand Down
5 changes: 3 additions & 2 deletions cub/device/dispatch/dispatch_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
#include "../../config.cuh"
#include "../../util_debug.cuh"
#include "../../util_device.cuh"
#include "../../util_math.cuh"

#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

Expand Down Expand Up @@ -296,7 +297,7 @@ struct DispatchScan:

// Number of input tiles
int tile_size = Policy::BLOCK_THREADS * Policy::ITEMS_PER_THREAD;
int num_tiles = static_cast<int>((num_items + tile_size - 1) / tile_size);
int num_tiles = static_cast<int>(cub::DivideAndRoundUp(num_items, tile_size));

// Specify temporary storage allocation requirements
size_t allocation_sizes[1];
Expand All @@ -320,7 +321,7 @@ struct DispatchScan:
if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;

// Log init_kernel configuration
int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS;
int init_grid_size = cub::DivideAndRoundUp(num_tiles, int{INIT_KERNEL_THREADS});
if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);

// Invoke init_kernel to initialize tile descriptors
Expand Down
7 changes: 4 additions & 3 deletions cub/device/dispatch/dispatch_select_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
#include "../../thread/thread_operators.cuh"
#include "../../grid/grid_queue.cuh"
#include "../../util_device.cuh"
#include "../../util_math.cuh"

#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

Expand Down Expand Up @@ -297,7 +298,7 @@ struct DispatchSelectIf

// Number of input tiles
int tile_size = select_if_config.block_threads * select_if_config.items_per_thread;
int num_tiles = (num_items + tile_size - 1) / tile_size;
int num_tiles = static_cast<int>(cub::DivideAndRoundUp(num_items, tile_size));

// Specify temporary storage allocation requirements
size_t allocation_sizes[1];
Expand All @@ -317,7 +318,7 @@ struct DispatchSelectIf
if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;

// Log scan_init_kernel configuration
int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
int init_grid_size = CUB_MAX(1, cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS));
if (debug_synchronous) _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);

// Invoke scan_init_kernel to initialize tile descriptors
Expand Down Expand Up @@ -352,7 +353,7 @@ struct DispatchSelectIf
// Get grid size for scanning tiles
dim3 scan_grid_size;
scan_grid_size.z = 1;
scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x;
scan_grid_size.y = cub::DivideAndRoundUp(num_tiles, max_dim_x);
scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);

// Log select_if_kernel configuration
Expand Down
15 changes: 8 additions & 7 deletions cub/device/dispatch/dispatch_spmv_orig.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
#include "../../util_type.cuh"
#include "../../util_debug.cuh"
#include "../../util_device.cuh"
#include "../../util_math.cuh"
#include "../../thread/thread_search.cuh"
#include "../../grid/grid_queue.cuh"
#include "../../config.cuh"
Expand Down Expand Up @@ -510,8 +511,8 @@ struct DispatchSpmv
}

// Get search/init grid dims
int degen_col_kernel_block_size = INIT_KERNEL_THREADS;
int degen_col_kernel_grid_size = (spmv_params.num_rows + degen_col_kernel_block_size - 1) / degen_col_kernel_block_size;
int degen_col_kernel_block_size = INIT_KERNEL_THREADS;
int degen_col_kernel_grid_size = cub::DivideAndRoundUp(spmv_params.num_rows, degen_col_kernel_block_size);

if (debug_synchronous) _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n",
degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream);
Expand Down Expand Up @@ -552,8 +553,8 @@ struct DispatchSpmv
int segment_fixup_tile_size = segment_fixup_config.block_threads * segment_fixup_config.items_per_thread;

// Number of tiles for kernels
int num_merge_tiles = (num_merge_items + merge_tile_size - 1) / merge_tile_size;
int num_segment_fixup_tiles = (num_merge_tiles + segment_fixup_tile_size - 1) / segment_fixup_tile_size;
int num_merge_tiles = cub::DivideAndRoundUp(num_merge_items, merge_tile_size);
int num_segment_fixup_tiles = cub::DivideAndRoundUp(num_merge_tiles, segment_fixup_tile_size);

// Get SM occupancy for kernels
int spmv_sm_occupancy;
Expand All @@ -571,12 +572,12 @@ struct DispatchSpmv
// Get grid dimensions
dim3 spmv_grid_size(
CUB_MIN(num_merge_tiles, max_dim_x),
(num_merge_tiles + max_dim_x - 1) / max_dim_x,
cub::DivideAndRoundUp(num_merge_tiles, max_dim_x),
1);

dim3 segment_fixup_grid_size(
CUB_MIN(num_segment_fixup_tiles, max_dim_x),
(num_segment_fixup_tiles + max_dim_x - 1) / max_dim_x,
cub::DivideAndRoundUp(num_segment_fixup_tiles, max_dim_x),
1);

// Get the temporary storage allocation requirements
Expand Down Expand Up @@ -604,7 +605,7 @@ struct DispatchSpmv

// Get search/init grid dims
int search_block_size = INIT_KERNEL_THREADS;
int search_grid_size = (num_merge_tiles + 1 + search_block_size - 1) / search_block_size;
int search_grid_size = cub::DivideAndRoundUp(num_merge_tiles + 1, search_block_size);

#if CUB_INCLUDE_HOST_CODE
if (CUB_IS_HOST_CODE)
Expand Down
3 changes: 2 additions & 1 deletion cub/grid/grid_even_share.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include "../config.cuh"
#include "../util_namespace.cuh"
#include "../util_macro.cuh"
#include "../util_math.cuh"
#include "../util_type.cuh"
#include "grid_mapping.cuh"

Expand Down Expand Up @@ -129,7 +130,7 @@ public:
this->block_offset = num_items_; // Initialize past-the-end
this->block_end = num_items_; // Initialize past-the-end
this->num_items = num_items_;
this->total_tiles = (num_items_ + tile_items - 1) / tile_items;
this->total_tiles = cub::DivideAndRoundUp(num_items_, tile_items);
this->grid_size = CUB_MIN(static_cast<int>(total_tiles), max_grid_size);
OffsetT avg_tiles_per_block = total_tiles / grid_size;
// leftover grains go to big blocks
Expand Down
63 changes: 63 additions & 0 deletions cub/util_math.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/******************************************************************************
* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
*AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
*IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/

/**
* \file
* Define helper math functions.
*/

#pragma once

#include <type_traits>

// Optional outer namespace(s)
CUB_NS_PREFIX

// CUB namespace
namespace cub
{

/**
* Divide n by d, round up if any remainder, and return the result.
*
* Effectively performs `(n + d - 1) / d`, but is robust against the case where
* `(n + d - 1)` would overflow.
*/
template <typename NumeratorT, typename DenominatorT>
__host__ __device__ __forceinline__ constexpr NumeratorT
DivideAndRoundUp(NumeratorT n, DenominatorT d)
{
static_assert(std::is_integral<NumeratorT>::value &&
std::is_integral<DenominatorT>::value,
"DivideAndRoundUp is only intended for integral types.");

// Static cast to undo integral promotion.
return static_cast<NumeratorT>(n / d + (n % d != 0 ? 1 : 0));
}

} // namespace cub
CUB_NS_POSTFIX // Optional outer namespace(s)
7 changes: 5 additions & 2 deletions test/test_device_radix_sort.cu
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
#endif

#include <cub/util_allocator.cuh>
#include <cub/util_math.cuh>
#include <cub/device/device_radix_sort.cuh>
#include <cub/device/device_segmented_radix_sort.cuh>

Expand Down Expand Up @@ -1035,8 +1036,10 @@ void TestSizes(
int max_items,
int max_segments)
{
for (int num_items = max_items; num_items > 1; num_items = (num_items + 32 - 1) / 32)
{
for (int num_items = max_items;
num_items > 1;
num_items = cub::DivideAndRoundUp(num_items, 32))
{
TestSegments(h_keys, num_items, max_segments);
}
TestSegments(h_keys, 1, max_segments);
Expand Down
3 changes: 2 additions & 1 deletion test/test_device_reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
#include <thrust/reduce.h>

#include <cub/util_allocator.cuh>
#include <cub/util_math.cuh>
#include <cub/device/device_reduce.cuh>
#include <cub/device/device_segmented_reduce.cuh>
#include <cub/iterator/constant_input_iterator.cuh>
Expand Down Expand Up @@ -1085,7 +1086,7 @@ void TestByBackend(
// Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment
int max_items_per_segment = 128000;

for (int num_segments = (num_items + max_items_per_segment - 1) / max_items_per_segment;
for (int num_segments = cub::DivideAndRoundUp(num_items, max_items_per_segment);
num_segments < max_segments;
num_segments = (num_segments * 32) + 1)
{
Expand Down
3 changes: 2 additions & 1 deletion test/test_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
#include "cub/util_device.cuh"
#include "cub/util_type.cuh"
#include "cub/util_macro.cuh"
#include "cub/util_math.cuh"
#include "cub/iterator/discard_output_iterator.cuh"

/******************************************************************************
Expand Down Expand Up @@ -1560,7 +1561,7 @@ void InitializeSegments(
if (num_segments <= 0)
return;

unsigned int expected_segment_length = (num_items + num_segments - 1) / num_segments;
unsigned int expected_segment_length = cub::DivideAndRoundUp(num_items, num_segments);
int offset = 0;
for (int i = 0; i < num_segments; ++i)
{
Expand Down

0 comments on commit ee506d4

Please sign in to comment.