Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Docs update: how to use dynamic shared mem with block reduce #348

Merged
merged 9 commits into from
Aug 18, 2021
7 changes: 7 additions & 0 deletions cub/block/block_discontinuity.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,13 @@ CUB_NAMESPACE_BEGIN
* \par Performance Considerations
* - Incurs zero bank conflicts for most types
*
* \par Re-using dynamically allocating shared memory
* The following example under the examples/block folder illustrates usage of
* dynamically shared memory with BlockReduce and how to re-purpose
* the same memory region:
* <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
*
* This example can be easily adapted to the storage required by BlockDiscontinuity.
*/
template <
typename T,
Expand Down
7 changes: 7 additions & 0 deletions cub/block/block_exchange.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,13 @@ CUB_NAMESPACE_BEGIN
* \par Performance Considerations
* - Proper device-specific padding ensures zero bank conflicts for most types.
*
* \par Re-using dynamically allocating shared memory
* The following example under the examples/block folder illustrates usage of
* dynamically shared memory with BlockReduce and how to re-purpose
* the same memory region:
* <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
*
* This example can be easily adapted to the storage required by BlockExchange.
*/
template <
typename InputT,
Expand Down
7 changes: 7 additions & 0 deletions cub/block/block_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,13 @@ enum BlockHistogramAlgorithm
* - The histogram output can be constructed in shared or device-accessible memory
* - See cub::BlockHistogramAlgorithm for performance details regarding algorithmic alternatives
*
* \par Re-using dynamically allocating shared memory
* The following example under the examples/block folder illustrates usage of
* dynamically shared memory with BlockReduce and how to re-purpose
* the same memory region:
* <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
*
* This example can be easily adapted to the storage required by BlockHistogram.
*/
template <
typename T,
Expand Down
7 changes: 7 additions & 0 deletions cub/block/block_load.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -625,6 +625,13 @@ enum BlockLoadAlgorithm
* The set of \p thread_data across the block of threads in those threads will be
* <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
*
* \par Re-using dynamically allocating shared memory
* The following example under the examples/block folder illustrates usage of
* dynamically shared memory with BlockReduce and how to re-purpose
* the same memory region:
* <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
*
* This example can be easily adapted to the storage required by BlockLoad.
*/
template <
typename InputT,
Expand Down
9 changes: 8 additions & 1 deletion cub/block/block_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,13 @@ __device__ __forceinline__ void SerialMerge(KeyT *keys_shared,
* The corresponding output \p thread_keys in those threads will be
* <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
*
* \par Re-using dynamically allocating shared memory
* The following example under the examples/block folder illustrates usage of
* dynamically shared memory with BlockReduce and how to re-purpose
* the same memory region:
* <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
*
* This example can be easily adapted to the storage required by BlockMergeSort.
*/
template <
typename KeyT,
Expand Down Expand Up @@ -576,4 +583,4 @@ public:
}
};

CUB_NAMESPACE_END
CUB_NAMESPACE_END
8 changes: 8 additions & 0 deletions cub/block/block_radix_rank.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,14 @@ struct BlockRadixRankEmptyCallback
* {
*
* \endcode
*
* \par Re-using dynamically allocating shared memory
* The following example under the examples/block folder illustrates usage of
* dynamically shared memory with BlockReduce and how to re-purpose
* the same memory region:
* <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
*
* This example can be easily adapted to the storage required by BlockRadixRank.
*/
template <
int BLOCK_DIM_X,
Expand Down
7 changes: 7 additions & 0 deletions cub/block/block_radix_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,13 @@ CUB_NAMESPACE_BEGIN
* corresponding output \p thread_keys in those threads will be
* <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
*
* \par Re-using dynamically allocating shared memory
* The following example under the examples/block folder illustrates usage of
* dynamically shared memory with BlockReduce and how to re-purpose
* the same memory region:
* <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
*
* This example can be easily adapted to the storage required by BlockRadixSort.
*/
template <
typename KeyT,
Expand Down
5 changes: 5 additions & 0 deletions cub/block/block_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,11 @@ enum BlockReduceAlgorithm
*
* \endcode
*
* \par Re-using dynamically allocating shared memory
* The following example under the examples/block folder illustrates usage of
* dynamically shared memory with BlockReduce and how to re-purpose
* the same memory region:
* <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
*/
template <
typename T,
Expand Down
7 changes: 7 additions & 0 deletions cub/block/block_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,13 @@ enum BlockScanAlgorithm
* The corresponding output \p thread_data in those threads will be
* <tt>{[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}</tt>.
*
* \par Re-using dynamically allocating shared memory
* The following example under the examples/block folder illustrates usage of
* dynamically shared memory with BlockReduce and how to re-purpose
* the same memory region:
* <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
*
* This example can be easily adapted to the storage required by BlockScan.
*/
template <
typename T,
Expand Down
7 changes: 7 additions & 0 deletions cub/block/block_store.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -514,6 +514,13 @@ enum BlockStoreAlgorithm
* <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
* The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
*
* \par Re-using dynamically allocating shared memory
* The following example under the examples/block folder illustrates usage of
* dynamically shared memory with BlockReduce and how to re-purpose
* the same memory region:
* <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
*
* This example can be easily adapted to the storage required by BlockStore.
*/
template <
typename T,
Expand Down
14 changes: 7 additions & 7 deletions examples/block/example_block_reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -68,13 +68,13 @@ int g_grid_size = 1;
//---------------------------------------------------------------------

/**
* Simple kernel for performing a block-wide exclusive prefix sum over integers
* Simple kernel for performing a block-wide reduction.
*/
template <
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
BlockReduceAlgorithm ALGORITHM>
__global__ void BlockSumKernel(
__global__ void BlockReduceKernel(
int *d_in, // Tile of input
int *d_out, // Tile aggregate
clock_t *d_elapsed) // Elapsed cycle count of block reduction
Expand Down Expand Up @@ -167,7 +167,7 @@ void Test()

// Kernel props
int max_sm_occupancy;
CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>, BLOCK_THREADS));
CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockReduceKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>, BLOCK_THREADS));

// Copy problem to device
cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
Expand All @@ -176,8 +176,8 @@ void Test()
(ALGORITHM == BLOCK_REDUCE_RAKING) ? "BLOCK_REDUCE_RAKING" : "BLOCK_REDUCE_WARP_REDUCTIONS",
TILE_SIZE, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy);

// Run aggregate/prefix kernel
BlockSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
// Run kernel
BlockReduceKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
d_in,
d_out,
d_elapsed);
Expand All @@ -200,8 +200,8 @@ void Test()

timer.Start();

// Run aggregate/prefix kernel
BlockSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
// Run kernel
BlockReduceKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
d_in,
d_out,
d_elapsed);
Expand Down
Loading