NVIDIA · alliepiper · Aug 18, 2021 · Jul 16, 2021 · Jul 29, 2021 · Jul 29, 2021
diff --git a/cub/block/block_discontinuity.cuh b/cub/block/block_discontinuity.cuh
@@ -94,6 +94,13 @@ CUB_NAMESPACE_BEGIN
  * \par Performance Considerations
  * - Incurs zero bank conflicts for most types
  *
+ * \par Re-using dynamically allocating shared memory
+ * The following example under the examples/block folder illustrates usage of
+ * dynamically shared memory with BlockReduce and how to re-purpose
+ * the same memory region:
+ * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
+ *
+ * This example can be easily adapted to the storage required by BlockDiscontinuity.
  */
 template <
     typename    T,

diff --git a/cub/block/block_exchange.cuh b/cub/block/block_exchange.cuh
@@ -98,6 +98,13 @@ CUB_NAMESPACE_BEGIN
  * \par Performance Considerations
  * - Proper device-specific padding ensures zero bank conflicts for most types.
  *
+ * \par Re-using dynamically allocating shared memory
+ * The following example under the examples/block folder illustrates usage of
+ * dynamically shared memory with BlockReduce and how to re-purpose
+ * the same memory region:
+ * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
+ *
+ * This example can be easily adapted to the storage required by BlockExchange.
  */
 template <
     typename    InputT,

diff --git a/cub/block/block_histogram.cuh b/cub/block/block_histogram.cuh
@@ -139,6 +139,13 @@ enum BlockHistogramAlgorithm
  * - The histogram output can be constructed in shared or device-accessible memory
  * - See cub::BlockHistogramAlgorithm for performance details regarding algorithmic alternatives
  *
+ * \par Re-using dynamically allocating shared memory
+ * The following example under the examples/block folder illustrates usage of
+ * dynamically shared memory with BlockReduce and how to re-purpose
+ * the same memory region:
+ * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
+ *
+ * This example can be easily adapted to the storage required by BlockHistogram.
  */
 template <
     typename                T,

diff --git a/cub/block/block_load.cuh b/cub/block/block_load.cuh
@@ -625,6 +625,13 @@ enum BlockLoadAlgorithm
  * The set of \p thread_data across the block of threads in those threads will be
  * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
  *
+ * \par Re-using dynamically allocating shared memory
+ * The following example under the examples/block folder illustrates usage of
+ * dynamically shared memory with BlockReduce and how to re-purpose
+ * the same memory region:
+ * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
+ *
+ * This example can be easily adapted to the storage required by BlockLoad.
  */
 template <
     typename            InputT,

diff --git a/cub/block/block_merge_sort.cuh b/cub/block/block_merge_sort.cuh
@@ -165,6 +165,13 @@ __device__ __forceinline__ void SerialMerge(KeyT *keys_shared,
  * The corresponding output \p thread_keys in those threads will be
  * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
  *
+ * \par Re-using dynamically allocating shared memory
+ * The following example under the examples/block folder illustrates usage of
+ * dynamically shared memory with BlockReduce and how to re-purpose
+ * the same memory region:
+ * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
+ *
+ * This example can be easily adapted to the storage required by BlockMergeSort.
  */
 template <
   typename  KeyT,
@@ -576,4 +583,4 @@ public:
   }
 };
 
-CUB_NAMESPACE_END
+CUB_NAMESPACE_END
diff --git a/cub/block/block_radix_rank.cuh b/cub/block/block_radix_rank.cuh
@@ -125,6 +125,14 @@ struct BlockRadixRankEmptyCallback
  *      {
  *
  *      \endcode
+ *
+ * \par Re-using dynamically allocating shared memory
+ * The following example under the examples/block folder illustrates usage of
+ * dynamically shared memory with BlockReduce and how to re-purpose
+ * the same memory region:
+ * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
+ *
+ * This example can be easily adapted to the storage required by BlockRadixRank.
  */
 template <
     int                     BLOCK_DIM_X,

diff --git a/cub/block/block_radix_sort.cuh b/cub/block/block_radix_sort.cuh
@@ -151,6 +151,13 @@ CUB_NAMESPACE_BEGIN
  * corresponding output \p thread_keys in those threads will be
  * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
  *
+ * \par Re-using dynamically allocating shared memory
+ * The following example under the examples/block folder illustrates usage of
+ * dynamically shared memory with BlockReduce and how to re-purpose
+ * the same memory region:
+ * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
+ *
+ * This example can be easily adapted to the storage required by BlockRadixSort.
  */
 template <
     typename                KeyT,

diff --git a/cub/block/block_reduce.cuh b/cub/block/block_reduce.cuh
@@ -206,6 +206,11 @@ enum BlockReduceAlgorithm
  *
  * \endcode
  *
+ * \par Re-using dynamically allocating shared memory
+ * The following example under the examples/block folder illustrates usage of
+ * dynamically shared memory with BlockReduce and how to re-purpose
+ * the same memory region:
+ * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
  */
 template <
     typename                T,

diff --git a/cub/block/block_scan.cuh b/cub/block/block_scan.cuh
@@ -177,6 +177,13 @@ enum BlockScanAlgorithm
  * The corresponding output \p thread_data in those threads will be
  * <tt>{[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}</tt>.
  *
+ * \par Re-using dynamically allocating shared memory
+ * The following example under the examples/block folder illustrates usage of
+ * dynamically shared memory with BlockReduce and how to re-purpose
+ * the same memory region:
+ * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
+ *
+ * This example can be easily adapted to the storage required by BlockScan.
  */
 template <
     typename            T,

diff --git a/cub/block/block_store.cuh b/cub/block/block_store.cuh
@@ -514,6 +514,13 @@ enum BlockStoreAlgorithm
  * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
  * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
  *
+ * \par Re-using dynamically allocating shared memory
+ * The following example under the examples/block folder illustrates usage of
+ * dynamically shared memory with BlockReduce and how to re-purpose
+ * the same memory region:
+ * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
+ *
+ * This example can be easily adapted to the storage required by BlockStore.
  */
 template <
     typename                T,

diff --git a/examples/block/example_block_reduce.cu b/examples/block/example_block_reduce.cu
@@ -68,13 +68,13 @@ int g_grid_size = 1;
 //---------------------------------------------------------------------
 
 /**
- * Simple kernel for performing a block-wide exclusive prefix sum over integers
+ * Simple kernel for performing a block-wide reduction.
  */
 template <
     int                     BLOCK_THREADS,
     int                     ITEMS_PER_THREAD,
     BlockReduceAlgorithm    ALGORITHM>
-__global__ void BlockSumKernel(
+__global__ void BlockReduceKernel(
     int         *d_in,          // Tile of input
     int         *d_out,         // Tile aggregate
     clock_t     *d_elapsed)     // Elapsed cycle count of block reduction
@@ -167,7 +167,7 @@ void Test()
 
     // Kernel props
     int max_sm_occupancy;
-    CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>, BLOCK_THREADS));
+    CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockReduceKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>, BLOCK_THREADS));
 
     // Copy problem to device
     cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
@@ -176,8 +176,8 @@ void Test()
         (ALGORITHM == BLOCK_REDUCE_RAKING) ? "BLOCK_REDUCE_RAKING" : "BLOCK_REDUCE_WARP_REDUCTIONS",
         TILE_SIZE, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy);
 
-    // Run aggregate/prefix kernel
-    BlockSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
+    // Run kernel
+    BlockReduceKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
         d_in,
         d_out,
         d_elapsed);
@@ -200,8 +200,8 @@ void Test()
 
         timer.Start();
 
-        // Run aggregate/prefix kernel
-        BlockSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
+        // Run kernel
+        BlockReduceKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
             d_in,
             d_out,
             d_elapsed);