NVIDIA · alliepiper · Jun 11, 2021 · Mar 25, 2021 · Mar 26, 2021 · Jun 8, 2021
diff --git a/cub/block/block_load.cuh b/cub/block/block_load.cuh
@@ -472,6 +472,18 @@ enum BlockLoadAlgorithm
      */
     BLOCK_LOAD_DIRECT,
 
+    /**
+    * \par Overview
+    *
+    * A [<em>striped arrangement</em>](index.html#sec5sec3) of data is read
+    * directly from memory.
+    *
+    * \par Performance Considerations
+    * - The utilization of memory transactions (coalescing) decreases as the
+    *   access stride between threads increases (i.e., the number items per thread).
+    */
+    BLOCK_LOAD_STRIPED,
+
     /**
      * \par Overview
      *
@@ -507,7 +519,6 @@ enum BlockLoadAlgorithm
      */
     BLOCK_LOAD_TRANSPOSE,
 
-
     /**
      * \par Overview
      *
@@ -528,7 +539,6 @@ enum BlockLoadAlgorithm
      */
     BLOCK_LOAD_WARP_TRANSPOSE,
 
-
     /**
      * \par Overview
      *
@@ -572,6 +582,8 @@ enum BlockLoadAlgorithm
  * - BlockLoad can be optionally specialized by different data movement strategies:
  *   -# <b>cub::BLOCK_LOAD_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
  *      of data is read directly from memory.  [More...](\ref cub::BlockLoadAlgorithm)
+*    -# <b>cub::BLOCK_LOAD_STRIPED,</b>.  A [<em>striped arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory.  [More...](\ref cub::BlockLoadAlgorithm)
  *   -# <b>cub::BLOCK_LOAD_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
  *      of data is read directly from memory using CUDA's built-in vectorized loads as a
  *      coalescing optimization.    [More...](\ref cub::BlockLoadAlgorithm)
@@ -703,6 +715,59 @@ private:
     };
 
 
+    /**
+    * BLOCK_LOAD_STRIPED specialization of load helper
+    */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_STRIPED, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
+        {
+            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items, oob_default);
+        }
+
+    };
+
+
     /**
      * BLOCK_LOAD_VECTORIZE specialization of load helper
      */

diff --git a/cub/block/block_store.cuh b/cub/block/block_store.cuh
@@ -364,6 +364,17 @@ enum BlockStoreAlgorithm
      */
     BLOCK_STORE_DIRECT,
 
+    /**
+     * \par Overview
+     * A [<em>striped arrangement</em>](index.html#sec5sec3) of data is written
+     * directly to memory.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) decreases as the
+     *   access stride between threads increases (i.e., the number items per thread).
+     */
+    BLOCK_STORE_STRIPED,
+
     /**
      * \par Overview
      *
@@ -432,7 +443,6 @@ enum BlockStoreAlgorithm
      *   latencies than the BLOCK_STORE_WARP_TRANSPOSE alternative.
      */
     BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
-
 };
 
 
@@ -456,6 +466,8 @@ enum BlockStoreAlgorithm
  * - BlockStore can be optionally specialized by different data movement strategies:
  *   -# <b>cub::BLOCK_STORE_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
  *      directly to memory. [More...](\ref cub::BlockStoreAlgorithm)
+ *   -# <b>cub::BLOCK_STORE_STRIPED</b>.  A [<em>striped arrangement</em>](index.html#sec5sec3)
+ *      of data is written directly to memory. [More...](\ref cub::BlockStoreAlgorithm)
  *   -# <b>cub::BLOCK_STORE_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
  *      of data is written directly to memory using CUDA's built-in vectorized stores as a
  *      coalescing optimization.  [More...](\ref cub::BlockStoreAlgorithm)
@@ -579,6 +591,50 @@ private:
     };
 
 
+    /**
+    * BLOCK_STORE_STRIPED specialization of store helper
+    */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_STRIPED, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT   block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, temp_storage.valid_items);
+        }
+    };
+
+
     /**
      * BLOCK_STORE_VECTORIZE specialization of store helper
      */
@@ -822,6 +878,7 @@ private:
             StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
         }
     };
+
 
     /******************************************************************************
      * Type definitions

diff --git a/test/test_block_load_store.cu b/test/test_block_load_store.cu
@@ -439,6 +439,7 @@ void TestStrategy(
     TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_DIRECT, BLOCK_STORE_DIRECT>(grid_size, fraction_valid);
     TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_TRANSPOSE, BLOCK_STORE_TRANSPOSE>(grid_size, fraction_valid);
     TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_VECTORIZE, BLOCK_STORE_VECTORIZE>(grid_size, fraction_valid);
+    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_STRIPED, BLOCK_STORE_STRIPED>(grid_size, fraction_valid);
 }