diff --git a/cub/block/block_load.cuh b/cub/block/block_load.cuh
index 234d4ee5bf..d689954e0b 100644
--- a/cub/block/block_load.cuh
+++ b/cub/block/block_load.cuh
@@ -472,6 +472,18 @@ enum BlockLoadAlgorithm
*/
BLOCK_LOAD_DIRECT,
+ /**
+ * \par Overview
+ *
+ * A [striped arrangement](index.html#sec5sec3) of data is read
+ * directly from memory.
+ *
+ * \par Performance Considerations
+ * - The utilization of memory transactions (coalescing) decreases as the
+ * access stride between threads increases (i.e., the number items per thread).
+ */
+ BLOCK_LOAD_STRIPED,
+
/**
* \par Overview
*
@@ -507,7 +519,6 @@ enum BlockLoadAlgorithm
*/
BLOCK_LOAD_TRANSPOSE,
-
/**
* \par Overview
*
@@ -528,7 +539,6 @@ enum BlockLoadAlgorithm
*/
BLOCK_LOAD_WARP_TRANSPOSE,
-
/**
* \par Overview
*
@@ -572,6 +582,8 @@ enum BlockLoadAlgorithm
* - BlockLoad can be optionally specialized by different data movement strategies:
* -# cub::BLOCK_LOAD_DIRECT. A [blocked arrangement](index.html#sec5sec3)
* of data is read directly from memory. [More...](\ref cub::BlockLoadAlgorithm)
+* -# cub::BLOCK_LOAD_STRIPED,. A [striped arrangement](index.html#sec5sec3)
+ * of data is read directly from memory. [More...](\ref cub::BlockLoadAlgorithm)
* -# cub::BLOCK_LOAD_VECTORIZE. A [blocked arrangement](index.html#sec5sec3)
* of data is read directly from memory using CUDA's built-in vectorized loads as a
* coalescing optimization. [More...](\ref cub::BlockLoadAlgorithm)
@@ -703,6 +715,59 @@ private:
};
+ /**
+ * BLOCK_LOAD_STRIPED specialization of load helper
+ */
+ template
+ struct LoadInternal
+ {
+ /// Shared memory storage layout type
+ typedef NullType TempStorage;
+
+ /// Linear thread-id
+ int linear_tid;
+
+ /// Constructor
+ __device__ __forceinline__ LoadInternal(
+ TempStorage &/*temp_storage*/,
+ int linear_tid)
+ :
+ linear_tid(linear_tid)
+ {}
+
+ /// Load a linear segment of items from memory
+ template
+ __device__ __forceinline__ void Load(
+ InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
+ InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load{
+ {
+ LoadDirectStriped(linear_tid, block_itr, items);
+ }
+
+ /// Load a linear segment of items from memory, guarded by range
+ template
+ __device__ __forceinline__ void Load(
+ InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
+ InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+ int valid_items) ///< [in] Number of valid items to load
+ {
+ LoadDirectStriped(linear_tid, block_itr, items, valid_items);
+ }
+
+ /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+ template
+ __device__ __forceinline__ void Load(
+ InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
+ InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+ int valid_items, ///< [in] Number of valid items to load
+ DefaultT oob_default) ///< [in] Default value to assign out-of-bound items
+ {
+ LoadDirectStriped(linear_tid, block_itr, items, valid_items, oob_default);
+ }
+
+ };
+
+
/**
* BLOCK_LOAD_VECTORIZE specialization of load helper
*/
diff --git a/cub/block/block_store.cuh b/cub/block/block_store.cuh
index 155c3b4217..cb00ec7287 100644
--- a/cub/block/block_store.cuh
+++ b/cub/block/block_store.cuh
@@ -364,6 +364,17 @@ enum BlockStoreAlgorithm
*/
BLOCK_STORE_DIRECT,
+ /**
+ * \par Overview
+ * A [striped arrangement](index.html#sec5sec3) of data is written
+ * directly to memory.
+ *
+ * \par Performance Considerations
+ * - The utilization of memory transactions (coalescing) decreases as the
+ * access stride between threads increases (i.e., the number items per thread).
+ */
+ BLOCK_STORE_STRIPED,
+
/**
* \par Overview
*
@@ -432,7 +443,6 @@ enum BlockStoreAlgorithm
* latencies than the BLOCK_STORE_WARP_TRANSPOSE alternative.
*/
BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
-
};
@@ -456,6 +466,8 @@ enum BlockStoreAlgorithm
* - BlockStore can be optionally specialized by different data movement strategies:
* -# cub::BLOCK_STORE_DIRECT. A [blocked arrangement](index.html#sec5sec3) of data is written
* directly to memory. [More...](\ref cub::BlockStoreAlgorithm)
+ * -# cub::BLOCK_STORE_STRIPED. A [striped arrangement](index.html#sec5sec3)
+ * of data is written directly to memory. [More...](\ref cub::BlockStoreAlgorithm)
* -# cub::BLOCK_STORE_VECTORIZE. A [blocked arrangement](index.html#sec5sec3)
* of data is written directly to memory using CUDA's built-in vectorized stores as a
* coalescing optimization. [More...](\ref cub::BlockStoreAlgorithm)
@@ -579,6 +591,47 @@ private:
};
+ /**
+ * BLOCK_STORE_STRIPED specialization of store helper
+ */
+ template
+ struct StoreInternal
+ {
+ /// Shared memory storage layout type
+ typedef NullType TempStorage;
+
+ /// Linear thread-id
+ int linear_tid;
+
+ /// Constructor
+ __device__ __forceinline__ StoreInternal(
+ TempStorage &/*temp_storage*/,
+ int linear_tid)
+ :
+ linear_tid(linear_tid)
+ {}
+
+ /// Store items into a linear segment of memory
+ template
+ __device__ __forceinline__ void Store(
+ OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to
+ T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+ {
+ StoreDirectStriped(linear_tid, block_itr, items);
+ }
+
+ /// Store items into a linear segment of memory, guarded by range
+ template
+ __device__ __forceinline__ void Store(
+ OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to
+ T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+ int valid_items) ///< [in] Number of valid items to write
+ {
+ StoreDirectStriped(linear_tid, block_itr, items, valid_items);
+ }
+ };
+
+
/**
* BLOCK_STORE_VECTORIZE specialization of store helper
*/
@@ -822,6 +875,7 @@ private:
StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
}
};
+
/******************************************************************************
* Type definitions
diff --git a/test/test_block_load_store.cu b/test/test_block_load_store.cu
index 10e4b2dff1..ec1c2bd02c 100644
--- a/test/test_block_load_store.cu
+++ b/test/test_block_load_store.cu
@@ -439,6 +439,7 @@ void TestStrategy(
TestPointerType(grid_size, fraction_valid);
TestPointerType(grid_size, fraction_valid);
TestPointerType(grid_size, fraction_valid);
+ TestPointerType(grid_size, fraction_valid);
}