diff --git a/cub/block/block_load.cuh b/cub/block/block_load.cuh index 234d4ee5bf..d689954e0b 100644 --- a/cub/block/block_load.cuh +++ b/cub/block/block_load.cuh @@ -472,6 +472,18 @@ enum BlockLoadAlgorithm */ BLOCK_LOAD_DIRECT, + /** + * \par Overview + * + * A [striped arrangement](index.html#sec5sec3) of data is read + * directly from memory. + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) decreases as the + * access stride between threads increases (i.e., the number items per thread). + */ + BLOCK_LOAD_STRIPED, + /** * \par Overview * @@ -507,7 +519,6 @@ enum BlockLoadAlgorithm */ BLOCK_LOAD_TRANSPOSE, - /** * \par Overview * @@ -528,7 +539,6 @@ enum BlockLoadAlgorithm */ BLOCK_LOAD_WARP_TRANSPOSE, - /** * \par Overview * @@ -572,6 +582,8 @@ enum BlockLoadAlgorithm * - BlockLoad can be optionally specialized by different data movement strategies: * -# cub::BLOCK_LOAD_DIRECT. A [blocked arrangement](index.html#sec5sec3) * of data is read directly from memory. [More...](\ref cub::BlockLoadAlgorithm) +* -# cub::BLOCK_LOAD_STRIPED,. A [striped arrangement](index.html#sec5sec3) + * of data is read directly from memory. [More...](\ref cub::BlockLoadAlgorithm) * -# cub::BLOCK_LOAD_VECTORIZE. A [blocked arrangement](index.html#sec5sec3) * of data is read directly from memory using CUDA's built-in vectorized loads as a * coalescing optimization. [More...](\ref cub::BlockLoadAlgorithm) @@ -703,6 +715,59 @@ private: }; + /** + * BLOCK_LOAD_STRIPED specialization of load helper + */ + template + struct LoadInternal + { + /// Shared memory storage layout type + typedef NullType TempStorage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ LoadInternal( + TempStorage &/*temp_storage*/, + int linear_tid) + : + linear_tid(linear_tid) + {} + + /// Load a linear segment of items from memory + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load{ + { + LoadDirectStriped(linear_tid, block_itr, items); + } + + /// Load a linear segment of items from memory, guarded by range + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load + { + LoadDirectStriped(linear_tid, block_itr, items, valid_items); + } + + /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + DefaultT oob_default) ///< [in] Default value to assign out-of-bound items + { + LoadDirectStriped(linear_tid, block_itr, items, valid_items, oob_default); + } + + }; + + /** * BLOCK_LOAD_VECTORIZE specialization of load helper */ diff --git a/cub/block/block_store.cuh b/cub/block/block_store.cuh index 155c3b4217..cb00ec7287 100644 --- a/cub/block/block_store.cuh +++ b/cub/block/block_store.cuh @@ -364,6 +364,17 @@ enum BlockStoreAlgorithm */ BLOCK_STORE_DIRECT, + /** + * \par Overview + * A [striped arrangement](index.html#sec5sec3) of data is written + * directly to memory. + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) decreases as the + * access stride between threads increases (i.e., the number items per thread). + */ + BLOCK_STORE_STRIPED, + /** * \par Overview * @@ -432,7 +443,6 @@ enum BlockStoreAlgorithm * latencies than the BLOCK_STORE_WARP_TRANSPOSE alternative. */ BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, - }; @@ -456,6 +466,8 @@ enum BlockStoreAlgorithm * - BlockStore can be optionally specialized by different data movement strategies: * -# cub::BLOCK_STORE_DIRECT. A [blocked arrangement](index.html#sec5sec3) of data is written * directly to memory. [More...](\ref cub::BlockStoreAlgorithm) + * -# cub::BLOCK_STORE_STRIPED. A [striped arrangement](index.html#sec5sec3) + * of data is written directly to memory. [More...](\ref cub::BlockStoreAlgorithm) * -# cub::BLOCK_STORE_VECTORIZE. A [blocked arrangement](index.html#sec5sec3) * of data is written directly to memory using CUDA's built-in vectorized stores as a * coalescing optimization. [More...](\ref cub::BlockStoreAlgorithm) @@ -579,6 +591,47 @@ private: }; + /** + * BLOCK_STORE_STRIPED specialization of store helper + */ + template + struct StoreInternal + { + /// Shared memory storage layout type + typedef NullType TempStorage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ StoreInternal( + TempStorage &/*temp_storage*/, + int linear_tid) + : + linear_tid(linear_tid) + {} + + /// Store items into a linear segment of memory + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + StoreDirectStriped(linear_tid, block_itr, items); + } + + /// Store items into a linear segment of memory, guarded by range + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write + { + StoreDirectStriped(linear_tid, block_itr, items, valid_items); + } + }; + + /** * BLOCK_STORE_VECTORIZE specialization of store helper */ @@ -822,6 +875,7 @@ private: StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items); } }; + /****************************************************************************** * Type definitions diff --git a/test/test_block_load_store.cu b/test/test_block_load_store.cu index 10e4b2dff1..ec1c2bd02c 100644 --- a/test/test_block_load_store.cu +++ b/test/test_block_load_store.cu @@ -439,6 +439,7 @@ void TestStrategy( TestPointerType(grid_size, fraction_valid); TestPointerType(grid_size, fraction_valid); TestPointerType(grid_size, fraction_valid); + TestPointerType(grid_size, fraction_valid); }