Skip to content
This repository was archived by the owner on Mar 21, 2024. It is now read-only.

Add BLOCK_LOAD_STRIPED and BLOCK_STORE_STRIPED #274

Merged
merged 8 commits into from
Jun 11, 2021
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 67 additions & 2 deletions cub/block/block_load.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,18 @@ enum BlockLoadAlgorithm
*/
BLOCK_LOAD_DIRECT,

/**
* \par Overview
*
* A [<em>striped arrangement</em>](index.html#sec5sec3) of data is read
* directly from memory.
*
* \par Performance Considerations
* - The utilization of memory transactions (coalescing) decreases as the
* access stride between threads increases (i.e., the number items per thread).
*/
BLOCK_LOAD_STRIPED,

/**
* \par Overview
*
Expand Down Expand Up @@ -507,7 +519,6 @@ enum BlockLoadAlgorithm
*/
BLOCK_LOAD_TRANSPOSE,


/**
* \par Overview
*
Expand All @@ -528,7 +539,6 @@ enum BlockLoadAlgorithm
*/
BLOCK_LOAD_WARP_TRANSPOSE,


/**
* \par Overview
*
Expand Down Expand Up @@ -572,6 +582,8 @@ enum BlockLoadAlgorithm
* - BlockLoad can be optionally specialized by different data movement strategies:
* -# <b>cub::BLOCK_LOAD_DIRECT</b>. A [<em>blocked arrangement</em>](index.html#sec5sec3)
* of data is read directly from memory. [More...](\ref cub::BlockLoadAlgorithm)
* -# <b>cub::BLOCK_LOAD_STRIPED,</b>. A [<em>striped arrangement</em>](index.html#sec5sec3)
* of data is read directly from memory. [More...](\ref cub::BlockLoadAlgorithm)
* -# <b>cub::BLOCK_LOAD_VECTORIZE</b>. A [<em>blocked arrangement</em>](index.html#sec5sec3)
* of data is read directly from memory using CUDA's built-in vectorized loads as a
* coalescing optimization. [More...](\ref cub::BlockLoadAlgorithm)
Expand Down Expand Up @@ -703,6 +715,59 @@ private:
};


/**
* BLOCK_LOAD_STRIPED specialization of load helper
*/
template <int DUMMY>
struct LoadInternal<BLOCK_LOAD_STRIPED, DUMMY>
{
/// Shared memory storage layout type
typedef NullType TempStorage;

/// Linear thread-id
int linear_tid;

/// Constructor
__device__ __forceinline__ LoadInternal(
TempStorage &temp_storage,
int linear_tid)
:
linear_tid(linear_tid)
{}

/// Load a linear segment of items from memory
template <typename InputIteratorT>
__device__ __forceinline__ void Load(
InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load{
{
LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
}

/// Load a linear segment of items from memory, guarded by range
template <typename InputIteratorT>
__device__ __forceinline__ void Load(
InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
int valid_items) ///< [in] Number of valid items to load
{
LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
}

/// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
template <typename InputIteratorT, typename DefaultT>
__device__ __forceinline__ void Load(
InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
int valid_items, ///< [in] Number of valid items to load
DefaultT oob_default) ///< [in] Default value to assign out-of-bound items
{
LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items, oob_default);
}

};


/**
* BLOCK_LOAD_VECTORIZE specialization of load helper
*/
Expand Down
59 changes: 58 additions & 1 deletion cub/block/block_store.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,17 @@ enum BlockStoreAlgorithm
*/
BLOCK_STORE_DIRECT,

/**
* \par Overview
* A [<em>striped arrangement</em>](index.html#sec5sec3) of data is written
* directly to memory.
*
* \par Performance Considerations
* - The utilization of memory transactions (coalescing) decreases as the
* access stride between threads increases (i.e., the number items per thread).
*/
BLOCK_STORE_STRIPED,

/**
* \par Overview
*
Expand Down Expand Up @@ -432,7 +443,6 @@ enum BlockStoreAlgorithm
* latencies than the BLOCK_STORE_WARP_TRANSPOSE alternative.
*/
BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,

};


Expand All @@ -456,6 +466,8 @@ enum BlockStoreAlgorithm
* - BlockStore can be optionally specialized by different data movement strategies:
* -# <b>cub::BLOCK_STORE_DIRECT</b>. A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
* directly to memory. [More...](\ref cub::BlockStoreAlgorithm)
* -# <b>cub::BLOCK_STORE_STRIPED</b>. A [<em>striped arrangement</em>](index.html#sec5sec3)
* of data is written directly to memory. [More...](\ref cub::BlockStoreAlgorithm)
* -# <b>cub::BLOCK_STORE_VECTORIZE</b>. A [<em>blocked arrangement</em>](index.html#sec5sec3)
* of data is written directly to memory using CUDA's built-in vectorized stores as a
* coalescing optimization. [More...](\ref cub::BlockStoreAlgorithm)
Expand Down Expand Up @@ -579,6 +591,50 @@ private:
};


/**
* BLOCK_STORE_STRIPED specialization of store helper
*/
template <int DUMMY>
struct StoreInternal<BLOCK_STORE_STRIPED, DUMMY>
{
/// Shared memory storage layout type
typedef NullType TempStorage;

/// Linear thread-id
int linear_tid;

/// Constructor
__device__ __forceinline__ StoreInternal(
TempStorage &temp_storage,
int linear_tid)
:
temp_storage(temp_storage.Alias()),
linear_tid(linear_tid)
{}

/// Store items into a linear segment of memory
template <typename OutputIteratorT>
__device__ __forceinline__ void Store(
OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to
T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
{
StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
}

/// Store items into a linear segment of memory, guarded by range
template <typename OutputIteratorT>
__device__ __forceinline__ void Store(
OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to
T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
int valid_items) ///< [in] Number of valid items to write
{
if (linear_tid == 0)
temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

temp_storage needs to be cached as a member variable.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That last commit only added an initialization to the constructor, there still needs to be a temp_storage member variable defined.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, let me know if I got it correct this time.

StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, temp_storage.valid_items);
}
};


/**
* BLOCK_STORE_VECTORIZE specialization of store helper
*/
Expand Down Expand Up @@ -822,6 +878,7 @@ private:
StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
}
};


/******************************************************************************
* Type definitions
Expand Down
1 change: 1 addition & 0 deletions test/test_block_load_store.cu
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,7 @@ void TestStrategy(
TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_DIRECT, BLOCK_STORE_DIRECT>(grid_size, fraction_valid);
TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_TRANSPOSE, BLOCK_STORE_TRANSPOSE>(grid_size, fraction_valid);
TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_VECTORIZE, BLOCK_STORE_VECTORIZE>(grid_size, fraction_valid);
TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_STRIPED, BLOCK_STORE_STRIPED>(grid_size, fraction_valid);
}


Expand Down