Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
Merge pull request #274 from mnicely/main
Browse files Browse the repository at this point in the history
Add BLOCK_LOAD_STRIPED and BLOCK_STORE_STRIPED
  • Loading branch information
alliepiper authored Jun 11, 2021
2 parents a693b01 + f9fbcbe commit d056a9a
Show file tree
Hide file tree
Showing 3 changed files with 123 additions and 3 deletions.
69 changes: 67 additions & 2 deletions cub/block/block_load.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,18 @@ enum BlockLoadAlgorithm
*/
BLOCK_LOAD_DIRECT,

/**
* \par Overview
*
* A [<em>striped arrangement</em>](index.html#sec5sec3) of data is read
* directly from memory.
*
* \par Performance Considerations
* - The utilization of memory transactions (coalescing) decreases as the
* access stride between threads increases (i.e., the number items per thread).
*/
BLOCK_LOAD_STRIPED,

/**
* \par Overview
*
Expand Down Expand Up @@ -507,7 +519,6 @@ enum BlockLoadAlgorithm
*/
BLOCK_LOAD_TRANSPOSE,


/**
* \par Overview
*
Expand All @@ -528,7 +539,6 @@ enum BlockLoadAlgorithm
*/
BLOCK_LOAD_WARP_TRANSPOSE,


/**
* \par Overview
*
Expand Down Expand Up @@ -572,6 +582,8 @@ enum BlockLoadAlgorithm
* - BlockLoad can be optionally specialized by different data movement strategies:
* -# <b>cub::BLOCK_LOAD_DIRECT</b>. A [<em>blocked arrangement</em>](index.html#sec5sec3)
* of data is read directly from memory. [More...](\ref cub::BlockLoadAlgorithm)
* -# <b>cub::BLOCK_LOAD_STRIPED,</b>. A [<em>striped arrangement</em>](index.html#sec5sec3)
* of data is read directly from memory. [More...](\ref cub::BlockLoadAlgorithm)
* -# <b>cub::BLOCK_LOAD_VECTORIZE</b>. A [<em>blocked arrangement</em>](index.html#sec5sec3)
* of data is read directly from memory using CUDA's built-in vectorized loads as a
* coalescing optimization. [More...](\ref cub::BlockLoadAlgorithm)
Expand Down Expand Up @@ -703,6 +715,59 @@ private:
};


/**
* BLOCK_LOAD_STRIPED specialization of load helper
*/
template <int DUMMY>
struct LoadInternal<BLOCK_LOAD_STRIPED, DUMMY>
{
/// Shared memory storage layout type
typedef NullType TempStorage;

/// Linear thread-id
int linear_tid;

/// Constructor
__device__ __forceinline__ LoadInternal(
TempStorage &/*temp_storage*/,
int linear_tid)
:
linear_tid(linear_tid)
{}

/// Load a linear segment of items from memory
template <typename InputIteratorT>
__device__ __forceinline__ void Load(
InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load{
{
LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
}

/// Load a linear segment of items from memory, guarded by range
template <typename InputIteratorT>
__device__ __forceinline__ void Load(
InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
int valid_items) ///< [in] Number of valid items to load
{
LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
}

/// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
template <typename InputIteratorT, typename DefaultT>
__device__ __forceinline__ void Load(
InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from
InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
int valid_items, ///< [in] Number of valid items to load
DefaultT oob_default) ///< [in] Default value to assign out-of-bound items
{
LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items, oob_default);
}

};


/**
* BLOCK_LOAD_VECTORIZE specialization of load helper
*/
Expand Down
56 changes: 55 additions & 1 deletion cub/block/block_store.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,17 @@ enum BlockStoreAlgorithm
*/
BLOCK_STORE_DIRECT,

/**
* \par Overview
* A [<em>striped arrangement</em>](index.html#sec5sec3) of data is written
* directly to memory.
*
* \par Performance Considerations
* - The utilization of memory transactions (coalescing) decreases as the
* access stride between threads increases (i.e., the number items per thread).
*/
BLOCK_STORE_STRIPED,

/**
* \par Overview
*
Expand Down Expand Up @@ -432,7 +443,6 @@ enum BlockStoreAlgorithm
* latencies than the BLOCK_STORE_WARP_TRANSPOSE alternative.
*/
BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,

};


Expand All @@ -456,6 +466,8 @@ enum BlockStoreAlgorithm
* - BlockStore can be optionally specialized by different data movement strategies:
* -# <b>cub::BLOCK_STORE_DIRECT</b>. A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
* directly to memory. [More...](\ref cub::BlockStoreAlgorithm)
* -# <b>cub::BLOCK_STORE_STRIPED</b>. A [<em>striped arrangement</em>](index.html#sec5sec3)
* of data is written directly to memory. [More...](\ref cub::BlockStoreAlgorithm)
* -# <b>cub::BLOCK_STORE_VECTORIZE</b>. A [<em>blocked arrangement</em>](index.html#sec5sec3)
* of data is written directly to memory using CUDA's built-in vectorized stores as a
* coalescing optimization. [More...](\ref cub::BlockStoreAlgorithm)
Expand Down Expand Up @@ -579,6 +591,47 @@ private:
};


/**
* BLOCK_STORE_STRIPED specialization of store helper
*/
template <int DUMMY>
struct StoreInternal<BLOCK_STORE_STRIPED, DUMMY>
{
/// Shared memory storage layout type
typedef NullType TempStorage;

/// Linear thread-id
int linear_tid;

/// Constructor
__device__ __forceinline__ StoreInternal(
TempStorage &/*temp_storage*/,
int linear_tid)
:
linear_tid(linear_tid)
{}

/// Store items into a linear segment of memory
template <typename OutputIteratorT>
__device__ __forceinline__ void Store(
OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to
T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
{
StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
}

/// Store items into a linear segment of memory, guarded by range
template <typename OutputIteratorT>
__device__ __forceinline__ void Store(
OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to
T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
int valid_items) ///< [in] Number of valid items to write
{
StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
}
};


/**
* BLOCK_STORE_VECTORIZE specialization of store helper
*/
Expand Down Expand Up @@ -822,6 +875,7 @@ private:
StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
}
};


/******************************************************************************
* Type definitions
Expand Down
1 change: 1 addition & 0 deletions test/test_block_load_store.cu
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,7 @@ void TestStrategy(
TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_DIRECT, BLOCK_STORE_DIRECT>(grid_size, fraction_valid);
TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_TRANSPOSE, BLOCK_STORE_TRANSPOSE>(grid_size, fraction_valid);
TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_VECTORIZE, BLOCK_STORE_VECTORIZE>(grid_size, fraction_valid);
TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_STRIPED, BLOCK_STORE_STRIPED>(grid_size, fraction_valid);
}


Expand Down

0 comments on commit d056a9a

Please sign in to comment.