Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add generic reduction functions and separate reductions/warp_primitives #1470

Merged
merged 31 commits into from
May 18, 2023
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
dbd348e
add reduction op type, ranked reduction and weighted reduction
akifcorduk Apr 27, 2023
e911e66
add binary reduction and pow2
akifcorduk Apr 27, 2023
4daafc1
clang format
akifcorduk Apr 27, 2023
fef36e3
add tests for binary reduction and separate warp primitives
akifcorduk Apr 27, 2023
d856c3b
remove next_pow2 function
akifcorduk Apr 27, 2023
b4fe470
add device span include
akifcorduk Apr 28, 2023
0a11a78
Merge branch 'branch-23.06' into improve_reduction
akifcorduk Apr 28, 2023
a2e5608
fix style include order
akifcorduk Apr 28, 2023
91dbcc2
add default template type
akifcorduk Apr 28, 2023
1702361
add correct function comments
akifcorduk May 2, 2023
1384077
Merge branch 'branch-23.06' into improve_reduction
akifcorduk May 2, 2023
f6758d3
Merge branch 'branch-23.06' into improve_reduction
akifcorduk May 2, 2023
4c10dda
Merge github.com:rapidsai/raft into improve_reduction
akifcorduk May 3, 2023
85d0db1
add include comments and paranthesis to if block
akifcorduk May 3, 2023
e30562b
Merge branch 'improve_reduction' of github.com:akifcorduk/raft into i…
akifcorduk May 3, 2023
6b89723
Merge branch 'branch-23.06' into improve_reduction
cjnolet May 4, 2023
94138a6
use DI macro, add max_warps_per_block var, remove stale comment
akifcorduk May 4, 2023
f1daedc
use better comments and naming for weightedSelect and add static_asserts
akifcorduk May 4, 2023
808e64f
Merge branch 'improve_reduction' of github.com:akifcorduk/raft into i…
akifcorduk May 4, 2023
986810f
adjust test names
akifcorduk May 4, 2023
56e1deb
fix compile error
akifcorduk May 4, 2023
3fec235
Add detailed comment on blockWeightedSelect
akifcorduk May 4, 2023
c154182
add todo comment and remove command in the comment
akifcorduk May 5, 2023
8f48546
Merge branch 'branch-23.06' into improve_reduction
cjnolet May 15, 2023
133b4dd
Merge branch 'branch-23.06' into improve_reduction
tfeher May 16, 2023
7d39c35
move block and warp level random sampling to random::device namespace
tfeher May 16, 2023
f72aae8
move random/device/sample_device.cuh to random/sample_devic.cuh
tfeher May 16, 2023
4d56e47
Revert "move random/device/sample_device.cuh to random/sample_devic.cuh"
tfeher May 17, 2023
b2c4ce0
Merge branch 'branch-23.06' into improve_reduction
tfeher May 17, 2023
b355af3
Merge branch 'branch-23.06' into improve_reduction
tfeher May 17, 2023
c879e48
Fix style
tfeher May 17, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
312 changes: 3 additions & 309 deletions cpp/include/raft/util/cuda_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,10 @@
#include <raft/core/cudart_utils.hpp>
#include <raft/core/math.hpp>
#include <raft/core/operators.hpp>
// For backward compatibility, we include the follow headers. They contain
// functionality that were previously contained in cuda_utils.cuh
#include <raft/util/cuda_dev_essentials.cuh>
#include <raft/util/reduction.cuh>
tfeher marked this conversation as resolved.
Show resolved Hide resolved

namespace raft {

Expand Down Expand Up @@ -523,238 +526,6 @@ DI double maxPrim(double x, double y)
}
/** @} */

/** apply a warp-wide fence (useful from Volta+ archs) */
DI void warpFence()
{
#if __CUDA_ARCH__ >= 700
__syncwarp();
#endif
}

/** warp-wide any boolean aggregator */
DI bool any(bool inFlag, uint32_t mask = 0xffffffffu)
{
#if CUDART_VERSION >= 9000
inFlag = __any_sync(mask, inFlag);
#else
inFlag = __any(inFlag);
#endif
return inFlag;
}

/** warp-wide all boolean aggregator */
DI bool all(bool inFlag, uint32_t mask = 0xffffffffu)
{
#if CUDART_VERSION >= 9000
inFlag = __all_sync(mask, inFlag);
#else
inFlag = __all(inFlag);
#endif
return inFlag;
}

/** For every thread in the warp, set the corresponding bit to the thread's flag value. */
DI uint32_t ballot(bool inFlag, uint32_t mask = 0xffffffffu)
{
#if CUDART_VERSION >= 9000
return __ballot_sync(mask, inFlag);
#else
return __ballot(inFlag);
#endif
}

/** True CUDA alignment of a type (adapted from CUB) */
template <typename T>
struct cuda_alignment {
struct Pad {
T val;
char byte;
};

static constexpr int bytes = sizeof(Pad) - sizeof(T);
};

template <typename LargeT, typename UnitT>
struct is_multiple {
static constexpr int large_align_bytes = cuda_alignment<LargeT>::bytes;
static constexpr int unit_align_bytes = cuda_alignment<UnitT>::bytes;
static constexpr bool value =
(sizeof(LargeT) % sizeof(UnitT) == 0) && (large_align_bytes % unit_align_bytes == 0);
};

template <typename LargeT, typename UnitT>
inline constexpr bool is_multiple_v = is_multiple<LargeT, UnitT>::value;

template <typename T>
struct is_shuffleable {
static constexpr bool value =
std::is_same_v<T, int> || std::is_same_v<T, unsigned int> || std::is_same_v<T, long> ||
std::is_same_v<T, unsigned long> || std::is_same_v<T, long long> ||
std::is_same_v<T, unsigned long long> || std::is_same_v<T, float> || std::is_same_v<T, double>;
};

template <typename T>
inline constexpr bool is_shuffleable_v = is_shuffleable<T>::value;

/**
* @brief Shuffle the data inside a warp
* @tparam T the data type
* @param val value to be shuffled
* @param srcLane lane from where to shuffle
* @param width lane width
* @param mask mask of participating threads (Volta+)
* @return the shuffled data
*/
template <typename T>
DI std::enable_if_t<is_shuffleable_v<T>, T> shfl(T val,
int srcLane,
int width = WarpSize,
uint32_t mask = 0xffffffffu)
{
#if CUDART_VERSION >= 9000
return __shfl_sync(mask, val, srcLane, width);
#else
return __shfl(val, srcLane, width);
#endif
}

/// Overload of shfl for data types not supported by the CUDA intrinsics
template <typename T>
DI std::enable_if_t<!is_shuffleable_v<T>, T> shfl(T val,
int srcLane,
int width = WarpSize,
uint32_t mask = 0xffffffffu)
{
using UnitT =
std::conditional_t<is_multiple_v<T, int>,
unsigned int,
std::conditional_t<is_multiple_v<T, short>, unsigned short, unsigned char>>;

constexpr int n_words = sizeof(T) / sizeof(UnitT);

T output;
UnitT* output_alias = reinterpret_cast<UnitT*>(&output);
UnitT* input_alias = reinterpret_cast<UnitT*>(&val);

unsigned int shuffle_word;
shuffle_word = shfl((unsigned int)input_alias[0], srcLane, width, mask);
output_alias[0] = shuffle_word;

#pragma unroll
for (int i = 1; i < n_words; ++i) {
shuffle_word = shfl((unsigned int)input_alias[i], srcLane, width, mask);
output_alias[i] = shuffle_word;
}

return output;
}

/**
* @brief Shuffle the data inside a warp from lower lane IDs
* @tparam T the data type
* @param val value to be shuffled
* @param delta lower lane ID delta from where to shuffle
* @param width lane width
* @param mask mask of participating threads (Volta+)
* @return the shuffled data
*/
template <typename T>
DI std::enable_if_t<is_shuffleable_v<T>, T> shfl_up(T val,
int delta,
int width = WarpSize,
uint32_t mask = 0xffffffffu)
{
#if CUDART_VERSION >= 9000
return __shfl_up_sync(mask, val, delta, width);
#else
return __shfl_up(val, delta, width);
#endif
}

/// Overload of shfl_up for data types not supported by the CUDA intrinsics
template <typename T>
DI std::enable_if_t<!is_shuffleable_v<T>, T> shfl_up(T val,
int delta,
int width = WarpSize,
uint32_t mask = 0xffffffffu)
{
using UnitT =
std::conditional_t<is_multiple_v<T, int>,
unsigned int,
std::conditional_t<is_multiple_v<T, short>, unsigned short, unsigned char>>;

constexpr int n_words = sizeof(T) / sizeof(UnitT);

T output;
UnitT* output_alias = reinterpret_cast<UnitT*>(&output);
UnitT* input_alias = reinterpret_cast<UnitT*>(&val);

unsigned int shuffle_word;
shuffle_word = shfl_up((unsigned int)input_alias[0], delta, width, mask);
output_alias[0] = shuffle_word;

#pragma unroll
for (int i = 1; i < n_words; ++i) {
shuffle_word = shfl_up((unsigned int)input_alias[i], delta, width, mask);
output_alias[i] = shuffle_word;
}

return output;
}

/**
* @brief Shuffle the data inside a warp
* @tparam T the data type
* @param val value to be shuffled
* @param laneMask mask to be applied in order to perform xor shuffle
* @param width lane width
* @param mask mask of participating threads (Volta+)
* @return the shuffled data
*/
template <typename T>
DI std::enable_if_t<is_shuffleable_v<T>, T> shfl_xor(T val,
int laneMask,
int width = WarpSize,
uint32_t mask = 0xffffffffu)
{
#if CUDART_VERSION >= 9000
return __shfl_xor_sync(mask, val, laneMask, width);
#else
return __shfl_xor(val, laneMask, width);
#endif
}

/// Overload of shfl_xor for data types not supported by the CUDA intrinsics
template <typename T>
DI std::enable_if_t<!is_shuffleable_v<T>, T> shfl_xor(T val,
int laneMask,
int width = WarpSize,
uint32_t mask = 0xffffffffu)
{
using UnitT =
std::conditional_t<is_multiple_v<T, int>,
unsigned int,
std::conditional_t<is_multiple_v<T, short>, unsigned short, unsigned char>>;

constexpr int n_words = sizeof(T) / sizeof(UnitT);

T output;
UnitT* output_alias = reinterpret_cast<UnitT*>(&output);
UnitT* input_alias = reinterpret_cast<UnitT*>(&val);

unsigned int shuffle_word;
shuffle_word = shfl_xor((unsigned int)input_alias[0], laneMask, width, mask);
output_alias[0] = shuffle_word;

#pragma unroll
for (int i = 1; i < n_words; ++i) {
shuffle_word = shfl_xor((unsigned int)input_alias[i], laneMask, width, mask);
output_alias[i] = shuffle_word;
}

return output;
}

/**
* @brief Four-way byte dot product-accumulate.
* @tparam T Four-byte integer: int or unsigned int
Expand Down Expand Up @@ -816,83 +587,6 @@ DI auto dp4a(unsigned int a, unsigned int b, unsigned int c) -> unsigned int
#endif
}

/**
* @brief Logical-warp-level reduction
* @tparam logicalWarpSize Logical warp size (2, 4, 8, 16 or 32)
* @tparam T Value type to be reduced
* @tparam ReduceLambda Reduction operation type
* @param val input value
* @param reduce_op Reduction operation
* @return Reduction result. All lanes will have the valid result.
*/
template <int logicalWarpSize, typename T, typename ReduceLambda>
DI T logicalWarpReduce(T val, ReduceLambda reduce_op)
{
#pragma unroll
for (int i = logicalWarpSize / 2; i > 0; i >>= 1) {
T tmp = shfl_xor(val, i);
val = reduce_op(val, tmp);
}
return val;
}

/**
* @brief Warp-level reduction
* @tparam T Value type to be reduced
* @tparam ReduceLambda Reduction operation type
* @param val input value
* @param reduce_op Reduction operation
* @return Reduction result. All lanes will have the valid result.
* @note Why not cub? Because cub doesn't seem to allow working with arbitrary
* number of warps in a block. All threads in the warp must enter this
* function together
*/
template <typename T, typename ReduceLambda>
DI T warpReduce(T val, ReduceLambda reduce_op)
{
return logicalWarpReduce<WarpSize>(val, reduce_op);
}

/**
* @brief Warp-level sum reduction
* @tparam T Value type to be reduced
* @param val input value
* @return Reduction result. All lanes will have the valid result.
* @note Why not cub? Because cub doesn't seem to allow working with arbitrary
* number of warps in a block. All threads in the warp must enter this
* function together
*/
template <typename T>
DI T warpReduce(T val)
{
return warpReduce(val, raft::add_op{});
}

/**
* @brief 1-D block-level sum reduction
* @param val input value
* @param smem shared memory region needed for storing intermediate results. It
* must alteast be of size: `sizeof(T) * nWarps`
* @return only the thread0 will contain valid reduced result
* @note Why not cub? Because cub doesn't seem to allow working with arbitrary
* number of warps in a block. All threads in the block must enter this
* function together
* @todo Expand this to support arbitrary reduction ops
*/
template <typename T>
DI T blockReduce(T val, char* smem)
{
auto* sTemp = reinterpret_cast<T*>(smem);
int nWarps = (blockDim.x + WarpSize - 1) / WarpSize;
int lid = laneId();
int wid = threadIdx.x / WarpSize;
val = warpReduce(val);
if (lid == 0) sTemp[wid] = val;
__syncthreads();
val = lid < nWarps ? sTemp[lid] : T(0);
return warpReduce(val);
}

/**
* @brief Simple utility function to determine whether user_stream or one of the
* internal streams should be used.
Expand Down
Loading