rapidsai · rapids-bot · May 18, 2023 · Apr 27, 2023 · Apr 27, 2023 · Apr 27, 2023
@@ -23,7 +23,10 @@
 #include <raft/core/cudart_utils.hpp>
 #include <raft/core/math.hpp>
 #include <raft/core/operators.hpp>
+// For backward compatibility, we include the follow headers. They contain
+// functionality that were previously contained in cuda_utils.cuh
 #include <raft/util/cuda_dev_essentials.cuh>
+#include <raft/util/reduction.cuh>
 
 namespace raft {
 
@@ -523,238 +526,6 @@ DI double maxPrim(double x, double y)
 }
 /** @} */
 
-/** apply a warp-wide fence (useful from Volta+ archs) */
-DI void warpFence()
-{
-#if __CUDA_ARCH__ >= 700
-  __syncwarp();
-#endif
-}
-
-/** warp-wide any boolean aggregator */
-DI bool any(bool inFlag, uint32_t mask = 0xffffffffu)
-{
-#if CUDART_VERSION >= 9000
-  inFlag = __any_sync(mask, inFlag);
-#else
-  inFlag = __any(inFlag);
-#endif
-  return inFlag;
-}
-
-/** warp-wide all boolean aggregator */
-DI bool all(bool inFlag, uint32_t mask = 0xffffffffu)
-{
-#if CUDART_VERSION >= 9000
-  inFlag = __all_sync(mask, inFlag);
-#else
-  inFlag = __all(inFlag);
-#endif
-  return inFlag;
-}
-
-/** For every thread in the warp, set the corresponding bit to the thread's flag value.  */
-DI uint32_t ballot(bool inFlag, uint32_t mask = 0xffffffffu)
-{
-#if CUDART_VERSION >= 9000
-  return __ballot_sync(mask, inFlag);
-#else
-  return __ballot(inFlag);
-#endif
-}
-
-/** True CUDA alignment of a type (adapted from CUB) */
-template <typename T>
-struct cuda_alignment {
-  struct Pad {
-    T val;
-    char byte;
-  };
-
-  static constexpr int bytes = sizeof(Pad) - sizeof(T);
-};
-
-template <typename LargeT, typename UnitT>
-struct is_multiple {
-  static constexpr int large_align_bytes = cuda_alignment<LargeT>::bytes;
-  static constexpr int unit_align_bytes  = cuda_alignment<UnitT>::bytes;
-  static constexpr bool value =
-    (sizeof(LargeT) % sizeof(UnitT) == 0) && (large_align_bytes % unit_align_bytes == 0);
-};
-
-template <typename LargeT, typename UnitT>
-inline constexpr bool is_multiple_v = is_multiple<LargeT, UnitT>::value;
-
-template <typename T>
-struct is_shuffleable {
-  static constexpr bool value =
-    std::is_same_v<T, int> || std::is_same_v<T, unsigned int> || std::is_same_v<T, long> ||
-    std::is_same_v<T, unsigned long> || std::is_same_v<T, long long> ||
-    std::is_same_v<T, unsigned long long> || std::is_same_v<T, float> || std::is_same_v<T, double>;
-};
-
-template <typename T>
-inline constexpr bool is_shuffleable_v = is_shuffleable<T>::value;
-
-/**
- * @brief Shuffle the data inside a warp
- * @tparam T the data type
- * @param val value to be shuffled
- * @param srcLane lane from where to shuffle
- * @param width lane width
- * @param mask mask of participating threads (Volta+)
- * @return the shuffled data
- */
-template <typename T>
-DI std::enable_if_t<is_shuffleable_v<T>, T> shfl(T val,
-                                                 int srcLane,
-                                                 int width     = WarpSize,
-                                                 uint32_t mask = 0xffffffffu)
-{
-#if CUDART_VERSION >= 9000
-  return __shfl_sync(mask, val, srcLane, width);
-#else
-  return __shfl(val, srcLane, width);
-#endif
-}
-
-/// Overload of shfl for data types not supported by the CUDA intrinsics
-template <typename T>
-DI std::enable_if_t<!is_shuffleable_v<T>, T> shfl(T val,
-                                                  int srcLane,
-                                                  int width     = WarpSize,
-                                                  uint32_t mask = 0xffffffffu)
-{
-  using UnitT =
-    std::conditional_t<is_multiple_v<T, int>,
-                       unsigned int,
-                       std::conditional_t<is_multiple_v<T, short>, unsigned short, unsigned char>>;
-
-  constexpr int n_words = sizeof(T) / sizeof(UnitT);
-
-  T output;
-  UnitT* output_alias = reinterpret_cast<UnitT*>(&output);
-  UnitT* input_alias  = reinterpret_cast<UnitT*>(&val);
-
-  unsigned int shuffle_word;
-  shuffle_word    = shfl((unsigned int)input_alias[0], srcLane, width, mask);
-  output_alias[0] = shuffle_word;
-
-#pragma unroll
-  for (int i = 1; i < n_words; ++i) {
-    shuffle_word    = shfl((unsigned int)input_alias[i], srcLane, width, mask);
-    output_alias[i] = shuffle_word;
-  }
-
-  return output;
-}
-
-/**
- * @brief Shuffle the data inside a warp from lower lane IDs
- * @tparam T the data type
- * @param val value to be shuffled
- * @param delta lower lane ID delta from where to shuffle
- * @param width lane width
- * @param mask mask of participating threads (Volta+)
- * @return the shuffled data
- */
-template <typename T>
-DI std::enable_if_t<is_shuffleable_v<T>, T> shfl_up(T val,
-                                                    int delta,
-                                                    int width     = WarpSize,
-                                                    uint32_t mask = 0xffffffffu)
-{
-#if CUDART_VERSION >= 9000
-  return __shfl_up_sync(mask, val, delta, width);
-#else
-  return __shfl_up(val, delta, width);
-#endif
-}
-
-/// Overload of shfl_up for data types not supported by the CUDA intrinsics
-template <typename T>
-DI std::enable_if_t<!is_shuffleable_v<T>, T> shfl_up(T val,
-                                                     int delta,
-                                                     int width     = WarpSize,
-                                                     uint32_t mask = 0xffffffffu)
-{
-  using UnitT =
-    std::conditional_t<is_multiple_v<T, int>,
-                       unsigned int,
-                       std::conditional_t<is_multiple_v<T, short>, unsigned short, unsigned char>>;
-
-  constexpr int n_words = sizeof(T) / sizeof(UnitT);
-
-  T output;
-  UnitT* output_alias = reinterpret_cast<UnitT*>(&output);
-  UnitT* input_alias  = reinterpret_cast<UnitT*>(&val);
-
-  unsigned int shuffle_word;
-  shuffle_word    = shfl_up((unsigned int)input_alias[0], delta, width, mask);
-  output_alias[0] = shuffle_word;
-
-#pragma unroll
-  for (int i = 1; i < n_words; ++i) {
-    shuffle_word    = shfl_up((unsigned int)input_alias[i], delta, width, mask);
-    output_alias[i] = shuffle_word;
-  }
-
-  return output;
-}
-
-/**
- * @brief Shuffle the data inside a warp
- * @tparam T the data type
- * @param val value to be shuffled
- * @param laneMask mask to be applied in order to perform xor shuffle
- * @param width lane width
- * @param mask mask of participating threads (Volta+)
- * @return the shuffled data
- */
-template <typename T>
-DI std::enable_if_t<is_shuffleable_v<T>, T> shfl_xor(T val,
-                                                     int laneMask,
-                                                     int width     = WarpSize,
-                                                     uint32_t mask = 0xffffffffu)
-{
-#if CUDART_VERSION >= 9000
-  return __shfl_xor_sync(mask, val, laneMask, width);
-#else
-  return __shfl_xor(val, laneMask, width);
-#endif
-}
-
-/// Overload of shfl_xor for data types not supported by the CUDA intrinsics
-template <typename T>
-DI std::enable_if_t<!is_shuffleable_v<T>, T> shfl_xor(T val,
-                                                      int laneMask,
-                                                      int width     = WarpSize,
-                                                      uint32_t mask = 0xffffffffu)
-{
-  using UnitT =
-    std::conditional_t<is_multiple_v<T, int>,
-                       unsigned int,
-                       std::conditional_t<is_multiple_v<T, short>, unsigned short, unsigned char>>;
-
-  constexpr int n_words = sizeof(T) / sizeof(UnitT);
-
-  T output;
-  UnitT* output_alias = reinterpret_cast<UnitT*>(&output);
-  UnitT* input_alias  = reinterpret_cast<UnitT*>(&val);
-
-  unsigned int shuffle_word;
-  shuffle_word    = shfl_xor((unsigned int)input_alias[0], laneMask, width, mask);
-  output_alias[0] = shuffle_word;
-
-#pragma unroll
-  for (int i = 1; i < n_words; ++i) {
-    shuffle_word    = shfl_xor((unsigned int)input_alias[i], laneMask, width, mask);
-    output_alias[i] = shuffle_word;
-  }
-
-  return output;
-}
-
 /**
  * @brief Four-way byte dot product-accumulate.
  * @tparam T Four-byte integer: int or unsigned int
@@ -816,83 +587,6 @@ DI auto dp4a(unsigned int a, unsigned int b, unsigned int c) -> unsigned int
 #endif
 }
 
-/**
- * @brief Logical-warp-level reduction
- * @tparam logicalWarpSize Logical warp size (2, 4, 8, 16 or 32)
- * @tparam T Value type to be reduced
- * @tparam ReduceLambda Reduction operation type
- * @param val input value
- * @param reduce_op Reduction operation
- * @return Reduction result. All lanes will have the valid result.
- */
-template <int logicalWarpSize, typename T, typename ReduceLambda>
-DI T logicalWarpReduce(T val, ReduceLambda reduce_op)
-{
-#pragma unroll
-  for (int i = logicalWarpSize / 2; i > 0; i >>= 1) {
-    T tmp = shfl_xor(val, i);
-    val   = reduce_op(val, tmp);
-  }
-  return val;
-}
-
-/**
- * @brief Warp-level reduction
- * @tparam T Value type to be reduced
- * @tparam ReduceLambda Reduction operation type
- * @param val input value
- * @param reduce_op Reduction operation
- * @return Reduction result. All lanes will have the valid result.
- * @note Why not cub? Because cub doesn't seem to allow working with arbitrary
- *       number of warps in a block. All threads in the warp must enter this
- *       function together
- */
-template <typename T, typename ReduceLambda>
-DI T warpReduce(T val, ReduceLambda reduce_op)
-{
-  return logicalWarpReduce<WarpSize>(val, reduce_op);
-}
-
-/**
- * @brief Warp-level sum reduction
- * @tparam T Value type to be reduced
- * @param val input value
- * @return Reduction result. All lanes will have the valid result.
- * @note Why not cub? Because cub doesn't seem to allow working with arbitrary
- *       number of warps in a block. All threads in the warp must enter this
- *       function together
- */
-template <typename T>
-DI T warpReduce(T val)
-{
-  return warpReduce(val, raft::add_op{});
-}
-
-/**
- * @brief 1-D block-level sum reduction
- * @param val input value
- * @param smem shared memory region needed for storing intermediate results. It
- *             must alteast be of size: `sizeof(T) * nWarps`
- * @return only the thread0 will contain valid reduced result
- * @note Why not cub? Because cub doesn't seem to allow working with arbitrary
- *       number of warps in a block. All threads in the block must enter this
- *       function together
- * @todo Expand this to support arbitrary reduction ops
- */
-template <typename T>
-DI T blockReduce(T val, char* smem)
-{
-  auto* sTemp = reinterpret_cast<T*>(smem);
-  int nWarps  = (blockDim.x + WarpSize - 1) / WarpSize;
-  int lid     = laneId();
-  int wid     = threadIdx.x / WarpSize;
-  val         = warpReduce(val);
-  if (lid == 0) sTemp[wid] = val;
-  __syncthreads();
-  val = lid < nWarps ? sTemp[lid] : T(0);
-  return warpReduce(val);
-}
-
 /**
  * @brief Simple utility function to determine whether user_stream or one of the
  * internal streams should be used.