From 17cd4577e4d6d6001374f3a7412eeee6bcf32d9e Mon Sep 17 00:00:00 2001 From: akifcorduk Date: Mon, 21 Feb 2022 09:57:38 -0800 Subject: [PATCH 1/3] Add broadcast option to warpReduce --- cpp/include/raft/cuda_utils.cuh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cpp/include/raft/cuda_utils.cuh b/cpp/include/raft/cuda_utils.cuh index be995ea824..9992d3336b 100644 --- a/cpp/include/raft/cuda_utils.cuh +++ b/cpp/include/raft/cuda_utils.cuh @@ -652,18 +652,20 @@ DI T shfl_xor(T val, int laneMask, int width = WarpSize, uint32_t mask = 0xfffff /** * @brief Warp-level sum reduction * @param val input value - * @return only the lane0 will contain valid reduced result + * @tparam T Value type to be reduced + * @tparam Broadcast Boolean flag to broadcast reduction value to all lanes + * @return only the lane0 will contain valid reduced result unless Broadcast is true * @note Why not cub? Because cub doesn't seem to allow working with arbitrary * number of warps in a block. All threads in the warp must enter this * function together * @todo Expand this to support arbitrary reduction ops */ -template +template DI T warpReduce(T val) { #pragma unroll for (int i = WarpSize / 2; i > 0; i >>= 1) { - T tmp = shfl(val, laneId() + i); + T tmp = Broadcast ? shfl_xor(val, i) : shfl(val, laneId() + i); val += tmp; } return val; From 9a23e8417fead1c75fc1d97e26d3e5cd8880e0f8 Mon Sep 17 00:00:00 2001 From: akifcorduk Date: Tue, 22 Feb 2022 00:54:09 -0800 Subject: [PATCH 2/3] Use shfl_xor --- cpp/include/raft/cuda_utils.cuh | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/include/raft/cuda_utils.cuh b/cpp/include/raft/cuda_utils.cuh index 9992d3336b..362dba66c5 100644 --- a/cpp/include/raft/cuda_utils.cuh +++ b/cpp/include/raft/cuda_utils.cuh @@ -653,19 +653,18 @@ DI T shfl_xor(T val, int laneMask, int width = WarpSize, uint32_t mask = 0xfffff * @brief Warp-level sum reduction * @param val input value * @tparam T Value type to be reduced - * @tparam Broadcast Boolean flag to broadcast reduction value to all lanes - * @return only the lane0 will contain valid reduced result unless Broadcast is true + * @return Reduction result. All lanes will have the valid result. * @note Why not cub? Because cub doesn't seem to allow working with arbitrary * number of warps in a block. All threads in the warp must enter this * function together * @todo Expand this to support arbitrary reduction ops */ -template +template DI T warpReduce(T val) { #pragma unroll for (int i = WarpSize / 2; i > 0; i >>= 1) { - T tmp = Broadcast ? shfl_xor(val, i) : shfl(val, laneId() + i); + T tmp = shfl_xor(val, i); val += tmp; } return val; From ab0dcedddbd16a177f304a4a02b8c9f5c45caa12 Mon Sep 17 00:00:00 2001 From: akifcorduk Date: Wed, 23 Feb 2022 07:08:33 -0800 Subject: [PATCH 3/3] Add thrust/sort include to ball_cover.cuh --- cpp/include/raft/spatial/knn/detail/ball_cover.cuh | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh index 2b245d06cb..4911582ed9 100644 --- a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh +++ b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh @@ -45,6 +45,7 @@ #include #include #include +#include namespace raft { namespace spatial {