From 2dd9abb35091f98ae0c0d01667e815cc8fbb3dc5 Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Thu, 5 Jan 2023 12:19:05 -0800 Subject: [PATCH 01/44] Remove faiss dependency from fused_l2_knn.cuh, selection_faiss.cuh, ball_cover.cuh and haversine_distance.cuh (#1108) Remove the dependency on faiss from the fused_l2_knn.cuh, selection_faiss.cuh, ball_cover.cuh and haversine_distance.cuh headers. This takes a copy of the faiss BlockSelect/WarpSelect device code for top-k selection, and updates to use raft primitives for things like reductions, KeyValuePair, warp shuffling etc. Authors: - Ben Frederickson (https://github.com/benfred) - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Corey J. Nolet (https://github.com/cjnolet) - Ray Douglass (https://github.com/raydouglass) URL: https://github.com/rapidsai/raft/pull/1108 --- ci/checks/copyright.py | 4 +- cpp/include/raft/core/kvp.hpp | 25 +- .../raft/spatial/knn/detail/ball_cover.cuh | 7 +- .../knn/detail/ball_cover/registers.cuh | 57 +- .../knn/detail/faiss_select/Comparators.cuh | 29 + .../detail/faiss_select/MergeNetworkBlock.cuh | 277 +++++++++ .../detail/faiss_select/MergeNetworkUtils.cuh | 25 + .../MergeNetworkWarp.cuh} | 354 +++++------ .../knn/detail/faiss_select/Select.cuh | 555 ++++++++++++++++++ .../knn/detail/faiss_select/StaticUtils.h | 48 ++ .../key_value_block_select.cuh} | 46 +- .../raft/spatial/knn/detail/fused_l2_knn.cuh | 8 +- .../spatial/knn/detail/haversine_distance.cuh | 17 +- .../knn/detail/knn_brute_force_faiss.cuh | 15 +- .../spatial/knn/detail/selection_faiss.cuh | 15 +- thirdparty/LICENSES/LICENSE.faiss | 21 + 16 files changed, 1216 insertions(+), 287 deletions(-) create mode 100644 cpp/include/raft/spatial/knn/detail/faiss_select/Comparators.cuh create mode 100644 cpp/include/raft/spatial/knn/detail/faiss_select/MergeNetworkBlock.cuh create mode 100644 cpp/include/raft/spatial/knn/detail/faiss_select/MergeNetworkUtils.cuh rename cpp/include/raft/spatial/knn/detail/{warp_select_faiss.cuh => faiss_select/MergeNetworkWarp.cuh} (51%) create mode 100644 cpp/include/raft/spatial/knn/detail/faiss_select/Select.cuh create mode 100644 cpp/include/raft/spatial/knn/detail/faiss_select/StaticUtils.h rename cpp/include/raft/spatial/knn/detail/{block_select_faiss.cuh => faiss_select/key_value_block_select.cuh} (80%) create mode 100644 thirdparty/LICENSES/LICENSE.faiss diff --git a/ci/checks/copyright.py b/ci/checks/copyright.py index bfef5392f5..43a4a186f8 100644 --- a/ci/checks/copyright.py +++ b/ci/checks/copyright.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -37,7 +37,7 @@ re.compile(r"setup[.]cfg$"), re.compile(r"meta[.]yaml$") ] -ExemptFiles = ["cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh"] +ExemptFiles = ["cpp/include/raft/spatial/knn/detail/faiss_select/"] # this will break starting at year 10000, which is probably OK :) CheckSimple = re.compile( diff --git a/cpp/include/raft/core/kvp.hpp b/cpp/include/raft/core/kvp.hpp index f6ea841dc4..8d3321eb77 100644 --- a/cpp/include/raft/core/kvp.hpp +++ b/cpp/include/raft/core/kvp.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ #ifdef _RAFT_HAS_CUDA #include +#include #endif namespace raft { /** @@ -58,5 +59,27 @@ struct KeyValuePair { { return (value != b.value) || (key != b.key); } + + RAFT_INLINE_FUNCTION bool operator<(const KeyValuePair<_Key, _Value>& b) const + { + return (key < b.key) || ((key == b.key) && value < b.value); + } + + RAFT_INLINE_FUNCTION bool operator>(const KeyValuePair<_Key, _Value>& b) const + { + return (key > b.key) || ((key == b.key) && value > b.value); + } }; + +#ifdef _RAFT_HAS_CUDA +template +RAFT_INLINE_FUNCTION KeyValuePair<_Key, _Value> shfl_xor(const KeyValuePair<_Key, _Value>& input, + int laneMask, + int width = WarpSize, + uint32_t mask = 0xffffffffu) +{ + return KeyValuePair<_Key, _Value>(shfl_xor(input.key, laneMask, width, mask), + shfl_xor(input.value, laneMask, width, mask)); +} +#endif } // end namespace raft diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh index 797dbaab50..fd0314dbcc 100644 --- a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh +++ b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,7 +21,6 @@ #include "../ball_cover_types.hpp" #include "ball_cover/common.cuh" #include "ball_cover/registers.cuh" -#include "block_select_faiss.cuh" #include "haversine_distance.cuh" #include "knn_brute_force_faiss.cuh" #include "selection_faiss.cuh" @@ -31,6 +30,8 @@ #include +#include + #include #include #include @@ -38,8 +39,6 @@ #include #include -#include - #include #include #include diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh index a883a1eadd..530b0d3d04 100644 --- a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh +++ b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,7 +19,7 @@ #include "common.cuh" #include "../../ball_cover_types.hpp" -#include "../block_select_faiss.cuh" +#include "../faiss_select/key_value_block_select.cuh" #include "../haversine_distance.cuh" #include "../selection_faiss.cuh" @@ -28,9 +28,6 @@ #include -#include -#include - #include namespace raft { @@ -172,10 +169,10 @@ __global__ void compute_final_dists_registers(const value_t* X_index, dist_func dfunc, value_int* dist_counter) { - static constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize; + static constexpr int kNumWarps = tpb / WarpSize; __shared__ value_t shared_memK[kNumWarps * warp_q]; - __shared__ faiss::gpu::KeyValuePair shared_memV[kNumWarps * warp_q]; + __shared__ KeyValuePair shared_memV[kNumWarps * warp_q]; const value_t* x_ptr = X + (n_cols * blockIdx.x); value_t local_x_ptr[col_q]; @@ -183,21 +180,21 @@ __global__ void compute_final_dists_registers(const value_t* X_index, local_x_ptr[j] = x_ptr[j]; } - faiss::gpu::KeyValueBlockSelect, - warp_q, - thread_q, - tpb> - heap(faiss::gpu::Limits::getMax(), - faiss::gpu::Limits::getMax(), + faiss_select::KeyValueBlockSelect, + warp_q, + thread_q, + tpb> + heap(std::numeric_limits::max(), + std::numeric_limits::max(), -1, shared_memK, shared_memV, k); - const value_int n_k = faiss::gpu::utils::roundDown(k, faiss::gpu::kWarpSize); + const value_int n_k = Pow2::roundDown(k); value_int i = threadIdx.x; for (; i < n_k; i += tpb) { value_idx ind = knn_inds[blockIdx.x * k + i]; @@ -224,7 +221,7 @@ __global__ void compute_final_dists_registers(const value_t* X_index, // Round R_size to the nearest warp threads so they can // all be computing in parallel. - const value_int limit = faiss::gpu::utils::roundDown(R_size, faiss::gpu::kWarpSize); + const value_int limit = Pow2::roundDown(R_size); i = threadIdx.x; for (; i < limit; i += tpb) { @@ -334,10 +331,10 @@ __global__ void block_rbc_kernel_registers(const value_t* X_index, distance_func dfunc, float weight = 1.0) { - static constexpr value_int kNumWarps = tpb / faiss::gpu::kWarpSize; + static constexpr value_int kNumWarps = tpb / WarpSize; __shared__ value_t shared_memK[kNumWarps * warp_q]; - __shared__ faiss::gpu::KeyValuePair shared_memV[kNumWarps * warp_q]; + __shared__ KeyValuePair shared_memV[kNumWarps * warp_q]; // TODO: Separate kernels for different widths: // 1. Very small (between 3 and 32) just use registers for columns of "blockIdx.x" @@ -352,15 +349,15 @@ __global__ void block_rbc_kernel_registers(const value_t* X_index, } // Each warp works on 1 R - faiss::gpu::KeyValueBlockSelect, - warp_q, - thread_q, - tpb> - heap(faiss::gpu::Limits::getMax(), - faiss::gpu::Limits::getMax(), + faiss_select::KeyValueBlockSelect, + warp_q, + thread_q, + tpb> + heap(std::numeric_limits::max(), + std::numeric_limits::max(), -1, shared_memK, shared_memV, @@ -390,7 +387,7 @@ __global__ void block_rbc_kernel_registers(const value_t* X_index, value_idx R_size = R_stop_offset - R_start_offset; - value_int limit = faiss::gpu::utils::roundDown(R_size, faiss::gpu::kWarpSize); + value_int limit = Pow2::roundDown(R_size); value_int i = threadIdx.x; for (; i < limit; i += tpb) { // Index and distance of current candidate's nearest landmark diff --git a/cpp/include/raft/spatial/knn/detail/faiss_select/Comparators.cuh b/cpp/include/raft/spatial/knn/detail/faiss_select/Comparators.cuh new file mode 100644 index 0000000000..173c06af30 --- /dev/null +++ b/cpp/include/raft/spatial/knn/detail/faiss_select/Comparators.cuh @@ -0,0 +1,29 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file thirdparty/LICENSES/LICENSE.faiss + */ + +#pragma once + +#include +#include + +namespace raft::spatial::knn::detail::faiss_select { + +template +struct Comparator { + __device__ static inline bool lt(T a, T b) { return a < b; } + + __device__ static inline bool gt(T a, T b) { return a > b; } +}; + +template <> +struct Comparator { + __device__ static inline bool lt(half a, half b) { return __hlt(a, b); } + + __device__ static inline bool gt(half a, half b) { return __hgt(a, b); } +}; + +} // namespace raft::spatial::knn::detail::faiss_select diff --git a/cpp/include/raft/spatial/knn/detail/faiss_select/MergeNetworkBlock.cuh b/cpp/include/raft/spatial/knn/detail/faiss_select/MergeNetworkBlock.cuh new file mode 100644 index 0000000000..d923b41ded --- /dev/null +++ b/cpp/include/raft/spatial/knn/detail/faiss_select/MergeNetworkBlock.cuh @@ -0,0 +1,277 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file thirdparty/LICENSES/LICENSE.faiss + */ + +#pragma once + +#include +#include +#include + +namespace raft::spatial::knn::detail::faiss_select { + +// Merge pairs of lists smaller than blockDim.x (NumThreads) +template +inline __device__ void blockMergeSmall(K* listK, V* listV) +{ + static_assert(utils::isPowerOf2(L), "L must be a power-of-2"); + static_assert(utils::isPowerOf2(NumThreads), "NumThreads must be a power-of-2"); + static_assert(L <= NumThreads, "merge list size must be <= NumThreads"); + + // Which pair of lists we are merging + int mergeId = threadIdx.x / L; + + // Which thread we are within the merge + int tid = threadIdx.x % L; + + // listK points to a region of size N * 2 * L + listK += 2 * L * mergeId; + listV += 2 * L * mergeId; + + // It's not a bitonic merge, both lists are in the same direction, + // so handle the first swap assuming the second list is reversed + int pos = L - 1 - tid; + int stride = 2 * tid + 1; + + if (AllThreads || (threadIdx.x < N * L)) { + K ka = listK[pos]; + K kb = listK[pos + stride]; + + bool swap = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); + listK[pos] = swap ? kb : ka; + listK[pos + stride] = swap ? ka : kb; + + V va = listV[pos]; + V vb = listV[pos + stride]; + listV[pos] = swap ? vb : va; + listV[pos + stride] = swap ? va : vb; + + // FIXME: is this a CUDA 9 compiler bug? + // K& ka = listK[pos]; + // K& kb = listK[pos + stride]; + + // bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); + // swap(s, ka, kb); + + // V& va = listV[pos]; + // V& vb = listV[pos + stride]; + // swap(s, va, vb); + } + + __syncthreads(); + +#pragma unroll + for (int stride = L / 2; stride > 0; stride /= 2) { + int pos = 2 * tid - (tid & (stride - 1)); + + if (AllThreads || (threadIdx.x < N * L)) { + K ka = listK[pos]; + K kb = listK[pos + stride]; + + bool swap = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); + listK[pos] = swap ? kb : ka; + listK[pos + stride] = swap ? ka : kb; + + V va = listV[pos]; + V vb = listV[pos + stride]; + listV[pos] = swap ? vb : va; + listV[pos + stride] = swap ? va : vb; + + // FIXME: is this a CUDA 9 compiler bug? + // K& ka = listK[pos]; + // K& kb = listK[pos + stride]; + + // bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); + // swap(s, ka, kb); + + // V& va = listV[pos]; + // V& vb = listV[pos + stride]; + // swap(s, va, vb); + } + + __syncthreads(); + } +} + +// Merge pairs of sorted lists larger than blockDim.x (NumThreads) +template +inline __device__ void blockMergeLarge(K* listK, V* listV) +{ + static_assert(utils::isPowerOf2(L), "L must be a power-of-2"); + static_assert(L >= WarpSize, "merge list size must be >= 32"); + static_assert(utils::isPowerOf2(NumThreads), "NumThreads must be a power-of-2"); + static_assert(L >= NumThreads, "merge list size must be >= NumThreads"); + + // For L > NumThreads, each thread has to perform more work + // per each stride. + constexpr int kLoopPerThread = L / NumThreads; + + // It's not a bitonic merge, both lists are in the same direction, + // so handle the first swap assuming the second list is reversed +#pragma unroll + for (int loop = 0; loop < kLoopPerThread; ++loop) { + int tid = loop * NumThreads + threadIdx.x; + int pos = L - 1 - tid; + int stride = 2 * tid + 1; + + K ka = listK[pos]; + K kb = listK[pos + stride]; + + bool swap = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); + listK[pos] = swap ? kb : ka; + listK[pos + stride] = swap ? ka : kb; + + V va = listV[pos]; + V vb = listV[pos + stride]; + listV[pos] = swap ? vb : va; + listV[pos + stride] = swap ? va : vb; + + // FIXME: is this a CUDA 9 compiler bug? + // K& ka = listK[pos]; + // K& kb = listK[pos + stride]; + + // bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); + // swap(s, ka, kb); + + // V& va = listV[pos]; + // V& vb = listV[pos + stride]; + // swap(s, va, vb); + } + + __syncthreads(); + + constexpr int kSecondLoopPerThread = FullMerge ? kLoopPerThread : kLoopPerThread / 2; + +#pragma unroll + for (int stride = L / 2; stride > 0; stride /= 2) { +#pragma unroll + for (int loop = 0; loop < kSecondLoopPerThread; ++loop) { + int tid = loop * NumThreads + threadIdx.x; + int pos = 2 * tid - (tid & (stride - 1)); + + K ka = listK[pos]; + K kb = listK[pos + stride]; + + bool swap = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); + listK[pos] = swap ? kb : ka; + listK[pos + stride] = swap ? ka : kb; + + V va = listV[pos]; + V vb = listV[pos + stride]; + listV[pos] = swap ? vb : va; + listV[pos + stride] = swap ? va : vb; + + // FIXME: is this a CUDA 9 compiler bug? + // K& ka = listK[pos]; + // K& kb = listK[pos + stride]; + + // bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); + // swap(s, ka, kb); + + // V& va = listV[pos]; + // V& vb = listV[pos + stride]; + // swap(s, va, vb); + } + + __syncthreads(); + } +} + +/// Class template to prevent static_assert from firing for +/// mixing smaller/larger than block cases +template +struct BlockMerge { +}; + +/// Merging lists smaller than a block +template +struct BlockMerge { + static inline __device__ void merge(K* listK, V* listV) + { + constexpr int kNumParallelMerges = NumThreads / L; + constexpr int kNumIterations = N / kNumParallelMerges; + + static_assert(L <= NumThreads, "list must be <= NumThreads"); + static_assert((N < kNumParallelMerges) || (kNumIterations * kNumParallelMerges == N), + "improper selection of N and L"); + + if (N < kNumParallelMerges) { + // We only need L threads per each list to perform the merge + blockMergeSmall(listK, listV); + } else { + // All threads participate +#pragma unroll + for (int i = 0; i < kNumIterations; ++i) { + int start = i * kNumParallelMerges * 2 * L; + + blockMergeSmall(listK + start, + listV + start); + } + } + } +}; + +/// Merging lists larger than a block +template +struct BlockMerge { + static inline __device__ void merge(K* listK, V* listV) + { + // Each pair of lists is merged sequentially +#pragma unroll + for (int i = 0; i < N; ++i) { + int start = i * 2 * L; + + blockMergeLarge(listK + start, listV + start); + } + } +}; + +template +inline __device__ void blockMerge(K* listK, V* listV) +{ + constexpr bool kSmallerThanBlock = (L <= NumThreads); + + BlockMerge::merge(listK, listV); +} + +} // namespace raft::spatial::knn::detail::faiss_select diff --git a/cpp/include/raft/spatial/knn/detail/faiss_select/MergeNetworkUtils.cuh b/cpp/include/raft/spatial/knn/detail/faiss_select/MergeNetworkUtils.cuh new file mode 100644 index 0000000000..2cb01f9199 --- /dev/null +++ b/cpp/include/raft/spatial/knn/detail/faiss_select/MergeNetworkUtils.cuh @@ -0,0 +1,25 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file thirdparty/LICENSES/LICENSE.faiss + */ + +#pragma once + +namespace raft::spatial::knn::detail::faiss_select { + +template +inline __device__ void swap(bool swap, T& x, T& y) +{ + T tmp = x; + x = swap ? y : x; + y = swap ? tmp : y; +} + +template +inline __device__ void assign(bool assign, T& x, T y) +{ + x = assign ? y : x; +} +} // namespace raft::spatial::knn::detail::faiss_select diff --git a/cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh b/cpp/include/raft/spatial/knn/detail/faiss_select/MergeNetworkWarp.cuh similarity index 51% rename from cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh rename to cpp/include/raft/spatial/knn/detail/faiss_select/MergeNetworkWarp.cuh index 2ce2d34cca..bce739b2d8 100644 --- a/cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/faiss_select/MergeNetworkWarp.cuh @@ -2,36 +2,31 @@ * Copyright (c) Facebook, Inc. and its affiliates. * * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. + * LICENSE file thirdparty/LICENSES/LICENSE.faiss */ #pragma once -#include -#include -#include -#include -#include +#include +#include -#include +#include -namespace faiss { -namespace gpu { -using raft::KeyValuePair; +namespace raft::spatial::knn::detail::faiss_select { // // This file contains functions to: // // -perform bitonic merges on pairs of sorted lists, held in -// registers. Each list contains N * kWarpSize (multiple of 32) +// registers. Each list contains N * WarpSize (multiple of 32) // elements for some N. // The bitonic merge is implemented for arbitrary sizes; -// sorted list A of size N1 * kWarpSize registers -// sorted list B of size N2 * kWarpSize registers => -// sorted list C if size (N1 + N2) * kWarpSize registers. N1 and N2 +// sorted list A of size N1 * WarpSize registers +// sorted list B of size N2 * WarpSize registers => +// sorted list C if size (N1 + N2) * WarpSize registers. N1 and N2 // are >= 1 and don't have to be powers of 2. // -// -perform bitonic sorts on a set of N * kWarpSize key/value pairs +// -perform bitonic sorts on a set of N * WarpSize key/value pairs // held in registers, by using the above bitonic merge as a // primitive. // N can be an arbitrary N >= 1; i.e., the bitonic sort here supports @@ -80,7 +75,7 @@ using raft::KeyValuePair; // performing both < and > comparisons with the variables, so I just // stick with this. -// This function merges kWarpSize / 2L lists in parallel using warp +// This function merges WarpSize / 2L lists in parallel using warp // shuffles. // It works on at most size-16 lists, as we need 32 threads for this // shuffle merge. @@ -88,22 +83,19 @@ using raft::KeyValuePair; // If IsBitonic is false, the first stage is reversed, so we don't // need to sort directionally. It's still technically a bitonic sort. template -inline __device__ void warpBitonicMergeLE16KVP(K& k, KeyValuePair& v) +inline __device__ void warpBitonicMergeLE16(K& k, V& v) { static_assert(utils::isPowerOf2(L), "L must be a power-of-2"); - static_assert(L <= kWarpSize / 2, "merge list size must be <= 16"); + static_assert(L <= WarpSize / 2, "merge list size must be <= 16"); - int laneId = getLaneId(); + int laneId = raft::laneId(); if (!IsBitonic) { // Reverse the first comparison stage. // For example, merging a list of size 8 has the exchanges: // 0 <-> 15, 1 <-> 14, ... - K otherK = shfl_xor(k, 2 * L - 1); - K otherVk = shfl_xor(v.key, 2 * L - 1); - V otherVv = shfl_xor(v.value, 2 * L - 1); - - KeyValuePair otherV = KeyValuePair(otherVk, otherVv); + K otherK = shfl_xor(k, 2 * L - 1); + V otherV = shfl_xor(v, 2 * L - 1); // Whether we are the lesser thread in the exchange bool small = !(laneId & L); @@ -114,24 +106,19 @@ inline __device__ void warpBitonicMergeLE16KVP(K& k, KeyValuePair& v) // alternatives in practice bool s = small ? Comp::gt(k, otherK) : Comp::lt(k, otherK); assign(s, k, otherK); - assign(s, v.key, otherV.key); - assign(s, v.value, otherV.value); + assign(s, v, otherV); } else { bool s = small ? Comp::lt(k, otherK) : Comp::gt(k, otherK); assign(s, k, otherK); - assign(s, v.value, otherV.value); - assign(s, v.key, otherV.key); + assign(s, v, otherV); } } #pragma unroll for (int stride = IsBitonic ? L : L / 2; stride > 0; stride /= 2) { - K otherK = shfl_xor(k, stride); - K otherVk = shfl_xor(v.key, stride); - V otherVv = shfl_xor(v.value, stride); - - KeyValuePair otherV = KeyValuePair(otherVk, otherVv); + K otherK = shfl_xor(k, stride); + V otherV = shfl_xor(v, stride); // Whether we are the lesser thread in the exchange bool small = !(laneId & stride); @@ -139,14 +126,12 @@ inline __device__ void warpBitonicMergeLE16KVP(K& k, KeyValuePair& v) if (Dir) { bool s = small ? Comp::gt(k, otherK) : Comp::lt(k, otherK); assign(s, k, otherK); - assign(s, v.key, otherV.key); - assign(s, v.value, otherV.value); + assign(s, v, otherV); } else { bool s = small ? Comp::lt(k, otherK) : Comp::gt(k, otherK); assign(s, k, otherK); - assign(s, v.key, otherV.key); - assign(s, v.value, otherV.value); + assign(s, v, otherV); } } } @@ -154,7 +139,7 @@ inline __device__ void warpBitonicMergeLE16KVP(K& k, KeyValuePair& v) // Template for performing a bitonic merge of an arbitrary set of // registers template -struct BitonicMergeStepKVP { +struct BitonicMergeStep { }; // @@ -163,74 +148,69 @@ struct BitonicMergeStepKVP { // All merges eventually call this template -struct BitonicMergeStepKVP { - static inline __device__ void merge(K k[1], KeyValuePair v[1]) +struct BitonicMergeStep { + static inline __device__ void merge(K k[1], V v[1]) { // Use warp shuffles - warpBitonicMergeLE16KVP(k[0], v[0]); + warpBitonicMergeLE16(k[0], v[0]); } }; template -struct BitonicMergeStepKVP { - static inline __device__ void merge(K k[N], KeyValuePair v[N]) +struct BitonicMergeStep { + static inline __device__ void merge(K k[N], V v[N]) { static_assert(utils::isPowerOf2(N), "must be power of 2"); static_assert(N > 1, "must be N > 1"); #pragma unroll for (int i = 0; i < N / 2; ++i) { - K& ka = k[i]; - KeyValuePair& va = v[i]; + K& ka = k[i]; + V& va = v[i]; - K& kb = k[i + N / 2]; - KeyValuePair& vb = v[i + N / 2]; + K& kb = k[i + N / 2]; + V& vb = v[i + N / 2]; bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); swap(s, ka, kb); - swap(s, va.key, vb.key); - swap(s, va.value, vb.value); + swap(s, va, vb); } { K newK[N / 2]; - KeyValuePair newV[N / 2]; + V newV[N / 2]; #pragma unroll for (int i = 0; i < N / 2; ++i) { - newK[i] = k[i]; - newV[i].key = v[i].key; - newV[i].value = v[i].value; + newK[i] = k[i]; + newV[i] = v[i]; } - BitonicMergeStepKVP::merge(newK, newV); + BitonicMergeStep::merge(newK, newV); #pragma unroll for (int i = 0; i < N / 2; ++i) { - k[i] = newK[i]; - v[i].key = newV[i].key; - v[i].value = newV[i].value; + k[i] = newK[i]; + v[i] = newV[i]; } } { K newK[N / 2]; - KeyValuePair newV[N / 2]; + V newV[N / 2]; #pragma unroll for (int i = 0; i < N / 2; ++i) { - newK[i] = k[i + N / 2]; - newV[i].key = v[i + N / 2].key; - newV[i].value = v[i + N / 2].value; + newK[i] = k[i + N / 2]; + newV[i] = v[i + N / 2]; } - BitonicMergeStepKVP::merge(newK, newV); + BitonicMergeStep::merge(newK, newV); #pragma unroll for (int i = 0; i < N / 2; ++i) { - k[i + N / 2] = newK[i]; - v[i + N / 2].key = newV[i].key; - v[i + N / 2].value = newV[i].value; + k[i + N / 2] = newK[i]; + v[i + N / 2] = newV[i]; } } } @@ -242,8 +222,8 @@ struct BitonicMergeStepKVP { // Low recursion template -struct BitonicMergeStepKVP { - static inline __device__ void merge(K k[N], KeyValuePair v[N]) +struct BitonicMergeStep { + static inline __device__ void merge(K k[N], V v[N]) { static_assert(!utils::isPowerOf2(N), "must be non-power-of-2"); static_assert(N >= 3, "must be N >= 3"); @@ -252,77 +232,73 @@ struct BitonicMergeStepKVP { #pragma unroll for (int i = 0; i < N - kNextHighestPowerOf2 / 2; ++i) { - K& ka = k[i]; - KeyValuePair& va = v[i]; + K& ka = k[i]; + V& va = v[i]; - K& kb = k[i + kNextHighestPowerOf2 / 2]; - KeyValuePair& vb = v[i + kNextHighestPowerOf2 / 2]; + K& kb = k[i + kNextHighestPowerOf2 / 2]; + V& vb = v[i + kNextHighestPowerOf2 / 2]; bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); swap(s, ka, kb); - swap(s, va.key, vb.key); - swap(s, va.value, vb.value); + swap(s, va, vb); } constexpr int kLowSize = N - kNextHighestPowerOf2 / 2; constexpr int kHighSize = kNextHighestPowerOf2 / 2; { K newK[kLowSize]; - KeyValuePair newV[kLowSize]; + V newV[kLowSize]; #pragma unroll for (int i = 0; i < kLowSize; ++i) { - newK[i] = k[i]; - newV[i].key = v[i].key; - newV[i].value = v[i].value; + newK[i] = k[i]; + newV[i] = v[i]; } constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(N - kNextHighestPowerOf2 / 2); // FIXME: compiler doesn't like this expression? compiler bug? // constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kLowSize); - BitonicMergeStepKVP::merge(newK, newV); + BitonicMergeStep::merge(newK, newV); #pragma unroll for (int i = 0; i < kLowSize; ++i) { - k[i] = newK[i]; - v[i].key = newV[i].key; - v[i].value = newV[i].value; + k[i] = newK[i]; + v[i] = newV[i]; } } { K newK[kHighSize]; - KeyValuePair newV[kHighSize]; + V newV[kHighSize]; #pragma unroll for (int i = 0; i < kHighSize; ++i) { - newK[i] = k[i + kLowSize]; - newV[i].key = v[i + kLowSize].key; - newV[i].value = v[i + kLowSize].value; + newK[i] = k[i + kLowSize]; + newV[i] = v[i + kLowSize]; } constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(kNextHighestPowerOf2 / 2); // FIXME: compiler doesn't like this expression? compiler bug? - // constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(kHighSize); - BitonicMergeStepKVP::merge(newK, newV); + // constexpr bool kHighIsPowerOf2 = + // utils::isPowerOf2(kHighSize); + BitonicMergeStep::merge(newK, newV); #pragma unroll for (int i = 0; i < kHighSize; ++i) { - k[i + kLowSize] = newK[i]; - v[i + kLowSize].key = newV[i].key; - v[i + kLowSize].value = newV[i].value; + k[i + kLowSize] = newK[i]; + v[i + kLowSize] = newV[i]; } } } @@ -330,8 +306,8 @@ struct BitonicMergeStepKVP { // High recursion template -struct BitonicMergeStepKVP { - static inline __device__ void merge(K k[N], KeyValuePair v[N]) +struct BitonicMergeStep { + static inline __device__ void merge(K k[N], V v[N]) { static_assert(!utils::isPowerOf2(N), "must be non-power-of-2"); static_assert(N >= 3, "must be N >= 3"); @@ -340,149 +316,137 @@ struct BitonicMergeStepKVP { #pragma unroll for (int i = 0; i < N - kNextHighestPowerOf2 / 2; ++i) { - K& ka = k[i]; - KeyValuePair& va = v[i]; + K& ka = k[i]; + V& va = v[i]; - K& kb = k[i + kNextHighestPowerOf2 / 2]; - KeyValuePair& vb = v[i + kNextHighestPowerOf2 / 2]; + K& kb = k[i + kNextHighestPowerOf2 / 2]; + V& vb = v[i + kNextHighestPowerOf2 / 2]; bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); swap(s, ka, kb); - swap(s, va.key, vb.key); - swap(s, va.value, vb.value); + swap(s, va, vb); } constexpr int kLowSize = kNextHighestPowerOf2 / 2; constexpr int kHighSize = N - kNextHighestPowerOf2 / 2; { K newK[kLowSize]; - KeyValuePair newV[kLowSize]; + V newV[kLowSize]; #pragma unroll for (int i = 0; i < kLowSize; ++i) { - newK[i] = k[i]; - newV[i].key = v[i].key; - newV[i].value = v[i].value; + newK[i] = k[i]; + newV[i] = v[i]; } constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kNextHighestPowerOf2 / 2); // FIXME: compiler doesn't like this expression? compiler bug? // constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kLowSize); - BitonicMergeStepKVP::merge(newK, newV); + BitonicMergeStep::merge(newK, newV); #pragma unroll for (int i = 0; i < kLowSize; ++i) { - k[i] = newK[i]; - v[i].key = newV[i].key; - v[i].value = newV[i].value; + k[i] = newK[i]; + v[i] = newV[i]; } } { K newK[kHighSize]; - KeyValuePair newV[kHighSize]; + V newV[kHighSize]; #pragma unroll for (int i = 0; i < kHighSize; ++i) { - newK[i] = k[i + kLowSize]; - newV[i].key = v[i + kLowSize].key; - newV[i].value = v[i + kLowSize].value; + newK[i] = k[i + kLowSize]; + newV[i] = v[i + kLowSize]; } constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(N - kNextHighestPowerOf2 / 2); // FIXME: compiler doesn't like this expression? compiler bug? - // constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(kHighSize); - BitonicMergeStepKVP::merge(newK, newV); + // constexpr bool kHighIsPowerOf2 = + // utils::isPowerOf2(kHighSize); + BitonicMergeStep::merge(newK, newV); #pragma unroll for (int i = 0; i < kHighSize; ++i) { - k[i + kLowSize] = newK[i]; - v[i + kLowSize].key = newV[i].key; - v[i + kLowSize].value = newV[i].value; + k[i + kLowSize] = newK[i]; + v[i + kLowSize] = newV[i]; } } } }; /// Merges two sets of registers across the warp of any size; -/// i.e., merges a sorted k/v list of size kWarpSize * N1 with a -/// sorted k/v list of size kWarpSize * N2, where N1 and N2 are any +/// i.e., merges a sorted k/v list of size WarpSize * N1 with a +/// sorted k/v list of size WarpSize * N2, where N1 and N2 are any /// value >= 1 template -inline __device__ void warpMergeAnyRegistersKVP(K k1[N1], - KeyValuePair v1[N1], - K k2[N2], - KeyValuePair v2[N2]) +inline __device__ void warpMergeAnyRegisters(K k1[N1], V v1[N1], K k2[N2], V v2[N2]) { constexpr int kSmallestN = N1 < N2 ? N1 : N2; #pragma unroll for (int i = 0; i < kSmallestN; ++i) { - K& ka = k1[N1 - 1 - i]; - KeyValuePair& va = v1[N1 - 1 - i]; + K& ka = k1[N1 - 1 - i]; + V& va = v1[N1 - 1 - i]; - K& kb = k2[i]; - KeyValuePair& vb = v2[i]; + K& kb = k2[i]; + V& vb = v2[i]; K otherKa; - KeyValuePair otherVa; + V otherVa; if (FullMerge) { // We need the other values - otherKa = shfl_xor(ka, kWarpSize - 1); - K otherVak = shfl_xor(va.key, kWarpSize - 1); - V otherVav = shfl_xor(va.value, kWarpSize - 1); - otherVa = KeyValuePair(otherVak, otherVav); + otherKa = shfl_xor(ka, WarpSize - 1); + otherVa = shfl_xor(va, WarpSize - 1); } - K otherKb = shfl_xor(kb, kWarpSize - 1); - K otherVbk = shfl_xor(vb.key, kWarpSize - 1); - V otherVbv = shfl_xor(vb.value, kWarpSize - 1); + K otherKb = shfl_xor(kb, WarpSize - 1); + V otherVb = shfl_xor(vb, WarpSize - 1); // ka is always first in the list, so we needn't use our lane // in this comparison bool swapa = Dir ? Comp::gt(ka, otherKb) : Comp::lt(ka, otherKb); assign(swapa, ka, otherKb); - assign(swapa, va.key, otherVbk); - assign(swapa, va.value, otherVbv); + assign(swapa, va, otherVb); // kb is always second in the list, so we needn't use our lane // in this comparison if (FullMerge) { bool swapb = Dir ? Comp::lt(kb, otherKa) : Comp::gt(kb, otherKa); assign(swapb, kb, otherKa); - assign(swapb, vb.key, otherVa.key); - assign(swapb, vb.value, otherVa.value); + assign(swapb, vb, otherVa); } else { // We don't care about updating elements in the second list } } - BitonicMergeStepKVP::merge(k1, v1); + BitonicMergeStep::merge(k1, v1); if (FullMerge) { // Only if we care about N2 do we need to bother merging it fully - BitonicMergeStepKVP::merge(k2, v2); + BitonicMergeStep::merge(k2, v2); } } // Recursive template that uses the above bitonic merge to perform a // bitonic sort template -struct BitonicSortStepKVP { - static inline __device__ void sort(K k[N], KeyValuePair v[N]) +struct BitonicSortStep { + static inline __device__ void sort(K k[N], V v[N]) { static_assert(N > 1, "did not hit specialized case"); @@ -491,71 +455,67 @@ struct BitonicSortStepKVP { constexpr int kSizeB = N - kSizeA; K aK[kSizeA]; - KeyValuePair aV[kSizeA]; + V aV[kSizeA]; #pragma unroll for (int i = 0; i < kSizeA; ++i) { - aK[i] = k[i]; - aV[i].key = v[i].key; - aV[i].value = v[i].value; + aK[i] = k[i]; + aV[i] = v[i]; } - BitonicSortStepKVP::sort(aK, aV); + BitonicSortStep::sort(aK, aV); K bK[kSizeB]; - KeyValuePair bV[kSizeB]; + V bV[kSizeB]; #pragma unroll for (int i = 0; i < kSizeB; ++i) { - bK[i] = k[i + kSizeA]; - bV[i].key = v[i + kSizeA].key; - bV[i].value = v[i + kSizeA].value; + bK[i] = k[i + kSizeA]; + bV[i] = v[i + kSizeA]; } - BitonicSortStepKVP::sort(bK, bV); + BitonicSortStep::sort(bK, bV); // Merge halves - warpMergeAnyRegistersKVP(aK, aV, bK, bV); + warpMergeAnyRegisters(aK, aV, bK, bV); #pragma unroll for (int i = 0; i < kSizeA; ++i) { - k[i] = aK[i]; - v[i].key = aV[i].key; - v[i].value = aV[i].value; + k[i] = aK[i]; + v[i] = aV[i]; } #pragma unroll for (int i = 0; i < kSizeB; ++i) { - k[i + kSizeA] = bK[i]; - v[i + kSizeA].key = bV[i].key; - v[i + kSizeA].value = bV[i].value; + k[i + kSizeA] = bK[i]; + v[i + kSizeA] = bV[i]; } } }; // Single warp (N == 1) sorting specialization template -struct BitonicSortStepKVP { - static inline __device__ void sort(K k[1], KeyValuePair v[1]) +struct BitonicSortStep { + static inline __device__ void sort(K k[1], V v[1]) { // Update this code if this changes - // should go from 1 -> kWarpSize in multiples of 2 - static_assert(kWarpSize == 32, "unexpected warp size"); - - warpBitonicMergeLE16KVP(k[0], v[0]); - warpBitonicMergeLE16KVP(k[0], v[0]); - warpBitonicMergeLE16KVP(k[0], v[0]); - warpBitonicMergeLE16KVP(k[0], v[0]); - warpBitonicMergeLE16KVP(k[0], v[0]); + // should go from 1 -> WarpSize in multiples of 2 + static_assert(WarpSize == 32, "unexpected warp size"); + + warpBitonicMergeLE16(k[0], v[0]); + warpBitonicMergeLE16(k[0], v[0]); + warpBitonicMergeLE16(k[0], v[0]); + warpBitonicMergeLE16(k[0], v[0]); + warpBitonicMergeLE16(k[0], v[0]); } }; -/// Sort a list of kWarpSize * N elements in registers, where N is an +/// Sort a list of WarpSize * N elements in registers, where N is an /// arbitrary >= 1 template -inline __device__ void warpSortAnyRegistersKVP(K k[N], KeyValuePair v[N]) +inline __device__ void warpSortAnyRegisters(K k[N], V v[N]) { - BitonicSortStepKVP::sort(k, v); + BitonicSortStep::sort(k, v); } -} // namespace gpu -} // namespace faiss + +} // namespace raft::spatial::knn::detail::faiss_select diff --git a/cpp/include/raft/spatial/knn/detail/faiss_select/Select.cuh b/cpp/include/raft/spatial/knn/detail/faiss_select/Select.cuh new file mode 100644 index 0000000000..e4faff7a6c --- /dev/null +++ b/cpp/include/raft/spatial/knn/detail/faiss_select/Select.cuh @@ -0,0 +1,555 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file thirdparty/LICENSES/LICENSE.faiss + */ + +#pragma once + +#include +#include +#include + +#include +#include + +namespace raft::spatial::knn::detail::faiss_select { + +// Specialization for block-wide monotonic merges producing a merge sort +// since what we really want is a constexpr loop expansion +template +struct FinalBlockMerge { +}; + +template +struct FinalBlockMerge<1, NumThreads, K, V, NumWarpQ, Dir, Comp> { + static inline __device__ void merge(K* sharedK, V* sharedV) + { + // no merge required; single warp + } +}; + +template +struct FinalBlockMerge<2, NumThreads, K, V, NumWarpQ, Dir, Comp> { + static inline __device__ void merge(K* sharedK, V* sharedV) + { + // Final merge doesn't need to fully merge the second list + blockMerge(sharedK, + sharedV); + } +}; + +template +struct FinalBlockMerge<4, NumThreads, K, V, NumWarpQ, Dir, Comp> { + static inline __device__ void merge(K* sharedK, V* sharedV) + { + blockMerge(sharedK, + sharedV); + // Final merge doesn't need to fully merge the second list + blockMerge( + sharedK, sharedV); + } +}; + +template +struct FinalBlockMerge<8, NumThreads, K, V, NumWarpQ, Dir, Comp> { + static inline __device__ void merge(K* sharedK, V* sharedV) + { + blockMerge(sharedK, + sharedV); + blockMerge(sharedK, + sharedV); + // Final merge doesn't need to fully merge the second list + blockMerge( + sharedK, sharedV); + } +}; + +// `Dir` true, produce largest values. +// `Dir` false, produce smallest values. +template +struct BlockSelect { + static constexpr int kNumWarps = ThreadsPerBlock / WarpSize; + static constexpr int kTotalWarpSortSize = NumWarpQ; + + __device__ inline BlockSelect(K initKVal, V initVVal, K* smemK, V* smemV, int k) + : initK(initKVal), + initV(initVVal), + numVals(0), + warpKTop(initKVal), + sharedK(smemK), + sharedV(smemV), + kMinus1(k - 1) + { + static_assert(utils::isPowerOf2(ThreadsPerBlock), "threads must be a power-of-2"); + static_assert(utils::isPowerOf2(NumWarpQ), "warp queue must be power-of-2"); + + // Fill the per-thread queue keys with the default value +#pragma unroll + for (int i = 0; i < NumThreadQ; ++i) { + threadK[i] = initK; + threadV[i] = initV; + } + + int laneId = raft::laneId(); + int warpId = threadIdx.x / WarpSize; + warpK = sharedK + warpId * kTotalWarpSortSize; + warpV = sharedV + warpId * kTotalWarpSortSize; + + // Fill warp queue (only the actual queue space is fine, not where + // we write the per-thread queues for merging) + for (int i = laneId; i < NumWarpQ; i += WarpSize) { + warpK[i] = initK; + warpV[i] = initV; + } + + warpFence(); + } + + __device__ inline void addThreadQ(K k, V v) + { + if (Dir ? Comp::gt(k, warpKTop) : Comp::lt(k, warpKTop)) { + // Rotate right +#pragma unroll + for (int i = NumThreadQ - 1; i > 0; --i) { + threadK[i] = threadK[i - 1]; + threadV[i] = threadV[i - 1]; + } + + threadK[0] = k; + threadV[0] = v; + ++numVals; + } + } + + __device__ inline void checkThreadQ() + { + bool needSort = (numVals == NumThreadQ); + +#if CUDA_VERSION >= 9000 + needSort = __any_sync(0xffffffff, needSort); +#else + needSort = __any(needSort); +#endif + + if (!needSort) { + // no lanes have triggered a sort + return; + } + + // This has a trailing warpFence + mergeWarpQ(); + + // Any top-k elements have been merged into the warp queue; we're + // free to reset the thread queues + numVals = 0; + +#pragma unroll + for (int i = 0; i < NumThreadQ; ++i) { + threadK[i] = initK; + threadV[i] = initV; + } + + // We have to beat at least this element + warpKTop = warpK[kMinus1]; + + warpFence(); + } + + /// This function handles sorting and merging together the + /// per-thread queues with the warp-wide queue, creating a sorted + /// list across both + __device__ inline void mergeWarpQ() + { + int laneId = raft::laneId(); + + // Sort all of the per-thread queues + warpSortAnyRegisters(threadK, threadV); + + constexpr int kNumWarpQRegisters = NumWarpQ / WarpSize; + K warpKRegisters[kNumWarpQRegisters]; + V warpVRegisters[kNumWarpQRegisters]; + +#pragma unroll + for (int i = 0; i < kNumWarpQRegisters; ++i) { + warpKRegisters[i] = warpK[i * WarpSize + laneId]; + warpVRegisters[i] = warpV[i * WarpSize + laneId]; + } + + warpFence(); + + // The warp queue is already sorted, and now that we've sorted the + // per-thread queue, merge both sorted lists together, producing + // one sorted list + warpMergeAnyRegisters( + warpKRegisters, warpVRegisters, threadK, threadV); + + // Write back out the warp queue +#pragma unroll + for (int i = 0; i < kNumWarpQRegisters; ++i) { + warpK[i * WarpSize + laneId] = warpKRegisters[i]; + warpV[i * WarpSize + laneId] = warpVRegisters[i]; + } + + warpFence(); + } + + /// WARNING: all threads in a warp must participate in this. + /// Otherwise, you must call the constituent parts separately. + __device__ inline void add(K k, V v) + { + addThreadQ(k, v); + checkThreadQ(); + } + + __device__ inline void reduce() + { + // Have all warps dump and merge their queues; this will produce + // the final per-warp results + mergeWarpQ(); + + // block-wide dep; thus far, all warps have been completely + // independent + __syncthreads(); + + // All warp queues are contiguous in smem. + // Now, we have kNumWarps lists of NumWarpQ elements. + // This is a power of 2. + FinalBlockMerge::merge(sharedK, sharedV); + + // The block-wide merge has a trailing syncthreads + } + + // Default element key + const K initK; + + // Default element value + const V initV; + + // Number of valid elements in our thread queue + int numVals; + + // The k-th highest (Dir) or lowest (!Dir) element + K warpKTop; + + // Thread queue values + K threadK[NumThreadQ]; + V threadV[NumThreadQ]; + + // Queues for all warps + K* sharedK; + V* sharedV; + + // Our warp's queue (points into sharedK/sharedV) + // warpK[0] is highest (Dir) or lowest (!Dir) + K* warpK; + V* warpV; + + // This is a cached k-1 value + int kMinus1; +}; + +/// Specialization for k == 1 (NumWarpQ == 1) +template +struct BlockSelect { + static constexpr int kNumWarps = ThreadsPerBlock / WarpSize; + + __device__ inline BlockSelect(K initK, V initV, K* smemK, V* smemV, int k) + : threadK(initK), threadV(initV), sharedK(smemK), sharedV(smemV) + { + } + + __device__ inline void addThreadQ(K k, V v) + { + bool swap = Dir ? Comp::gt(k, threadK) : Comp::lt(k, threadK); + threadK = swap ? k : threadK; + threadV = swap ? v : threadV; + } + + __device__ inline void checkThreadQ() + { + // We don't need to do anything here, since the warp doesn't + // cooperate until the end + } + + __device__ inline void add(K k, V v) { addThreadQ(k, v); } + + __device__ inline void reduce() + { + // Reduce within the warp + KeyValuePair pair(threadK, threadV); + + if (Dir) { + pair = warpReduce(pair, max_op{}); + } else { + pair = warpReduce(pair, min_op{}); + } + + // Each warp writes out a single value + int laneId = raft::laneId(); + int warpId = threadIdx.x / WarpSize; + + if (laneId == 0) { + sharedK[warpId] = pair.key; + sharedV[warpId] = pair.value; + } + + __syncthreads(); + + // We typically use this for small blocks (<= 128), just having the + // first thread in the block perform the reduction across warps is + // faster + if (threadIdx.x == 0) { + threadK = sharedK[0]; + threadV = sharedV[0]; + +#pragma unroll + for (int i = 1; i < kNumWarps; ++i) { + K k = sharedK[i]; + V v = sharedV[i]; + + bool swap = Dir ? Comp::gt(k, threadK) : Comp::lt(k, threadK); + threadK = swap ? k : threadK; + threadV = swap ? v : threadV; + } + + // Hopefully a thread's smem reads/writes are ordered wrt + // itself, so no barrier needed :) + sharedK[0] = threadK; + sharedV[0] = threadV; + } + + // In case other threads wish to read this value + __syncthreads(); + } + + // threadK is lowest (Dir) or highest (!Dir) + K threadK; + V threadV; + + // Where we reduce in smem + K* sharedK; + V* sharedV; +}; + +// +// per-warp WarpSelect +// + +// `Dir` true, produce largest values. +// `Dir` false, produce smallest values. +template +struct WarpSelect { + static constexpr int kNumWarpQRegisters = NumWarpQ / WarpSize; + + __device__ inline WarpSelect(K initKVal, V initVVal, int k) + : initK(initKVal), initV(initVVal), numVals(0), warpKTop(initKVal), kLane((k - 1) % WarpSize) + { + static_assert(utils::isPowerOf2(ThreadsPerBlock), "threads must be a power-of-2"); + static_assert(utils::isPowerOf2(NumWarpQ), "warp queue must be power-of-2"); + + // Fill the per-thread queue keys with the default value +#pragma unroll + for (int i = 0; i < NumThreadQ; ++i) { + threadK[i] = initK; + threadV[i] = initV; + } + + // Fill the warp queue with the default value +#pragma unroll + for (int i = 0; i < kNumWarpQRegisters; ++i) { + warpK[i] = initK; + warpV[i] = initV; + } + } + + __device__ inline void addThreadQ(K k, V v) + { + if (Dir ? Comp::gt(k, warpKTop) : Comp::lt(k, warpKTop)) { + // Rotate right +#pragma unroll + for (int i = NumThreadQ - 1; i > 0; --i) { + threadK[i] = threadK[i - 1]; + threadV[i] = threadV[i - 1]; + } + + threadK[0] = k; + threadV[0] = v; + ++numVals; + } + } + + __device__ inline void checkThreadQ() + { + bool needSort = (numVals == NumThreadQ); + +#if CUDA_VERSION >= 9000 + needSort = __any_sync(0xffffffff, needSort); +#else + needSort = __any(needSort); +#endif + + if (!needSort) { + // no lanes have triggered a sort + return; + } + + mergeWarpQ(); + + // Any top-k elements have been merged into the warp queue; we're + // free to reset the thread queues + numVals = 0; + +#pragma unroll + for (int i = 0; i < NumThreadQ; ++i) { + threadK[i] = initK; + threadV[i] = initV; + } + + // We have to beat at least this element + warpKTop = shfl(warpK[kNumWarpQRegisters - 1], kLane); + } + + /// This function handles sorting and merging together the + /// per-thread queues with the warp-wide queue, creating a sorted + /// list across both + __device__ inline void mergeWarpQ() + { + // Sort all of the per-thread queues + warpSortAnyRegisters(threadK, threadV); + + // The warp queue is already sorted, and now that we've sorted the + // per-thread queue, merge both sorted lists together, producing + // one sorted list + warpMergeAnyRegisters( + warpK, warpV, threadK, threadV); + } + + /// WARNING: all threads in a warp must participate in this. + /// Otherwise, you must call the constituent parts separately. + __device__ inline void add(K k, V v) + { + addThreadQ(k, v); + checkThreadQ(); + } + + __device__ inline void reduce() + { + // Have all warps dump and merge their queues; this will produce + // the final per-warp results + mergeWarpQ(); + } + + /// Dump final k selected values for this warp out + __device__ inline void writeOut(K* outK, V* outV, int k) + { + int laneId = raft::laneId(); + +#pragma unroll + for (int i = 0; i < kNumWarpQRegisters; ++i) { + int idx = i * WarpSize + laneId; + + if (idx < k) { + outK[idx] = warpK[i]; + outV[idx] = warpV[i]; + } + } + } + + // Default element key + const K initK; + + // Default element value + const V initV; + + // Number of valid elements in our thread queue + int numVals; + + // The k-th highest (Dir) or lowest (!Dir) element + K warpKTop; + + // Thread queue values + K threadK[NumThreadQ]; + V threadV[NumThreadQ]; + + // warpK[0] is highest (Dir) or lowest (!Dir) + K warpK[kNumWarpQRegisters]; + V warpV[kNumWarpQRegisters]; + + // This is what lane we should load an approximation (>=k) to the + // kth element from the last register in the warp queue (i.e., + // warpK[kNumWarpQRegisters - 1]). + int kLane; +}; + +/// Specialization for k == 1 (NumWarpQ == 1) +template +struct WarpSelect { + static constexpr int kNumWarps = ThreadsPerBlock / WarpSize; + + __device__ inline WarpSelect(K initK, V initV, int k) : threadK(initK), threadV(initV) {} + + __device__ inline void addThreadQ(K k, V v) + { + bool swap = Dir ? Comp::gt(k, threadK) : Comp::lt(k, threadK); + threadK = swap ? k : threadK; + threadV = swap ? v : threadV; + } + + __device__ inline void checkThreadQ() + { + // We don't need to do anything here, since the warp doesn't + // cooperate until the end + } + + __device__ inline void add(K k, V v) { addThreadQ(k, v); } + + __device__ inline void reduce() + { + // Reduce within the warp + KeyValuePair pair(threadK, threadV); + + if (Dir) { + pair = warpReduce(pair, max_op{}); + } else { + pair = warpReduce(pair, min_op{}); + } + + threadK = pair.key; + threadV = pair.value; + } + + /// Dump final k selected values for this warp out + __device__ inline void writeOut(K* outK, V* outV, int k) + { + if (raft::laneId() == 0) { + *outK = threadK; + *outV = threadV; + } + } + + // threadK is lowest (Dir) or highest (!Dir) + K threadK; + V threadV; +}; + +} // namespace raft::spatial::knn::detail::faiss_select diff --git a/cpp/include/raft/spatial/knn/detail/faiss_select/StaticUtils.h b/cpp/include/raft/spatial/knn/detail/faiss_select/StaticUtils.h new file mode 100644 index 0000000000..bac051b68c --- /dev/null +++ b/cpp/include/raft/spatial/knn/detail/faiss_select/StaticUtils.h @@ -0,0 +1,48 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file thirdparty/LICENSES/LICENSE.faiss + */ + +#pragma once + +#include + +// allow usage for non-CUDA files +#ifndef __host__ +#define __host__ +#define __device__ +#endif + +namespace raft::spatial::knn::detail::faiss_select::utils { + +template +constexpr __host__ __device__ bool isPowerOf2(T v) +{ + return (v && !(v & (v - 1))); +} + +static_assert(isPowerOf2(2048), "isPowerOf2"); +static_assert(!isPowerOf2(3333), "isPowerOf2"); + +template +constexpr __host__ __device__ T nextHighestPowerOf2(T v) +{ + return (isPowerOf2(v) ? (T)2 * v : ((T)1 << (log2(v) + 1))); +} + +static_assert(nextHighestPowerOf2(1) == 2, "nextHighestPowerOf2"); +static_assert(nextHighestPowerOf2(2) == 4, "nextHighestPowerOf2"); +static_assert(nextHighestPowerOf2(3) == 4, "nextHighestPowerOf2"); +static_assert(nextHighestPowerOf2(4) == 8, "nextHighestPowerOf2"); + +static_assert(nextHighestPowerOf2(15) == 16, "nextHighestPowerOf2"); +static_assert(nextHighestPowerOf2(16) == 32, "nextHighestPowerOf2"); +static_assert(nextHighestPowerOf2(17) == 32, "nextHighestPowerOf2"); + +static_assert(nextHighestPowerOf2(1536000000u) == 2147483648u, "nextHighestPowerOf2"); +static_assert(nextHighestPowerOf2((size_t)2147483648ULL) == (size_t)4294967296ULL, + "nextHighestPowerOf2"); + +} // namespace raft::spatial::knn::detail::faiss_select::utils diff --git a/cpp/include/raft/spatial/knn/detail/block_select_faiss.cuh b/cpp/include/raft/spatial/knn/detail/faiss_select/key_value_block_select.cuh similarity index 80% rename from cpp/include/raft/spatial/knn/detail/block_select_faiss.cuh rename to cpp/include/raft/spatial/knn/detail/faiss_select/key_value_block_select.cuh index 34240fba64..617a26a243 100644 --- a/cpp/include/raft/spatial/knn/detail/block_select_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/faiss_select/key_value_block_select.cuh @@ -2,26 +2,19 @@ * Copyright (c) Facebook, Inc. and its affiliates. * * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. + * LICENSE file thirdparty/LICENSES/LICENSE.faiss */ #pragma once -#include -#include -#include -#include -#include -#include - -#include "warp_select_faiss.cuh" +#include +#include // TODO: Need to think further about the impact (and new boundaries created) on the registers // because this will change the max k that can be processed. One solution might be to break // up k into multiple batches for larger k. -namespace faiss { -namespace gpu { +namespace raft::spatial::knn::detail::faiss_select { // `Dir` true, produce largest values. // `Dir` false, produce smallest values. @@ -33,7 +26,7 @@ template struct KeyValueBlockSelect { - static constexpr int kNumWarps = ThreadsPerBlock / kWarpSize; + static constexpr int kNumWarps = ThreadsPerBlock / WarpSize; static constexpr int kTotalWarpSortSize = NumWarpQ; __device__ inline KeyValueBlockSelect( @@ -59,14 +52,14 @@ struct KeyValueBlockSelect { threadV[i].value = initVv; } - int laneId = getLaneId(); - int warpId = threadIdx.x / kWarpSize; + int laneId = raft::laneId(); + int warpId = threadIdx.x / WarpSize; warpK = sharedK + warpId * kTotalWarpSortSize; warpV = sharedV + warpId * kTotalWarpSortSize; // Fill warp queue (only the actual queue space is fine, not where // we write the per-thread queues for merging) - for (int i = laneId; i < NumWarpQ; i += kWarpSize) { + for (int i = laneId; i < NumWarpQ; i += WarpSize) { warpK[i] = initK; warpV[i].key = initVk; warpV[i].value = initVv; @@ -134,20 +127,20 @@ struct KeyValueBlockSelect { /// list across both __device__ inline void mergeWarpQ() { - int laneId = getLaneId(); + int laneId = raft::laneId(); // Sort all of the per-thread queues - warpSortAnyRegistersKVP(threadK, threadV); + warpSortAnyRegisters, NumThreadQ, !Dir, Comp>(threadK, threadV); - constexpr int kNumWarpQRegisters = NumWarpQ / kWarpSize; + constexpr int kNumWarpQRegisters = NumWarpQ / WarpSize; K warpKRegisters[kNumWarpQRegisters]; KeyValuePair warpVRegisters[kNumWarpQRegisters]; #pragma unroll for (int i = 0; i < kNumWarpQRegisters; ++i) { - warpKRegisters[i] = warpK[i * kWarpSize + laneId]; - warpVRegisters[i].key = warpV[i * kWarpSize + laneId].key; - warpVRegisters[i].value = warpV[i * kWarpSize + laneId].value; + warpKRegisters[i] = warpK[i * WarpSize + laneId]; + warpVRegisters[i].key = warpV[i * WarpSize + laneId].key; + warpVRegisters[i].value = warpV[i * WarpSize + laneId].value; } warpFence(); @@ -155,15 +148,15 @@ struct KeyValueBlockSelect { // The warp queue is already sorted, and now that we've sorted the // per-thread queue, merge both sorted lists together, producing // one sorted list - warpMergeAnyRegistersKVP( + warpMergeAnyRegisters, kNumWarpQRegisters, NumThreadQ, !Dir, Comp, false>( warpKRegisters, warpVRegisters, threadK, threadV); // Write back out the warp queue #pragma unroll for (int i = 0; i < kNumWarpQRegisters; ++i) { - warpK[i * kWarpSize + laneId] = warpKRegisters[i]; - warpV[i * kWarpSize + laneId].key = warpVRegisters[i].key; - warpV[i * kWarpSize + laneId].value = warpVRegisters[i].value; + warpK[i * WarpSize + laneId] = warpKRegisters[i]; + warpV[i * WarpSize + laneId].key = warpVRegisters[i].key; + warpV[i * WarpSize + laneId].value = warpVRegisters[i].value; } warpFence(); @@ -228,5 +221,4 @@ struct KeyValueBlockSelect { int kMinus1; }; -} // namespace gpu -} // namespace faiss \ No newline at end of file +} // namespace raft::spatial::knn::detail::faiss_select diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh index 85a05877f1..f1f160a154 100644 --- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh +++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,9 +15,9 @@ */ #pragma once #include -#include #include #include +#include // TODO: Need to hide the PairwiseDistance class impl and expose to public API #include "processing.cuh" #include @@ -219,8 +219,8 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(const DataT* x constexpr auto identity = std::numeric_limits::max(); constexpr auto keyMax = std::numeric_limits::max(); constexpr auto Dir = false; - typedef faiss::gpu:: - WarpSelect, NumWarpQ, NumThreadQ, 32> + typedef faiss_select:: + WarpSelect, NumWarpQ, NumThreadQ, 32> myWarpSelect; auto rowEpilog_lambda = [m, n, numOfNN, out_dists, out_inds, mutexes] __device__( diff --git a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh index 333fc1c573..e073841dd3 100644 --- a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh +++ b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,12 +18,11 @@ #include #include - -#include -#include +#include #include #include +#include namespace raft { namespace spatial { @@ -61,21 +60,21 @@ __global__ void haversine_knn_kernel(value_idx* out_inds, size_t n_index_rows, int k) { - constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize; + constexpr int kNumWarps = tpb / WarpSize; __shared__ value_t smemK[kNumWarps * warp_q]; __shared__ value_idx smemV[kNumWarps * warp_q]; - faiss::gpu:: - BlockSelect, warp_q, thread_q, tpb> - heap(faiss::gpu::Limits::getMax(), + faiss_select:: + BlockSelect, warp_q, thread_q, tpb> + heap(std::numeric_limits::max(), std::numeric_limits::max(), smemK, smemV, k); // Grid is exactly sized to rows available - int limit = faiss::gpu::utils::roundDown(n_index_rows, faiss::gpu::kWarpSize); + int limit = Pow2::roundDown(n_index_rows); const value_t* query_ptr = query + (blockIdx.x * 2); value_t x1 = query_ptr[0]; diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh index 086cae1089..b246121958 100644 --- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,13 +23,12 @@ #include #include -#include -#include #include #include #include #include +#include #include #include #include @@ -61,7 +60,7 @@ __global__ void knn_merge_parts_kernel(value_t* inK, int k, value_idx* translations) { - constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize; + constexpr int kNumWarps = tpb / WarpSize; __shared__ value_t smemK[kNumWarps * warp_q]; __shared__ value_idx smemV[kNumWarps * warp_q]; @@ -69,8 +68,8 @@ __global__ void knn_merge_parts_kernel(value_t* inK, /** * Uses shared memory */ - faiss::gpu:: - BlockSelect, warp_q, thread_q, tpb> + faiss_select:: + BlockSelect, warp_q, thread_q, tpb> heap(initK, initV, smemK, smemV, k); // Grid is exactly sized to rows available @@ -88,7 +87,7 @@ __global__ void knn_merge_parts_kernel(value_t* inK, value_t* inKStart = inK + (row_idx + col); value_idx* inVStart = inV + (row_idx + col); - int limit = faiss::gpu::utils::roundDown(total_k, faiss::gpu::kWarpSize); + int limit = Pow2::roundDown(total_k); value_idx translation = 0; for (; i < limit; i += tpb) { @@ -134,7 +133,7 @@ inline void knn_merge_parts_impl(value_t* inK, constexpr int n_threads = (warp_q <= 1024) ? 128 : 64; auto block = dim3(n_threads); - auto kInit = faiss::gpu::Limits::getMax(); + auto kInit = std::numeric_limits::max(); auto vInit = -1; knn_merge_parts_kernel <<>>( diff --git a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh index 27c7e006ca..2cdc0fae91 100644 --- a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,7 +20,7 @@ #include #include -#include +#include namespace raft { namespace spatial { @@ -50,9 +50,14 @@ __global__ void select_k_kernel(const key_t* inK, __shared__ key_t smemK[kNumWarps * warp_q]; __shared__ payload_t smemV[kNumWarps * warp_q]; - faiss::gpu:: - BlockSelect, warp_q, thread_q, tpb> - heap(initK, initV, smemK, smemV, k); + faiss_select::BlockSelect, + warp_q, + thread_q, + tpb> + heap(initK, initV, smemK, smemV, k); // Grid is exactly sized to rows available int row = blockIdx.x; diff --git a/thirdparty/LICENSES/LICENSE.faiss b/thirdparty/LICENSES/LICENSE.faiss new file mode 100644 index 0000000000..87cbf536c6 --- /dev/null +++ b/thirdparty/LICENSES/LICENSE.faiss @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) Facebook, Inc. and its affiliates. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file From 9944b3a8b83bfd6cd8298a73cd175298e168d264 Mon Sep 17 00:00:00 2001 From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com> Date: Fri, 6 Jan 2023 23:14:06 +0100 Subject: [PATCH 02/44] Make IVF-PQ build index in batches when necessary (#1056) Before this patch, when the input data was not accessible directly from the device, the `build` and `extend` functions mapped it using the `cudaHostRegister`. Although this approach was rather fast, it could fail when the input data is too large to fit in the device memory. This PR, changes the logic of `build` and `extend`, so that the data is loaded in batches when necessary. Moreover, when the passed pointer represents the mapped file (e.g. using the system call `mmap` ), the size of the input may even be larger than the host memory. The `build` does one pass through the input (to sample the training set), and the `extend` does at most two passes. Authors: - Artem M. Chirkin (https://github.com/achirkin) - Tamas Bela Feher (https://github.com/tfeher) Approvers: - Tamas Bela Feher (https://github.com/tfeher) URL: https://github.com/rapidsai/raft/pull/1056 --- cpp/include/raft/neighbors/ivf_pq_types.hpp | 14 +- .../raft/spatial/knn/detail/ann_utils.cuh | 205 +++ .../raft/spatial/knn/detail/ivf_pq_build.cuh | 1243 +++++++++-------- 3 files changed, 888 insertions(+), 574 deletions(-) diff --git a/cpp/include/raft/neighbors/ivf_pq_types.hpp b/cpp/include/raft/neighbors/ivf_pq_types.hpp index 825e2902c3..244d1879d8 100644 --- a/cpp/include/raft/neighbors/ivf_pq_types.hpp +++ b/cpp/include/raft/neighbors/ivf_pq_types.hpp @@ -316,8 +316,16 @@ struct index : ann::index { */ void allocate(const handle_t& handle, IdxT index_size) { - pq_dataset_ = make_device_mdarray(handle, make_pq_dataset_extents(index_size)); - indices_ = make_device_mdarray(handle, make_extents(index_size)); + try { + pq_dataset_ = make_device_mdarray(handle, make_pq_dataset_extents(index_size)); + indices_ = make_device_mdarray(handle, make_extents(index_size)); + } catch (std::bad_alloc& e) { + RAFT_FAIL( + "ivf-pq: failed to allocate a big enough index to hold all data (size: %zu). " + "Allocator exception: %s", + size_t(index_size), + e.what()); + } if (index_size > 0) { thrust::fill_n( handle.get_thrust_policy(), indices_.data_handle(), index_size, kInvalidRecord); @@ -434,7 +442,7 @@ struct index : ann::index { /** A helper function to determine the extents of an array enough to hold a given amount of data. */ - auto make_pq_dataset_extents(IdxT n_rows) -> pq_dataset_extents + auto make_pq_dataset_extents(IdxT n_rows) const -> pq_dataset_extents { // how many elems of pq_dim fit into one kIndexGroupVecLen-byte chunk auto pq_chunk = (kIndexGroupVecLen * 8u) / pq_bits(); diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh index b721915187..32d4f67a20 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh @@ -16,12 +16,19 @@ #pragma once +#include #include #include #include #include +#include #include +#include +#include + +#include +#include namespace raft::spatial::knn::detail::utils { @@ -359,4 +366,202 @@ void copy_selected(IdxT n_rows, } } +/** + * A batch input iterator over the data source. + * Given an input pointer, it decides whether the current device has the access to the data and + * gives it back to the user in batches. Three scenarios are possible: + * + * 1. if `source == nullptr`: then `batch.data() == nullptr` + * 2. if `source` is accessible from the device, `batch.data()` points directly at the source at + * the proper offsets on each iteration. + * 3. if `source` is not accessible from the device, `batch.data()` points to an intermediate + * buffer; the corresponding data is copied in the given `stream` on every iterator dereference + * (i.e. batches can be skipped). Dereferencing the same batch two times in a row does not force + * the copy. + * + * In all three scenarios, the number of iterations, batch offsets and sizes are the same. + * + * The iterator can be reused. If the number of iterations is one, at most one copy will ever be + * invoked (i.e. small datasets are not reloaded multiple times). + */ +template +struct batch_load_iterator { + using size_type = size_t; + + /** A single batch of data residing in device memory. */ + struct batch { + /** Logical width of a single row in a batch, in elements of type `T`. */ + [[nodiscard]] auto row_width() const -> size_type { return row_width_; } + /** Logical offset of the batch, in rows (`row_width()`) */ + [[nodiscard]] auto offset() const -> size_type { return pos_.value_or(0) * batch_size_; } + /** Logical size of the batch, in rows (`row_width()`) */ + [[nodiscard]] auto size() const -> size_type { return batch_len_; } + /** Logical size of the batch, in rows (`row_width()`) */ + [[nodiscard]] auto data() const -> const T* { return const_cast(dev_ptr_); } + /** Whether this batch copies the data (i.e. the source is inaccessible from the device). */ + [[nodiscard]] auto does_copy() const -> bool { return needs_copy_; } + + private: + batch(const T* source, + size_type n_rows, + size_type row_width, + size_type batch_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + : stream_(stream), + buf_(0, stream, mr), + source_(source), + dev_ptr_(nullptr), + n_rows_(n_rows), + row_width_(row_width), + batch_size_(std::min(batch_size, n_rows)), + pos_(std::nullopt), + n_iters_(raft::div_rounding_up_safe(n_rows, batch_size)), + needs_copy_(false) + { + if (source_ == nullptr) { return; } + cudaPointerAttributes attr; + RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, source_)); + dev_ptr_ = reinterpret_cast(attr.devicePointer); + if (dev_ptr_ == nullptr) { + buf_.resize(row_width_ * batch_size_, stream); + dev_ptr_ = buf_.data(); + needs_copy_ = true; + } + } + rmm::cuda_stream_view stream_; + rmm::device_uvector buf_; + const T* source_; + size_type n_rows_; + size_type row_width_; + size_type batch_size_; + size_type n_iters_; + bool needs_copy_; + + std::optional pos_; + size_type batch_len_; + T* dev_ptr_; + + friend class batch_load_iterator; + + /** + * Changes the state of the batch to point at the `pos` index. + * If necessary, copies the data from the source in the registered stream. + */ + void load(const size_type& pos) + { + // No-op if the data is already loaded, or it's the end of the input. + if (pos == pos_ || pos >= n_iters_) { return; } + pos_.emplace(pos); + batch_len_ = std::min(batch_size_, n_rows_ - std::min(offset(), n_rows_)); + if (source_ == nullptr) { return; } + if (needs_copy_) { + if (size() > 0) { + RAFT_LOG_DEBUG("batch_load_iterator::copy(offset = %zu, size = %zu, row_width = %zu)", + size_t(offset()), + size_t(size()), + size_t(row_width())); + copy(dev_ptr_, source_ + offset() * row_width(), size() * row_width(), stream_); + } + } else { + dev_ptr_ = const_cast(source_) + offset() * row_width(); + } + } + }; + + using value_type = batch; + using reference = const value_type&; + using pointer = const value_type*; + + /** + * Create a batch iterator over the data `source`. + * + * For convenience, the data `source` is read in logical units of size `row_width`; batch sizes + * and offsets are calculated in logical rows. Hence, can interpret the data as a contiguous + * row-major matrix of size [n_rows, row_width], and the batches are the sub-matrices of size + * [x<=batch_size, n_rows]. + * + * @param source the input data -- host, device, or nullptr. + * @param n_rows the size of the input in logical rows. + * @param row_width the size of the logical row in the elements of type `T`. + * @param batch_size the desired size of the batch. + * @param stream the ordering for the host->device copies, if applicable. + * @param mr a custom memory resource for the intermediate buffer, if applicable. + */ + batch_load_iterator(const T* source, + size_type n_rows, + size_type row_width, + size_type batch_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) + : cur_batch_(new batch(source, n_rows, row_width, batch_size, stream, mr)), cur_pos_(0) + { + } + /** + * Whether this iterator copies the data on every iteration + * (i.e. the source is inaccessible from the device). + */ + [[nodiscard]] auto does_copy() const -> bool { return cur_batch_->does_copy(); } + /** Reset the iterator position to `begin()` */ + void reset() { cur_pos_ = 0; } + /** Reset the iterator position to `end()` */ + void reset_to_end() { cur_pos_ = cur_batch_->n_iters_; } + [[nodiscard]] auto begin() const -> const batch_load_iterator + { + batch_load_iterator x(*this); + x.reset(); + return x; + } + [[nodiscard]] auto end() const -> const batch_load_iterator + { + batch_load_iterator x(*this); + x.reset_to_end(); + return x; + } + [[nodiscard]] auto operator*() const -> reference + { + cur_batch_->load(cur_pos_); + return *cur_batch_; + } + [[nodiscard]] auto operator->() const -> pointer + { + cur_batch_->load(cur_pos_); + return cur_batch_.get(); + } + friend auto operator==(const batch_load_iterator& x, const batch_load_iterator& y) -> bool + { + return x.cur_batch_ == y.cur_batch_ && x.cur_pos_ == y.cur_pos_; + }; + friend auto operator!=(const batch_load_iterator& x, const batch_load_iterator& y) -> bool + { + return x.cur_batch_ != y.cur_batch_ || x.cur_pos_ != y.cur_pos_; + }; + auto operator++() -> batch_load_iterator& + { + ++cur_pos_; + return *this; + } + auto operator++(int) -> batch_load_iterator + { + batch_load_iterator x(*this); + ++cur_pos_; + return x; + } + auto operator--() -> batch_load_iterator& + { + --cur_pos_; + return *this; + } + auto operator--(int) -> batch_load_iterator + { + batch_load_iterator x(*this); + --cur_pos_; + return x; + } + + private: + std::shared_ptr cur_batch_; + size_type cur_pos_; +}; + } // namespace raft::spatial::knn::detail::utils diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh index d718deeb57..fa7504866d 100644 --- a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh +++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -48,9 +49,12 @@ #include #include #include +#include #include #include +#include + namespace raft::spatial::knn::ivf_pq::detail { using namespace raft::spatial::knn::detail; // NOLINT @@ -61,7 +65,9 @@ using raft::neighbors::ivf_pq::index_params; using raft::neighbors::ivf_pq::kIndexGroupSize; using raft::neighbors::ivf_pq::kIndexGroupVecLen; -using pq_codes_exts = extents; +using pq_vec_t = TxN_t::io_t; +using pq_new_vec_exts = extents; +using pq_int_vec_exts = extents; namespace { @@ -117,80 +123,53 @@ struct bitfield_view_t { } }; -/* - NB: label type is uint32_t although it can only contain values up to `1 << pq_bits`. - We keep it this way to not force one more overload for kmeans::predict. - */ -template -__device__ void ivfpq_encode_core(uint32_t n_rows, - uint32_t pq_dim, - const uint32_t* label, - uint8_t* output) +template +__launch_bounds__(BlockDim) __global__ void copy_warped_kernel( + T* out, uint32_t ld_out, const S* in, uint32_t ld_in, uint32_t n_cols, size_t n_rows) { - constexpr uint32_t kChunkSize = (VecLen * 8u) / PqBits; - TxN_t vec; - for (uint32_t j = 0; j < pq_dim;) { - vec.fill(0); - bitfield_view_t out{vec.val.data}; -#pragma unroll - for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++, label += n_rows) { - out[k] = static_cast(*label); - } - vec.store(output, 0); - output += VecLen; + using warp = Pow2; + size_t row_ix = warp::div(size_t(threadIdx.x) + size_t(BlockDim) * size_t(blockIdx.x)); + uint32_t i = warp::mod(threadIdx.x); + if (row_ix >= n_rows) return; + out += row_ix * ld_out; + in += row_ix * ld_in; + auto f = utils::mapping{}; + for (uint32_t col_ix = i; col_ix < n_cols; col_ix += warp::Value) { + auto x = f(in[col_ix]); + __syncwarp(); + out[col_ix] = x; } } -template -__launch_bounds__(BlockDim) __global__ - void ivfpq_encode_kernel(uint32_t pq_dim, - const uint32_t* label, // [pq_dim, n_rows] - device_mdspan output // [n_rows, ..] - ) -{ - uint32_t i = threadIdx.x + BlockDim * blockIdx.x; - if (i >= output.extent(0)) return; - ivfpq_encode_core( - output.extent(0), - pq_dim, - label + i, - output.data_handle() + output.extent(1) * output.extent(2) * i); -} -} // namespace - /** - * Compress the cluster labels into an encoding with pq_bits bits, and transform it into a form to - * facilitate vectorized loads + * Copy the data one warp-per-row: + * + * 1. load the data per-warp + * 2. apply the `utils::mapping{}` + * 3. sync within warp + * 4. store the data. + * + * Assuming sizeof(T) >= sizeof(S) and the data is properly aligned (see the usage in `build`), this + * allows to re-structure the data within rows in-place. */ -inline void ivfpq_encode(uint32_t pq_dim, - uint32_t pq_bits, // 4 <= pq_bits <= 8 - const uint32_t* label, // [pq_dim, n_rows] - device_mdspan output, // [n_rows, ..] - rmm::cuda_stream_view stream) +template +void copy_warped(T* out, + uint32_t ld_out, + const S* in, + uint32_t ld_in, + uint32_t n_cols, + size_t n_rows, + rmm::cuda_stream_view stream) { constexpr uint32_t kBlockDim = 128; dim3 threads(kBlockDim, 1, 1); - dim3 blocks(raft::ceildiv(output.extent(0), kBlockDim), 1, 1); - switch (pq_bits) { - case 4: - return ivfpq_encode_kernel - <<>>(pq_dim, label, output); - case 5: - return ivfpq_encode_kernel - <<>>(pq_dim, label, output); - case 6: - return ivfpq_encode_kernel - <<>>(pq_dim, label, output); - case 7: - return ivfpq_encode_kernel - <<>>(pq_dim, label, output); - case 8: - return ivfpq_encode_kernel - <<>>(pq_dim, label, output); - default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits); - } + dim3 blocks(div_rounding_up_safe(n_rows, kBlockDim / WarpSize), 1, 1); + copy_warped_kernel + <<>>(out, ld_out, in, ld_in, n_cols, n_rows); } +} // namespace + /** * @brief Fill-in a random orthogonal transformation matrix. * @@ -283,166 +262,55 @@ void select_residuals(const handle_t& handle, } /** + * @brief Compute residual vectors from the source dataset given by selected indices. + * + * The residual has the form + * `rotation_matrix %* (dataset[:, :] - centers[labels[:], 0:dim])` * - * @param handle, - * @param n_rows - * @param data_dim - * @param rot_dim - * @param pq_dim - * @param pq_len - * @param pq_bits - * @param n_clusters - * @param codebook_kind - * @param max_cluster_size - * @param cluster_centers // [n_clusters, data_dim] - * @param rotation_matrix // [rot_dim, data_dim] - * @param dataset // [n_rows] - * @param data_indices - * tells which indices to select in the dataset for each cluster [n_rows]; - * it should be partitioned by the clusters by now. - * @param cluster_sizes // [n_clusters] - * @param cluster_offsets // [n_clusters + 1] - * @param pq_centers // [...] (see ivf_pq::index::pq_centers() layout) - * @param pq_dataset - * // [n_rows, ceildiv(pq_dim, (kIndexGroupVecLen * 8u) / pq_bits), kIndexGroupVecLen] - * NB: in contrast to the final interleaved layout in ivf_pq::index::pq_dataset(), this function - * produces a non-interleaved data; it gets interleaved later when adding the data to the - * index. - * @param device_memory */ template -void compute_pq_codes( +void flat_compute_residuals( const handle_t& handle, + float* residuals, // [n_rows, rot_dim] IdxT n_rows, - uint32_t data_dim, - uint32_t rot_dim, - uint32_t pq_dim, - uint32_t pq_len, - uint32_t pq_bits, - uint32_t n_clusters, - codebook_gen codebook_kind, - uint32_t max_cluster_size, - float* cluster_centers, - const float* rotation_matrix, - const T* dataset, - const IdxT* data_indices, - const uint32_t* cluster_sizes, - const IdxT* cluster_offsets, - device_mdspan::pq_centers_extents, row_major> pq_centers, - device_mdspan pq_dataset, + device_mdspan, row_major> rotation_matrix, // [rot_dim, dim] + device_mdspan, row_major> centers, // [n_lists, dim_ext] + const T* dataset, // [n_rows, dim] + const uint32_t* labels, // [n_rows] rmm::mr::device_memory_resource* device_memory) { - common::nvtx::range fun_scope( - "ivf_pq::compute_pq_codes(n_rows = %zu, data_dim = %u, rot_dim = %u (%u * %u), n_clusters = " - "%u)", - size_t(n_rows), - data_dim, - rot_dim, - pq_dim, - pq_len, - n_clusters); - auto stream = handle.get_stream(); - - // - // Compute PQ code - // - - uint32_t pq_width = 1 << pq_bits; - rmm::device_uvector pq_centers_tmp(pq_len * pq_width, stream, device_memory); - rmm::device_uvector rot_vectors( - size_t(max_cluster_size) * size_t(rot_dim), stream, device_memory); - rmm::device_uvector sub_vectors( - size_t(max_cluster_size) * size_t(pq_dim * pq_len), stream, device_memory); - rmm::device_uvector sub_vector_labels( - size_t(max_cluster_size) * size_t(pq_dim), stream, device_memory); - - for (uint32_t l = 0; l < n_clusters; l++) { - auto cluster_size = cluster_sizes[l]; - common::nvtx::range cluster_scope( - "ivf_pq::compute_pq_codes::cluster[%u](size = %u)", l, cluster_size); - if (cluster_size == 0) continue; - - select_residuals(handle, - rot_vectors.data(), - IdxT(cluster_size), - data_dim, - rot_dim, - rotation_matrix, - cluster_centers + size_t(l) * size_t(data_dim), - dataset, - data_indices + cluster_offsets[l], - device_memory); - - // - // Change the order of the vector data to facilitate processing in - // each vector subspace. - // input: rot_vectors[cluster_size, rot_dim] = [cluster_size, pq_dim, pq_len] - // output: sub_vectors[pq_dim, cluster_size, pq_len] - // - for (uint32_t i = 0; i < pq_dim; i++) { - RAFT_CUDA_TRY( - cudaMemcpy2DAsync(sub_vectors.data() + size_t(i) * size_t(pq_len) * size_t(cluster_size), - sizeof(float) * pq_len, - rot_vectors.data() + i * pq_len, - sizeof(float) * rot_dim, - sizeof(float) * pq_len, - cluster_size, - cudaMemcpyDefault, - stream)); - } - - if (codebook_kind == codebook_gen::PER_CLUSTER) { - linalg::writeOnlyUnaryOp( - pq_centers_tmp.data(), - pq_len * pq_width, - [pq_centers, pq_width, pq_len, l] __device__(float* out, uint32_t i) { - auto i0 = i / pq_len; - auto i1 = i % pq_len; - *out = pq_centers(l, i1, i0); - }, - stream); - } - - // - // Find a label (cluster ID) for each vector subspace. - // - for (uint32_t j = 0; j < pq_dim; j++) { - if (codebook_kind == codebook_gen::PER_SUBSPACE) { - linalg::writeOnlyUnaryOp( - pq_centers_tmp.data(), - pq_len * pq_width, - [pq_centers, pq_width, pq_len, j] __device__(float* out, uint32_t i) { - auto i0 = i / pq_len; - auto i1 = i % pq_len; - *out = pq_centers(j, i1, i0); - }, - stream); - } - kmeans::predict(handle, - pq_centers_tmp.data(), - pq_width, - pq_len, - sub_vectors.data() + size_t(j) * size_t(cluster_size) * size_t(pq_len), - cluster_size, - sub_vector_labels.data() + size_t(j) * size_t(cluster_size), - raft::distance::DistanceType::L2Expanded, - stream, - device_memory); - } + auto stream = handle.get_stream(); + auto dim = rotation_matrix.extent(1); + auto rot_dim = rotation_matrix.extent(0); + rmm::device_uvector tmp(n_rows * dim, stream, device_memory); + linalg::writeOnlyUnaryOp( + tmp.data(), + tmp.size(), + [centers, dataset, labels, dim] __device__(float* out, size_t i) { + auto row_ix = i / dim; + auto el_ix = i % dim; + auto label = labels[row_ix]; + *out = utils::mapping{}(dataset[i]) - centers(label, el_ix); + }, + stream); - // - // PQ encoding - // - ivfpq_encode( - pq_dim, - pq_bits, - sub_vector_labels.data(), - make_mdspan( - pq_dataset.data_handle() + - size_t(cluster_offsets[l]) * pq_dataset.extent(1) * pq_dataset.extent(2), - make_extents(cluster_size, pq_dataset.extent(1), pq_dataset.static_extent(2))), - stream); - } + float alpha = 1.0f; + float beta = 0.0f; + linalg::gemm(handle, + true, + false, + rot_dim, + n_rows, + dim, + &alpha, + rotation_matrix.data_handle(), + dim, + tmp.data(), + dim, + &beta, + residuals, + rot_dim, + stream); } template @@ -482,7 +350,7 @@ auto calculate_offsets_and_indices(IdxT n_rows, IdxT cumsum = 0; update_device(cluster_offsets, &cumsum, 1, stream); thrust::inclusive_scan( - exec_policy, cluster_sizes, cluster_sizes + n_lists, cluster_offsets + 1, thrust::plus{}); + exec_policy, cluster_sizes, cluster_sizes + n_lists, cluster_offsets + 1, add_op{}); update_host(&cumsum, cluster_offsets + n_lists, 1, stream); uint32_t max_cluster_size = *thrust::max_element(exec_policy, cluster_sizes, cluster_sizes + n_lists); @@ -673,20 +541,396 @@ void train_per_cluster(const handle_t& handle, } /** - * See raft::spatial::knn::ivf_pq::extend docs. + * Sort cluster by their size (descending). * - * This version requires `new_vectors` and `new_indices` (if non-null) to be on-device. + * @return Number of non-empty clusters */ +inline auto reorder_clusters_by_size_desc(const handle_t& handle, + uint32_t* ordering, + uint32_t* cluster_sizes_out, + const uint32_t* cluster_sizes_in, + uint32_t n_clusters, + rmm::mr::device_memory_resource* device_memory) + -> uint32_t +{ + auto stream = handle.get_stream(); + rmm::device_uvector cluster_ordering_in(n_clusters, stream, device_memory); + thrust::sequence(handle.get_thrust_policy(), + cluster_ordering_in.data(), + cluster_ordering_in.data() + n_clusters); + + int begin_bit = 0; + int end_bit = sizeof(uint32_t) * 8; + size_t cub_workspace_size = 0; + cub::DeviceRadixSort::SortPairsDescending(nullptr, + cub_workspace_size, + cluster_sizes_in, + cluster_sizes_out, + cluster_ordering_in.data(), + ordering, + n_clusters, + begin_bit, + end_bit, + stream); + rmm::device_buffer cub_workspace(cub_workspace_size, stream, device_memory); + cub::DeviceRadixSort::SortPairsDescending(cub_workspace.data(), + cub_workspace_size, + cluster_sizes_in, + cluster_sizes_out, + cluster_ordering_in.data(), + ordering, + n_clusters, + begin_bit, + end_bit, + stream); + + return thrust::lower_bound(handle.get_thrust_policy(), + cluster_sizes_out, + cluster_sizes_out + n_clusters, + 0, + thrust::greater()) - + cluster_sizes_out; +} + +/** + * Compute the code: find the closest cluster in each pq_dim-subspace. + * + * @tparam SubWarpSize + * how many threads work on a single vector; + * bouded by either WarpSize or pq_book_size. + * + * @param pq_centers + * - codebook_gen::PER_SUBSPACE: [pq_dim , pq_len, pq_book_size] + * - codebook_gen::PER_CLUSTER: [n_lists, pq_len, pq_book_size] + * @param new_vector a single input of length rot_dim, reinterpreted as [pq_dim, pq_len]. + * the input must be already transformed to floats, rotated, and the level 1 cluster + * center must be already substructed (i.e. this is the residual of a single input vector). + * @param codebook_kind + * @param j index along pq_dim "dimension" + * @param cluster_ix is used for PER_CLUSTER codebooks. + */ +template +__device__ auto compute_pq_code( + device_mdspan, row_major> pq_centers, + device_mdspan, row_major> new_vector, + codebook_gen codebook_kind, + uint32_t j, + uint32_t cluster_ix) -> uint8_t +{ + using subwarp_align = Pow2; + uint32_t lane_id = subwarp_align::mod(laneId()); + uint32_t partition_ix; + switch (codebook_kind) { + case codebook_gen::PER_CLUSTER: { + partition_ix = cluster_ix; + } break; + case codebook_gen::PER_SUBSPACE: { + partition_ix = j; + } break; + default: __builtin_unreachable(); + } + + const uint32_t pq_book_size = pq_centers.extent(2); + const uint32_t pq_len = pq_centers.extent(1); + float min_dist = std::numeric_limits::infinity(); + uint8_t code = 0; + // calculate the distance for each PQ cluster, find the minimum for each thread + for (uint32_t i = lane_id; i < pq_book_size; i += subwarp_align::Value) { + // NB: the L2 quantifiers on residuals are always trained on L2 metric. + float d = 0.0f; + for (uint32_t k = 0; k < pq_len; k++) { + auto t = new_vector(j, k) - pq_centers(partition_ix, k, i); + d += t * t; + } + if (d < min_dist) { + min_dist = d; + code = uint8_t(i); + } + } + // reduce among threads +#pragma unroll + for (uint32_t stride = SubWarpSize >> 1; stride > 0; stride >>= 1) { + const auto other_dist = shfl_xor(min_dist, stride, SubWarpSize); + const auto other_code = shfl_xor(code, stride, SubWarpSize); + if (other_dist < min_dist) { + min_dist = other_dist; + code = other_code; + } + } + return code; +} + +template +__launch_bounds__(BlockSize) __global__ void process_and_fill_codes_kernel( + device_mdspan, row_major> new_vectors, + std::variant src_offset_or_indices, + const uint32_t* new_labels, + device_mdspan, row_major> list_sizes, + device_mdspan, row_major> list_offsets, + device_mdspan, row_major> pq_indices, + device_mdspan pq_dataset, + device_mdspan, row_major> pq_centers, + codebook_gen codebook_kind) +{ + constexpr uint32_t kSubWarpSize = std::min(WarpSize, 1u << PqBits); + using subwarp_align = Pow2; + const uint32_t lane_id = subwarp_align::mod(threadIdx.x); + const IdxT row_ix = subwarp_align::div(IdxT{threadIdx.x} + IdxT{blockDim.x} * IdxT{blockIdx.x}); + if (row_ix >= new_vectors.extent(0)) { return; } + + const uint32_t cluster_ix = new_labels[row_ix]; + uint32_t out_incluster_ix; + if (lane_id == 0) { out_incluster_ix = atomicAdd(&list_sizes(cluster_ix), 1); } + out_incluster_ix = shfl(out_incluster_ix, 0, kSubWarpSize); + const IdxT out_ix = list_offsets(cluster_ix) + out_incluster_ix; + + // write the label + if (lane_id == 0) { + if (std::holds_alternative(src_offset_or_indices)) { + pq_indices(out_ix) = std::get(src_offset_or_indices) + row_ix; + } else { + pq_indices(out_ix) = std::get(src_offset_or_indices)[row_ix]; + } + } + + // write the codes + using group_align = Pow2; + const uint32_t group_ix = group_align::div(out_ix); + const uint32_t ingroup_ix = group_align::mod(out_ix); + const uint32_t pq_len = pq_centers.extent(1); + const uint32_t pq_dim = new_vectors.extent(1) / pq_len; + + __shared__ pq_vec_t codes[subwarp_align::div(BlockSize)]; + pq_vec_t& code = codes[subwarp_align::div(threadIdx.x)]; + bitfield_view_t out{reinterpret_cast(&code)}; + constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits; + for (uint32_t j = 0, i = 0; j < pq_dim; i++) { + // clear the chunk for writing + if (lane_id == 0) { code = pq_vec_t{}; } + // fill-in the values, one/pq_dim at a time +#pragma unroll + for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) { + // find the label + using layout_t = typename decltype(new_vectors)::layout_type; + using accessor_t = typename decltype(new_vectors)::accessor_type; + auto one_vector = mdspan, layout_t, accessor_t>( + &new_vectors(row_ix, 0), extent_2d{pq_dim, pq_len}); + auto l = compute_pq_code(pq_centers, one_vector, codebook_kind, j, cluster_ix); + if (lane_id == 0) { out[k] = l; } + } + // write the chunk into the dataset + if (lane_id == 0) { pq_dataset(group_ix, i, ingroup_ix) = code; } + } +} + +/** + * Assuming the index already has some data and allocated the space for more, write more data in it. + * There must be enough free space in `pq_dataset()` and `indices()`, as computed using + * `list_offsets()` and `list_sizes()`. + * + * NB: Since the pq_dataset is stored in the interleaved blocked format (see ivf_pq_types.hpp), one + * cannot just concatenate the old and the new codes; the positions for the codes are determined the + * same way as in the ivfpq_compute_similarity_kernel (see ivf_pq_search.cuh). + * + * @tparam T + * @tparam IdxT + * + * @param handle + * @param index + * @param[in] new_vectors + * a pointer to a row-major device array [index.dim(), n_rows]; + * @param[in] src_offset_or_indices + * references for the new data: + * either a starting index for the auto-indexing + * or a pointer to a device array of explicit indices [n_rows]; + * @param[in] new_labels + * cluster ids (first-level quantization) - a device array [n_rows]; + * @param n_rows + * the number of records to write in. + * @param mr + * a memory resource to use for device allocations + */ +template +void process_and_fill_codes(const handle_t& handle, + index& index, + const T* new_vectors, + std::variant src_offset_or_indices, + const uint32_t* new_labels, + IdxT n_rows, + rmm::mr::device_memory_resource* mr) +{ + pq_int_vec_exts pq_extents = make_extents(index.pq_dataset().extent(0), + index.pq_dataset().extent(1), + index.pq_dataset().static_extent(2)); + auto pq_dataset = make_mdspan( + reinterpret_cast(index.pq_dataset().data_handle()), pq_extents); + + auto new_vectors_residual = + make_device_mdarray(handle, mr, make_extents(n_rows, index.rot_dim())); + + flat_compute_residuals(handle, + new_vectors_residual.data_handle(), + n_rows, + index.rotation_matrix(), + index.centers(), + new_vectors, + new_labels, + mr); + + constexpr uint32_t kBlockSize = 256; + const uint32_t threads_per_vec = std::min(WarpSize, index.pq_book_size()); + dim3 blocks(div_rounding_up_safe(n_rows, kBlockSize / threads_per_vec), 1, 1); + dim3 threads(kBlockSize, 1, 1); + auto kernel = [](uint32_t pq_bits) { + switch (pq_bits) { + case 4: return process_and_fill_codes_kernel; + case 5: return process_and_fill_codes_kernel; + case 6: return process_and_fill_codes_kernel; + case 7: return process_and_fill_codes_kernel; + case 8: return process_and_fill_codes_kernel; + default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits); + } + }(index.pq_bits()); + kernel<<>>(new_vectors_residual.view(), + src_offset_or_indices, + new_labels, + index.list_sizes(), + index.list_offsets(), + index.indices(), + pq_dataset, + index.pq_centers(), + index.codebook_kind()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +/** + * Fill the `target` index with the data from the `source`, except `list_offsets`. + * The `target` index must have the same settings and valid `list_offsets`, and must have been + * pre-allocated to fit the whole `source` data. + * As a result, the `target` index is in a valid state; it's identical to the `source`, except + * has more unused space in `pq_dataset`. + * + * @param target the index to be filled-in + * @param source the index to get data from + * @param cluster_ordering + * a pointer to the managed data [n_clusters]; + * the mapping `source_label = cluster_ordering[target_label]` + * @param stream + */ +template +void copy_index_data(index& target, + const index& source, + const uint32_t* cluster_ordering, + rmm::cuda_stream_view stream) +{ + auto n_clusters = target.n_lists(); + RAFT_EXPECTS(target.size() >= source.size(), + "The target index must be not smaller than the source index."); + RAFT_EXPECTS(n_clusters >= source.n_lists(), + "The target and the source are not compatible (different numbers of clusters)."); + + // Copy the unchanged parts + copy(target.rotation_matrix().data_handle(), + source.rotation_matrix().data_handle(), + source.rotation_matrix().size(), + stream); + + // copy cluster-ordering-dependent data + utils::copy_selected(n_clusters, + uint32_t{1}, + source.list_sizes().data_handle(), + cluster_ordering, + uint32_t{1}, + target.list_sizes().data_handle(), + uint32_t{1}, + stream); + utils::copy_selected(n_clusters, + target.dim_ext(), + source.centers().data_handle(), + cluster_ordering, + source.dim_ext(), + target.centers().data_handle(), + target.dim_ext(), + stream); + utils::copy_selected(n_clusters, + target.rot_dim(), + source.centers_rot().data_handle(), + cluster_ordering, + source.rot_dim(), + target.centers_rot().data_handle(), + target.rot_dim(), + stream); + switch (source.codebook_kind()) { + case codebook_gen::PER_SUBSPACE: { + copy(target.pq_centers().data_handle(), + source.pq_centers().data_handle(), + source.pq_centers().size(), + stream); + } break; + case codebook_gen::PER_CLUSTER: { + auto d = source.pq_book_size() * source.pq_len(); + utils::copy_selected(n_clusters, + d, + source.pq_centers().data_handle(), + cluster_ordering, + d, + target.pq_centers().data_handle(), + d, + stream); + } break; + default: RAFT_FAIL("Unreachable code"); + } + + // Fill the data with the old clusters. + if (source.size() > 0) { + std::vector target_cluster_offsets(n_clusters + 1); + std::vector source_cluster_offsets(n_clusters + 1); + std::vector source_cluster_sizes(n_clusters); + copy(target_cluster_offsets.data(), + target.list_offsets().data_handle(), + target.list_offsets().size(), + stream); + copy(source_cluster_offsets.data(), + source.list_offsets().data_handle(), + source.list_offsets().size(), + stream); + copy(source_cluster_sizes.data(), + source.list_sizes().data_handle(), + source.list_sizes().size(), + stream); + stream.synchronize(); + auto data_exts = target.pq_dataset().extents(); + auto data_unit = size_t(data_exts.extent(3)) * size_t(data_exts.extent(1)); + auto data_mod = size_t(data_exts.extent(2)); + for (uint32_t l = 0; l < target.n_lists(); l++) { + auto k = cluster_ordering[l]; + auto source_cluster_size = source_cluster_sizes[k]; + if (source_cluster_size > 0) { + copy(target.indices().data_handle() + target_cluster_offsets[l], + source.indices().data_handle() + source_cluster_offsets[k], + source_cluster_size, + stream); + copy(target.pq_dataset().data_handle() + target_cluster_offsets[l] * data_unit, + source.pq_dataset().data_handle() + source_cluster_offsets[k] * data_unit, + round_up_safe(source_cluster_size, data_mod) * data_unit, + stream); + } + } + } +} + +/** See raft::spatial::knn::ivf_pq::extend docs */ template -inline auto extend_device(const handle_t& handle, - const index& orig_index, - const T* new_vectors, - const IdxT* new_indices, - IdxT n_rows) -> index +auto extend(const handle_t& handle, + const index& orig_index, + const T* new_vectors, + const IdxT* new_indices, + IdxT n_rows) -> index { common::nvtx::range fun_scope( "ivf_pq::extend(%zu, %u)", size_t(n_rows), orig_index.dim()); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); + const auto n_clusters = orig_index.n_lists(); RAFT_EXPECTS(new_indices != nullptr || orig_index.size() == 0, "You must pass data indices when the index is non-empty."); @@ -694,13 +938,6 @@ inline auto extend_device(const handle_t& handle, static_assert(std::is_same_v || std::is_same_v || std::is_same_v, "Unsupported data type"); - switch (new_indices != nullptr ? utils::check_pointer_residency(new_vectors, new_indices) - : utils::check_pointer_residency(new_vectors)) { - case utils::pointer_residency::device_only: - case utils::pointer_residency::host_and_device: break; - default: RAFT_FAIL("[ivf_pq::extend_device] The added data must be available on device."); - } - rmm::mr::device_memory_resource* device_memory = nullptr; auto pool_guard = raft::get_pool_memory_resource(device_memory, 1024 * 1024); if (pool_guard) { @@ -712,154 +949,134 @@ inline auto extend_device(const handle_t& handle, rmm::mr::pool_memory_resource managed_memory( &managed_memory_upstream, 1024 * 1024); - // - // The cluster_centers stored in index contain data other than cluster - // centroids to speed up the search. Here, only the cluster centroids - // are extracted. - // - const auto n_clusters = orig_index.n_lists(); + // Try to allocate an index with the same parameters and the projected new size + // (which can be slightly larger than index.size() + n_rows, due to padding). + // If this fails, the index would be too big to fit in the device anyway. + std::optional> placeholder_index(std::in_place_t{}, + handle, + orig_index.metric(), + orig_index.codebook_kind(), + n_clusters, + orig_index.dim(), + orig_index.pq_bits(), + orig_index.pq_dim(), + orig_index.n_nonempty_lists()); + placeholder_index->allocate( + handle, + orig_index.size() + n_rows + (kIndexGroupSize - 1) * std::min(n_clusters, n_rows)); + + // Available device memory + size_t free_mem, total_mem; + RAFT_CUDA_TRY(cudaMemGetInfo(&free_mem, &total_mem)); + + // Decide on an approximate threshold when we'd better start saving device memory by using + // managed allocations for large device buffers + rmm::mr::device_memory_resource* labels_mr = device_memory; + rmm::mr::device_memory_resource* batches_mr = device_memory; + if (n_rows * + (orig_index.dim() * sizeof(T) + orig_index.pq_dim() + sizeof(IdxT) + sizeof(uint32_t)) > + free_mem) { + labels_mr = &managed_memory; + } + // Allocate a buffer for the new labels (classifying the new data) + rmm::device_uvector new_data_labels(n_rows, stream, labels_mr); + if (labels_mr == device_memory) { free_mem -= sizeof(uint32_t) * n_rows; } - rmm::device_uvector cluster_centers( - size_t(n_clusters) * size_t(orig_index.dim()), stream, device_memory); - RAFT_CUDA_TRY(cudaMemcpy2DAsync(cluster_centers.data(), - sizeof(float) * orig_index.dim(), - orig_index.centers().data_handle(), - sizeof(float) * orig_index.dim_ext(), - sizeof(float) * orig_index.dim(), - n_clusters, - cudaMemcpyDefault, - stream)); - - // - // Use the existing cluster centroids to find the label (cluster ID) - // of the vector to be added. - // - - rmm::device_uvector new_data_labels(n_rows, stream, device_memory); - utils::memzero(new_data_labels.data(), n_rows, stream); - rmm::device_uvector new_cluster_sizes_buf(n_clusters, stream, &managed_memory); - auto new_cluster_sizes = new_cluster_sizes_buf.data(); - utils::memzero(new_cluster_sizes, n_clusters, stream); + // Calculate the batch size for the input data if it's not accessible directly from the device + constexpr size_t kReasonableMaxBatchSize = 65536; + size_t max_batch_size = std::min(n_rows, kReasonableMaxBatchSize); + { + size_t size_factor = 0; + // we'll use two temporary buffers for converted inputs when computing the codes. + size_factor += (orig_index.dim() + orig_index.rot_dim()) * sizeof(float); + // ...and another buffer for indices + size_factor += sizeof(IdxT); + // if the input data is not accessible on device, we'd need a buffer for it. + switch (utils::check_pointer_residency(new_vectors)) { + case utils::pointer_residency::device_only: + case utils::pointer_residency::host_and_device: break; + default: size_factor += orig_index.dim() * sizeof(T); + } + // the same with indices + if (new_indices != nullptr) { + switch (utils::check_pointer_residency(new_indices)) { + case utils::pointer_residency::device_only: + case utils::pointer_residency::host_and_device: break; + default: size_factor += sizeof(IdxT); + } + } + // make the batch size fit into the remaining memory + while (size_factor * max_batch_size > free_mem && max_batch_size > 128) { + max_batch_size >>= 1; + } + if (size_factor * max_batch_size > free_mem) { + // if that still doesn't fit, resort to the UVM + batches_mr = &managed_memory; + max_batch_size = kReasonableMaxBatchSize; + } else { + // If we're keeping the batches in device memory, update the available mem tracker. + free_mem -= size_factor * max_batch_size; + } + } - kmeans::predict(handle, - cluster_centers.data(), - n_clusters, - orig_index.dim(), - new_vectors, - n_rows, - new_data_labels.data(), - orig_index.metric(), - stream); - raft::stats::histogram(raft::stats::HistTypeAuto, - reinterpret_cast(new_cluster_sizes), - IdxT(n_clusters), - new_data_labels.data(), - n_rows, - 1, - stream); - - // - // Make new_cluster_offsets, new_data_indices - // - rmm::device_uvector new_data_indices(n_rows, stream, &managed_memory); - rmm::device_uvector new_cluster_offsets(n_clusters + 1, stream, &managed_memory); - uint32_t new_max_cluster_size = calculate_offsets_and_indices(n_rows, - n_clusters, - new_data_labels.data(), - new_cluster_sizes, - new_cluster_offsets.data(), - new_data_indices.data(), - stream); - - // - // Compute PQ code for new vectors - // - pq_codes_exts new_pq_exts = make_extents( - n_rows, orig_index.pq_dataset().extent(1), orig_index.pq_dataset().static_extent(3)); - auto new_pq_codes = make_device_mdarray(handle, device_memory, new_pq_exts); - compute_pq_codes(handle, - n_rows, - orig_index.dim(), - orig_index.rot_dim(), - orig_index.pq_dim(), - orig_index.pq_len(), - orig_index.pq_bits(), - n_clusters, - orig_index.codebook_kind(), - new_max_cluster_size, + // Predict the cluster labels for the new data, in batches if necessary + utils::batch_load_iterator vec_batches( + new_vectors, n_rows, orig_index.dim(), max_batch_size, stream, batches_mr); + // Release the placeholder memory, because we don't intend to allocate any more long-living + // temporary buffers before we allocate the ext_index data. + // This memory could potentially speed up UVM accesses, if any. + placeholder_index.reset(); + { + // The cluster centers in the index are stored padded, which is not acceptable by + // the kmeans::predict. Thus, we need the restructuring copy. + rmm::device_uvector cluster_centers( + size_t(n_clusters) * size_t(orig_index.dim()), stream, device_memory); + RAFT_CUDA_TRY(cudaMemcpy2DAsync(cluster_centers.data(), + sizeof(float) * orig_index.dim(), + orig_index.centers().data_handle(), + sizeof(float) * orig_index.dim_ext(), + sizeof(float) * orig_index.dim(), + n_clusters, + cudaMemcpyDefault, + stream)); + for (const auto& batch : vec_batches) { + kmeans::predict(handle, cluster_centers.data(), - orig_index.rotation_matrix().data_handle(), - new_vectors, - new_data_indices.data(), - new_cluster_sizes, - new_cluster_offsets.data(), - orig_index.pq_centers(), - new_pq_codes.view(), + n_clusters, + orig_index.dim(), + batch.data(), + batch.size(), + new_data_labels.data() + batch.offset(), + orig_index.metric(), + stream, device_memory); + } + } // Get the combined cluster sizes and sort the clusters in decreasing order // (this makes it easy to estimate the max number of samples during search). - rmm::device_uvector old_cluster_sizes_buf(n_clusters, stream, &managed_memory); - rmm::device_uvector ext_cluster_sizes_buf(n_clusters, stream, &managed_memory); - rmm::device_uvector old_cluster_offsets_buf(n_clusters + 1, stream, &managed_memory); - rmm::device_uvector ext_cluster_offsets_buf(n_clusters + 1, stream, &managed_memory); rmm::device_uvector cluster_ordering_buf(n_clusters, stream, &managed_memory); - auto old_cluster_sizes = old_cluster_sizes_buf.data(); - auto ext_cluster_sizes = ext_cluster_sizes_buf.data(); - auto old_cluster_offsets = old_cluster_offsets_buf.data(); - auto ext_cluster_offsets = ext_cluster_offsets_buf.data(); - auto cluster_ordering = cluster_ordering_buf.data(); - copy(old_cluster_offsets, - orig_index.list_offsets().data_handle(), - orig_index.list_offsets().size(), - stream); - copy(old_cluster_sizes, - orig_index.list_sizes().data_handle(), - orig_index.list_sizes().size(), - stream); - + rmm::device_uvector ext_cluster_sizes_buf(n_clusters, stream, device_memory); + auto cluster_ordering = cluster_ordering_buf.data(); + auto ext_cluster_sizes = ext_cluster_sizes_buf.data(); uint32_t n_nonempty_lists = 0; { - rmm::device_uvector ext_cluster_sizes_buf_in(n_clusters, stream, device_memory); - rmm::device_uvector cluster_ordering_in(n_clusters, stream, device_memory); - auto ext_cluster_sizes_in = ext_cluster_sizes_buf_in.data(); - linalg::add(ext_cluster_sizes_in, old_cluster_sizes, new_cluster_sizes, n_clusters, stream); - - thrust::sequence(handle.get_thrust_policy(), - cluster_ordering_in.data(), - cluster_ordering_in.data() + n_clusters); - - int begin_bit = 0; - int end_bit = sizeof(uint32_t) * 8; - size_t cub_workspace_size = 0; - cub::DeviceRadixSort::SortPairsDescending(nullptr, - cub_workspace_size, - ext_cluster_sizes_in, - ext_cluster_sizes, - cluster_ordering_in.data(), - cluster_ordering, - n_clusters, - begin_bit, - end_bit, - stream); - rmm::device_buffer cub_workspace(cub_workspace_size, stream, device_memory); - cub::DeviceRadixSort::SortPairsDescending(cub_workspace.data(), - cub_workspace_size, - ext_cluster_sizes_in, - ext_cluster_sizes, - cluster_ordering_in.data(), - cluster_ordering, - n_clusters, - begin_bit, - end_bit, - stream); - - n_nonempty_lists = thrust::lower_bound(handle.get_thrust_policy(), - ext_cluster_sizes, - ext_cluster_sizes + n_clusters, - 0, - thrust::greater()) - - ext_cluster_sizes; + rmm::device_uvector new_cluster_sizes_buf(n_clusters, stream, device_memory); + auto new_cluster_sizes = new_cluster_sizes_buf.data(); + raft::stats::histogram(raft::stats::HistTypeAuto, + reinterpret_cast(new_cluster_sizes), + IdxT(n_clusters), + new_data_labels.data(), + n_rows, + 1, + stream); + linalg::add(new_cluster_sizes, + new_cluster_sizes, + orig_index.list_sizes().data_handle(), + n_clusters, + stream); + n_nonempty_lists = reorder_clusters_by_size_desc( + handle, cluster_ordering, ext_cluster_sizes, new_cluster_sizes, n_clusters, device_memory); } // Assemble the extended index @@ -871,193 +1088,66 @@ inline auto extend_device(const handle_t& handle, orig_index.pq_bits(), orig_index.pq_dim(), n_nonempty_lists); - // calculate extended cluster offsets + // calculate extended cluster offsets and allocate the index data { - using group_align = Pow2; - IdxT size = 0; + auto ext_cluster_offsets = ext_index.list_offsets().data_handle(); + using group_align = Pow2; + IdxT size = 0; update_device(ext_cluster_offsets, &size, 1, stream); - thrust::inclusive_scan( - handle.get_thrust_policy(), - ext_cluster_sizes, - ext_cluster_sizes + n_clusters, - ext_cluster_offsets + 1, - [] __device__(IdxT a, IdxT b) { return group_align::roundUp(a) + group_align::roundUp(b); }); + auto sizes_padded = thrust::make_transform_iterator( + ext_cluster_sizes, [] __device__ __host__(uint32_t x) -> IdxT { + return IdxT{Pow2::roundUp(x)}; + }); + thrust::inclusive_scan(handle.get_thrust_policy(), + sizes_padded, + sizes_padded + n_clusters, + ext_cluster_offsets + 1, + add_op{}); update_host(&size, ext_cluster_offsets + n_clusters, 1, stream); - handle.sync_stream(); - copy(ext_index.list_offsets().data_handle(), - ext_cluster_offsets, - ext_index.list_offsets().size(), - stream); - copy(ext_index.list_sizes().data_handle(), - ext_cluster_sizes, - ext_index.list_sizes().size(), - stream); + handle.sync_stream(); // syncs `size`, `cluster_ordering` ext_index.allocate(handle, size); } - // Copy the unchanged parts - copy(ext_index.rotation_matrix().data_handle(), - orig_index.rotation_matrix().data_handle(), - orig_index.rotation_matrix().size(), - stream); + // pre-fill the extended index with the data from the original index + copy_index_data(ext_index, orig_index, cluster_ordering, stream); - // copy cluster-ordering-dependent data - utils::copy_selected(n_clusters, - ext_index.dim_ext(), - orig_index.centers().data_handle(), - cluster_ordering, - orig_index.dim_ext(), - ext_index.centers().data_handle(), - ext_index.dim_ext(), - stream); - utils::copy_selected(n_clusters, - ext_index.rot_dim(), - orig_index.centers_rot().data_handle(), - cluster_ordering, - orig_index.rot_dim(), - ext_index.centers_rot().data_handle(), - ext_index.rot_dim(), - stream); - switch (orig_index.codebook_kind()) { - case codebook_gen::PER_SUBSPACE: { - copy(ext_index.pq_centers().data_handle(), - orig_index.pq_centers().data_handle(), - orig_index.pq_centers().size(), - stream); - } break; - case codebook_gen::PER_CLUSTER: { - auto d = orig_index.pq_book_size() * orig_index.pq_len(); - utils::copy_selected(n_clusters, - d, - orig_index.pq_centers().data_handle(), - cluster_ordering, - d, - ext_index.pq_centers().data_handle(), - d, - stream); - } break; - default: RAFT_FAIL("Unreachable code"); - } - - // Make ext_indices - handle.sync_stream(); // make sure cluster sizes are up-to-date - auto ext_indices = ext_index.indices().data_handle(); - for (uint32_t l = 0; l < ext_index.n_lists(); l++) { - auto k = cluster_ordering[l]; - auto old_cluster_size = old_cluster_sizes[k]; - auto new_cluster_size = new_cluster_sizes[k]; - if (old_cluster_size > 0) { - copy(ext_indices + ext_cluster_offsets[l], - orig_index.indices().data_handle() + old_cluster_offsets[k], - old_cluster_size, - stream); - } - if (new_cluster_size > 0) { - if (new_indices == nullptr) { - // implies the orig index is empty - copy(ext_indices + ext_cluster_offsets[l] + old_cluster_size, - new_data_indices.data() + new_cluster_offsets.data()[k], - new_cluster_size, - stream); - } else { - utils::copy_selected((IdxT)new_cluster_size, - (IdxT)1, - new_indices, - new_data_indices.data() + new_cluster_offsets.data()[k], - (IdxT)1, - ext_indices + ext_cluster_offsets[l] + old_cluster_size, - (IdxT)1, - stream); - } + // update the labels to correspond to the new cluster ordering + { + rmm::device_uvector cluster_ordering_rev_buf(n_clusters, stream, &managed_memory); + auto cluster_ordering_rev = cluster_ordering_rev_buf.data(); + for (uint32_t i = 0; i < n_clusters; i++) { + cluster_ordering_rev[cluster_ordering[i]] = i; } + linalg::unaryOp( + new_data_labels.data(), + new_data_labels.data(), + new_data_labels.size(), + [cluster_ordering_rev] __device__(uint32_t i) { return cluster_ordering_rev[i]; }, + stream); } - /* Extend the pq_dataset */ - // For simplicity and performance, we reinterpret the last dimension of the dataset - // as a single vector element. - using vec_t = TxN_t::io_t; - - auto data_unit = ext_index.pq_dataset().extent(1); - auto ext_pq_dataset = make_mdspan( - reinterpret_cast(ext_index.pq_dataset().data_handle()), - make_extents( - ext_index.pq_dataset().extent(0), data_unit, ext_index.pq_dataset().extent(2))); - - for (uint32_t l = 0; l < ext_index.n_lists(); l++) { - // Extend the data cluster-by-cluster; - // The original/old index stores the data interleaved; - // the new data produced by `compute_pq_codes` is not interleaved. - auto k = cluster_ordering[l]; - auto old_cluster_size = old_cluster_sizes[k]; - auto old_pq_dataset = make_mdspan( - reinterpret_cast(orig_index.pq_dataset().data_handle()) + - data_unit * old_cluster_offsets[k], - make_extents(div_rounding_up_safe(old_cluster_size, kIndexGroupSize), - data_unit, - ext_pq_dataset.extent(2))); - auto new_pq_data = make_mdspan( - reinterpret_cast(new_pq_codes.data_handle()) + - data_unit * new_cluster_offsets.data()[k], - make_extents(new_cluster_sizes[k], data_unit)); - // Write all cluster data, vec-by-vec - linalg::writeOnlyUnaryOp( - ext_pq_dataset.data_handle() + data_unit * ext_cluster_offsets[l], - data_unit * size_t(ext_cluster_offsets[l + 1] - ext_cluster_offsets[l]), - [old_pq_dataset, new_pq_data, old_cluster_size] __device__(vec_t * out, size_t i_flat) { - // find the proper 3D index from the flat offset - size_t i[3]; - for (int r = 2; r > 0; r--) { - i[r] = i_flat % old_pq_dataset.extent(r); - i_flat /= old_pq_dataset.extent(r); - } - i[0] = i_flat; - auto row_ix = i[0] * old_pq_dataset.extent(2) + i[2]; - if (row_ix < old_cluster_size) { - // First, pack the original/old data - *out = old_pq_dataset(i[0], i[1], i[2]); - } else { - // Then add the new data - row_ix -= old_cluster_size; - if (row_ix < new_pq_data.extent(0)) { - *out = new_pq_data(row_ix, i[1]); - } else { - *out = vec_t{}; - } - } - }, - stream); + // fill the extended index with the new data (possibly, in batches) + utils::batch_load_iterator idx_batches( + new_indices, n_rows, 1, max_batch_size, stream, batches_mr); + for (const auto& vec_batch : vec_batches) { + const auto& idx_batch = *idx_batches++; + process_and_fill_codes(handle, + ext_index, + vec_batch.data(), + new_indices != nullptr + ? std::variant(idx_batch.data()) + : std::variant(IdxT(idx_batch.offset())), + new_data_labels.data() + vec_batch.offset(), + IdxT(vec_batch.size()), + batches_mr); } return ext_index; } -/** See raft::spatial::knn::ivf_pq::extend docs */ -template -inline auto extend(const handle_t& handle, - const index& orig_index, - const T* new_vectors, - const IdxT* new_indices, - IdxT n_rows) -> index -{ - size_t vec_size = sizeof(T) * size_t(n_rows) * size_t(orig_index.dim()); - size_t ind_size = sizeof(IdxT) * size_t(n_rows); - return utils::with_mapped_memory_t{ - new_vectors, vec_size, [&](const T* new_vectors_dev) { - return utils::with_mapped_memory_t{ - new_indices, ind_size, [&](const IdxT* new_indices_dev) { - return extend_device( - handle, orig_index, new_vectors_dev, new_indices_dev, n_rows); - }}(); - }}(); -} - -/** - * See raft::spatial::knn::ivf_pq::build docs. - * - * This version requires `dataset` to be on-device. - */ +/** See raft::spatial::knn::ivf_pq::build docs */ template -inline auto build_device( +auto build( const handle_t& handle, const index_params& params, const T* dataset, IdxT n_rows, uint32_t dim) -> index { @@ -1068,12 +1158,6 @@ inline auto build_device( RAFT_EXPECTS(n_rows > 0 && dim > 0, "empty dataset"); - switch (utils::check_pointer_residency(dataset)) { - case utils::pointer_residency::device_only: - case utils::pointer_residency::host_and_device: break; - default: RAFT_FAIL("[ivf_pq::build_device] The dataset pointer must be available on device."); - } - auto stream = handle.get_stream(); index index(handle, params, dim); @@ -1122,15 +1206,45 @@ inline auto build_device( cudaMemcpyDefault, stream)); } else { - auto dim = index.dim(); - linalg::writeOnlyUnaryOp( - trainset.data(), - size_t(index.dim()) * n_rows_train, - [dataset, trainset_ratio, dim] __device__(float* out, size_t i) { - auto col = i % dim; - *out = utils::mapping{}(dataset[(i - col) * size_t(trainset_ratio) + col]); - }, - stream); + size_t dim = index.dim(); + cudaPointerAttributes dataset_attr; + RAFT_CUDA_TRY(cudaPointerGetAttributes(&dataset_attr, dataset)); + if (dataset_attr.devicePointer != nullptr) { + // data is available on device: just run the kernel to copy and map the data + auto p = reinterpret_cast(dataset_attr.devicePointer); + linalg::writeOnlyUnaryOp( + trainset.data(), + dim * n_rows_train, + [p, trainset_ratio, dim] __device__(float* out, size_t i) { + auto col = i % dim; + *out = utils::mapping{}(p[(i - col) * size_t(trainset_ratio) + col]); + }, + stream); + } else { + // data is not available: first copy, then map inplace + auto trainset_tmp = reinterpret_cast(reinterpret_cast(trainset.data()) + + (sizeof(float) - sizeof(T)) * index.dim()); + // We copy the data in strides, one row at a time, and place the smaller rows of type T + // at the end of float rows. + RAFT_CUDA_TRY(cudaMemcpy2DAsync(trainset_tmp, + sizeof(float) * index.dim(), + dataset, + sizeof(T) * index.dim() * trainset_ratio, + sizeof(T) * index.dim(), + n_rows_train, + cudaMemcpyDefault, + stream)); + // Transform the input `{T -> float}`, one row per warp. + // The threads in each warp copy the data synchronously; this and the layout of the data + // (content is aligned to the end of the rows) together allow doing the transform in-place. + copy_warped(trainset.data(), + index.dim(), + trainset_tmp, + index.dim() * sizeof(float) / sizeof(T), + index.dim(), + n_rows_train, + stream); + } } // NB: here cluster_centers is used as if it is [n_clusters, data_dim] not [n_clusters, dim_ext]! @@ -1245,25 +1359,12 @@ inline auto build_device( // add the data if necessary if (params.add_data_on_build) { - return detail::extend_device(handle, index, dataset, nullptr, n_rows); + return detail::extend(handle, index, dataset, nullptr, n_rows); } else { return index; } } -/** See raft::spatial::knn::ivf_pq::build docs */ -template -inline auto build( - const handle_t& handle, const index_params& params, const T* dataset, IdxT n_rows, uint32_t dim) - -> index -{ - size_t data_size = sizeof(T) * size_t(n_rows) * size_t(dim); - return utils::with_mapped_memory_t{dataset, data_size, [&](const T* dataset_dev) { - return build_device( - handle, params, dataset_dev, n_rows, dim); - }}(); -} - static const int serialization_version = 1; /** From 53ba2261317c3eceb5bf259cf40057084054d19d Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 9 Jan 2023 15:51:50 -0500 Subject: [PATCH 03/44] Adding ability to use an existing stream in the pylibraft Handle (#1125) Closes #1123 Authors: - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Dante Gama Dessavre (https://github.com/dantegd) URL: https://github.com/rapidsai/raft/pull/1125 --- cpp/include/raft/neighbors/ivf_pq_types.hpp | 2 +- .../raft/spatial/knn/detail/ann_utils.cuh | 2 +- python/pylibraft/pylibraft/common/cuda.pyx | 9 +++- python/pylibraft/pylibraft/common/handle.pyx | 47 +++++++++++++++++-- .../pylibraft/pylibraft/test/test_distance.py | 9 ++-- .../pylibraft/pylibraft/test/test_handle.py | 47 +++++++++++++++++++ 6 files changed, 105 insertions(+), 11 deletions(-) create mode 100644 python/pylibraft/pylibraft/test/test_handle.py diff --git a/cpp/include/raft/neighbors/ivf_pq_types.hpp b/cpp/include/raft/neighbors/ivf_pq_types.hpp index 244d1879d8..51364e1ee6 100644 --- a/cpp/include/raft/neighbors/ivf_pq_types.hpp +++ b/cpp/include/raft/neighbors/ivf_pq_types.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh index 32d4f67a20..395714a161 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/python/pylibraft/pylibraft/common/cuda.pyx b/python/pylibraft/pylibraft/common/cuda.pyx index 7400c8550f..c164a463ae 100644 --- a/python/pylibraft/pylibraft/common/cuda.pyx +++ b/python/pylibraft/pylibraft/common/cuda.pyx @@ -1,5 +1,5 @@ # -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -30,6 +30,7 @@ from cuda.ccudart cimport ( cudaStreamSynchronize, cudaSuccess, ) +from libc.stdint cimport uintptr_t class CudaRuntimeError(RuntimeError): @@ -80,3 +81,9 @@ cdef class Stream: cdef cudaStream_t getStream(self): return self.s + + def get_ptr(self): + """ + Return the uintptr_t pointer of the underlying cudaStream_t handle + """ + return self.s diff --git a/python/pylibraft/pylibraft/common/handle.pyx b/python/pylibraft/pylibraft/common/handle.pyx index 13fc7fc98e..2821cb7f8a 100644 --- a/python/pylibraft/pylibraft/common/handle.pyx +++ b/python/pylibraft/pylibraft/common/handle.pyx @@ -1,5 +1,5 @@ # -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,6 +21,9 @@ import functools +from cuda.ccudart cimport cudaStream_t +from libc.stdint cimport uintptr_t + from rmm._lib.cuda_stream_view cimport cuda_stream_per_thread, cuda_stream_view from .cuda cimport Stream @@ -34,9 +37,15 @@ cdef class Handle: of handle_t exposed by RAFT's C++ interface. Refer to the header file raft/handle.hpp for interface level details of this struct + Parameters + ---------- + stream : Optional stream to use for ordering CUDA instructions + Accepts pylibraft.common.Stream() or uintptr_t (cudaStream_t) + Examples -------- + Basic usage: >>> from pylibraft.common import Stream, Handle >>> stream = Stream() >>> handle = Handle(stream) @@ -48,14 +57,33 @@ cdef class Handle: >>> # the default stream inside the `handle_t` is being used >>> handle.sync() >>> del handle # optional! + + Using a cuPy stream with RAFT handle: + >>> import cupy + >>> from pylibraft.common import Stream, Handle + >>> + >>> cupy_stream = cupy.cuda.Stream() + >>> handle = Handle(stream=cupy_stream.ptr) + + Using a RAFT stream with CuPy ExternalStream: + >>> import cupy + >>> from pylibraft.common import Stream + >>> + >>> raft_stream = Stream() + >>> cupy_stream = cupy.cuda.ExternalStream(raft_stream.get_ptr()) """ - def __cinit__(self, stream: Stream = None, n_streams=0): + def __cinit__(self, stream=None, n_streams=0): self.n_streams = n_streams + if n_streams > 0: self.stream_pool.reset(new cuda_stream_pool(n_streams)) + cdef uintptr_t s cdef cuda_stream_view c_stream + + # We should either have a pylibraft.common.Stream or a uintptr_t + # of a cudaStream_t if stream is None: # this constructor will construct a "main" handle on # per-thread default stream, which is non-blocking @@ -63,9 +91,20 @@ cdef class Handle: self.stream_pool)) else: # this constructor constructs a handle on user stream - c_stream = cuda_stream_view(stream.getStream()) + if isinstance(stream, Stream): + # Stream is pylibraft Stream() + s = stream.get_ptr() + c_stream = cuda_stream_view(s) + elif isinstance(stream, int): + # Stream is a pointer, cast to cudaStream_t + s = stream + c_stream = cuda_stream_view(s) + else: + raise ValueError("stream should be common.Stream() or " + "uintptr_t to cudaStream_t") + self.c_obj.reset(new handle_t(c_stream, - self.stream_pool)) + self.stream_pool)) def sync(self): """ diff --git a/python/pylibraft/pylibraft/test/test_distance.py b/python/pylibraft/pylibraft/test/test_distance.py index a08656d3aa..9c8a608f6e 100644 --- a/python/pylibraft/pylibraft/test/test_distance.py +++ b/python/pylibraft/pylibraft/test/test_distance.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,7 +17,7 @@ import pytest from scipy.spatial.distance import cdist -from pylibraft.common import Handle, device_ndarray +from pylibraft.common import Handle, Stream, device_ndarray from pylibraft.distance import pairwise_distance @@ -64,9 +64,10 @@ def test_distance(n_rows, n_cols, inplace, metric, order, dtype): input1_device = device_ndarray(input1) output_device = device_ndarray(output) if inplace else None - handle = Handle() + s2 = Stream() + handle = Handle(stream=s2) ret_output = pairwise_distance( - input1_device, input1_device, output_device, metric + input1_device, input1_device, output_device, metric, handle=handle ) handle.sync() diff --git a/python/pylibraft/pylibraft/test/test_handle.py b/python/pylibraft/pylibraft/test/test_handle.py new file mode 100644 index 0000000000..877bf442f8 --- /dev/null +++ b/python/pylibraft/pylibraft/test/test_handle.py @@ -0,0 +1,47 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import numpy as np +import pytest + +from pylibraft.common import Handle, Stream, device_ndarray +from pylibraft.distance import pairwise_distance + +try: + import cupy +except ImportError: + pytest.skip(reason="cupy not installed.") + + +@pytest.mark.parametrize("stream", [cupy.cuda.Stream().ptr, Stream()]) +def test_handle_external_stream(stream): + + input1 = np.random.random_sample((50, 3)) + input1 = np.asarray(input1, order="F").astype("float") + + output = np.zeros((50, 50), dtype="float") + + input1_device = device_ndarray(input1) + output_device = device_ndarray(output) + + # We are just testing that this doesn't segfault + handle = Handle(stream) + pairwise_distance( + input1_device, input1_device, output_device, "euclidean", handle=handle + ) + handle.sync() + + with pytest.raises(ValueError): + handle = Handle(stream=1.0) From b5c2b39ae0cd48b0c3031c8a545fe53818c5096e Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Tue, 10 Jan 2023 14:19:00 +0100 Subject: [PATCH 04/44] Fix `euclidean_dist` in IVF-Flat search (#1122) Solves #1058 This is a tricky bug so the fix deserves some explanation. The previous implementation of `euclidean_dist` was the following in vectorized cases, where `x` and `y` are `int32` vectors of 4 `int8` each and `acc` is a single `int32` number to accumulate the distance in: ```c++ // Compute vectorized absolute differences independently. const auto diff = static_cast(__vabsdiffs4(x, y)); // Square, reduce, and add to the accumulator. acc = dp4a(diff, diff, acc); ``` Now consider the following case: ```c++ x = 0x80; // -128, 0, 0, 0 y = 0x7f; // 127, 0, 0, 0 ``` The difference between -128 and 127 is 255, represented as `FF` (`__vabsdiffs4` is smart enough not to compute `abs(a-b)` which would result in `01`). However, if we call the signed version of `dp4a`, `FF` is cast from `int8` to `int32` as `FFFFFFFF` (or -1). The square of -1 is 1, which is added to `acc` (instead of 65025). As the output of `__vabsdiffs4` is correct when considered as an unsigned number, and as addition is the same for signed and unsigned in 2's complement (and `acc` is positive anyway), the easiest fix is to use the unsigned version of `dp4a`, which will cast overflowed differences properly to 32 bits. The previous code simply becomes: ```c++ const auto diff = __vabsdiffs4(x, y); acc = dp4a(diff, diff, static_cast(acc)); ``` ----- Additionally, to avoid underflows in the non-vectorized unsigned case, I replaced the subtraction with `__usad` (absolute difference of unsigned numbers). Note that using the subtraction was correct anyway, because the addition/subtraction is the same for unsigned and signed integers, as well as the least significant half of the multiplication (which is the part that is stored), and the square of a number is also the square of its opposite. Consider: ```c++ uint32_t a = 10; uint32_t b = 20; uint32_t c = a - b; // fffffff6, i.e -10 or 4294967286 uint32_t d = c * c; // (ffffffec)00000064, i.e 100 ``` Authors: - Louis Sugy (https://github.com/Nyrio) Approvers: - Tamas Bela Feher (https://github.com/tfeher) - Corey J. Nolet (https://github.com/cjnolet) - Artem M. Chirkin (https://github.com/achirkin) URL: https://github.com/rapidsai/raft/pull/1122 --- .../raft/spatial/knn/detail/ivf_flat_search.cuh | 10 +++++++--- python/pylibraft/pylibraft/test/test_refine.py | 2 -- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh index d2f7d681d7..628b83a23c 100644 --- a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh +++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh @@ -886,7 +886,7 @@ struct euclidean_dist { const auto diff = __vabsdiffu4(x, y); acc = dp4a(diff, diff, acc); } else { - const auto diff = x - y; + const auto diff = __usad(x, y, 0u); acc += diff * diff; } } @@ -897,8 +897,12 @@ struct euclidean_dist { __device__ __forceinline__ void operator()(int32_t& acc, int32_t x, int32_t y) { if constexpr (Veclen > 1) { - const auto diff = static_cast(__vabsdiffs4(x, y)); - acc = dp4a(diff, diff, acc); + // Note that we enforce here that the unsigned version of dp4a is used, because the difference + // between two int8 numbers can be greater than 127 and therefore represented as a negative + // number in int8. Casting from int8 to int32 would yield incorrect results, while casting + // from uint8 to uint32 is correct. + const auto diff = __vabsdiffs4(x, y); + acc = dp4a(diff, diff, static_cast(acc)); } else { const auto diff = x - y; acc += diff * diff; diff --git a/python/pylibraft/pylibraft/test/test_refine.py b/python/pylibraft/pylibraft/test/test_refine.py index 49e4e71f9a..c7b8624bf1 100644 --- a/python/pylibraft/pylibraft/test/test_refine.py +++ b/python/pylibraft/pylibraft/test/test_refine.py @@ -124,8 +124,6 @@ def run_refine( @pytest.mark.parametrize("dtype", [np.float32, np.int8, np.uint8]) @pytest.mark.parametrize("memory_type", ["device", "host"]) def test_refine_dtypes(n_queries, dtype, inplace, metric, memory_type): - if memory_type == "device" and dtype == np.int8: - pytest.xfail("Possibly incorrect distance calculation (IVF-Flat)") run_refine( n_rows=2000, n_queries=n_queries, From 74ef8264c640bf9b35f24e9382e0e36aeffcf073 Mon Sep 17 00:00:00 2001 From: Tamas Bela Feher Date: Tue, 10 Jan 2023 14:21:03 +0100 Subject: [PATCH 05/44] Allow host dataset for IVF-PQ (#1114) This PR enables building (or extending) an IVF-PQ index using data in host memory. Authors: - Tamas Bela Feher (https://github.com/tfeher) - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/1114 --- python/pylibraft/pylibraft/common/__init__.py | 3 +- .../pylibraft/pylibraft/common/ai_wrapper.py | 89 +++++++++++++++++++ .../pylibraft/pylibraft/common/cai_wrapper.py | 69 +++----------- .../pylibraft/neighbors/ivf_pq/ivf_pq.pyx | 21 +++-- .../pylibraft/pylibraft/test/test_ivf_pq.py | 36 +++++--- 5 files changed, 144 insertions(+), 74 deletions(-) create mode 100644 python/pylibraft/pylibraft/common/ai_wrapper.py diff --git a/python/pylibraft/pylibraft/common/__init__.py b/python/pylibraft/pylibraft/common/__init__.py index 4f87720030..f8f9b58426 100644 --- a/python/pylibraft/pylibraft/common/__init__.py +++ b/python/pylibraft/pylibraft/common/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ # limitations under the License. # +from .ai_wrapper import ai_wrapper from .cai_wrapper import cai_wrapper from .cuda import Stream from .device_ndarray import device_ndarray diff --git a/python/pylibraft/pylibraft/common/ai_wrapper.py b/python/pylibraft/pylibraft/common/ai_wrapper.py new file mode 100644 index 0000000000..b6b1f02187 --- /dev/null +++ b/python/pylibraft/pylibraft/common/ai_wrapper.py @@ -0,0 +1,89 @@ +# +# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import numpy as np + +from pylibraft.common import input_validation + + +class ai_wrapper: + """ + Simple wrapper around a array interface object to reduce + boilerplate for extracting common information from the underlying + dictionary. + """ + + def __init__(self, ai_arr): + """ + Constructor accepts an array interface compliant array + + Parameters + ---------- + ai_arr : array interface array + """ + self.ai_ = ai_arr.__array_interface__ + + @property + def dtype(self): + """ + Returns the dtype of the underlying array interface + """ + return np.dtype(self.ai_["typestr"]) + + @property + def shape(self): + """ + Returns the shape of the underlying array interface + """ + return self.ai_["shape"] + + @property + def c_contiguous(self): + """ + Returns whether the underlying array interface has + c-ordered (row-major) layout + """ + return input_validation.is_c_contiguous(self.ai_) + + @property + def f_contiguous(self): + """ + Returns whether the underlying array interface has + f-ordered (column-major) layout + """ + return not input_validation.is_c_contiguous(self.ai_) + + @property + def data(self): + """ + Returns the data pointer of the underlying array interface + """ + return self.ai_["data"][0] + + def validate_shape_dtype(self, expected_dims=None, expected_dtype=None): + """Checks to see if the shape, dtype, and strides match expectations""" + if expected_dims is not None and len(self.shape) != expected_dims: + raise ValueError( + f"unexpected shape {self.shape} - " + f"expected {expected_dims} dimensions" + ) + + if expected_dtype is not None and self.dtype != expected_dtype: + raise ValueError( + f"invalid dtype {self.dtype}: expected " f"{expected_dtype}" + ) + + if not self.c_contiguous: + raise ValueError("input must be c-contiguous") diff --git a/python/pylibraft/pylibraft/common/cai_wrapper.py b/python/pylibraft/pylibraft/common/cai_wrapper.py index 5851821f57..cf11ea29ce 100644 --- a/python/pylibraft/pylibraft/common/cai_wrapper.py +++ b/python/pylibraft/pylibraft/common/cai_wrapper.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,12 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import numpy as np +from types import SimpleNamespace -from pylibraft.common import input_validation +from pylibraft.common.ai_wrapper import ai_wrapper -class cai_wrapper: +class cai_wrapper(ai_wrapper): """ Simple wrapper around a CUDA array interface object to reduce boilerplate for extracting common information from the underlying @@ -33,57 +33,14 @@ def __init__(self, cai_arr): ---------- cai_arr : CUDA array interface array """ - self.cai_ = cai_arr.__cuda_array_interface__ + helper = SimpleNamespace( + __array_interface__=cai_arr.__cuda_array_interface__ + ) + super().__init__(helper) - @property - def dtype(self): - """ - Returns the dtype of the underlying CUDA array interface - """ - return np.dtype(self.cai_["typestr"]) - - @property - def shape(self): - """ - Returns the shape of the underlying CUDA array interface - """ - return self.cai_["shape"] - - @property - def c_contiguous(self): - """ - Returns whether the underlying CUDA array interface has - c-ordered (row-major) layout - """ - return input_validation.is_c_contiguous(self.cai_) - - @property - def f_contiguous(self): - """ - Returns whether the underlying CUDA array interface has - f-ordered (column-major) layout - """ - return not input_validation.is_c_contiguous(self.cai_) - - @property - def data(self): - """ - Returns the data pointer of the underlying CUDA array interface - """ - return self.cai_["data"][0] - - def validate_shape_dtype(self, expected_dims=None, expected_dtype=None): - """Checks to see if the shape, dtype, and strides match expectations""" - if expected_dims is not None and len(self.shape) != expected_dims: - raise ValueError( - f"unexpected shape {self.shape} - " - f"expected {expected_dims} dimensions" - ) - - if expected_dtype is not None and self.dtype != expected_dtype: - raise ValueError( - f"invalid dtype {self.dtype}: expected " f"{expected_dtype}" - ) - if not self.c_contiguous: - raise ValueError("input must be c-contiguous") +def wrap_array(array): + try: + return cai_wrapper(array) + except AttributeError: + return ai_wrapper(array) diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx index a7137e4d08..002a097d0f 100644 --- a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx +++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx @@ -1,5 +1,5 @@ # -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -36,10 +36,12 @@ from pylibraft.distance.distance_type cimport DistanceType from pylibraft.common import ( Handle, + ai_wrapper, auto_convert_output, cai_wrapper, device_ndarray, ) +from pylibraft.common.cai_wrapper import wrap_array from pylibraft.common.interruptible import cuda_interruptible from pylibraft.common.handle cimport handle_t @@ -313,10 +315,13 @@ def build(IndexParams index_params, dataset, handle=None): """ Builds an IVF-PQ index that can be later used for nearest neighbor search. + The input array can be either CUDA array interface compliant matrix or + array interface compliant matrix in host memory. + Parameters ---------- index_params : IndexParams object - dataset : CUDA array interface compliant matrix shape (n_samples, dim) + dataset : array interface compliant matrix shape (n_samples, dim) Supported dtype [float, int8, uint8] {handle_docstring} @@ -359,7 +364,7 @@ def build(IndexParams index_params, dataset, handle=None): >>> # handle needs to be explicitly synchronized >>> handle.sync() """ - dataset_cai = cai_wrapper(dataset) + dataset_cai = wrap_array(dataset) dataset_dt = dataset_cai.dtype _check_input_array(dataset_cai, [np.dtype('float32'), np.dtype('byte'), np.dtype('ubyte')]) @@ -413,14 +418,16 @@ def extend(Index index, new_vectors, new_indices, handle=None): """ Extend an existing index with new vectors. + The input array can be either CUDA array interface compliant matrix or + array interface compliant matrix in host memory. Parameters ---------- index : ivf_pq.Index Trained ivf_pq object. - new_vectors : CUDA array interface compliant matrix shape (n_samples, dim) + new_vectors : array interface compliant matrix shape (n_samples, dim) Supported dtype [float, int8, uint8] - new_indices : CUDA array interface compliant matrix shape (n_samples, dim) + new_indices : array interface compliant matrix shape (n_samples, dim) Supported dtype [uint64] {handle_docstring} @@ -473,7 +480,7 @@ def extend(Index index, new_vectors, new_indices, handle=None): handle = Handle() cdef handle_t* handle_ = handle.getHandle() - vecs_cai = cai_wrapper(new_vectors) + vecs_cai = wrap_array(new_vectors) vecs_dt = vecs_cai.dtype cdef uint64_t n_rows = vecs_cai.shape[0] cdef uint32_t dim = vecs_cai.shape[1] @@ -482,7 +489,7 @@ def extend(Index index, new_vectors, new_indices, handle=None): np.dtype('ubyte')], exp_cols=index.dim) - idx_cai = cai_wrapper(new_indices) + idx_cai = wrap_array(new_indices) _check_input_array(idx_cai, [np.dtype('uint64')], exp_rows=n_rows) if len(idx_cai.shape)!=1: raise ValueError("Indices array is expected to be 1D") diff --git a/python/pylibraft/pylibraft/test/test_ivf_pq.py b/python/pylibraft/pylibraft/test/test_ivf_pq.py index 2c6e0dd14c..35738cd471 100644 --- a/python/pylibraft/pylibraft/test/test_ivf_pq.py +++ b/python/pylibraft/pylibraft/test/test_ivf_pq.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -97,6 +97,7 @@ def run_ivf_pq_build_search_test( kmeans_n_iters=20, compare=True, inplace=True, + array_type="device", ): dataset = generate_data((n_rows, n_cols), dtype) if metric == "inner_product": @@ -115,7 +116,10 @@ def run_ivf_pq_build_search_test( add_data_on_build=add_data_on_build, ) - index = ivf_pq.build(build_params, dataset_device) + if array_type == "device": + index = ivf_pq.build(build_params, dataset_device) + else: + index = ivf_pq.build(build_params, dataset) assert index.trained if pq_dim != 0: @@ -125,14 +129,20 @@ def run_ivf_pq_build_search_test( assert index.n_lists == build_params.n_lists if not add_data_on_build: - dataset_1_device = device_ndarray(dataset[: n_rows // 2, :]) - dataset_2_device = device_ndarray(dataset[n_rows // 2 :, :]) + dataset_1 = dataset[: n_rows // 2, :] + dataset_2 = dataset[n_rows // 2 :, :] indices_1 = np.arange(n_rows // 2, dtype=np.uint64) - indices_1_device = device_ndarray(indices_1) indices_2 = np.arange(n_rows // 2, n_rows, dtype=np.uint64) - indices_2_device = device_ndarray(indices_2) - index = ivf_pq.extend(index, dataset_1_device, indices_1_device) - index = ivf_pq.extend(index, dataset_2_device, indices_2_device) + if array_type == "device": + dataset_1_device = device_ndarray(dataset_1) + dataset_2_device = device_ndarray(dataset_2) + indices_1_device = device_ndarray(indices_1) + indices_2_device = device_ndarray(indices_2) + index = ivf_pq.extend(index, dataset_1_device, indices_1_device) + index = ivf_pq.extend(index, dataset_2_device, indices_2_device) + else: + index = ivf_pq.extend(index, dataset_1, indices_1) + index = ivf_pq.extend(index, dataset_2, indices_2) assert index.size >= n_rows @@ -190,7 +200,10 @@ def run_ivf_pq_build_search_test( @pytest.mark.parametrize("n_queries", [100]) @pytest.mark.parametrize("n_lists", [100]) @pytest.mark.parametrize("dtype", [np.float32, np.int8, np.uint8]) -def test_ivf_pq_dtypes(n_rows, n_cols, n_queries, n_lists, dtype, inplace): +@pytest.mark.parametrize("array_type", ["host", "device"]) +def test_ivf_pq_dtypes( + n_rows, n_cols, n_queries, n_lists, dtype, inplace, array_type +): # Note that inner_product tests use normalized input which we cannot # represent in int8, therefore we test only l2_expanded metric here. run_ivf_pq_build_search_test( @@ -202,6 +215,7 @@ def test_ivf_pq_dtypes(n_rows, n_cols, n_queries, n_lists, dtype, inplace): metric="l2_expanded", dtype=dtype, inplace=inplace, + array_type=array_type, ) @@ -337,7 +351,8 @@ def test_ivf_pq_search_params(params): @pytest.mark.parametrize("dtype", [np.float32, np.int8, np.uint8]) -def test_extend(dtype): +@pytest.mark.parametrize("array_type", ["host", "device"]) +def test_extend(dtype, array_type): run_ivf_pq_build_search_test( n_rows=10000, n_cols=10, @@ -347,6 +362,7 @@ def test_extend(dtype): metric="l2_expanded", dtype=dtype, add_data_on_build=False, + array_type=array_type, ) From de7d361535916876f50f125c5a618b1636dd8327 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Tue, 10 Jan 2023 17:38:10 -0500 Subject: [PATCH 06/44] build.sh switch to use `RAPIDS` magic value (#1132) rapids-cmake 23.02 is deprecating the magic value of `ALL` since it doesn't cleanly map to the cmake magic value of `all`. Instead we use `RAPIDS` which better represents the architectures we are building for. Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/1132 --- build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.sh b/build.sh index 34dcd3a2db..94bc055adb 100755 --- a/build.sh +++ b/build.sh @@ -387,7 +387,7 @@ if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || has RAFT_CMAKE_CUDA_ARCHITECTURES="NATIVE" echo "Building for the architecture of the GPU in the system..." else - RAFT_CMAKE_CUDA_ARCHITECTURES="ALL" + RAFT_CMAKE_CUDA_ARCHITECTURES="RAPIDS" echo "Building for *ALL* supported GPU architectures..." fi From 2c97abeb1a1b6d03b73f38813420b784feb33e87 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 10 Jan 2023 18:57:23 -0500 Subject: [PATCH 07/44] Decoupling raft handle from underlying resources (#1111) This implements a design idea a few of us have been kicking around for a little while now to help decouple underlying resources from the raft handle and also allow users to never have to explicitly include headers for resources that are never used (such as cublas, cusolver, cusparse, comms, etc...). This effectively breaks the existing raft::handle_t into separate headers for the various resources it contains, providing functions that can be individually included and invoked on a `raft::resources`. This still allows us to write something like a `raft::device_resources` (and also allows us to maintain API compatibility in the meantime by backing the existing `raft::handle_t` with a `raft::resources`. One of the major goals of this PR is to also enable a handle to be used outside of just cuda resources and to allow for unused resources to not need to be loaded nor compiled at all into user code downstream. Follow-on work after this PR will include: 1. Updating all of RAFT's public functions to accept `raft::resources` and using the individual resource accessors instead of assuming `device_resources` everywhere. 2. Deprecating the `handle_t` in favor of the more explicit `device_resources` Authors: - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Divye Gala (https://github.com/divyegala) - Dante Gama Dessavre (https://github.com/dantegd) - William Hicks (https://github.com/wphicks) - Ben Frederickson (https://github.com/benfred) URL: https://github.com/rapidsai/raft/pull/1111 --- build.sh | 3 +- cpp/include/raft/comms/detail/test.hpp | 2 +- cpp/include/raft/core/comms.hpp | 3 +- cpp/include/raft/core/device_resources.hpp | 241 +++++++++++ cpp/include/raft/core/handle.hpp | 312 +------------- cpp/include/raft/core/resource/comms.hpp | 69 +++ .../raft/core/resource/cublas_handle.hpp | 71 +++ cpp/include/raft/core/resource/cuda_event.hpp | 38 ++ .../raft/core/resource/cuda_stream.hpp | 94 ++++ .../raft/core/resource/cuda_stream_pool.hpp | 171 ++++++++ .../raft/core/resource/cusolver_dn_handle.hpp | 75 ++++ .../raft/core/resource/cusolver_sp_handle.hpp | 74 ++++ .../raft/core/resource/cusparse_handle.hpp | 69 +++ .../resource/detail/stream_sync_event.hpp | 50 +++ cpp/include/raft/core/resource/device_id.hpp | 66 +++ .../raft/core/resource/device_properties.hpp | 68 +++ .../raft/core/resource/resource_types.hpp | 105 +++++ cpp/include/raft/core/resource/sub_comms.hpp | 72 ++++ .../raft/core/resource/thrust_policy.hpp | 64 +++ cpp/include/raft/core/resources.hpp | 128 ++++++ .../spatial/knn/detail/ivf_flat_search.cuh | 2 +- cpp/test/CMakeLists.txt | 34 +- cpp/test/{ => cluster}/cluster_solvers.cu | 9 +- .../cluster_solvers_deprecated.cu | 2 +- cpp/test/cluster/kmeans.cu | 14 +- cpp/test/cluster/linkage.cu | 14 +- cpp/test/core/handle.cpp | 251 +++++++++++ cpp/test/{ => core}/interruptible.cu | 2 +- cpp/test/{common => core}/logger.cpp | 2 +- cpp/test/{ => core}/mdarray.cu | 2 +- cpp/test/{ => core}/mdspan_utils.cu | 2 +- cpp/test/{ => core}/memory_type.cpp | 2 +- cpp/test/{ => core}/nvtx.cpp | 2 +- cpp/test/{common => core}/seive.cu | 2 +- cpp/test/{ => core}/span.cpp | 2 +- cpp/test/{ => core}/span.cu | 2 +- cpp/test/{ => core}/test_span.hpp | 2 +- cpp/test/distance/distance_base.cuh | 4 +- cpp/test/distance/fused_l2_nn.cu | 6 +- cpp/test/handle.cpp | 67 --- cpp/test/{ => linalg}/eigen_solvers.cu | 2 +- cpp/test/matrix/columnSort.cu | 4 +- cpp/test/matrix/linewise_op.cu | 4 +- cpp/test/neighbors/epsilon_neighborhood.cu | 4 +- cpp/test/neighbors/selection.cu | 92 ++-- cpp/test/random/make_blobs.cu | 4 +- cpp/test/random/multi_variable_gaussian.cu | 17 +- cpp/test/{ => sparse}/mst.cu | 4 +- cpp/test/{ => sparse}/spectral_matrix.cu | 2 +- cpp/test/stats/cov.cu | 6 +- cpp/test/stats/regression_metrics.cu | 4 +- cpp/test/stats/silhouette_score.cu | 4 +- cpp/test/stats/trustworthiness.cu | 19 +- cpp/test/{ => util}/cudart_utils.cpp | 2 +- cpp/test/{ => util}/device_atomics.cu | 2 +- cpp/test/{ => util}/integer_utils.cpp | 2 +- cpp/test/{ => util}/pow2_utils.cu | 2 +- docs/source/build.md | 4 +- docs/source/developer_guide.md | 405 +++++++++++++++++- .../pylibraft/pylibraft/test/test_refine.py | 2 +- python/raft-dask/setup.py | 4 +- 61 files changed, 2284 insertions(+), 503 deletions(-) create mode 100644 cpp/include/raft/core/device_resources.hpp create mode 100644 cpp/include/raft/core/resource/comms.hpp create mode 100644 cpp/include/raft/core/resource/cublas_handle.hpp create mode 100644 cpp/include/raft/core/resource/cuda_event.hpp create mode 100644 cpp/include/raft/core/resource/cuda_stream.hpp create mode 100644 cpp/include/raft/core/resource/cuda_stream_pool.hpp create mode 100644 cpp/include/raft/core/resource/cusolver_dn_handle.hpp create mode 100644 cpp/include/raft/core/resource/cusolver_sp_handle.hpp create mode 100644 cpp/include/raft/core/resource/cusparse_handle.hpp create mode 100644 cpp/include/raft/core/resource/detail/stream_sync_event.hpp create mode 100644 cpp/include/raft/core/resource/device_id.hpp create mode 100644 cpp/include/raft/core/resource/device_properties.hpp create mode 100644 cpp/include/raft/core/resource/resource_types.hpp create mode 100644 cpp/include/raft/core/resource/sub_comms.hpp create mode 100644 cpp/include/raft/core/resource/thrust_policy.hpp create mode 100644 cpp/include/raft/core/resources.hpp rename cpp/test/{ => cluster}/cluster_solvers.cu (96%) rename cpp/test/{ => cluster}/cluster_solvers_deprecated.cu (96%) create mode 100644 cpp/test/core/handle.cpp rename cpp/test/{ => core}/interruptible.cu (98%) rename cpp/test/{common => core}/logger.cpp (98%) rename cpp/test/{ => core}/mdarray.cu (99%) rename cpp/test/{ => core}/mdspan_utils.cu (99%) rename cpp/test/{ => core}/memory_type.cpp (96%) rename cpp/test/{ => core}/nvtx.cpp (96%) rename cpp/test/{common => core}/seive.cu (95%) rename cpp/test/{ => core}/span.cpp (99%) rename cpp/test/{ => core}/span.cu (99%) rename cpp/test/{ => core}/test_span.hpp (99%) delete mode 100644 cpp/test/handle.cpp rename cpp/test/{ => linalg}/eigen_solvers.cu (98%) rename cpp/test/{ => sparse}/mst.cu (99%) rename cpp/test/{ => sparse}/spectral_matrix.cu (98%) rename cpp/test/{ => util}/cudart_utils.cpp (98%) rename cpp/test/{ => util}/device_atomics.cu (97%) rename cpp/test/{ => util}/integer_utils.cpp (96%) rename cpp/test/{ => util}/pow2_utils.cu (98%) diff --git a/build.sh b/build.sh index 94bc055adb..b47e1ed862 100755 --- a/build.sh +++ b/build.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. # raft build script @@ -153,6 +153,7 @@ function limitTests { # Remove the full LIMIT_TEST_TARGETS argument from list of args so that it passes validArgs function ARGS=${ARGS//--limit-tests=$LIMIT_TEST_TARGETS/} TEST_TARGETS=${LIMIT_TEST_TARGETS} + echo "Limiting tests to $TEST_TARGETS" fi fi } diff --git a/cpp/include/raft/comms/detail/test.hpp b/cpp/include/raft/comms/detail/test.hpp index 6ba4be3886..4f879540b4 100644 --- a/cpp/include/raft/comms/detail/test.hpp +++ b/cpp/include/raft/comms/detail/test.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/core/comms.hpp b/cpp/include/raft/core/comms.hpp index 35ab6680de..463c17f2f6 100644 --- a/cpp/include/raft/core/comms.hpp +++ b/cpp/include/raft/core/comms.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ #pragma once +#include #include #include #include diff --git a/cpp/include/raft/core/device_resources.hpp b/cpp/include/raft/core/device_resources.hpp new file mode 100644 index 0000000000..faca07e8f4 --- /dev/null +++ b/cpp/include/raft/core/device_resources.hpp @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __RAFT_DEVICE_RESOURCES +#define __RAFT_DEVICE_RESOURCES + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace raft { + +/** + * @brief Main resource container object that stores all necessary resources + * used for calling necessary device functions, cuda kernels and/or libraries + */ +class device_resources : public resources { + public: + // delete copy/move constructors and assignment operators as + // copying and moving underlying resources is unsafe + device_resources(const device_resources&) = delete; + device_resources& operator=(const device_resources&) = delete; + device_resources(device_resources&&) = delete; + device_resources& operator=(device_resources&&) = delete; + + /** + * @brief Construct a resources instance with a stream view and stream pool + * + * @param[in] stream_view the default stream (which has the default per-thread stream if + * unspecified) + * @param[in] stream_pool the stream pool used (which has default of nullptr if unspecified) + */ + device_resources(rmm::cuda_stream_view stream_view = rmm::cuda_stream_per_thread, + std::shared_ptr stream_pool = {nullptr}) + : resources{} + { + resources::add_resource_factory(std::make_shared()); + resources::add_resource_factory( + std::make_shared(stream_view)); + resources::add_resource_factory( + std::make_shared(stream_pool)); + } + + /** Destroys all held-up resources */ + virtual ~device_resources() {} + + int get_device() const { return resource::get_device_id(*this); } + + cublasHandle_t get_cublas_handle() const { return resource::get_cublas_handle(*this); } + + cusolverDnHandle_t get_cusolver_dn_handle() const + { + return resource::get_cusolver_dn_handle(*this); + } + + cusolverSpHandle_t get_cusolver_sp_handle() const + { + return resource::get_cusolver_sp_handle(*this); + } + + cusparseHandle_t get_cusparse_handle() const { return resource::get_cusparse_handle(*this); } + + rmm::exec_policy& get_thrust_policy() const { return resource::get_thrust_policy(*this); } + + /** + * @brief synchronize a stream on the current container + */ + void sync_stream(rmm::cuda_stream_view stream) const { resource::sync_stream(*this, stream); } + + /** + * @brief synchronize main stream on the current container + */ + void sync_stream() const { resource::sync_stream(*this); } + + /** + * @brief returns main stream on the current container + */ + rmm::cuda_stream_view get_stream() const { return resource::get_cuda_stream(*this); } + + /** + * @brief returns whether stream pool was initialized on the current container + */ + + bool is_stream_pool_initialized() const { return resource::is_stream_pool_initialized(*this); } + + /** + * @brief returns stream pool on the current container + */ + const rmm::cuda_stream_pool& get_stream_pool() const + { + return resource::get_cuda_stream_pool(*this); + } + + std::size_t get_stream_pool_size() const { return resource::get_stream_pool_size(*this); } + + /** + * @brief return stream from pool + */ + rmm::cuda_stream_view get_stream_from_stream_pool() const + { + return resource::get_stream_from_stream_pool(*this); + } + + /** + * @brief return stream from pool at index + */ + rmm::cuda_stream_view get_stream_from_stream_pool(std::size_t stream_idx) const + { + return resource::get_stream_from_stream_pool(*this, stream_idx); + } + + /** + * @brief return stream from pool if size > 0, else main stream on current container + */ + rmm::cuda_stream_view get_next_usable_stream() const + { + return resource::get_next_usable_stream(*this); + } + + /** + * @brief return stream from pool at index if size > 0, else main stream on current container + * + * @param[in] stream_idx the required index of the stream in the stream pool if available + */ + rmm::cuda_stream_view get_next_usable_stream(std::size_t stream_idx) const + { + return resource::get_next_usable_stream(*this, stream_idx); + } + + /** + * @brief synchronize the stream pool on the current container + */ + void sync_stream_pool() const { return resource::sync_stream_pool(*this); } + + /** + * @brief synchronize subset of stream pool + * + * @param[in] stream_indices the indices of the streams in the stream pool to synchronize + */ + void sync_stream_pool(const std::vector stream_indices) const + { + return resource::sync_stream_pool(*this, stream_indices); + } + + /** + * @brief ask stream pool to wait on last event in main stream + */ + void wait_stream_pool_on_stream() const { return resource::wait_stream_pool_on_stream(*this); } + + void set_comms(std::shared_ptr communicator) + { + resource::set_comms(*this, communicator); + } + + const comms::comms_t& get_comms() const { return resource::get_comms(*this); } + + void set_subcomm(std::string key, std::shared_ptr subcomm) + { + resource::set_subcomm(*this, key, subcomm); + } + + const comms::comms_t& get_subcomm(std::string key) const + { + return resource::get_subcomm(*this, key); + } + + bool comms_initialized() const { return resource::comms_initialized(*this); } + + const cudaDeviceProp& get_device_properties() const + { + return resource::get_device_properties(*this); + } +}; // class device_resources + +/** + * @brief RAII approach to synchronizing across all streams in the current container + */ +class stream_syncer { + public: + explicit stream_syncer(const device_resources& handle) : handle_(handle) + { + handle_.sync_stream(); + } + ~stream_syncer() + { + handle_.wait_stream_pool_on_stream(); + handle_.sync_stream_pool(); + } + + stream_syncer(const stream_syncer& other) = delete; + stream_syncer& operator=(const stream_syncer& other) = delete; + + private: + const device_resources& handle_; +}; // class stream_syncer + +} // namespace raft + +#endif \ No newline at end of file diff --git a/cpp/include/raft/core/handle.hpp b/cpp/include/raft/core/handle.hpp index 08cb812bb7..48c1718eb0 100644 --- a/cpp/include/raft/core/handle.hpp +++ b/cpp/include/raft/core/handle.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,44 +14,23 @@ * limitations under the License. */ -#ifndef __RAFT_RT_HANDLE -#define __RAFT_RT_HANDLE - #pragma once -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -///@todo: enable once we have migrated cuml-comms layer too -//#include - -#include - -#include -#include -#include -#include -#include -#include -#include +#include namespace raft { /** - * @brief Main handle object that stores all necessary context used for calling - * necessary cuda kernels and/or libraries + * raft::handle_t is being kept around for backwards + * compatibility and will be removed in a future version. + * + * Extending the `raft::device_resources` instead of `using` to + * minimize needed changes downstream + * (e.g. existing forward declarations, etc...) + * + * Use of `raft::resources` or `raft::device_resources` is preferred. */ -class handle_t { +class handle_t : public raft::device_resources { public: // delete copy/move constructors and assignment operators as // copying and moving underlying resources is unsafe @@ -61,7 +40,7 @@ class handle_t { handle_t& operator=(handle_t&&) = delete; /** - * @brief Construct a handle with a stream view and stream pool + * @brief Construct a resources instance with a stream view and stream pool * * @param[in] stream_view the default stream (which has the default per-thread stream if * unspecified) @@ -69,271 +48,12 @@ class handle_t { */ handle_t(rmm::cuda_stream_view stream_view = rmm::cuda_stream_per_thread, std::shared_ptr stream_pool = {nullptr}) - : dev_id_([]() -> int { - int cur_dev = -1; - RAFT_CUDA_TRY(cudaGetDevice(&cur_dev)); - return cur_dev; - }()), - stream_view_{stream_view}, - stream_pool_{stream_pool} + : device_resources{stream_view, stream_pool} { - create_resources(); } /** Destroys all held-up resources */ - virtual ~handle_t() { destroy_resources(); } - - int get_device() const { return dev_id_; } - - cublasHandle_t get_cublas_handle() const - { - std::lock_guard _(mutex_); - if (!cublas_initialized_) { - RAFT_CUBLAS_TRY_NO_THROW(cublasCreate(&cublas_handle_)); - RAFT_CUBLAS_TRY_NO_THROW(cublasSetStream(cublas_handle_, stream_view_)); - cublas_initialized_ = true; - } - return cublas_handle_; - } - - cusolverDnHandle_t get_cusolver_dn_handle() const - { - std::lock_guard _(mutex_); - if (!cusolver_dn_initialized_) { - RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnCreate(&cusolver_dn_handle_)); - RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnSetStream(cusolver_dn_handle_, stream_view_)); - cusolver_dn_initialized_ = true; - } - return cusolver_dn_handle_; - } - - cusolverSpHandle_t get_cusolver_sp_handle() const - { - std::lock_guard _(mutex_); - if (!cusolver_sp_initialized_) { - RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpCreate(&cusolver_sp_handle_)); - RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpSetStream(cusolver_sp_handle_, stream_view_)); - cusolver_sp_initialized_ = true; - } - return cusolver_sp_handle_; - } - - cusparseHandle_t get_cusparse_handle() const - { - std::lock_guard _(mutex_); - if (!cusparse_initialized_) { - RAFT_CUSPARSE_TRY_NO_THROW(cusparseCreate(&cusparse_handle_)); - RAFT_CUSPARSE_TRY_NO_THROW(cusparseSetStream(cusparse_handle_, stream_view_)); - cusparse_initialized_ = true; - } - return cusparse_handle_; - } - - rmm::exec_policy& get_thrust_policy() const { return *thrust_policy_; } - - /** - * @brief synchronize a stream on the handle - */ - void sync_stream(rmm::cuda_stream_view stream) const { interruptible::synchronize(stream); } - - /** - * @brief synchronize main stream on the handle - */ - void sync_stream() const { sync_stream(stream_view_); } - - /** - * @brief returns main stream on the handle - */ - rmm::cuda_stream_view get_stream() const { return stream_view_; } - - /** - * @brief returns whether stream pool was initialized on the handle - */ - - bool is_stream_pool_initialized() const { return stream_pool_.get() != nullptr; } - - /** - * @brief returns stream pool on the handle - */ - const rmm::cuda_stream_pool& get_stream_pool() const - { - RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized"); - return *stream_pool_; - } - - std::size_t get_stream_pool_size() const - { - return is_stream_pool_initialized() ? stream_pool_->get_pool_size() : 0; - } - - /** - * @brief return stream from pool - */ - rmm::cuda_stream_view get_stream_from_stream_pool() const - { - RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized"); - return stream_pool_->get_stream(); - } - - /** - * @brief return stream from pool at index - */ - rmm::cuda_stream_view get_stream_from_stream_pool(std::size_t stream_idx) const - { - RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized"); - return stream_pool_->get_stream(stream_idx); - } - - /** - * @brief return stream from pool if size > 0, else main stream on handle - */ - rmm::cuda_stream_view get_next_usable_stream() const - { - return is_stream_pool_initialized() ? get_stream_from_stream_pool() : stream_view_; - } - - /** - * @brief return stream from pool at index if size > 0, else main stream on handle - * - * @param[in] stream_idx the required index of the stream in the stream pool if available - */ - rmm::cuda_stream_view get_next_usable_stream(std::size_t stream_idx) const - { - return is_stream_pool_initialized() ? get_stream_from_stream_pool(stream_idx) : stream_view_; - } - - /** - * @brief synchronize the stream pool on the handle - */ - void sync_stream_pool() const - { - for (std::size_t i = 0; i < get_stream_pool_size(); i++) { - sync_stream(stream_pool_->get_stream(i)); - } - } - - /** - * @brief synchronize subset of stream pool - * - * @param[in] stream_indices the indices of the streams in the stream pool to synchronize - */ - void sync_stream_pool(const std::vector stream_indices) const - { - RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized"); - for (const auto& stream_index : stream_indices) { - sync_stream(stream_pool_->get_stream(stream_index)); - } - } - - /** - * @brief ask stream pool to wait on last event in main stream - */ - void wait_stream_pool_on_stream() const - { - RAFT_CUDA_TRY(cudaEventRecord(event_, stream_view_)); - for (std::size_t i = 0; i < get_stream_pool_size(); i++) { - RAFT_CUDA_TRY(cudaStreamWaitEvent(stream_pool_->get_stream(i), event_, 0)); - } - } - - void set_comms(std::shared_ptr communicator) { communicator_ = communicator; } - - const comms::comms_t& get_comms() const - { - RAFT_EXPECTS(this->comms_initialized(), "ERROR: Communicator was not initialized\n"); - return *communicator_; - } - - void set_subcomm(std::string key, std::shared_ptr subcomm) - { - subcomms_[key] = subcomm; - } - - const comms::comms_t& get_subcomm(std::string key) const - { - RAFT_EXPECTS( - subcomms_.find(key) != subcomms_.end(), "%s was not found in subcommunicators.", key.c_str()); - - auto subcomm = subcomms_.at(key); - - RAFT_EXPECTS(nullptr != subcomm.get(), "ERROR: Subcommunicator was not initialized"); - - return *subcomm; - } - - bool comms_initialized() const { return (nullptr != communicator_.get()); } - - const cudaDeviceProp& get_device_properties() const - { - std::lock_guard _(mutex_); - if (!device_prop_initialized_) { - RAFT_CUDA_TRY_NO_THROW(cudaGetDeviceProperties(&prop_, dev_id_)); - device_prop_initialized_ = true; - } - return prop_; - } - - private: - std::shared_ptr communicator_; - std::unordered_map> subcomms_; - - const int dev_id_; - mutable cublasHandle_t cublas_handle_; - mutable bool cublas_initialized_{false}; - mutable cusolverDnHandle_t cusolver_dn_handle_; - mutable bool cusolver_dn_initialized_{false}; - mutable cusolverSpHandle_t cusolver_sp_handle_; - mutable bool cusolver_sp_initialized_{false}; - mutable cusparseHandle_t cusparse_handle_; - mutable bool cusparse_initialized_{false}; - std::unique_ptr thrust_policy_{nullptr}; - rmm::cuda_stream_view stream_view_{rmm::cuda_stream_per_thread}; - std::shared_ptr stream_pool_{nullptr}; - cudaEvent_t event_; - mutable cudaDeviceProp prop_; - mutable bool device_prop_initialized_{false}; - mutable std::mutex mutex_; - - void create_resources() - { - thrust_policy_ = std::make_unique(stream_view_); - - RAFT_CUDA_TRY(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); - } - - void destroy_resources() - { - if (cusparse_initialized_) { RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroy(cusparse_handle_)); } - if (cusolver_dn_initialized_) { - RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_)); - } - if (cusolver_sp_initialized_) { - RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_)); - } - if (cublas_initialized_) { RAFT_CUBLAS_TRY_NO_THROW(cublasDestroy(cublas_handle_)); } - RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(event_)); - } -}; // class handle_t - -/** - * @brief RAII approach to synchronizing across all streams in the handle - */ -class stream_syncer { - public: - explicit stream_syncer(const handle_t& handle) : handle_(handle) { handle_.sync_stream(); } - ~stream_syncer() - { - handle_.wait_stream_pool_on_stream(); - handle_.sync_stream_pool(); - } - - stream_syncer(const stream_syncer& other) = delete; - stream_syncer& operator=(const stream_syncer& other) = delete; - - private: - const handle_t& handle_; -}; // class stream_syncer - -} // namespace raft + ~handle_t() override {} +}; -#endif \ No newline at end of file +} // end NAMESPACE raft \ No newline at end of file diff --git a/cpp/include/raft/core/resource/comms.hpp b/cpp/include/raft/core/resource/comms.hpp new file mode 100644 index 0000000000..b7a74b7dd5 --- /dev/null +++ b/cpp/include/raft/core/resource/comms.hpp @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +namespace raft::resource { +class comms_resource : public resource { + public: + comms_resource(std::shared_ptr comnumicator) : communicator_(comnumicator) {} + + void* get_resource() override { return &communicator_; } + + ~comms_resource() override {} + + private: + std::shared_ptr communicator_; +}; + +/** + * Factory that knows how to construct a + * specific raft::resource to populate + * the res_t. + */ +class comms_resource_factory : public resource_factory { + public: + comms_resource_factory(std::shared_ptr communicator) : communicator_(communicator) + { + } + + resource_type get_resource_type() override { return resource_type::COMMUNICATOR; } + + resource* make_resource() override { return new comms_resource(communicator_); } + + private: + std::shared_ptr communicator_; +}; + +inline bool comms_initialized(resources const& res) +{ + return res.has_resource_factory(resource_type::COMMUNICATOR); +} + +inline comms::comms_t const& get_comms(resources const& res) +{ + RAFT_EXPECTS(comms_initialized(res), "ERROR: Communicator was not initialized\n"); + return *(*res.get_resource>(resource_type::COMMUNICATOR)); +} + +inline void set_comms(resources const& res, std::shared_ptr communicator) +{ + res.add_resource_factory(std::make_shared(communicator)); +} +} // namespace raft::resource diff --git a/cpp/include/raft/core/resource/cublas_handle.hpp b/cpp/include/raft/core/resource/cublas_handle.hpp new file mode 100644 index 0000000000..cf6f51ee98 --- /dev/null +++ b/cpp/include/raft/core/resource/cublas_handle.hpp @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include + +namespace raft::resource { + +class cublas_resource : public resource { + public: + cublas_resource(rmm::cuda_stream_view stream) + { + RAFT_CUBLAS_TRY_NO_THROW(cublasCreate(&cublas_res)); + RAFT_CUBLAS_TRY_NO_THROW(cublasSetStream(cublas_res, stream)); + } + + ~cublas_resource() override { RAFT_CUBLAS_TRY_NO_THROW(cublasDestroy(cublas_res)); } + + void* get_resource() override { return &cublas_res; } + + private: + cublasHandle_t cublas_res; +}; + +/** + * Factory that knows how to construct a + * specific raft::resource to populate + * the res_t. + */ +class cublas_resource_factory : public resource_factory { + public: + cublas_resource_factory(rmm::cuda_stream_view stream) : stream_(stream) {} + resource_type get_resource_type() override { return resource_type::CUBLAS_HANDLE; } + resource* make_resource() override { return new cublas_resource(stream_); } + + private: + rmm::cuda_stream_view stream_; +}; + +/** + * Load a cublasres_t from raft res if it exists, otherwise + * add it and return it. + * @param res + * @return + */ +inline cublasHandle_t get_cublas_handle(resources const& res) +{ + if (!res.has_resource_factory(resource_type::CUBLAS_HANDLE)) { + cudaStream_t stream = get_cuda_stream(res); + res.add_resource_factory(std::make_shared(stream)); + } + return *res.get_resource(resource_type::CUBLAS_HANDLE); +}; +} // namespace raft::resource diff --git a/cpp/include/raft/core/resource/cuda_event.hpp b/cpp/include/raft/core/resource/cuda_event.hpp new file mode 100644 index 0000000000..4859d95ee9 --- /dev/null +++ b/cpp/include/raft/core/resource/cuda_event.hpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +namespace raft::resource { + +class cuda_event_resource : public resource { + public: + cuda_event_resource() + { + RAFT_CUDA_TRY_NO_THROW(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); + } + void* get_resource() override { return &event_; } + + ~cuda_event_resource() override { RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(event_)); } + + private: + cudaEvent_t event_; +}; +} // namespace raft::resource diff --git a/cpp/include/raft/core/resource/cuda_stream.hpp b/cpp/include/raft/core/resource/cuda_stream.hpp new file mode 100644 index 0000000000..2e01ce0123 --- /dev/null +++ b/cpp/include/raft/core/resource/cuda_stream.hpp @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace raft::resource { +class cuda_stream_resource : public resource { + public: + cuda_stream_resource(rmm::cuda_stream_view stream_view = rmm::cuda_stream_per_thread) + : stream(stream_view) + { + } + void* get_resource() override { return &stream; } + + ~cuda_stream_resource() override {} + + private: + rmm::cuda_stream_view stream; +}; + +/** + * Factory that knows how to construct a specific raft::resource to populate + * the resources instance. + */ +class cuda_stream_resource_factory : public resource_factory { + public: + cuda_stream_resource_factory(rmm::cuda_stream_view stream_view = rmm::cuda_stream_per_thread) + : stream(stream_view) + { + } + resource_type get_resource_type() override { return resource_type::CUDA_STREAM_VIEW; } + resource* make_resource() override { return new cuda_stream_resource(stream); } + + private: + rmm::cuda_stream_view stream; +}; + +/** + * Load a rmm::cuda_stream_view from a resources instance (and populate it on the res + * if needed). + * @param res raft res object for managing resources + * @return + */ +inline rmm::cuda_stream_view get_cuda_stream(resources const& res) +{ + if (!res.has_resource_factory(resource_type::CUDA_STREAM_VIEW)) { + res.add_resource_factory(std::make_shared()); + } + return *res.get_resource(resource_type::CUDA_STREAM_VIEW); +}; + +/** + * Load a rmm::cuda_stream_view from a resources instance (and populate it on the res + * if needed). + * @param res raft res object for managing resources + * @return + */ +inline void set_cuda_stream(resources const& res, rmm::cuda_stream_view stream_view) +{ + res.add_resource_factory(std::make_shared(stream_view)); +}; + +/** + * @brief synchronize a specific stream + */ +inline void sync_stream(const resources& res, rmm::cuda_stream_view stream) +{ + interruptible::synchronize(stream); +} + +/** + * @brief synchronize main stream on the resources instance + */ +inline void sync_stream(const resources& res) { sync_stream(res, get_cuda_stream(res)); } +} // namespace raft::resource \ No newline at end of file diff --git a/cpp/include/raft/core/resource/cuda_stream_pool.hpp b/cpp/include/raft/core/resource/cuda_stream_pool.hpp new file mode 100644 index 0000000000..452523d3af --- /dev/null +++ b/cpp/include/raft/core/resource/cuda_stream_pool.hpp @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +namespace raft::resource { + +class cuda_stream_pool_resource : public resource { + public: + cuda_stream_pool_resource(std::shared_ptr stream_pool) + : stream_pool_(stream_pool) + { + } + + ~cuda_stream_pool_resource() override {} + void* get_resource() override { return &stream_pool_; } + + private: + std::shared_ptr stream_pool_{nullptr}; +}; + +/** + * Factory that knows how to construct a + * specific raft::resource to populate + * the res_t. + */ +class cuda_stream_pool_resource_factory : public resource_factory { + public: + cuda_stream_pool_resource_factory(std::shared_ptr stream_pool = {nullptr}) + : stream_pool_(stream_pool) + { + } + + resource_type get_resource_type() override { return resource_type::CUDA_STREAM_POOL; } + resource* make_resource() override { return new cuda_stream_pool_resource(stream_pool_); } + + private: + std::shared_ptr stream_pool_{nullptr}; +}; + +inline bool is_stream_pool_initialized(const resources& res) +{ + return *res.get_resource>( + resource_type::CUDA_STREAM_POOL) != nullptr; +} + +/** + * Load a cuda_stream_pool, and create a new one if it doesn't already exist + * @param res raft res object for managing resources + * @return + */ +inline const rmm::cuda_stream_pool& get_cuda_stream_pool(const resources& res) +{ + if (!res.has_resource_factory(resource_type::CUDA_STREAM_POOL)) { + res.add_resource_factory(std::make_shared()); + } + return *( + *res.get_resource>(resource_type::CUDA_STREAM_POOL)); +}; + +/** + * Explicitly set a stream pool on the current res. Note that this will overwrite + * an existing stream pool on the res. + * @param res + * @param stream_pool + */ +inline void set_cuda_stream_pool(const resources& res, + std::shared_ptr stream_pool) +{ + res.add_resource_factory(std::make_shared(stream_pool)); +}; + +inline std::size_t get_stream_pool_size(const resources& res) +{ + return is_stream_pool_initialized(res) ? get_cuda_stream_pool(res).get_pool_size() : 0; +} + +/** + * @brief return stream from pool + */ +inline rmm::cuda_stream_view get_stream_from_stream_pool(const resources& res) +{ + RAFT_EXPECTS(is_stream_pool_initialized(res), "ERROR: rmm::cuda_stream_pool was not initialized"); + return get_cuda_stream_pool(res).get_stream(); +} + +/** + * @brief return stream from pool at index + */ +inline rmm::cuda_stream_view get_stream_from_stream_pool(const resources& res, + std::size_t stream_idx) +{ + RAFT_EXPECTS(is_stream_pool_initialized(res), "ERROR: rmm::cuda_stream_pool was not initialized"); + return get_cuda_stream_pool(res).get_stream(stream_idx); +} + +/** + * @brief return stream from pool if size > 0, else main stream on res + */ +inline rmm::cuda_stream_view get_next_usable_stream(const resources& res) +{ + return is_stream_pool_initialized(res) ? get_stream_from_stream_pool(res) : get_cuda_stream(res); +} + +/** + * @brief return stream from pool at index if size > 0, else main stream on res + * + * @param[in] stream_idx the required index of the stream in the stream pool if available + */ +inline rmm::cuda_stream_view get_next_usable_stream(const resources& res, std::size_t stream_idx) +{ + return is_stream_pool_initialized(res) ? get_stream_from_stream_pool(res, stream_idx) + : get_cuda_stream(res); +} + +/** + * @brief synchronize the stream pool on the res + */ +inline void sync_stream_pool(const resources& res) +{ + for (std::size_t i = 0; i < get_stream_pool_size(res); i++) { + sync_stream(res, get_cuda_stream_pool(res).get_stream(i)); + } +} + +/** + * @brief synchronize subset of stream pool + * + * @param[in] stream_indices the indices of the streams in the stream pool to synchronize + */ +inline void sync_stream_pool(const resources& res, const std::vector stream_indices) +{ + RAFT_EXPECTS(is_stream_pool_initialized(res), "ERROR: rmm::cuda_stream_pool was not initialized"); + for (const auto& stream_index : stream_indices) { + sync_stream(res, get_cuda_stream_pool(res).get_stream(stream_index)); + } +} + +/** + * @brief ask stream pool to wait on last event in main stream + */ +inline void wait_stream_pool_on_stream(const resources& res) +{ + cudaEvent_t event = detail::get_cuda_stream_sync_event(res); + RAFT_CUDA_TRY(cudaEventRecord(event, get_cuda_stream(res))); + for (std::size_t i = 0; i < get_stream_pool_size(res); i++) { + RAFT_CUDA_TRY(cudaStreamWaitEvent(get_cuda_stream_pool(res).get_stream(i), event, 0)); + } +} +} // namespace raft::resource diff --git a/cpp/include/raft/core/resource/cusolver_dn_handle.hpp b/cpp/include/raft/core/resource/cusolver_dn_handle.hpp new file mode 100644 index 0000000000..7ed5634574 --- /dev/null +++ b/cpp/include/raft/core/resource/cusolver_dn_handle.hpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "cuda_stream.hpp" +#include +#include +#include +#include +#include + +namespace raft::resource { + +/** + * + */ +class cusolver_dn_resource : public resource { + public: + cusolver_dn_resource(rmm::cuda_stream_view stream) + { + RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnCreate(&cusolver_res)); + RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnSetStream(cusolver_res, stream)); + } + + void* get_resource() override { return &cusolver_res; } + + ~cusolver_dn_resource() override { RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnDestroy(cusolver_res)); } + + private: + cusolverDnHandle_t cusolver_res; +}; + +/** + * Factory that knows how to construct a + * specific raft::resource to populate + * the res_t. + */ +class cusolver_dn_resource_factory : public resource_factory { + public: + cusolver_dn_resource_factory(rmm::cuda_stream_view stream) : stream_(stream) {} + resource_type get_resource_type() override { return resource_type::CUSOLVER_DN_HANDLE; } + resource* make_resource() override { return new cusolver_dn_resource(stream_); } + + private: + rmm::cuda_stream_view stream_; +}; + +/** + * Load a cusolverSpres_t from raft res if it exists, otherwise + * add it and return it. + * @param res + * @return + */ +inline cusolverDnHandle_t get_cusolver_dn_handle(resources const& res) +{ + if (!res.has_resource_factory(resource_type::CUSOLVER_DN_HANDLE)) { + cudaStream_t stream = get_cuda_stream(res); + res.add_resource_factory(std::make_shared(stream)); + } + return *res.get_resource(resource_type::CUSOLVER_DN_HANDLE); +}; +} // namespace raft::resource diff --git a/cpp/include/raft/core/resource/cusolver_sp_handle.hpp b/cpp/include/raft/core/resource/cusolver_sp_handle.hpp new file mode 100644 index 0000000000..1822955301 --- /dev/null +++ b/cpp/include/raft/core/resource/cusolver_sp_handle.hpp @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include + +namespace raft::resource { + +/** + * + */ +class cusolver_sp_resource : public resource { + public: + cusolver_sp_resource(rmm::cuda_stream_view stream) + { + RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpCreate(&cusolver_res)); + RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpSetStream(cusolver_res, stream)); + } + + void* get_resource() override { return &cusolver_res; } + + ~cusolver_sp_resource() override { RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpDestroy(cusolver_res)); } + + private: + cusolverSpHandle_t cusolver_res; +}; + +/** + * Factory that knows how to construct a + * specific raft::resource to populate + * the res_t. + */ +class cusolver_sp_resource_factory : public resource_factory { + public: + cusolver_sp_resource_factory(rmm::cuda_stream_view stream) : stream_(stream) {} + resource_type get_resource_type() override { return resource_type::CUSOLVER_SP_HANDLE; } + resource* make_resource() override { return new cusolver_sp_resource(stream_); } + + private: + rmm::cuda_stream_view stream_; +}; + +/** + * Load a cusolverSpres_t from raft res if it exists, otherwise + * add it and return it. + * @param res + * @return + */ +inline cusolverSpHandle_t get_cusolver_sp_handle(resources const& res) +{ + if (!res.has_resource_factory(resource_type::CUSOLVER_SP_HANDLE)) { + cudaStream_t stream = get_cuda_stream(res); + res.add_resource_factory(std::make_shared(stream)); + } + return *res.get_resource(resource_type::CUSOLVER_SP_HANDLE); +}; +} // namespace raft::resource diff --git a/cpp/include/raft/core/resource/cusparse_handle.hpp b/cpp/include/raft/core/resource/cusparse_handle.hpp new file mode 100644 index 0000000000..133e01f164 --- /dev/null +++ b/cpp/include/raft/core/resource/cusparse_handle.hpp @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include + +namespace raft::resource { +class cusparse_resource : public resource { + public: + cusparse_resource(rmm::cuda_stream_view stream) + { + RAFT_CUSPARSE_TRY_NO_THROW(cusparseCreate(&cusparse_res)); + RAFT_CUSPARSE_TRY_NO_THROW(cusparseSetStream(cusparse_res, stream)); + } + + ~cusparse_resource() { RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroy(cusparse_res)); } + void* get_resource() override { return &cusparse_res; } + + private: + cusparseHandle_t cusparse_res; +}; + +/** + * Factory that knows how to construct a + * specific raft::resource to populate + * the res_t. + */ +class cusparse_resource_factory : public resource_factory { + public: + cusparse_resource_factory(rmm::cuda_stream_view stream) : stream_(stream) {} + resource_type get_resource_type() override { return resource_type::CUSPARSE_HANDLE; } + resource* make_resource() override { return new cusparse_resource(stream_); } + + private: + rmm::cuda_stream_view stream_; +}; + +/** + * Load a cusparseres_t from raft res if it exists, otherwise + * add it and return it. + * @param res + * @return + */ +inline cusparseHandle_t get_cusparse_handle(resources const& res) +{ + if (!res.has_resource_factory(resource_type::CUSPARSE_HANDLE)) { + rmm::cuda_stream_view stream = get_cuda_stream(res); + res.add_resource_factory(std::make_shared(stream)); + } + return *res.get_resource(resource_type::CUSPARSE_HANDLE); +}; +} // namespace raft::resource diff --git a/cpp/include/raft/core/resource/detail/stream_sync_event.hpp b/cpp/include/raft/core/resource/detail/stream_sync_event.hpp new file mode 100644 index 0000000000..1d02fef20d --- /dev/null +++ b/cpp/include/raft/core/resource/detail/stream_sync_event.hpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include + +namespace raft::resource::detail { + +/** + * Factory that knows how to construct a specific raft::resource to populate + * the res_t. + */ +class cuda_stream_sync_event_resource_factory : public resource_factory { + public: + resource_type get_resource_type() override { return resource_type::CUDA_STREAM_SYNC_EVENT; } + resource* make_resource() override { return new cuda_event_resource(); } +}; + +/** + * Load a cudaEvent from a resources instance (and populate it on the resources instance) + * if needed) for syncing the main cuda stream. + * @param res raft resources instance for managing resources + * @return + */ +inline cudaEvent_t& get_cuda_stream_sync_event(resources const& res) +{ + if (!res.has_resource_factory(resource_type::CUDA_STREAM_SYNC_EVENT)) { + res.add_resource_factory(std::make_shared()); + } + return *res.get_resource(resource_type::CUDA_STREAM_SYNC_EVENT); +}; + +} // namespace raft::resource::detail diff --git a/cpp/include/raft/core/resource/device_id.hpp b/cpp/include/raft/core/resource/device_id.hpp new file mode 100644 index 0000000000..76c57166b3 --- /dev/null +++ b/cpp/include/raft/core/resource/device_id.hpp @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +namespace raft::resource { + +class device_id_resource : public resource { + public: + device_id_resource() + : dev_id_([]() -> int { + int cur_dev = -1; + RAFT_CUDA_TRY_NO_THROW(cudaGetDevice(&cur_dev)); + return cur_dev; + }()) + { + } + void* get_resource() override { return &dev_id_; } + + ~device_id_resource() override {} + + private: + int dev_id_; +}; + +/** + * Factory that knows how to construct a + * specific raft::resource to populate + * the res_t. + */ +class device_id_resource_factory : public resource_factory { + public: + resource_type get_resource_type() override { return resource_type::DEVICE_ID; } + resource* make_resource() override { return new device_id_resource(); } +}; + +/** + * Load a device id from a res (and populate it on the res if needed). + * @param res raft res object for managing resources + * @return + */ +inline int get_device_id(resources const& res) +{ + if (!res.has_resource_factory(resource_type::DEVICE_ID)) { + res.add_resource_factory(std::make_shared()); + } + return *res.get_resource(resource_type::DEVICE_ID); +}; +} // namespace raft::resource \ No newline at end of file diff --git a/cpp/include/raft/core/resource/device_properties.hpp b/cpp/include/raft/core/resource/device_properties.hpp new file mode 100644 index 0000000000..d6193e7a95 --- /dev/null +++ b/cpp/include/raft/core/resource/device_properties.hpp @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include + +namespace raft::resource { + +class device_properties_resource : public resource { + public: + device_properties_resource(int dev_id) + { + RAFT_CUDA_TRY_NO_THROW(cudaGetDeviceProperties(&prop_, dev_id)); + } + void* get_resource() override { return &prop_; } + + ~device_properties_resource() override {} + + private: + cudaDeviceProp prop_; +}; + +/** + * Factory that knows how to construct a + * specific raft::resource to populate + * the res_t. + */ +class device_properties_resource_factory : public resource_factory { + public: + device_properties_resource_factory(int dev_id) : dev_id_(dev_id) {} + resource_type get_resource_type() override { return resource_type::DEVICE_PROPERTIES; } + resource* make_resource() override { return new device_properties_resource(dev_id_); } + + private: + int dev_id_; +}; + +/** + * Load a cudaDeviceProp from a res (and populate it on the res if needed). + * @param res raft res object for managing resources + * @return + */ +inline cudaDeviceProp& get_device_properties(resources const& res) +{ + if (!res.has_resource_factory(resource_type::DEVICE_PROPERTIES)) { + int dev_id = get_device_id(res); + res.add_resource_factory(std::make_shared(dev_id)); + } + return *res.get_resource(resource_type::DEVICE_PROPERTIES); +}; +} // namespace raft::resource \ No newline at end of file diff --git a/cpp/include/raft/core/resource/resource_types.hpp b/cpp/include/raft/core/resource/resource_types.hpp new file mode 100644 index 0000000000..c763066c79 --- /dev/null +++ b/cpp/include/raft/core/resource/resource_types.hpp @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace raft::resource { + +/** + * @brief Resource types can apply to any resource and don't have to be host- or device-specific. + */ +enum resource_type { + // device-specific resource types + CUBLAS_HANDLE = 0, // cublas handle + CUSOLVER_DN_HANDLE, // cusolver dn handle + CUSOLVER_SP_HANDLE, // cusolver sp handle + CUSPARSE_HANDLE, // cusparse handle + CUDA_STREAM_VIEW, // view of a cuda stream + CUDA_STREAM_POOL, // cuda stream pool + CUDA_STREAM_SYNC_EVENT, // cuda event for syncing streams + COMMUNICATOR, // raft communicator + SUB_COMMUNICATOR, // raft sub communicator + DEVICE_PROPERTIES, // cuda device properties + DEVICE_ID, // cuda device id + THRUST_POLICY, // thrust execution policy + + LAST_KEY // reserved for the last key +}; + +/** + * @brief A resource constructs and contains an instance of + * some pre-determined object type and facades that object + * behind a common API. + */ +class resource { + public: + virtual void* get_resource() = 0; + + virtual ~resource() {} +}; + +class empty_resource : public resource { + public: + empty_resource() : resource() {} + + void* get_resource() override { return nullptr; } + + ~empty_resource() override {} +}; + +/** + * @brief A resource factory knows how to construct an instance of + * a specific raft::resource::resource. + */ +class resource_factory { + public: + /** + * @brief Return the resource_type associated with the current factory + * @return resource_type corresponding to the current factory + */ + virtual resource_type get_resource_type() = 0; + + /** + * @brief Construct an instance of the factory's underlying resource. + * @return resource instance + */ + virtual resource* make_resource() = 0; +}; + +/** + * @brief A resource factory knows how to construct an instance of + * a specific raft::resource::resource. + */ +class empty_resource_factory : public resource_factory { + public: + empty_resource_factory() : resource_factory() {} + /** + * @brief Return the resource_type associated with the current factory + * @return resource_type corresponding to the current factory + */ + resource_type get_resource_type() override { return resource_type::LAST_KEY; } + + /** + * @brief Construct an instance of the factory's underlying resource. + * @return resource instance + */ + resource* make_resource() override { return &res; } + + private: + empty_resource res; +}; + +} // namespace raft::resource diff --git a/cpp/include/raft/core/resource/sub_comms.hpp b/cpp/include/raft/core/resource/sub_comms.hpp new file mode 100644 index 0000000000..9c2c67deed --- /dev/null +++ b/cpp/include/raft/core/resource/sub_comms.hpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +namespace raft::resource { +class sub_comms_resource : public resource { + public: + sub_comms_resource() : communicators_() {} + void* get_resource() override { return &communicators_; } + + ~sub_comms_resource() override {} + + private: + std::unordered_map> communicators_; +}; + +/** + * Factory that knows how to construct a + * specific raft::resource to populate + * the res_t. + */ +class sub_comms_resource_factory : public resource_factory { + public: + resource_type get_resource_type() override { return resource_type::SUB_COMMUNICATOR; } + resource* make_resource() override { return new sub_comms_resource(); } +}; + +inline const comms::comms_t& get_subcomm(const resources& res, std::string key) +{ + if (!res.has_resource_factory(resource_type::SUB_COMMUNICATOR)) { + res.add_resource_factory(std::make_shared()); + } + + auto sub_comms = + res.get_resource>>( + resource_type::SUB_COMMUNICATOR); + auto sub_comm = sub_comms->at(key); + RAFT_EXPECTS(nullptr != sub_comm.get(), "ERROR: Subcommunicator was not initialized"); + + return *sub_comm; +} + +inline void set_subcomm(resources const& res, + std::string key, + std::shared_ptr subcomm) +{ + if (!res.has_resource_factory(resource_type::SUB_COMMUNICATOR)) { + res.add_resource_factory(std::make_shared()); + } + auto sub_comms = + res.get_resource>>( + resource_type::SUB_COMMUNICATOR); + sub_comms->insert(std::make_pair(key, subcomm)); +} +} // namespace raft::resource \ No newline at end of file diff --git a/cpp/include/raft/core/resource/thrust_policy.hpp b/cpp/include/raft/core/resource/thrust_policy.hpp new file mode 100644 index 0000000000..e3e3cf6aef --- /dev/null +++ b/cpp/include/raft/core/resource/thrust_policy.hpp @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +namespace raft::resource { +class thrust_policy_resource : public resource { + public: + thrust_policy_resource(rmm::cuda_stream_view stream_view) + : thrust_policy_(std::make_unique(stream_view)) + { + } + void* get_resource() override { return thrust_policy_.get(); } + + ~thrust_policy_resource() override {} + + private: + std::unique_ptr thrust_policy_; +}; + +/** + * Factory that knows how to construct a + * specific raft::resource to populate + * the res_t. + */ +class thrust_policy_resource_factory : public resource_factory { + public: + thrust_policy_resource_factory(rmm::cuda_stream_view stream_view) : stream_view_(stream_view) {} + resource_type get_resource_type() override { return resource_type::THRUST_POLICY; } + resource* make_resource() override { return new thrust_policy_resource(stream_view_); } + + private: + rmm::cuda_stream_view stream_view_; +}; + +/** + * Load a thrust policy from a res (and populate it on the res if needed). + * @param res raft res object for managing resources + * @return + */ +inline rmm::exec_policy& get_thrust_policy(resources const& res) +{ + if (!res.has_resource_factory(resource_type::THRUST_POLICY)) { + rmm::cuda_stream_view stream = get_cuda_stream(res); + res.add_resource_factory(std::make_shared(stream)); + } + return *res.get_resource(resource_type::THRUST_POLICY); +}; +} // namespace raft::resource \ No newline at end of file diff --git a/cpp/include/raft/core/resources.hpp b/cpp/include/raft/core/resources.hpp new file mode 100644 index 0000000000..797fd5968d --- /dev/null +++ b/cpp/include/raft/core/resources.hpp @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "resource/resource_types.hpp" +#include +#include +#include +#include +#include + +namespace raft { + +/** + * @brief Resource container which allows lazy-loading and registration + * of resource_factory implementations, which in turn generate resource instances. + * + * This class is intended to be agnostic of the resources it contains and + * does not, itself, differentiate between host and device resources. Downstream + * accessor functions can then register and load resources as needed in order + * to keep its usage somewhat opaque to end-users. + * + * @code{.cpp} + * #include + * #include + * #include + * + * raft::resources res; + * auto stream = raft::resource::get_cuda_stream(res); + * auto cublas_handle = raft::resource::get_cublas_handle(res); + * @endcode + */ +class resources { + public: + template + using pair_res = std::pair>; + + using pair_res_factory = pair_res; + using pair_resource = pair_res; + + resources() + : factories_(resource::resource_type::LAST_KEY), resources_(resource::resource_type::LAST_KEY) + { + for (int i = 0; i < resource::resource_type::LAST_KEY; ++i) { + factories_.at(i) = std::make_pair(resource::resource_type::LAST_KEY, + std::make_shared()); + resources_.at(i) = std::make_pair(resource::resource_type::LAST_KEY, + std::make_shared()); + } + } + + resources(const resources&) = delete; + resources& operator=(const resources&) = delete; + resources(resources&&) = delete; + resources& operator=(resources&&) = delete; + + /** + * @brief Returns true if a resource_factory has been registered for the + * given resource_type, false otherwise. + * @param resource_type resource type to check + * @return true if resource_factory is registered for the given resource_type + */ + bool has_resource_factory(resource::resource_type resource_type) const + { + std::lock_guard _(mutex_); + return factories_.at(resource_type).first != resource::resource_type::LAST_KEY; + } + + /** + * @brief Register a resource_factory with the current instance. + * This will overwrite any existing resource factories. + * @param factory resource factory to register on the current instance + */ + void add_resource_factory(std::shared_ptr factory) const + { + std::lock_guard _(mutex_); + resource::resource_type rtype = factory.get()->get_resource_type(); + RAFT_EXPECTS(rtype != resource::resource_type::LAST_KEY, + "LAST_KEY is a placeholder and not a valid resource factory type."); + factories_.at(rtype) = std::make_pair(rtype, factory); + } + + /** + * @brief Retrieve a resource for the given resource_type and cast to given pointer type. + * Note that the resources are loaded lazily on-demand and resources which don't yet + * exist on the current instance will be created using the corresponding factory, if + * it exists. + * @tparam res_t pointer type for which retrieved resource will be casted + * @param resource_type resource type to retrieve + * @return the given resource, if it exists. + */ + template + res_t* get_resource(resource::resource_type resource_type) const + { + std::lock_guard _(mutex_); + + if (resources_.at(resource_type).first == resource::resource_type::LAST_KEY) { + RAFT_EXPECTS(factories_.at(resource_type).first != resource::resource_type::LAST_KEY, + "No resource factory has been registered for the given resource %d.", + resource_type); + resource::resource_factory* factory = factories_.at(resource_type).second.get(); + resources_.at(resource_type) = std::make_pair( + resource_type, std::shared_ptr(factory->make_resource())); + } + + resource::resource* res = resources_.at(resource_type).second.get(); + return reinterpret_cast(res->get_resource()); + } + + private: + mutable std::mutex mutex_; + mutable std::vector factories_; + mutable std::vector resources_; +}; +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh index 628b83a23c..8ed71864fd 100644 --- a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh +++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 5be8401a6f..8ca30a5c82 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -77,25 +77,25 @@ endfunction() if(BUILD_TESTS) ConfigureTest( - NAME CLUSTER_TEST PATH test/cluster/kmeans.cu test/cluster_solvers.cu test/cluster/linkage.cu - OPTIONAL DIST NN + NAME CLUSTER_TEST PATH test/cluster/kmeans.cu test/cluster/cluster_solvers.cu + test/cluster/linkage.cu OPTIONAL DIST NN ) ConfigureTest( NAME CORE_TEST PATH - test/common/logger.cpp + test/core/logger.cpp test/core/operators_device.cu test/core/operators_host.cpp - test/handle.cpp - test/interruptible.cu - test/nvtx.cpp - test/mdarray.cu - test/mdspan_utils.cu - test/memory_type.cpp - test/span.cpp - test/span.cu + test/core/handle.cpp + test/core/interruptible.cu + test/core/nvtx.cpp + test/core/mdarray.cu + test/core/mdspan_utils.cu + test/core/memory_type.cpp + test/core/span.cpp + test/core/span.cu test/test.cpp ) @@ -179,7 +179,7 @@ if(BUILD_TESTS) test/matrix/reverse.cu test/matrix/slice.cu test/matrix/triangular.cu - test/spectral_matrix.cu + test/sparse/spectral_matrix.cu ) ConfigureTest( @@ -198,8 +198,8 @@ if(BUILD_TESTS) ) ConfigureTest( - NAME SOLVERS_TEST PATH test/cluster_solvers_deprecated.cu test/eigen_solvers.cu test/lap/lap.cu - test/mst.cu OPTIONAL DIST + NAME SOLVERS_TEST PATH test/cluster/cluster_solvers_deprecated.cu test/linalg/eigen_solvers.cu + test/lap/lap.cu test/sparse/mst.cu OPTIONAL DIST ) ConfigureTest( @@ -290,7 +290,7 @@ if(BUILD_TESTS) ) ConfigureTest( - NAME UTILS_TEST PATH test/common/seive.cu test/cudart_utils.cpp test/device_atomics.cu - test/integer_utils.cpp test/pow2_utils.cu + NAME UTILS_TEST PATH test/core/seive.cu test/util/cudart_utils.cpp test/util/device_atomics.cu + test/util/integer_utils.cpp test/util/pow2_utils.cu ) endif() diff --git a/cpp/test/cluster_solvers.cu b/cpp/test/cluster/cluster_solvers.cu similarity index 96% rename from cpp/test/cluster_solvers.cu rename to cpp/test/cluster/cluster_solvers.cu index 26fbfec011..9293c78294 100644 --- a/cpp/test/cluster_solvers.cu +++ b/cpp/test/cluster/cluster_solvers.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -66,12 +66,7 @@ TEST(Raft, ModularitySolvers) using value_type = double; handle_t h; - ASSERT_EQ(0, - h. - - get_device() - - ); + ASSERT_EQ(0, h.get_device()); index_type neigvs{10}; index_type maxiter{100}; diff --git a/cpp/test/cluster_solvers_deprecated.cu b/cpp/test/cluster/cluster_solvers_deprecated.cu similarity index 96% rename from cpp/test/cluster_solvers_deprecated.cu rename to cpp/test/cluster/cluster_solvers_deprecated.cu index 167a710b34..dbc7722485 100644 --- a/cpp/test/cluster_solvers_deprecated.cu +++ b/cpp/test/cluster/cluster_solvers_deprecated.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/test/cluster/kmeans.cu b/cpp/test/cluster/kmeans.cu index 9644541a0c..abc4cd6e13 100644 --- a/cpp/test/cluster/kmeans.cu +++ b/cpp/test/cluster/kmeans.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -58,11 +58,10 @@ template class KmeansTest : public ::testing::TestWithParam> { protected: KmeansTest() - : stream(handle.get_stream()), - d_labels(0, stream), - d_labels_ref(0, stream), - d_centroids(0, stream), - d_sample_weight(0, stream) + : d_labels(0, handle.get_stream()), + d_labels_ref(0, handle.get_stream()), + d_centroids(0, handle.get_stream()), + d_sample_weight(0, handle.get_stream()) { } @@ -70,6 +69,7 @@ class KmeansTest : public ::testing::TestWithParam> { { testparams = ::testing::TestWithParam>::GetParam(); + auto stream = handle.get_stream(); int n_samples = testparams.n_row; int n_features = testparams.n_col; params.n_clusters = testparams.n_clusters; @@ -249,6 +249,7 @@ class KmeansTest : public ::testing::TestWithParam> { auto X = raft::make_device_matrix(handle, n_samples, n_features); auto labels = raft::make_device_vector(handle, n_samples); + auto stream = handle.get_stream(); raft::random::make_blobs(X.data_handle(), labels.data_handle(), @@ -323,7 +324,6 @@ class KmeansTest : public ::testing::TestWithParam> { protected: raft::handle_t handle; - cudaStream_t stream; KmeansInputs testparams; rmm::device_uvector d_labels; rmm::device_uvector d_labels_ref; diff --git a/cpp/test/cluster/linkage.cu b/cpp/test/cluster/linkage.cu index 53aa5c55e3..a36ad4abea 100644 --- a/cpp/test/cluster/linkage.cu +++ b/cpp/test/cluster/linkage.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -162,15 +162,18 @@ class LinkageTest : public ::testing::TestWithParam> { public: LinkageTest() : params(::testing::TestWithParam>::GetParam()), - stream(handle.get_stream()), - labels(params.n_row, stream), - labels_ref(params.n_row, stream) + labels(0, handle.get_stream()), + labels_ref(0, handle.get_stream()) { } protected: void basicTest() { + auto stream = handle.get_stream(); + + labels.resize(params.n_row, stream); + labels_ref.resize(params.n_row, stream); rmm::device_uvector data(params.n_row * params.n_col, stream); raft::copy(data.data(), params.data.data(), data.size(), stream); @@ -178,8 +181,6 @@ class LinkageTest : public ::testing::TestWithParam> { rmm::device_uvector out_children(params.n_row * 2, stream); - raft::handle_t handle; - auto data_view = raft::make_device_matrix_view( data.data(), params.n_row, params.n_col); auto dendrogram_view = @@ -205,7 +206,6 @@ class LinkageTest : public ::testing::TestWithParam> { protected: raft::handle_t handle; - cudaStream_t stream; LinkageInputs params; rmm::device_uvector labels, labels_ref; diff --git a/cpp/test/core/handle.cpp b/cpp/test/core/handle.cpp new file mode 100644 index 0000000000..2148742e83 --- /dev/null +++ b/cpp/test/core/handle.cpp @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace raft { + +using namespace comms; +class mock_comms : public comms_iface { + public: + mock_comms(int n) : n_ranks(n) {} + ~mock_comms() {} + + int get_size() const override { return n_ranks; } + + int get_rank() const override { return 0; } + + std::unique_ptr comm_split(int color, int key) const + { + return std::unique_ptr(new mock_comms(0)); + } + + void barrier() const {} + + void get_request_id(request_t* req) const {} + + void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const {} + + void irecv(void* buf, size_t size, int source, int tag, request_t* request) const {} + + void waitall(int count, request_t array_of_requests[]) const {} + + void allreduce(const void* sendbuff, + void* recvbuff, + size_t count, + datatype_t datatype, + op_t op, + cudaStream_t stream) const + { + } + + void bcast(void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const {} + + void bcast(const void* sendbuff, + void* recvbuff, + size_t count, + datatype_t datatype, + int root, + cudaStream_t stream) const + { + } + + void reduce(const void* sendbuff, + void* recvbuff, + size_t count, + datatype_t datatype, + op_t op, + int root, + cudaStream_t stream) const + { + } + + void allgather(const void* sendbuff, + void* recvbuff, + size_t sendcount, + datatype_t datatype, + cudaStream_t stream) const + { + } + + void allgatherv(const void* sendbuf, + void* recvbuf, + const size_t* recvcounts, + const size_t* displs, + datatype_t datatype, + cudaStream_t stream) const + { + } + + void gather(const void* sendbuff, + void* recvbuff, + size_t sendcount, + datatype_t datatype, + int root, + cudaStream_t stream) const + { + } + + void gatherv(const void* sendbuff, + void* recvbuff, + size_t sendcount, + const size_t* recvcounts, + const size_t* displs, + datatype_t datatype, + int root, + cudaStream_t stream) const + { + } + + void reducescatter(const void* sendbuff, + void* recvbuff, + size_t recvcount, + datatype_t datatype, + op_t op, + cudaStream_t stream) const + { + } + + status_t sync_stream(cudaStream_t stream) const { return status_t::SUCCESS; } + + // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock + void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const {} + + // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock + void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const {} + + void device_sendrecv(const void* sendbuf, + size_t sendsize, + int dest, + void* recvbuf, + size_t recvsize, + int source, + cudaStream_t stream) const + { + } + + void device_multicast_sendrecv(const void* sendbuf, + std::vector const& sendsizes, + std::vector const& sendoffsets, + std::vector const& dests, + void* recvbuf, + std::vector const& recvsizes, + std::vector const& recvoffsets, + std::vector const& sources, + cudaStream_t stream) const + { + } + + void group_start() const {} + + void group_end() const {} + + private: + int n_ranks; +}; + +TEST(Raft, HandleDefault) +{ + handle_t h; + ASSERT_EQ(0, h.get_device()); + ASSERT_EQ(rmm::cuda_stream_per_thread, h.get_stream()); + ASSERT_NE(nullptr, h.get_cublas_handle()); + ASSERT_NE(nullptr, h.get_cusolver_dn_handle()); + ASSERT_NE(nullptr, h.get_cusolver_sp_handle()); + ASSERT_NE(nullptr, h.get_cusparse_handle()); +} + +TEST(Raft, Handle) +{ + // test stream pool creation + constexpr std::size_t n_streams = 4; + auto stream_pool = std::make_shared(n_streams); + handle_t h(rmm::cuda_stream_default, stream_pool); + ASSERT_EQ(n_streams, h.get_stream_pool_size()); + + // test non default stream handle + cudaStream_t stream; + RAFT_CUDA_TRY(cudaStreamCreate(&stream)); + rmm::cuda_stream_view stream_view(stream); + handle_t handle(stream_view); + ASSERT_EQ(stream_view, handle.get_stream()); + handle.sync_stream(stream); + RAFT_CUDA_TRY(cudaStreamDestroy(stream)); +} + +TEST(Raft, DefaultConstructor) +{ + handle_t handle; + + // Make sure waiting on the default stream pool + // does not fail. + handle.wait_stream_pool_on_stream(); + handle.sync_stream_pool(); + + auto s1 = handle.get_next_usable_stream(); + auto s2 = handle.get_stream(); + auto s3 = handle.get_next_usable_stream(5); + + ASSERT_EQ(s1, s2); + ASSERT_EQ(s2, s3); + ASSERT_EQ(0, handle.get_stream_pool_size()); +} + +TEST(Raft, GetHandleFromPool) +{ + constexpr std::size_t n_streams = 4; + auto stream_pool = std::make_shared(n_streams); + handle_t parent(rmm::cuda_stream_default, stream_pool); + + for (std::size_t i = 0; i < n_streams; i++) { + auto worker_stream = parent.get_stream_from_stream_pool(i); + handle_t child(worker_stream); + ASSERT_EQ(parent.get_stream_from_stream_pool(i), child.get_stream()); + } + + parent.wait_stream_pool_on_stream(); +} + +TEST(Raft, Comms) +{ + handle_t handle; + auto comm1 = std::make_shared(std::unique_ptr(new mock_comms(2))); + handle.set_comms(comm1); + + ASSERT_EQ(handle.get_comms().get_size(), 2); +} + +TEST(Raft, SubComms) +{ + handle_t handle; + auto comm1 = std::make_shared(std::unique_ptr(new mock_comms(1))); + handle.set_subcomm("key1", comm1); + + auto comm2 = std::make_shared(std::unique_ptr(new mock_comms(2))); + handle.set_subcomm("key2", comm2); + + ASSERT_EQ(handle.get_subcomm("key1").get_size(), 1); + ASSERT_EQ(handle.get_subcomm("key2").get_size(), 2); +} + +} // namespace raft diff --git a/cpp/test/interruptible.cu b/cpp/test/core/interruptible.cu similarity index 98% rename from cpp/test/interruptible.cu rename to cpp/test/core/interruptible.cu index 92adfabd55..f54bb6f859 100644 --- a/cpp/test/interruptible.cu +++ b/cpp/test/core/interruptible.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/test/common/logger.cpp b/cpp/test/core/logger.cpp similarity index 98% rename from cpp/test/common/logger.cpp rename to cpp/test/core/logger.cpp index a8460e45ca..3f29c9f12c 100644 --- a/cpp/test/common/logger.cpp +++ b/cpp/test/core/logger.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/test/mdarray.cu b/cpp/test/core/mdarray.cu similarity index 99% rename from cpp/test/mdarray.cu rename to cpp/test/core/mdarray.cu index c292feb894..8e455bebfe 100644 --- a/cpp/test/mdarray.cu +++ b/cpp/test/core/mdarray.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/test/mdspan_utils.cu b/cpp/test/core/mdspan_utils.cu similarity index 99% rename from cpp/test/mdspan_utils.cu rename to cpp/test/core/mdspan_utils.cu index 7f1efb78bb..6eaecf78b4 100644 --- a/cpp/test/mdspan_utils.cu +++ b/cpp/test/core/mdspan_utils.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/test/memory_type.cpp b/cpp/test/core/memory_type.cpp similarity index 96% rename from cpp/test/memory_type.cpp rename to cpp/test/core/memory_type.cpp index 57d44ceefe..02aa8caa6c 100644 --- a/cpp/test/memory_type.cpp +++ b/cpp/test/core/memory_type.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/test/nvtx.cpp b/cpp/test/core/nvtx.cpp similarity index 96% rename from cpp/test/nvtx.cpp rename to cpp/test/core/nvtx.cpp index 635fe55012..e6c29fa3d8 100644 --- a/cpp/test/nvtx.cpp +++ b/cpp/test/core/nvtx.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/test/common/seive.cu b/cpp/test/core/seive.cu similarity index 95% rename from cpp/test/common/seive.cu rename to cpp/test/core/seive.cu index 54a59d6251..8634abf3be 100644 --- a/cpp/test/common/seive.cu +++ b/cpp/test/core/seive.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/test/span.cpp b/cpp/test/core/span.cpp similarity index 99% rename from cpp/test/span.cpp rename to cpp/test/core/span.cpp index f8d9345a12..1a21b5ff47 100644 --- a/cpp/test/span.cpp +++ b/cpp/test/core/span.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/test/span.cu b/cpp/test/core/span.cu similarity index 99% rename from cpp/test/span.cu rename to cpp/test/core/span.cu index e9af9b857f..f16a18332b 100644 --- a/cpp/test/span.cu +++ b/cpp/test/core/span.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/test/test_span.hpp b/cpp/test/core/test_span.hpp similarity index 99% rename from cpp/test/test_span.hpp rename to cpp/test/core/test_span.hpp index 254c89f91c..27c50e9695 100644 --- a/cpp/test/test_span.hpp +++ b/cpp/test/core/test_span.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh index 067b1b2c0e..cbfd97ebc6 100644 --- a/cpp/test/distance/distance_base.cuh +++ b/cpp/test/distance/distance_base.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * Copyright (c) 2018-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -519,10 +519,10 @@ class BigMatrixDistanceTest : public ::testing::Test { } protected: + raft::handle_t handle; int m = 48000; int n = 48000; int k = 1; - raft::handle_t handle; rmm::device_uvector x, dist; }; } // end namespace distance diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu index 252f56607f..e746a2382d 100644 --- a/cpp/test/distance/fused_l2_nn.cu +++ b/cpp/test/distance/fused_l2_nn.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -158,6 +158,8 @@ class FusedL2NNTest : public ::testing::TestWithParam> { } protected: + raft::handle_t handle; + cudaStream_t stream; Inputs params; rmm::device_uvector x; rmm::device_uvector y; @@ -166,8 +168,6 @@ class FusedL2NNTest : public ::testing::TestWithParam> { rmm::device_uvector> min; rmm::device_uvector> min_ref; rmm::device_uvector workspace; - raft::handle_t handle; - cudaStream_t stream; virtual void generateGoldenResult() { diff --git a/cpp/test/handle.cpp b/cpp/test/handle.cpp deleted file mode 100644 index 2ebc38d03a..0000000000 --- a/cpp/test/handle.cpp +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include - -namespace raft { - -TEST(Raft, HandleDefault) -{ - handle_t h; - ASSERT_EQ(0, h.get_device()); - ASSERT_EQ(rmm::cuda_stream_per_thread, h.get_stream()); - ASSERT_NE(nullptr, h.get_cublas_handle()); - ASSERT_NE(nullptr, h.get_cusolver_dn_handle()); - ASSERT_NE(nullptr, h.get_cusolver_sp_handle()); - ASSERT_NE(nullptr, h.get_cusparse_handle()); -} - -TEST(Raft, Handle) -{ - // test stream pool creation - constexpr std::size_t n_streams = 4; - auto stream_pool = std::make_shared(n_streams); - handle_t h(rmm::cuda_stream_default, stream_pool); - ASSERT_EQ(n_streams, h.get_stream_pool_size()); - - // test non default stream handle - cudaStream_t stream; - RAFT_CUDA_TRY(cudaStreamCreate(&stream)); - rmm::cuda_stream_view stream_view(stream); - handle_t handle(stream_view); - ASSERT_EQ(stream_view, handle.get_stream()); - handle.sync_stream(stream); - RAFT_CUDA_TRY(cudaStreamDestroy(stream)); -} - -TEST(Raft, GetHandleFromPool) -{ - constexpr std::size_t n_streams = 4; - auto stream_pool = std::make_shared(n_streams); - handle_t parent(rmm::cuda_stream_default, stream_pool); - - for (std::size_t i = 0; i < n_streams; i++) { - auto worker_stream = parent.get_stream_from_stream_pool(i); - handle_t child(worker_stream); - ASSERT_EQ(parent.get_stream_from_stream_pool(i), child.get_stream()); - } -} - -} // namespace raft diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/linalg/eigen_solvers.cu similarity index 98% rename from cpp/test/eigen_solvers.cu rename to cpp/test/linalg/eigen_solvers.cu index 68b431b894..3e7d923e2d 100644 --- a/cpp/test/eigen_solvers.cu +++ b/cpp/test/linalg/eigen_solvers.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/test/matrix/columnSort.cu b/cpp/test/matrix/columnSort.cu index 000a911efd..00205830c4 100644 --- a/cpp/test/matrix/columnSort.cu +++ b/cpp/test/matrix/columnSort.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -116,10 +116,10 @@ class ColumnSort : public ::testing::TestWithParam> { } protected: + raft::handle_t handle; columnSort params; rmm::device_uvector keyIn, keySorted, keySortGolden; rmm::device_uvector valueOut, goldenValOut; // valueOut are indexes - raft::handle_t handle; }; const std::vector> inputsf1 = {{0.000001f, 503, 2000, false}, diff --git a/cpp/test/matrix/linewise_op.cu b/cpp/test/matrix/linewise_op.cu index 9ce1371944..a791cbc0f0 100644 --- a/cpp/test/matrix/linewise_op.cu +++ b/cpp/test/matrix/linewise_op.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -43,8 +43,8 @@ struct LinewiseTestParams { template struct LinewiseTest : public ::testing::TestWithParam { - const LinewiseTestParams params; const raft::handle_t handle; + const LinewiseTestParams params; rmm::cuda_stream_view stream; LinewiseTest() diff --git a/cpp/test/neighbors/epsilon_neighborhood.cu b/cpp/test/neighbors/epsilon_neighborhood.cu index 4f33db489e..36d7cb25ff 100644 --- a/cpp/test/neighbors/epsilon_neighborhood.cu +++ b/cpp/test/neighbors/epsilon_neighborhood.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -72,13 +72,13 @@ class EpsNeighTest : public ::testing::TestWithParam> { false); } + const raft::handle_t handle; EpsInputs param; cudaStream_t stream = 0; rmm::device_uvector data; rmm::device_uvector adj; rmm::device_uvector labels, vd; IdxT batchSize; - const raft::handle_t handle; }; // class EpsNeighTest const std::vector> inputsfi = { diff --git a/cpp/test/neighbors/selection.cu b/cpp/test/neighbors/selection.cu index d793ea46ee..2f95ed1b3a 100644 --- a/cpp/test/neighbors/selection.cu +++ b/cpp/test/neighbors/selection.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * Copyright (c) 2018-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -49,10 +49,10 @@ std::ostream& operator<<(std::ostream& os, const SelectTestSpec& ss) } template -auto gen_simple_ids(int n_inputs, int input_len) -> std::vector +auto gen_simple_ids(int n_inputs, int input_len, const raft::handle_t& handle) -> std::vector { std::vector out(n_inputs * input_len); - auto s = rmm::cuda_stream_default; + auto s = handle.get_stream(); rmm::device_uvector out_d(out.size(), s); iota_fill(out_d.data(), IdxT(n_inputs), IdxT(input_len), s); update_host(out.data(), out_d.data(), out.size(), s); @@ -65,14 +65,16 @@ struct SelectInOutSimple { public: bool not_supported = false; - SelectInOutSimple(const SelectTestSpec& spec, + SelectInOutSimple(std::shared_ptr handle, + const SelectTestSpec& spec, const std::vector& in_dists, const std::vector& out_dists, const std::vector& out_ids) : in_dists_(in_dists), - in_ids_(gen_simple_ids(spec.n_inputs, spec.input_len)), + in_ids_(gen_simple_ids(spec.n_inputs, spec.input_len, *handle.get())), out_dists_(out_dists), - out_ids_(out_ids) + out_ids_(out_ids), + handle_(handle) { } @@ -82,6 +84,7 @@ struct SelectInOutSimple { auto get_out_ids() -> std::vector& { return out_ids_; } private: + std::shared_ptr handle_; std::vector in_dists_; std::vector in_ids_; std::vector out_dists_; @@ -93,14 +96,17 @@ struct SelectInOutComputed { public: bool not_supported = false; - SelectInOutComputed(const SelectTestSpec& spec, + SelectInOutComputed(std::shared_ptr handle, + const SelectTestSpec& spec, knn::SelectKAlgo algo, const std::vector& in_dists, const std::optional>& in_ids = std::nullopt) - : in_dists_(in_dists), - in_ids_(in_ids.value_or(gen_simple_ids(spec.n_inputs, spec.input_len))), + : handle_(handle), + in_dists_(in_dists), + in_ids_(in_ids.value_or(gen_simple_ids(spec.n_inputs, spec.input_len, *handle.get()))), out_dists_(spec.n_inputs * spec.k), out_ids_(spec.n_inputs * spec.k) + { // check if the size is supported by the algorithm switch (algo) { @@ -119,7 +125,7 @@ struct SelectInOutComputed { default: break; } - auto stream = rmm::cuda_stream_default; + auto stream = handle_.get()->get_stream(); rmm::device_uvector in_dists_d(in_dists_.size(), stream); rmm::device_uvector in_ids_d(in_ids_.size(), stream); @@ -156,6 +162,7 @@ struct SelectInOutComputed { auto get_out_ids() -> std::vector& { return out_ids_; } private: + std::shared_ptr handle_; std::vector in_dists_; std::vector in_ids_; std::vector out_dists_; @@ -205,11 +212,12 @@ struct SelectInOutComputed { }; template -using Params = std::tuple; +using Params = std::tuple>; template typename ParamsReader> class SelectionTest : public testing::TestWithParam::ParamsIn> { protected: + std::shared_ptr handle_; const SelectTestSpec spec; const knn::SelectKAlgo algo; @@ -218,10 +226,11 @@ class SelectionTest : public testing::TestWithParam::InOut> ps) - : spec(std::get<0>(ps)), + : handle_(std::get<3>(ps)), + spec(std::get<0>(ps)), algo(std::get<1>(ps)), ref(std::get<2>(ps)), - res(spec, algo, ref.get_in_dists(), ref.get_in_ids()) + res(handle_, spec, algo, ref.get_in_dists(), ref.get_in_ids()) { } @@ -238,12 +247,13 @@ class SelectionTest : public testing::TestWithParam())); + ASSERT_TRUE(hostVecMatch(ref.get_out_dists(), res.get_out_dists(), Compare())); // If the dists (keys) are the same, different corresponding ids may end up in the selection due // to non-deterministic nature of some implementations. - auto& in_ids = ref.get_in_ids(); - auto& in_dists = ref.get_in_dists(); + auto& in_ids = ref.get_in_ids(); + auto& in_dists = ref.get_in_dists(); + auto compare_ids = [&in_ids, &in_dists](const IdxT& i, const IdxT& j) { if (i == j) return true; auto ix_i = size_t(std::find(in_ids.begin(), in_ids.end(), i) - in_ids.begin()); @@ -265,17 +275,20 @@ struct params_simple { using InOut = SelectInOutSimple; using Inputs = std::tuple, std::vector, std::vector>; - using ParamsIn = std::tuple; + using Handle = std::shared_ptr; + using ParamsIn = std::tuple; static auto read(ParamsIn ps) -> Params { - auto ins = std::get<0>(ps); - auto algo = std::get<1>(ps); + auto ins = std::get<0>(ps); + auto algo = std::get<1>(ps); + auto handle = std::get<2>(ps); return std::make_tuple( std::get<0>(ins), algo, SelectInOutSimple( - std::get<0>(ins), std::get<1>(ins), std::get<2>(ins), std::get<3>(ins))); + handle, std::get<0>(ins), std::get<1>(ins), std::get<2>(ins), std::get<3>(ins)), + handle); } }; @@ -345,32 +358,36 @@ INSTANTIATE_TEST_CASE_P(SelectionTest, testing::Values(knn::SelectKAlgo::FAISS, knn::SelectKAlgo::RADIX_8_BITS, knn::SelectKAlgo::RADIX_11_BITS, - knn::SelectKAlgo::WARP_SORT))); + knn::SelectKAlgo::WARP_SORT), + testing::Values(std::make_shared()))); template struct with_ref { template struct params_random { using InOut = SelectInOutComputed; - using ParamsIn = std::tuple; + using Handle = std::shared_ptr; + using ParamsIn = std::tuple; static auto read(ParamsIn ps) -> Params { - auto spec = std::get<0>(ps); - auto algo = std::get<1>(ps); + auto spec = std::get<0>(ps); + auto algo = std::get<1>(ps); + auto handle = std::get<2>(ps); + std::vector dists(spec.input_len * spec.n_inputs); - raft::handle_t handle; { - auto s = handle.get_stream(); + auto s = (*handle.get()).get_stream(); rmm::device_uvector dists_d(spec.input_len * spec.n_inputs, s); raft::random::RngState r(42); - normal(handle, r, dists_d.data(), dists_d.size(), KeyT(10.0), KeyT(100.0)); + normal(*(handle.get()), r, dists_d.data(), dists_d.size(), KeyT(10.0), KeyT(100.0)); update_host(dists.data(), dists_d.data(), dists_d.size(), s); s.synchronize(); } - return std::make_tuple(spec, algo, SelectInOutComputed(spec, RefAlgo, dists)); + return std::make_tuple( + spec, algo, SelectInOutComputed(handle, spec, RefAlgo, dists), handle); } }; }; @@ -416,11 +433,11 @@ auto inputs_random_largesize = testing::Values(SelectTestSpec{100, 100000, 1, tr SelectTestSpec{100, 100000, 100, true, false}, SelectTestSpec{100, 100000, 200, true}, SelectTestSpec{100000, 100, 100, false}, - SelectTestSpec{1, 1000000000, 1, true}, - SelectTestSpec{1, 1000000000, 16, false, false}, - SelectTestSpec{1, 1000000000, 64, false}, - SelectTestSpec{1, 1000000000, 128, true, false}, - SelectTestSpec{1, 1000000000, 256, false, false}); + SelectTestSpec{1, 100000000, 1, true}, + SelectTestSpec{1, 100000000, 16, false, false}, + SelectTestSpec{1, 100000000, 64, false}, + SelectTestSpec{1, 100000000, 128, true, false}, + SelectTestSpec{1, 100000000, 256, false, false}); auto inputs_random_largek = testing::Values(SelectTestSpec{100, 100000, 1000, true}, SelectTestSpec{100, 100000, 2000, true}, @@ -436,7 +453,8 @@ INSTANTIATE_TEST_CASE_P(SelectionTest, testing::Combine(inputs_random_longlist, testing::Values(knn::SelectKAlgo::RADIX_8_BITS, knn::SelectKAlgo::RADIX_11_BITS, - knn::SelectKAlgo::WARP_SORT))); + knn::SelectKAlgo::WARP_SORT), + testing::Values(std::make_shared()))); typedef SelectionTest::params_random> ReferencedRandomDoubleSizeT; @@ -446,7 +464,8 @@ INSTANTIATE_TEST_CASE_P(SelectionTest, testing::Combine(inputs_random_longlist, testing::Values(knn::SelectKAlgo::RADIX_8_BITS, knn::SelectKAlgo::RADIX_11_BITS, - knn::SelectKAlgo::WARP_SORT))); + knn::SelectKAlgo::WARP_SORT), + testing::Values(std::make_shared()))); typedef SelectionTest::params_random> ReferencedRandomDoubleInt; @@ -454,7 +473,8 @@ TEST_P(ReferencedRandomDoubleInt, LargeSize) { run(); } INSTANTIATE_TEST_CASE_P(SelectionTest, ReferencedRandomDoubleInt, testing::Combine(inputs_random_largesize, - testing::Values(knn::SelectKAlgo::WARP_SORT))); + testing::Values(knn::SelectKAlgo::WARP_SORT), + testing::Values(std::make_shared()))); /** TODO: Fix test failure in RAFT CI * diff --git a/cpp/test/random/make_blobs.cu b/cpp/test/random/make_blobs.cu index 741b374c8c..ea7283977c 100644 --- a/cpp/test/random/make_blobs.cu +++ b/cpp/test/random/make_blobs.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -147,8 +147,8 @@ class MakeBlobsTest : public ::testing::TestWithParam> { } protected: - MakeBlobsInputs params; raft::handle_t handle; + MakeBlobsInputs params; cudaStream_t stream = 0; device_vector mean_var; diff --git a/cpp/test/random/multi_variable_gaussian.cu b/cpp/test/random/multi_variable_gaussian.cu index 04626a53c7..b2b99027d6 100644 --- a/cpp/test/random/multi_variable_gaussian.cu +++ b/cpp/test/random/multi_variable_gaussian.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * Copyright (c) 2018-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -79,9 +79,10 @@ template template class MVGTest : public ::testing::TestWithParam> { - protected: + public: MVGTest() - : workspace_d(0, handle.get_stream()), + : params(::testing::TestWithParam>::GetParam()), + workspace_d(0, handle.get_stream()), P_d(0, handle.get_stream()), x_d(0, handle.get_stream()), X_d(0, handle.get_stream()), @@ -90,6 +91,7 @@ class MVGTest : public ::testing::TestWithParam> { { } + protected: void SetUp() override { // getting params @@ -195,15 +197,15 @@ class MVGTest : public ::testing::TestWithParam> { } protected: + raft::handle_t handle; MVGInputs params; - std::vector P, x, X; rmm::device_uvector workspace_d, P_d, x_d, X_d, Rand_cov, Rand_mean; + std::vector P, x, X; int dim, nPoints; typename detail::multi_variable_gaussian::Decomposer method; Correlation corr; detail::multi_variable_gaussian* mvg = NULL; T tolerance; - raft::handle_t handle; }; // end of MVGTest class template @@ -220,7 +222,7 @@ class MVGMdspanTest : public ::testing::TestWithParam> { } } - protected: + public: MVGMdspanTest() : workspace_d(0, handle.get_stream()), P_d(0, handle.get_stream()), @@ -323,13 +325,14 @@ class MVGMdspanTest : public ::testing::TestWithParam> { } protected: + raft::handle_t handle; + MVGInputs params; std::vector P, x, X; rmm::device_uvector workspace_d, P_d, x_d, X_d, Rand_cov, Rand_mean; int dim, nPoints; Correlation corr; T tolerance; - raft::handle_t handle; }; // end of MVGTest class ///@todo find out the reason that Un-correlated covs are giving problems (in qr) diff --git a/cpp/test/mst.cu b/cpp/test/sparse/mst.cu similarity index 99% rename from cpp/test/mst.cu rename to cpp/test/sparse/mst.cu index d11f0b5842..7c7d264f3f 100644 --- a/cpp/test/mst.cu +++ b/cpp/test/sparse/mst.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +16,7 @@ #include -#include "test_utils.cuh" +#include "../test_utils.cuh" #include #include #include diff --git a/cpp/test/spectral_matrix.cu b/cpp/test/sparse/spectral_matrix.cu similarity index 98% rename from cpp/test/spectral_matrix.cu rename to cpp/test/sparse/spectral_matrix.cu index 867b1e9daf..02856cb378 100644 --- a/cpp/test/spectral_matrix.cu +++ b/cpp/test/sparse/spectral_matrix.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/test/stats/cov.cu b/cpp/test/stats/cov.cu index 59a2c6e081..287bb85886 100644 --- a/cpp/test/stats/cov.cu +++ b/cpp/test/stats/cov.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * Copyright (c) 2018-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -103,10 +103,10 @@ class CovTest : public ::testing::TestWithParam> { } protected: - CovInputs params; - rmm::device_uvector data, mean_act, cov_act, cov_cm, cov_cm_ref; cublasHandle_t handle; cudaStream_t stream = 0; + CovInputs params; + rmm::device_uvector data, mean_act, cov_act, cov_cm, cov_cm_ref; }; ///@todo: add stable=false after it has been implemented diff --git a/cpp/test/stats/regression_metrics.cu b/cpp/test/stats/regression_metrics.cu index 86ac03c8b3..b3e0df32f8 100644 --- a/cpp/test/stats/regression_metrics.cu +++ b/cpp/test/stats/regression_metrics.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -106,8 +106,8 @@ class RegressionTest : public ::testing::TestWithParam> { } protected: - RegressionInputs params; raft::handle_t handle; + RegressionInputs params; cudaStream_t stream = 0; double mean_abs_error = 0; double mean_squared_error = 0; diff --git a/cpp/test/stats/silhouette_score.cu b/cpp/test/stats/silhouette_score.cu index 876926b71a..354a9c29cc 100644 --- a/cpp/test/stats/silhouette_score.cu +++ b/cpp/test/stats/silhouette_score.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -192,6 +192,7 @@ class silhouetteScoreTest : public ::testing::TestWithParam d_X; @@ -203,7 +204,6 @@ class silhouetteScoreTest : public ::testing::TestWithParam d_X(X.size(), stream); - rmm::device_uvector d_X_embedded(X_embedded.size(), stream); + auto stream = handle.get_stream(); + d_X.resize(X.size(), stream); + d_X_embedded.resize(X_embedded.size(), stream); raft::update_device(d_X.data(), X.data(), X.size(), stream); raft::update_device(d_X_embedded.data(), X_embedded.data(), X_embedded.size(), stream); auto n_sample = 50; @@ -338,6 +338,11 @@ class TrustworthinessScoreTest : public ::testing::Test { void TearDown() override {} protected: + raft::handle_t handle; + + rmm::device_uvector d_X; + rmm::device_uvector d_X_embedded; + double score; }; diff --git a/cpp/test/cudart_utils.cpp b/cpp/test/util/cudart_utils.cpp similarity index 98% rename from cpp/test/cudart_utils.cpp rename to cpp/test/util/cudart_utils.cpp index 7e8585c7c7..e6b1aa9676 100644 --- a/cpp/test/cudart_utils.cpp +++ b/cpp/test/util/cudart_utils.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/test/device_atomics.cu b/cpp/test/util/device_atomics.cu similarity index 97% rename from cpp/test/device_atomics.cu rename to cpp/test/util/device_atomics.cu index 4e56b8d486..5e8a67c8f6 100644 --- a/cpp/test/device_atomics.cu +++ b/cpp/test/util/device_atomics.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/test/integer_utils.cpp b/cpp/test/util/integer_utils.cpp similarity index 96% rename from cpp/test/integer_utils.cpp rename to cpp/test/util/integer_utils.cpp index 46fa8d348d..ed5dddf72d 100644 --- a/cpp/test/integer_utils.cpp +++ b/cpp/test/util/integer_utils.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/test/pow2_utils.cu b/cpp/test/util/pow2_utils.cu similarity index 98% rename from cpp/test/pow2_utils.cu rename to cpp/test/util/pow2_utils.cu index 9e9bd80673..e29e4eeb9c 100644 --- a/cpp/test/pow2_utils.cu +++ b/cpp/test/util/pow2_utils.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/docs/source/build.md b/docs/source/build.md index 2eba3af450..c88cf6c162 100644 --- a/docs/source/build.md +++ b/docs/source/build.md @@ -130,7 +130,7 @@ For example, to run the distance tests: It can take sometime to compile all of the tests. You can build individual tests by providing a semicolon-separated list to the `--limit-tests` option in `build.sh`: ```bash -./build.sh libraft tests --limit-tests=NEIGHBORS_TEST;DISTANCE_TEST;MATRIX_TEST +./build.sh libraft tests -n --limit-tests=NEIGHBORS_TEST;DISTANCE_TEST;MATRIX_TEST ``` ### Benchmarks @@ -143,7 +143,7 @@ The benchmarks are broken apart by algorithm category, so you will find several It can take sometime to compile all of the benchmarks. You can build individual benchmarks by providing a semicolon-separated list to the `--limit-bench` option in `build.sh`: ```bash -./build.sh libraft bench --limit-bench=NEIGHBORS_BENCH;DISTANCE_BENCH;LINALG_BENCH +./build.sh libraft bench -n --limit-bench=NEIGHBORS_BENCH;DISTANCE_BENCH;LINALG_BENCH ``` ### C++ Using Cmake Directly diff --git a/docs/source/developer_guide.md b/docs/source/developer_guide.md index b37d5dc1af..2f54753cc6 100644 --- a/docs/source/developer_guide.md +++ b/docs/source/developer_guide.md @@ -1,5 +1,13 @@ # Developer Guide +## General +Please start by reading the [Contributor Guide](contributing.md). + +## Performance +1. In performance critical sections of the code, favor `cudaDeviceGetAttribute` over `cudaDeviceGetProperties`. See corresponding CUDA devblog [here](https://devblogs.nvidia.com/cuda-pro-tip-the-fast-way-to-query-device-properties/) to know more. +2. If an algo requires you to launch GPU work in multiple cuda streams, do not create multiple `raft::resources` objects, one for each such work stream. Instead, use the stream pool configured on the given `raft::resources` instance's `raft::resources::get_stream_from_stream_pool()` to pick up the right cuda stream. Refer to the section on [CUDA Resources](#resource-management) and the section on [Threading](#threading-model) for more details. TIP: use `raft::resources::get_stream_pool_size()` to know how many such streams are available at your disposal. + + ## Local Development Developing features and fixing bugs for the RAFT library itself is straightforward and only requires building and installing the relevant RAFT artifacts. @@ -8,11 +16,239 @@ The process for working on a CUDA/C++ feature which might span RAFT and one or m If building a feature which spans projects and not using the source build in cmake, the RAFT changes (both C++ and Python) will need to be installed into the environment of the consuming project before they can be used. The ideal integration of RAFT into consuming projects will enable both the source build in the consuming project only for this case but also rely on a more stable packaging (such as conda packaging) otherwise. -## API stability + +## Threading Model + +With the exception of the `raft::resources`, RAFT algorithms should maintain thread-safety and are, in general, +assumed to be single threaded. This means they should be able to be called from multiple host threads so +long as different instances of `raft::resources` are used. + +Exceptions are made for algorithms that can take advantage of multiple CUDA streams within multiple host threads +in order to oversubscribe or increase occupancy on a single GPU. In these cases, the use of multiple host +threads within RAFT algorithms should be used only to maintain concurrency of the underlying CUDA streams. +Multiple host threads should be used sparingly, be bounded, and should steer clear of performing CPU-intensive +computations. + +A good example of an acceptable use of host threads within a RAFT algorithm might look like the following + +```cpp +#include +#include +#include +raft::resources res; + +... + +sync_stream(res); + +... + +int n_streams = get_stream_pool_size(res); + +#pragma omp parallel for num_threads(n_threads) +for(int i = 0; i < n; i++) { + int thread_num = omp_get_thread_num() % n_threads; + cudaStream_t s = get_stream_from_stream_pool(res, thread_num); + ... possible light cpu pre-processing ... + my_kernel1<<>>(...); + ... + ... some possible async d2h / h2d copies ... + my_kernel2<<>>(...); + ... + sync_stream(res, s); + ... possible light cpu post-processing ... +} +``` + +In the example above, if there is no CPU pre-processing at the beginning of the for-loop, an event can be registered in +each of the streams within the for-loop to make them wait on the stream from the handle. If there is no CPU post-processing +at the end of each for-loop iteration, `sync_stream(res, s)` can be replaced with a single `sync_stream_pool(res)` +after the for-loop. + +To avoid compatibility issues between different threading models, the only threading programming allowed in RAFT is OpenMP. +Though RAFT's build enables OpenMP by default, RAFT algorithms should still function properly even when OpenMP has been +disabled. If the CPU pre- and post-processing were not needed in the example above, OpenMP would not be needed. + +The use of threads in third-party libraries is allowed, though they should still avoid depending on a specific OpenMP runtime. + +## Public Interface + +### General guidelines +Functions exposed via the C++ API must be stateless. Things that are OK to be exposed on the interface: +1. Any [POD](https://en.wikipedia.org/wiki/Passive_data_structure) - see [std::is_pod](https://en.cppreference.com/w/cpp/types/is_pod) as a reference for C++11 POD types. +2. `raft::resources` - since it stores resource-related state which has nothing to do with model/algo state. +3. Avoid using pointers to POD types (explicitly putting it out, even though it can be considered as a POD) and pass the structures by reference instead. + Internal to the C++ API, these stateless functions are free to use their own temporary classes, as long as they are not exposed on the interface. +4. Accept single- (`raft::span`) and multi-dimensional views (`raft::mdspan`) and validate their metadata wherever possible. +5. Prefer `std::optional` for any optional arguments (e.g. do not accept `nullptr`) +6. All public APIs should be lightweight wrappers around calls to private APIs inside the `detail` namespace. + +### API stability Since RAFT is a core library with multiple consumers, it's important that the public APIs maintain stability across versions and any changes to them are done with caution, adding new functions and deprecating the old functions over a couple releases as necessary. -The public APIs should be lightweight wrappers around calls to private APIs inside the `detail` namespace. +### Stateless C++ APIs + +Using the IVF-PQ algorithm as an example, the following way of exposing its API would be wrong according to the guidelines in this section, since it exposes a non-POD C++ class object in the C++ API: +```cpp +template +class ivf_pq { + ivf_pq_params params_; + raft::resources const& res_; + +public: + ivf_pq(raft::resources const& res); + void train(raft::device_matrix dataset); + void search(raft::device_matrix queries, + raft::device_matrix out_inds, + raft::device_matrix out_dists); +}; +``` + +An alternative correct way to expose this could be: +```cpp +namespace raft::ivf_pq { + +template +void ivf_pq_train(raft::resources const& res, const raft::ivf_pq_params ¶ms, raft::ivf_pq_index &index, +raft::device_matrix dataset); + +template +void ivf_pq_search(raft::resources const& res, raft::ivf_pq_params const¶ms, raft::ivf_pq_index const & index, +raft::device_matrix queries, +raft::device_matrix out_inds, +raft::device_matrix out_dists); +} +``` + +### Other functions on state + +These guidelines also mean that it is the responsibility of C++ API to expose methods to load and store (aka marshalling) such a data structure. Further continuing the IVF-PQ example, the following methods could achieve this: +```cpp +namespace raft::ivf_pq { + void save(raft::ivf_pq_index const& model, std::ostream &os); + void load(raft::ivf_pq_index& model, std::istream &is); +} +``` + + +## Coding style + +### Code format +#### Introduction +RAFT relies on `clang-format` to enforce code style across all C++ and CUDA source code. The coding style is based on the [Google style guide](https://google.github.io/styleguide/cppguide.html#Formatting). The only digressions from this style are the following. +1. Do not split empty functions/records/namespaces. +2. Two-space indentation everywhere, including the line continuations. +3. Disable reflowing of comments. + The reasons behind these deviations from the Google style guide are given in comments [here](../../cpp/.clang-format). + +#### How is the check done? +All formatting checks are done by this python script: [run-clang-format.py](../../cpp/scripts/run-clang-format.py) which is effectively a wrapper over `clang-format`. An error is raised if the code diverges from the format suggested by clang-format. It is expected that the developers run this script to detect and fix formatting violations before creating PR. + +##### As part of CI +[run-clang-format.py](../../cpp/scripts/run-clang-format.py) is executed as part of our `ci/checks/style.sh` CI test. If there are any formatting violations, PR author is expected to fix those to get CI passing. Steps needed to fix the formatting violations are described in the subsequent sub-section. + +##### Manually +Developers can also manually (or setup this command as part of git pre-commit hook) run this check by executing: +```bash +python ./cpp/scripts/run-clang-format.py +``` +From the root of the RAFT repository. + +#### How to know the formatting violations? +When there are formatting errors, [run-clang-format.py](../../cpp/scripts/run-clang-format.py) prints a `diff` command, showing where there are formatting differences. Unfortunately, unlike `flake8`, `clang-format` does NOT print descriptions of the violations, but instead directly formats the code. So, the only way currently to know about formatting differences is to run the diff command as suggested by this script against each violating source file. + +#### How to fix the formatting violations? +When there are formatting violations, [run-clang-format.py](../../cpp/scripts/run-clang-format.py) prints at the end, the exact command that can be run by developers to fix them. This is the easiest way to fix formatting errors. [This screencast](https://asciinema.org/a/287367) shows how developers can check for formatting violations in their branches and also how to fix those, before sending out PRs. + +In short, to bulk-fix all the formatting violations, execute the following command: +```bash +python ./cpp/scripts/run-clang-format.py -inplace +``` +From the root of the RAFT repository. + +#### clang-format version? +To avoid spurious code style violations we specify the exact clang-format version required, currently `11.1.0`. This is enforced by the [run-clang-format.py](../../cpp/scripts/run-clang-format.py) script itself. Refer [here](../../cpp/README.md#dependencies) for the list of build-time dependencies. + +#### Additional scripts +Along with clang, there are an include checker and copyright checker scripts for checking style, which can be performed as part of CI, as well as manually. + +##### #include style +[include_checker.py](../../cpp/scripts/include_checker.py) is used to enforce the include style as follows: +1. `#include "..."` should be used for referencing local files only. It is acceptable to be used for referencing files in a sub-folder/parent-folder of the same algorithm, but should never be used to include files in other algorithms or between algorithms and the primitives or other dependencies. +2. `#include <...>` should be used for referencing everything else + +Manually, run the following to bulk-fix include style issues: +```bash +python ./cpp/scripts/include_checker.py --inplace [cpp/include cpp/test ... list of folders which you want to fix] +``` + +##### Copyright header +[copyright.py](../../ci/checks/copyright.py) checks the Copyright header for all git-modified files + +Manually, you can run the following to bulk-fix the header if only the years need to be updated: +```bash +python ./ci/checks/copyright.py --update-current-year +``` +Keep in mind that this only applies to files tracked by git and having been modified. + +## Error handling +Call CUDA APIs via the provided helper macros `RAFT_CUDA_TRY`, `RAFT_CUBLAS_TRY` and `RAFT_CUSOLVER_TRY`. These macros take care of checking the return values of the used API calls and generate an exception when the command is not successful. If you need to avoid an exception, e.g. inside a destructor, use `RAFT_CUDA_TRY_NO_THROW`, `RAFT_CUBLAS_TRY_NO_THROW ` and `RAFT_CUSOLVER_TRY_NO_THROW`. These macros log the error but do not throw an exception. + +## Logging + +### Introduction +Anything and everything about logging is defined inside [logger.hpp](../../cpp/include/raft/core/logger.hpp). It uses [spdlog](https://github.com/gabime/spdlog) underneath, but this information is transparent to all. + +### Usage +```cpp +#include + +// Inside your method or function, use any of these macros +RAFT_LOG_TRACE("Hello %s!", "world"); +RAFT_LOG_DEBUG("Hello %s!", "world"); +RAFT_LOG_INFO("Hello %s!", "world"); +RAFT_LOG_WARN("Hello %s!", "world"); +RAFT_LOG_ERROR("Hello %s!", "world"); +RAFT_LOG_CRITICAL("Hello %s!", "world"); +``` + +### Changing logging level +There are 7 logging levels with each successive level becoming quieter: +1. RAFT_LEVEL_TRACE +2. RAFT_LEVEL_DEBUG +3. RAFT_LEVEL_INFO +4. RAFT_LEVEL_WARN +5. RAFT_LEVEL_ERROR +6. RAFT_LEVEL_CRITICAL +7. RAFT_LEVEL_OFF + Pass one of these as per your needs into the `set_level()` method as follows: +```cpp +raft::logger::get.set_level(RAFT_LEVEL_WARN); +// From now onwards, this will print only WARN and above kind of messages +``` + +### Changing logging pattern +Pass the [format string](https://github.com/gabime/spdlog/wiki/3.-Custom-formatting) as follows in order use a different logging pattern than the default. +```cpp +raft::logger::get.set_pattern(YourFavoriteFormat); +``` +One can also use the corresponding `get_pattern()` method to know the current format as well. + +### Temporarily changing the logging pattern +Sometimes, we need to temporarily change the log pattern (eg: for reporting decision tree structure). This can be achieved in a RAII-like approach as follows: +```cpp +{ + PatternSetter _(MyNewTempFormat); + // new log format is in effect from here onwards + doStuff(); + // once the above temporary object goes out-of-scope, the old format will be restored +} +``` + +### Tips +* Do NOT end your logging messages with a newline! It is automatically added by spdlog. +* The `RAFT_LOG_TRACE()` is by default not compiled due to the `RAFT_ACTIVE_LEVEL` macro setup, for performance reasons. If you need it to be enabled, change this macro accordingly during compilation time ## Common Design Considerations @@ -26,9 +262,170 @@ The public APIs should be lightweight wrappers around calls to private APIs insi ## Testing -It's important for RAFT to maintain a high test coverage in order to minimize the potential for downstream projects to encounter unexpected build or runtime behavior as a result of changes. A well-defined public API can help maintain compile-time stability but means more focus should be placed on testing the functional requirements and verifying execution on the various edge cases within RAFT itself. Ideally, bug fixes and new features should be able to be made to RAFT independently of the consuming projects. +It's important for RAFT to maintain a high test coverage of the public APIs in order to minimize the potential for downstream projects to encounter unexpected build or runtime behavior as a result of changes. +A well-defined public API can help maintain compile-time stability but means more focus should be placed on testing the functional requirements and verifying execution on the various edge cases within RAFT itself. Ideally, bug fixes and new features should be able to be made to RAFT independently of the consuming projects. ## Documentation -Public APIs always require documentation, since those will be exposed directly to users. In addition to summarizing the purpose of each class / function in the public API, the arguments (and relevant templates) should be documented along with brief usage examples. +Public APIs always require documentation since those will be exposed directly to users. For C++, we use [doxygen](http://www.doxygen.nl) and for Python/cython we use [pydoc](https://docs.python.org/3/library/pydoc.html). In addition to summarizing the purpose of each class / function in the public API, the arguments (and relevant templates) should be documented along with brief usage examples. + +## Asynchronous operations and stream ordering +All RAFT algorithms should be as asynchronous as possible avoiding the use of the default stream (aka as NULL or `0` stream). Implementations that require only one CUDA Stream should use the stream from `raft::resources`: + +```cpp +#include +#include + +void foo(const raft::resources& res, ...) +{ + cudaStream_t stream = get_cuda_stream(res); +} +``` +When multiple streams are needed, e.g. to manage a pipeline, use the internal streams available in `raft::resources` (see [CUDA Resources](#cuda-resources)). If multiple streams are used all operations still must be ordered according to `raft::resource::get_cuda_stream()` (from `raft/core/resource/cuda_stream.hpp`). Before any operation in any of the internal CUDA streams is started, all previous work in `raft::resource::get_cuda_stream()` must have completed. Any work enqueued in `raft::resource::get_cuda_stream()` after a RAFT function returns should not start before all work enqueued in the internal streams has completed. E.g. if a RAFT algorithm is called like this: +```cpp +#include +#include +void foo(const double* srcdata, double* result) +{ + cudaStream_t stream; + CUDA_RT_CALL( cudaStreamCreate( &stream ) ); + raft::resources res; + set_cuda_stream(res, stream); + + ... + + RAFT_CUDA_TRY( cudaMemcpyAsync( srcdata, h_srcdata.data(), n*sizeof(double), cudaMemcpyHostToDevice, stream ) ); + + raft::algo(raft::resources, dopredict, srcdata, result, ... ); + + RAFT_CUDA_TRY( cudaMemcpyAsync( h_result.data(), result, m*sizeof(int), cudaMemcpyDeviceToHost, stream ) ); + + ... +} +``` +No work in any stream should start in `raft::algo` before the `cudaMemcpyAsync` in `stream` launched before the call to `raft::algo` is done. And all work in all streams used in `raft::algo` should be done before the `cudaMemcpyAsync` in `stream` launched after the call to `raft::algo` starts. + +This can be ensured by introducing interstream dependencies with CUDA events and `cudaStreamWaitEvent`. For convenience, the header `raft/core/device_resources.hpp` provides the class `raft::stream_syncer` which lets all `raft::resources` internal CUDA streams wait on `raft::resource::get_cuda_stream()` in its constructor and in its destructor and lets `raft::resource::get_cuda_stream()` wait on all work enqueued in the `raft::resources` internal CUDA streams. The intended use would be to create a `raft::stream_syncer` object as the first thing in an entry function of the public RAFT API: + +```cpp +namespace raft { + void algo(const raft::resources& res, ...) + { + raft::streamSyncer _(res); + } +} +``` +This ensures the stream ordering behavior described above. + +### Using Thrust +To ensure that thrust algorithms are executed in the intended stream the `thrust::cuda::par` execution policy should be used. To ensure that thrust algorithms allocate temporary memory via the provided device memory allocator, use the `rmm::exec_policy` available in `raft/core/resource/thrust_policy.hpp`, which can be used through `raft::resources`: +```cpp +#include +#include +void foo(const raft::resources& res, ...) +{ + auto execution_policy = get_thrust_policy(res); + thrust::for_each(execution_policy, ... ); +} +``` + +## Resource Management + +Do not create reusable CUDA resources directly in implementations of RAFT algorithms. Instead, use the existing resources in `raft::resources` to avoid constant creation and deletion of reusable resources such as CUDA streams, CUDA events or library handles. Please file a feature request if a resource handle is missing in `raft::resources`. +The resources can be obtained like this +```cpp +#include +#include +#include +void foo(const raft::resources& h, ...) +{ + cublasHandle_t cublasHandle = get_cublas_handle(h); + const int num_streams = get_stream_pool_size(h); + const int stream_idx = ... + cudaStream_t stream = get_stream_from_stream_pool(stream_idx); + ... +} +``` + +The example below shows one way to create `n_stream` number of internal cuda streams with an `rmm::stream_pool` which can later be used by the algos inside RAFT. +```cpp +#include +#include +#include +int main(int argc, char** argv) +{ + int n_streams = argc > 1 ? atoi(argv[1]) : 0; + raft::resources res; + set_cuda_stream_pool(res, std::make_shared(n_streams)); + + foo(res, ...); +} +``` + +## Multi-GPU + +The multi-GPU paradigm of RAFT is **O**ne **P**rocess per **G**PU (OPG). Each algorithm should be implemented in a way that it can run with a single GPU without any specific dependencies to a particular communication library. A multi-GPU implementation should use the methods offered by the class `raft::comms::comms_t` from [raft/core/comms.hpp] for inter-rank/GPU communication. It is the responsibility of the user of cuML to create an initialized instance of `raft::comms::comms_t`. + +E.g. with a CUDA-aware MPI, a RAFT user could use code like this to inject an initialized instance of `raft::comms::mpi_comms` into a `raft::resources`: + +```cpp +#include +#include +#include +#include +... +int main(int argc, char * argv[]) +{ + MPI_Init(&argc, &argv); + int rank = -1; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + int local_rank = -1; + { + MPI_Comm local_comm; + MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, &local_comm); + + MPI_Comm_rank(local_comm, &local_rank); + + MPI_Comm_free(&local_comm); + } + + cudaSetDevice(local_rank); + + mpi_comms raft_mpi_comms; + MPI_Comm_dup(MPI_COMM_WORLD, &raft_mpi_comms); + + { + raft::device_resources res; + initialize_mpi_comms(res, raft_mpi_comms); + + ... + + raft::algo(res, ... ); + } + + MPI_Comm_free(&raft_mpi_comms); + + MPI_Finalize(); + return 0; +} +``` + +A RAFT developer can assume the following: +* A instance of `raft::comms::comms_t` was correctly initialized. +* All processes that are part of `raft::comms::comms_t` call into the RAFT algorithm cooperatively. + +The initialized instance of `raft::comms::comms_t` can be accessed from the `raft::resources` instance: + +```cpp +#include +#include +void foo(const raft::resources& res, ...) +{ + const raft::comms_t& communicator = get_comms(res); + const int rank = communicator.get_rank(); + const int size = communicator.get_size(); + ... +} +``` diff --git a/python/pylibraft/pylibraft/test/test_refine.py b/python/pylibraft/pylibraft/test/test_refine.py index c7b8624bf1..2f3bef2e0c 100644 --- a/python/pylibraft/pylibraft/test/test_refine.py +++ b/python/pylibraft/pylibraft/test/test_refine.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/raft-dask/setup.py b/python/raft-dask/setup.py index bd21136103..7009a9ab44 100644 --- a/python/raft-dask/setup.py +++ b/python/raft-dask/setup.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ "numpy", "numba>=0.49", "joblib>=0.11", - "dask-cuda>=23.02", + "dask-cuda>=23.2*", "dask>=2022.12.0", f"ucx-py{cuda_suffix}", "distributed>=2022.12.0", From d86610d19a0d368d637a9551a13ddfcb59d1937e Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 10 Jan 2023 20:31:52 -0600 Subject: [PATCH 08/44] Fix documentation author (#1134) Fixes docs to mark NVIDIA as the author. Authors: - Bradley Dice (https://github.com/bdice) - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/1134 --- docs/source/conf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 4f78ae2145..4a0dfe00b5 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -77,8 +77,8 @@ # General information about the project. project = "raft" -copyright = "2022, nvidia" -author = "nvidia" +copyright = "2023, NVIDIA Corporation" +author = "NVIDIA Corporation" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -161,7 +161,7 @@ # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, "raft.tex", "RAFT Documentation", "nvidia", "manual"), + (master_doc, "raft.tex", "RAFT Documentation", "NVIDIA Corporation", "manual"), ] # -- Options for manual page output --------------------------------------- From bbe07554c50a5132009c6b3e66a4ecbf77c81e72 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Thu, 12 Jan 2023 05:27:41 +0100 Subject: [PATCH 09/44] Add raft::void_op functor (#1136) Follow up on PR #1049. Adds a void_op functor for lambdas that are unused. Authors: - Allard Hendriksen (https://github.com/ahendriksen) Approvers: - Louis Sugy (https://github.com/Nyrio) - Artem M. Chirkin (https://github.com/achirkin) - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/1136 --- cpp/include/raft/core/operators.hpp | 10 +++++++++- cpp/include/raft/distance/detail/canberra.cuh | 8 ++------ cpp/include/raft/distance/detail/chebyshev.cuh | 9 +++------ cpp/include/raft/distance/detail/l1.cuh | 8 ++------ 4 files changed, 16 insertions(+), 19 deletions(-) diff --git a/cpp/include/raft/core/operators.hpp b/cpp/include/raft/core/operators.hpp index de521cc945..398354df46 100644 --- a/cpp/include/raft/core/operators.hpp +++ b/cpp/include/raft/core/operators.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -40,6 +40,14 @@ struct identity_op { } }; +struct void_op { + template + constexpr RAFT_INLINE_FUNCTION void operator()(UnusedArgs...) const + { + return; + } +}; + template struct cast_op { template diff --git a/cpp/include/raft/distance/detail/canberra.cuh b/cpp/include/raft/distance/detail/canberra.cuh index 90ed3940e1..43a904edba 100644 --- a/cpp/include/raft/distance/detail/canberra.cuh +++ b/cpp/include/raft/distance/detail/canberra.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -81,11 +81,7 @@ static void canberraImpl(const DataT* x, }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, - DataT * regyn, - IdxT gridStrideX, - IdxT gridStrideY) { return; }; + auto epilog_lambda = raft::void_op(); if (isRowMajor) { auto canberraRowMajor = pairwiseDistanceMatKernel #include namespace raft { @@ -77,11 +78,7 @@ static void chebyshevImpl(const DataT* x, }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, - DataT * regyn, - IdxT gridStrideX, - IdxT gridStrideY) { return; }; + auto epilog_lambda = raft::void_op(); if (isRowMajor) { auto chebyshevRowMajor = pairwiseDistanceMatKernel Date: Thu, 12 Jan 2023 09:49:40 -0800 Subject: [PATCH 10/44] Add L2SqrtExpanded support to ivf_pq (#1138) Authors: - Ben Frederickson (https://github.com/benfred) Approvers: - Artem M. Chirkin (https://github.com/achirkin) - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/1138 --- .../raft/spatial/knn/detail/ivf_pq_search.cuh | 7 ++++++- cpp/test/neighbors/ann_ivf_pq.cuh | 11 ++++++++++- .../ann_ivf_pq/test_float_int64_t.cu | 5 +++-- .../pylibraft/neighbors/ivf_pq/ivf_pq.pyx | 13 +++++++------ .../pylibraft/pylibraft/test/test_ivf_pq.py | 19 ++++++++++--------- 5 files changed, 36 insertions(+), 19 deletions(-) diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh index 1df5671be2..16a78aec1c 100644 --- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh +++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -171,6 +171,7 @@ void select_clusters(const handle_t& handle, */ float norm_factor; switch (metric) { + case raft::distance::DistanceType::L2SqrtExpanded: case raft::distance::DistanceType::L2Expanded: norm_factor = 1.0 / -2.0; break; case raft::distance::DistanceType::InnerProduct: norm_factor = 0.0; break; default: RAFT_FAIL("Unsupported distance type %d.", int(metric)); @@ -189,6 +190,7 @@ void select_clusters(const handle_t& handle, float beta; uint32_t gemm_k = dim; switch (metric) { + case raft::distance::DistanceType::L2SqrtExpanded: case raft::distance::DistanceType::L2Expanded: { alpha = -2.0; beta = 0.0; @@ -710,6 +712,7 @@ __global__ void ivfpq_compute_similarity_kernel(uint32_t n_rows, if constexpr (PrecompBaseDiff) { // Reduce number of memory reads later by pre-computing parts of the score switch (metric) { + case distance::DistanceType::L2SqrtExpanded: case distance::DistanceType::L2Expanded: { for (uint32_t i = threadIdx.x; i < dim; i += blockDim.x) { base_diff[i] = query[i] - cluster_center[i]; @@ -743,6 +746,7 @@ __global__ void ivfpq_compute_similarity_kernel(uint32_t n_rows, float pq_c = *cur_pq_center; cur_pq_center += PqShift; switch (metric) { + case distance::DistanceType::L2SqrtExpanded: case distance::DistanceType::L2Expanded: { float diff; if constexpr (PrecompBaseDiff) { @@ -809,6 +813,7 @@ __global__ void ivfpq_compute_similarity_kernel(uint32_t n_rows, switch (metric) { // If the metric is non-negative, we can use the query_kth approximation as an early stop // threshold to skip some iterations when computing the score. Add such metrics here. + case distance::DistanceType::L2SqrtExpanded: case distance::DistanceType::L2Expanded: { early_stop_limit = query_kth; } break; diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh index 353e8b65e5..94777aedd1 100644 --- a/cpp/test/neighbors/ann_ivf_pq.cuh +++ b/cpp/test/neighbors/ann_ivf_pq.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -436,6 +436,15 @@ inline auto enum_variety_ip() -> test_cases_t }); } +inline auto enum_variety_l2sqrt() -> test_cases_t +{ + return map(enum_variety(), [](const ivf_pq_inputs& x) { + ivf_pq_inputs y(x); + y.index_params.metric = distance::DistanceType::L2SqrtExpanded; + return y; + }); +} + /** * Try different number of n_probes, some of which may trigger the non-fused version of the search * kernel. diff --git a/cpp/test/neighbors/ann_ivf_pq/test_float_int64_t.cu b/cpp/test/neighbors/ann_ivf_pq/test_float_int64_t.cu index ecb2faa6a0..db42b1ee6a 100644 --- a/cpp/test/neighbors/ann_ivf_pq/test_float_int64_t.cu +++ b/cpp/test/neighbors/ann_ivf_pq/test_float_int64_t.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,6 +22,7 @@ using f32_f32_i64 = ivf_pq_test; TEST_BUILD_SEARCH(f32_f32_i64) TEST_BUILD_EXTEND_SEARCH(f32_f32_i64) -INSTANTIATE(f32_f32_i64, enum_variety_l2() + enum_variety_ip() + big_dims_small_lut()); +INSTANTIATE(f32_f32_i64, + enum_variety_l2() + enum_variety_ip() + big_dims_small_lut() + enum_variety_l2sqrt()); } // namespace raft::neighbors::ivf_pq diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx index 002a097d0f..ee30864193 100644 --- a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx +++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx @@ -64,9 +64,7 @@ from pylibraft.neighbors.ivf_pq.cpp.c_ivf_pq cimport ( def _get_metric(metric): SUPPORTED_DISTANCES = { "l2_expanded": DistanceType.L2Expanded, - # TODO(tfeher): fix inconsistency: index building for L2SqrtExpanded is - # only supported by build, not by search. - # "euclidean": DistanceType.L2SqrtExpanded + "euclidean": DistanceType.L2SqrtExpanded, "inner_product": DistanceType.InnerProduct } if metric not in SUPPORTED_DISTANCES: @@ -76,7 +74,8 @@ def _get_metric(metric): cdef _get_metric_string(DistanceType metric): return {DistanceType.L2Expanded : "l2_expanded", - DistanceType.InnerProduct: "inner_product"}[metric] + DistanceType.InnerProduct: "inner_product", + DistanceType.L2SqrtExpanded: "euclidean"}[metric] cdef _get_codebook_string(c_ivf_pq.codebook_gen codebook): @@ -135,9 +134,11 @@ cdef class IndexParams: n_list : int, default = 1024 The number of clusters used in the coarse quantizer. metric : string denoting the metric type, default="l2_expanded" - Valid values for metric: ["l2_expanded", "inner_product"], where - - l2_expanded is the equclidean distance without the square root + Valid values for metric: ["l2_expanded", "inner_product", + "euclidean"], where + - l2_expanded is the euclidean distance without the square root operation, i.e.: distance(a,b) = \\sum_i (a_i - b_i)^2, + - euclidean is the euclidean distance - inner product distance is defined as distance(a, b) = \\sum_i a_i * b_i. kmeans_n_iters : int, default = 20 diff --git a/python/pylibraft/pylibraft/test/test_ivf_pq.py b/python/pylibraft/pylibraft/test/test_ivf_pq.py index 35738cd471..db1389c6cd 100644 --- a/python/pylibraft/pylibraft/test/test_ivf_pq.py +++ b/python/pylibraft/pylibraft/test/test_ivf_pq.py @@ -59,17 +59,14 @@ def check_distances(dataset, queries, metric, out_idx, out_dist, eps=None): X = queries[np.newaxis, i, :] Y = dataset[out_idx[i, :], :] if metric == "l2_expanded": + dist[i, :] = pairwise_distances(X, Y, "sqeuclidean") + elif metric == "euclidean": dist[i, :] = pairwise_distances(X, Y, "euclidean") elif metric == "inner_product": dist[i, :] = np.matmul(X, Y.T) else: raise ValueError("Invalid metric") - # Note: raft l2 metric does not include the square root operation like - # sklearn's euclidean. - if metric == "l2_expanded": - dist = np.power(dist, 2) - dist_eps = abs(dist) dist_eps[dist < 1e-3] = 1e-3 diff = abs(out_dist - dist) / dist_eps @@ -179,9 +176,11 @@ def run_ivf_pq_build_search_test( out_dist = out_dist_device.copy_to_host() # Calculate reference values with sklearn - skl_metric = {"l2_expanded": "euclidean", "inner_product": "cosine"}[ - metric - ] + skl_metric = { + "l2_expanded": "sqeuclidean", + "inner_product": "cosine", + "euclidean": "euclidean", + }[metric] nn_skl = NearestNeighbors( n_neighbors=k, algorithm="brute", metric=skl_metric ) @@ -253,7 +252,9 @@ def test_ivf_pq_n(params): ) -@pytest.mark.parametrize("metric", ["l2_expanded", "inner_product"]) +@pytest.mark.parametrize( + "metric", ["l2_expanded", "inner_product", "euclidean"] +) @pytest.mark.parametrize("dtype", [np.float32]) @pytest.mark.parametrize("codebook_kind", ["subspace", "cluster"]) @pytest.mark.parametrize("rotation", [True, False]) From 3f3a59eea8cbd3e069913a954e6faac5eb450be3 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 13 Jan 2023 06:08:12 -0500 Subject: [PATCH 11/44] Adding workspace resource (#1137) This will default to `rmm::mr::get_current_device_resource()` in the event no explicit workspace resource has been set. It's using raw pointers right now, but that may be okay as the RMM memory resource API seems to promote that over shared pointers (or any dereferencing at all). Authors: - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Ben Frederickson (https://github.com/benfred) URL: https://github.com/rapidsai/raft/pull/1137 --- cpp/include/raft/core/device_resources.hpp | 11 ++- cpp/include/raft/core/handle.hpp | 7 +- .../core/resource/device_memory_resource.hpp | 76 +++++++++++++++++++ .../raft/core/resource/resource_types.hpp | 1 + cpp/test/core/handle.cpp | 20 +++++ 5 files changed, 111 insertions(+), 4 deletions(-) create mode 100644 cpp/include/raft/core/resource/device_memory_resource.hpp diff --git a/cpp/include/raft/core/device_resources.hpp b/cpp/include/raft/core/device_resources.hpp index faca07e8f4..9b9e07cf4f 100644 --- a/cpp/include/raft/core/device_resources.hpp +++ b/cpp/include/raft/core/device_resources.hpp @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -73,7 +74,8 @@ class device_resources : public resources { * @param[in] stream_pool the stream pool used (which has default of nullptr if unspecified) */ device_resources(rmm::cuda_stream_view stream_view = rmm::cuda_stream_per_thread, - std::shared_ptr stream_pool = {nullptr}) + std::shared_ptr stream_pool = {nullptr}, + rmm::mr::device_memory_resource* workspace_resource = nullptr) : resources{} { resources::add_resource_factory(std::make_shared()); @@ -81,6 +83,8 @@ class device_resources : public resources { std::make_shared(stream_view)); resources::add_resource_factory( std::make_shared(stream_pool)); + resources::add_resource_factory( + std::make_shared(workspace_resource)); } /** Destroys all held-up resources */ @@ -206,6 +210,11 @@ class device_resources : public resources { return resource::get_subcomm(*this, key); } + const rmm::mr::device_memory_resource* get_workspace_resource() const + { + return resource::get_workspace_resource(*this); + } + bool comms_initialized() const { return resource::comms_initialized(*this); } const cudaDeviceProp& get_device_properties() const diff --git a/cpp/include/raft/core/handle.hpp b/cpp/include/raft/core/handle.hpp index 48c1718eb0..6486965cdf 100644 --- a/cpp/include/raft/core/handle.hpp +++ b/cpp/include/raft/core/handle.hpp @@ -46,9 +46,10 @@ class handle_t : public raft::device_resources { * unspecified) * @param[in] stream_pool the stream pool used (which has default of nullptr if unspecified) */ - handle_t(rmm::cuda_stream_view stream_view = rmm::cuda_stream_per_thread, - std::shared_ptr stream_pool = {nullptr}) - : device_resources{stream_view, stream_pool} + handle_t(rmm::cuda_stream_view stream_view = rmm::cuda_stream_per_thread, + std::shared_ptr stream_pool = {nullptr}, + rmm::mr::device_memory_resource* workspace_resource = nullptr) + : device_resources{stream_view, stream_pool, workspace_resource} { } diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp new file mode 100644 index 0000000000..0706f28f94 --- /dev/null +++ b/cpp/include/raft/core/resource/device_memory_resource.hpp @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +namespace raft::resource { +class device_memory_resource : public resource { + public: + device_memory_resource(rmm::mr::device_memory_resource* mr_ = nullptr) : mr(mr_) + { + if (mr_ == nullptr) { mr = rmm::mr::get_current_device_resource(); } + } + void* get_resource() override { return mr; } + + ~device_memory_resource() override {} + + private: + rmm::mr::device_memory_resource* mr; +}; + +/** + * Factory that knows how to construct a specific raft::resource to populate + * the resources instance. + */ +class workspace_resource_factory : public resource_factory { + public: + workspace_resource_factory(rmm::mr::device_memory_resource* mr_ = nullptr) : mr(mr_) {} + resource_type get_resource_type() override { return resource_type::WORKSPACE_RESOURCE; } + resource* make_resource() override { return new device_memory_resource(mr); } + + private: + rmm::mr::device_memory_resource* mr; +}; + +/** + * Load a temp workspace resource from a resources instance (and populate it on the res + * if needed). + * @param res raft resources object for managing resources + * @return device memory resource object + */ +inline rmm::mr::device_memory_resource* get_workspace_resource(resources const& res) +{ + if (!res.has_resource_factory(resource_type::WORKSPACE_RESOURCE)) { + res.add_resource_factory(std::make_shared()); + } + return res.get_resource(resource_type::WORKSPACE_RESOURCE); +}; + +/** + * Set a temp workspace resource on a resources instance. + * + * @param res raft resources object for managing resources + * @param mr a valid rmm device_memory_resource + * @return + */ +inline void set_workspace_resource(resources const& res, rmm::mr::device_memory_resource* mr) +{ + res.add_resource_factory(std::make_shared(mr)); +}; +} // namespace raft::resource \ No newline at end of file diff --git a/cpp/include/raft/core/resource/resource_types.hpp b/cpp/include/raft/core/resource/resource_types.hpp index c763066c79..ace4b7061b 100644 --- a/cpp/include/raft/core/resource/resource_types.hpp +++ b/cpp/include/raft/core/resource/resource_types.hpp @@ -35,6 +35,7 @@ enum resource_type { DEVICE_PROPERTIES, // cuda device properties DEVICE_ID, // cuda device id THRUST_POLICY, // thrust execution policy + WORKSPACE_RESOURCE, // rmm device memory resource LAST_KEY // reserved for the last key }; diff --git a/cpp/test/core/handle.cpp b/cpp/test/core/handle.cpp index 2148742e83..75b2d60bcd 100644 --- a/cpp/test/core/handle.cpp +++ b/cpp/test/core/handle.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include namespace raft { @@ -248,4 +249,23 @@ TEST(Raft, SubComms) ASSERT_EQ(handle.get_subcomm("key2").get_size(), 2); } +TEST(Raft, WorkspaceResource) +{ + handle_t handle; + + ASSERT_TRUE(dynamic_cast*>( + handle.get_workspace_resource()) == nullptr); + ASSERT_EQ(rmm::mr::get_current_device_resource(), handle.get_workspace_resource()); + + auto pool_mr = new rmm::mr::pool_memory_resource(rmm::mr::get_current_device_resource()); + std::shared_ptr pool = {nullptr}; + handle_t handle2(rmm::cuda_stream_per_thread, pool, pool_mr); + + ASSERT_TRUE(dynamic_cast*>( + handle2.get_workspace_resource()) != nullptr); + ASSERT_EQ(pool_mr, handle2.get_workspace_resource()); + + delete pool_mr; +} + } // namespace raft From ab4f1fd640138e4b162fe8fd7f85590aedfae7b3 Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Fri, 13 Jan 2023 09:26:36 -0800 Subject: [PATCH 12/44] Add L2SqrtExpanded support to ivf_flat ANN indices (#1133) Authors: - Ben Frederickson (https://github.com/benfred) - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Artem M. Chirkin (https://github.com/achirkin) - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/1133 --- cpp/include/raft/neighbors/ivf_flat_types.hpp | 19 +++-- .../raft/spatial/knn/detail/ann_quantized.cuh | 18 ++++- .../spatial/knn/detail/ivf_flat_build.cuh | 10 +-- .../spatial/knn/detail/ivf_flat_search.cuh | 78 ++++++++++++------- .../spatial/knn/detail/topk/warpsort_topk.cuh | 12 +-- cpp/test/neighbors/ann_ivf_flat.cu | 4 +- 6 files changed, 91 insertions(+), 50 deletions(-) diff --git a/cpp/include/raft/neighbors/ivf_flat_types.hpp b/cpp/include/raft/neighbors/ivf_flat_types.hpp index eea6ae256d..fc5a8116ab 100644 --- a/cpp/include/raft/neighbors/ivf_flat_types.hpp +++ b/cpp/include/raft/neighbors/ivf_flat_types.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -252,14 +252,21 @@ struct index : ann::index { * Replace the content of the index with new uninitialized mdarrays to hold the indicated amount * of data. */ - void allocate(const handle_t& handle, IdxT index_size, bool allocate_center_norms) + void allocate(const handle_t& handle, IdxT index_size) { data_ = make_device_mdarray(handle, make_extents(index_size, dim())); indices_ = make_device_mdarray(handle, make_extents(index_size)); - center_norms_ = - allocate_center_norms - ? std::optional(make_device_mdarray(handle, make_extents(n_lists()))) - : std::nullopt; + + switch (metric_) { + case raft::distance::DistanceType::L2Expanded: + case raft::distance::DistanceType::L2SqrtExpanded: + case raft::distance::DistanceType::L2Unexpanded: + case raft::distance::DistanceType::L2SqrtUnexpanded: + center_norms_ = make_device_mdarray(handle, make_extents(n_lists())); + break; + default: center_norms_ = std::nullopt; + } + check_consistency(); } diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh index 10f781d817..975f1a0f89 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -94,6 +94,18 @@ void approx_knn_ivfsq_build_index(knnIndex* index, const IVFSQParam& params, Int index->gpu_res.get(), D, params.nlist, faiss_qtype, faiss_metric, params.encodeResidual)); } +inline bool ivf_flat_supported_metric(raft::distance::DistanceType metric) +{ + switch (metric) { + case raft::distance::DistanceType::L2Unexpanded: + case raft::distance::DistanceType::L2Expanded: + case raft::distance::DistanceType::L2SqrtExpanded: + case raft::distance::DistanceType::L2SqrtUnexpanded: + case raft::distance::DistanceType::InnerProduct: return true; + default: return false; + } +} + template void approx_knn_build_index(const handle_t& handle, knnIndex* index, @@ -120,9 +132,7 @@ void approx_knn_build_index(const handle_t& handle, } if constexpr (std::is_same_v) { index->metric_processor->preprocess(index_array); } - if (ivf_ft_pams && (metric == raft::distance::DistanceType::L2Unexpanded || - metric == raft::distance::DistanceType::L2Expanded || - metric == raft::distance::DistanceType::InnerProduct)) { + if (ivf_ft_pams && ivf_flat_supported_metric(metric)) { auto new_params = from_legacy_index_params(*ivf_ft_pams, metric, metricArg); index->ivf_flat() = std::make_unique>( ivf_flat::build(handle, new_params, index_array, int64_t(n), D)); diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh index e951d8fe5d..ed2c6bae49 100644 --- a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh +++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -191,8 +191,7 @@ inline auto extend(const handle_t& handle, update_host(&index_size, list_offsets_ptr + n_lists, 1, stream); handle.sync_stream(stream); - ext_index.allocate( - handle, index_size, ext_index.metric() == raft::distance::DistanceType::L2Expanded); + ext_index.allocate(handle, index_size); // Populate index with the old data if (orig_index.size() > 0) { @@ -359,8 +358,7 @@ inline void fill_refinement_index(const handle_t& handle, stream); IdxT index_size = n_roundup * n_lists; - refinement_index->allocate( - handle, index_size, refinement_index->metric() == raft::distance::DistanceType::L2Expanded); + refinement_index->allocate(handle, index_size); RAFT_CUDA_TRY(cudaMemsetAsync(list_sizes_ptr, 0, n_lists * sizeof(uint32_t), stream)); @@ -454,7 +452,7 @@ auto load(const handle_t& handle, const std::string& filename) -> index index index_ = raft::spatial::knn::ivf_flat::index(handle, metric, n_lists, adaptive_centers, dim); - index_.allocate(handle, n_rows, metric == raft::distance::DistanceType::L2Expanded); + index_.allocate(handle, n_rows); auto data = index_.data(); read_mdspan(handle, infile, data); read_mdspan(handle, infile, index_.indices()); diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh index 8ed71864fd..fac8519a03 100644 --- a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh +++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh @@ -663,9 +663,11 @@ template + typename Lambda, + typename PostLambda> __global__ void __launch_bounds__(kThreadsPerBlock) interleaved_scan_kernel(Lambda compute_dist, + PostLambda post_process, const uint32_t query_smem_elems, const T* query, const uint32_t* coarse_index, @@ -777,7 +779,7 @@ __global__ void __launch_bounds__(kThreadsPerBlock) // finalize and store selected neighbours queue.done(); - queue.store(distances, neighbors); + queue.store(distances, neighbors, post_process); } /** @@ -805,8 +807,10 @@ template + typename Lambda, + typename PostLambda> void launch_kernel(Lambda lambda, + PostLambda post_process, const ivf_flat::index& index, const T* queries, const uint32_t* coarse_index, @@ -821,7 +825,7 @@ void launch_kernel(Lambda lambda, RAFT_EXPECTS(Veclen == index.veclen(), "Configured Veclen does not match the index interleaving pattern."); constexpr auto kKernel = - interleaved_scan_kernel; + interleaved_scan_kernel; const int max_query_smem = 16384; int query_smem_elems = std::min(max_query_smem / sizeof(T), Pow2::roundUp(index.dim())); @@ -851,6 +855,7 @@ void launch_kernel(Lambda lambda, n_probes, smem_size); kKernel<<>>(lambda, + post_process, query_smem_elems, queries, coarse_index, @@ -941,7 +946,18 @@ void launch_with_fixed_consts(raft::distance::DistanceType metric, Args&&... arg T, AccT, IdxT, - euclidean_dist>({}, std::forward(args)...); + euclidean_dist, + raft::identity_op>({}, {}, std::forward(args)...); + case raft::distance::DistanceType::L2SqrtExpanded: + case raft::distance::DistanceType::L2SqrtUnexpanded: + return launch_kernel, + raft::sqrt_op>({}, {}, std::forward(args)...); case raft::distance::DistanceType::InnerProduct: return launch_kernel>({}, std::forward(args)...); + inner_prod_dist, + raft::identity_op>({}, {}, std::forward(args)...); // NB: update the description of `knn::ivf_flat::build` when adding here a new metric. default: RAFT_FAIL("The chosen distance metric is not supported (%d)", int(metric)); } @@ -1105,28 +1122,33 @@ void search_impl(const handle_t& handle, float beta = 0.0f; // todo(lsugy): raft distance? (if performance is similar/better than gemm) - if (index.metric() == raft::distance::DistanceType::L2Expanded) { - alpha = -2.0f; - beta = 1.0f; - raft::linalg::rowNorm(query_norm_dev.data(), - converted_queries_ptr, - static_cast(index.dim()), - static_cast(n_queries), - raft::linalg::L2Norm, - true, - stream, - raft::sqrt_op()); - utils::outer_add(query_norm_dev.data(), - (IdxT)n_queries, - index.center_norms()->data_handle(), - (IdxT)index.n_lists(), - distance_buffer_dev.data(), - stream); - RAFT_LOG_TRACE_VEC(index.center_norms()->data_handle(), std::min(20, index.dim())); - RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), std::min(20, index.n_lists())); - } else { - alpha = 1.0f; - beta = 0.0f; + switch (index.metric()) { + case raft::distance::DistanceType::L2Expanded: + case raft::distance::DistanceType::L2SqrtExpanded: { + alpha = -2.0f; + beta = 1.0f; + raft::linalg::rowNorm(query_norm_dev.data(), + converted_queries_ptr, + static_cast(index.dim()), + static_cast(n_queries), + raft::linalg::L2Norm, + true, + stream, + raft::sqrt_op()); + utils::outer_add(query_norm_dev.data(), + (IdxT)n_queries, + index.center_norms()->data_handle(), + (IdxT)index.n_lists(), + distance_buffer_dev.data(), + stream); + RAFT_LOG_TRACE_VEC(index.center_norms()->data_handle(), std::min(20, index.dim())); + RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), std::min(20, index.n_lists())); + break; + } + default: { + alpha = 1.0f; + beta = 0.0f; + } } linalg::gemm(handle, diff --git a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh index cbe9f36e97..c06aa04aea 100644 --- a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh +++ b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -212,12 +212,13 @@ class warp_sort { * device pointer to a contiguous array, unique per-subwarp of size `kWarpWidth` * (length: k <= kWarpWidth * kMaxArrLen). */ - __device__ void store(T* out, IdxT* out_idx) const + template + __device__ void store(T* out, IdxT* out_idx, Lambda post_process = raft::identity_op()) const { int idx = Pow2::mod(laneId()); #pragma unroll kMaxArrLen for (int i = 0; i < kMaxArrLen && idx < k; i++, idx += kWarpWidth) { - out[idx] = val_arr_[i]; + out[idx] = post_process(val_arr_[i]); out_idx[idx] = idx_arr_[i]; } } @@ -591,9 +592,10 @@ class block_sort { } /** Save the content by the pointer location. */ - __device__ void store(T* out, IdxT* out_idx) const + template + __device__ void store(T* out, IdxT* out_idx, Lambda post_process = raft::identity_op()) const { - if (threadIdx.x < subwarp_align::Value) { queue_.store(out, out_idx); } + if (threadIdx.x < subwarp_align::Value) { queue_.store(out, out_idx, post_process); } } private: diff --git a/cpp/test/neighbors/ann_ivf_flat.cu b/cpp/test/neighbors/ann_ivf_flat.cu index 3285bc3496..86a62bb487 100644 --- a/cpp/test/neighbors/ann_ivf_flat.cu +++ b/cpp/test/neighbors/ann_ivf_flat.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -294,6 +294,8 @@ const std::vector> inputs = { {1000, 10000, 4, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false}, {1000, 10000, 5, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, false}, {1000, 10000, 8, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, true}, + {1000, 10000, 5, 16, 40, 1024, raft::distance::DistanceType::L2SqrtExpanded, false}, + {1000, 10000, 8, 16, 40, 1024, raft::distance::DistanceType::L2SqrtExpanded, true}, // test dims that do not fit into kernel shared memory limits {1000, 10000, 2048, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false}, From 7e5ce788a17b789f75893efa3fbc76be8de2c915 Mon Sep 17 00:00:00 2001 From: Sevag H Date: Fri, 13 Jan 2023 15:33:07 -0500 Subject: [PATCH 13/44] Build wheels alongside conda CI (#1116) This PR adds pip wheel CI to the Conda CI, instead of having them work separately. Authors: - Sevag H (https://github.com/sevagh) - AJ Schmidt (https://github.com/ajschmidt8) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/raft/pull/1116 --- .github/workflows/build.yaml | 43 +++++++++++++++ .github/workflows/pr.yaml | 46 ++++++++++++++++ .github/workflows/test.yaml | 24 +++++++++ .github/workflows/wheels.yml | 72 ------------------------- ci/release/update-version.sh | 5 +- ci/wheel_smoke_test_pylibraft.py | 38 +++++++++++++ ci/wheel_smoke_test_raft_dask.py | 92 ++++++++++++++++++++++++++++++++ python/raft-dask/CMakeLists.txt | 4 +- 8 files changed, 250 insertions(+), 74 deletions(-) delete mode 100644 .github/workflows/wheels.yml create mode 100644 ci/wheel_smoke_test_pylibraft.py create mode 100644 ci/wheel_smoke_test_raft_dask.py diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 620a13fe17..f40a8c35cf 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -52,3 +52,46 @@ jobs: branch: ${{ inputs.branch }} date: ${{ inputs.date }} sha: ${{ inputs.sha }} + wheel-build-pylibraft: + secrets: inherit + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@main + with: + build_type: ${{ inputs.build_type || 'branch' }} + branch: ${{ inputs.branch }} + sha: ${{ inputs.sha }} + date: ${{ inputs.date }} + package-name: pylibraft + package-dir: python/pylibraft + skbuild-configure-options: "-DRAFT_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF" + wheel-publish-pylibraft: + needs: wheel-build-pylibraft + secrets: inherit + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@main + with: + build_type: ${{ inputs.build_type || 'branch' }} + branch: ${{ inputs.branch }} + sha: ${{ inputs.sha }} + date: ${{ inputs.date }} + package-name: pylibraft + wheel-build-raft-dask: + needs: wheel-publish-pylibraft + secrets: inherit + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@main + with: + build_type: ${{ inputs.build_type || 'branch' }} + branch: ${{ inputs.branch }} + sha: ${{ inputs.sha }} + date: ${{ inputs.date }} + package-name: raft_dask + package-dir: python/raft-dask + skbuild-configure-options: "-DRAFT_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF" + wheel-publish-raft-dask: + needs: wheel-build-raft-dask + secrets: inherit + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@main + with: + build_type: ${{ inputs.build_type || 'branch' }} + branch: ${{ inputs.branch }} + sha: ${{ inputs.sha }} + date: ${{ inputs.date }} + package-name: raft_dask diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index ca2e2356c0..a7cd1a8480 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -17,6 +17,10 @@ jobs: - conda-cpp-tests - conda-python-build - conda-python-tests + - wheel-build-pylibraft + - wheel-tests-pylibraft + - wheel-build-raft-dask + - wheel-tests-raft-dask secrets: inherit uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@main checks: @@ -47,3 +51,45 @@ jobs: uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@main with: build_type: pull-request + wheel-build-pylibraft: + needs: checks + secrets: inherit + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@main + with: + build_type: pull-request + package-name: pylibraft + package-dir: python/pylibraft + skbuild-configure-options: "-DRAFT_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF" + wheel-tests-pylibraft: + needs: wheel-build-pylibraft + secrets: inherit + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@main + with: + build_type: pull-request + package-name: pylibraft + test-before-amd64: "pip install cupy-cuda11x" + test-before-arm64: "pip install cupy-cuda11x -f https://pip.cupy.dev/aarch64" + test-unittest: "python -m pytest -v ./python/pylibraft/pylibraft/test" + test-smoketest: "python ./ci/wheel_smoke_test_pylibraft.py" + wheel-build-raft-dask: + needs: wheel-tests-pylibraft + secrets: inherit + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@main + with: + build_type: pull-request + package-name: raft_dask + package-dir: python/raft-dask + before-wheel: "RAPIDS_PY_WHEEL_NAME=pylibraft_cu11 rapids-download-wheels-from-s3 ./local-pylibraft-dep" + cibw-before-build: "pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl" + skbuild-configure-options: "-DRAFT_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF" + wheel-tests-raft-dask: + needs: wheel-build-raft-dask + secrets: inherit + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@main + with: + build_type: pull-request + package-name: raft_dask + test-before-amd64: "RAPIDS_PY_WHEEL_NAME=pylibraft_cu11 rapids-download-wheels-from-s3 ./local-pylibraft-dep && pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.02" + test-before-arm64: "RAPIDS_PY_WHEEL_NAME=pylibraft_cu11 rapids-download-wheels-from-s3 ./local-pylibraft-dep && pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.02" + test-unittest: "python -m pytest -v ./python/raft-dask/raft_dask/test" + test-smoketest: "python ./ci/wheel_smoke_test_raft_dask.py" diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index bd201e987f..8b94330f86 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -30,3 +30,27 @@ jobs: branch: ${{ inputs.branch }} date: ${{ inputs.date }} sha: ${{ inputs.sha }} + wheel-tests-pylibraft: + secrets: inherit + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@main + with: + build_type: nightly + branch: ${{ inputs.branch }} + date: ${{ inputs.date }} + sha: ${{ inputs.sha }} + package-name: pylibraft + test-before-amd64: "pip install cupy-cuda11x" + test-before-arm64: "pip install cupy-cuda11x -f https://pip.cupy.dev/aarch64" + test-unittest: "python -m pytest -v ./python/pylibraft/pylibraft/test" + wheel-tests-raft-dask: + secrets: inherit + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@main + with: + build_type: nightly + branch: ${{ inputs.branch }} + date: ${{ inputs.date }} + sha: ${{ inputs.sha }} + package-name: raft_dask + test-before-amd64: "pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.02" + test-before-arm64: "pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.02" + test-unittest: "python -m pytest -v ./python/raft-dask/raft_dask/test" diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml deleted file mode 100644 index 0a681b864b..0000000000 --- a/.github/workflows/wheels.yml +++ /dev/null @@ -1,72 +0,0 @@ -name: RAFT wheels - -on: - workflow_call: - inputs: - versioneer-override: - type: string - default: '' - build-tag: - type: string - default: '' - branch: - required: true - type: string - date: - required: true - type: string - sha: - required: true - type: string - build-type: - type: string - default: nightly - -concurrency: - group: "raft-${{ github.workflow }}-${{ github.ref }}" - cancel-in-progress: true - -jobs: - pylibraft-wheel: - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux.yml@main - with: - repo: rapidsai/raft - - build-type: ${{ inputs.build-type }} - branch: ${{ inputs.branch }} - sha: ${{ inputs.sha }} - date: ${{ inputs.date }} - - package-dir: python/pylibraft - package-name: pylibraft - - python-package-versioneer-override: ${{ inputs.versioneer-override }} - python-package-build-tag: ${{ inputs.build-tag }} - - skbuild-configure-options: "-DRAFT_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF" - - test-extras: test - test-unittest: "python -m pytest -v ./python/pylibraft/pylibraft/test" - secrets: inherit - raft-dask-wheel: - needs: pylibraft-wheel - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux.yml@main - with: - repo: rapidsai/raft - - build-type: ${{ inputs.build-type }} - branch: ${{ inputs.branch }} - sha: ${{ inputs.sha }} - date: ${{ inputs.date }} - - package-dir: python/raft-dask - package-name: raft_dask - - python-package-versioneer-override: ${{ inputs.versioneer-override }} - python-package-build-tag: ${{ inputs.build-tag }} - - skbuild-configure-options: "-DRAFT_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF" - - test-extras: test - test-unittest: "python -m pytest -v ./python/raft-dask/raft_dask/test" - secrets: inherit diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 0b6410f9c9..d1f849194e 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. ######################## # RAFT Version Updater # ######################## @@ -53,3 +53,6 @@ done sed_runner "s/export UCX_PY_VERSION=.*/export UCX_PY_VERSION='${NEXT_UCX_PY_VERSION}'/g" ci/gpu/build.sh sed_runner "s/export UCX_PY_VERSION=.*/export UCX_PY_VERSION='${NEXT_UCX_PY_VERSION}'/g" ci/cpu/build.sh sed_runner "/^ucx_py_version:$/ {n;s/.*/ - \"${NEXT_UCX_PY_VERSION}\"/}" conda/recipes/raft-dask/conda_build_config.yaml + +# Wheel builds install dask-cuda from source, update its branch +sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" .github/workflows/*.yaml diff --git a/ci/wheel_smoke_test_pylibraft.py b/ci/wheel_smoke_test_pylibraft.py new file mode 100644 index 0000000000..7fee674691 --- /dev/null +++ b/ci/wheel_smoke_test_pylibraft.py @@ -0,0 +1,38 @@ +import numpy as np +from scipy.spatial.distance import cdist + +from pylibraft.common import Handle, Stream, device_ndarray +from pylibraft.distance import pairwise_distance + + +if __name__ == "__main__": + metric = "euclidean" + n_rows = 1337 + n_cols = 1337 + + input1 = np.random.random_sample((n_rows, n_cols)) + input1 = np.asarray(input1, order="C").astype(np.float64) + + output = np.zeros((n_rows, n_rows), dtype=np.float64) + + expected = cdist(input1, input1, metric) + + expected[expected <= 1e-5] = 0.0 + + input1_device = device_ndarray(input1) + output_device = None + + s2 = Stream() + handle = Handle(stream=s2) + ret_output = pairwise_distance( + input1_device, input1_device, output_device, metric, handle=handle + ) + handle.sync() + + output_device = ret_output + + actual = output_device.copy_to_host() + + actual[actual <= 1e-5] = 0.0 + + assert np.allclose(expected, actual, rtol=1e-4) diff --git a/ci/wheel_smoke_test_raft_dask.py b/ci/wheel_smoke_test_raft_dask.py new file mode 100644 index 0000000000..32c13e61ca --- /dev/null +++ b/ci/wheel_smoke_test_raft_dask.py @@ -0,0 +1,92 @@ +from dask.distributed import Client, wait +from dask_cuda import LocalCUDACluster, initialize + +from raft_dask.common import ( + Comms, + local_handle, + perform_test_comm_split, + perform_test_comms_allgather, + perform_test_comms_allreduce, + perform_test_comms_bcast, + perform_test_comms_device_multicast_sendrecv, + perform_test_comms_device_send_or_recv, + perform_test_comms_device_sendrecv, + perform_test_comms_gather, + perform_test_comms_gatherv, + perform_test_comms_reduce, + perform_test_comms_reducescatter, + perform_test_comms_send_recv, +) + +import os +os.environ["UCX_LOG_LEVEL"] = "error" + + +def func_test_send_recv(sessionId, n_trials): + handle = local_handle(sessionId) + return perform_test_comms_send_recv(handle, n_trials) + + +def func_test_collective(func, sessionId, root): + handle = local_handle(sessionId) + return func(handle, root) + + +if __name__ == "__main__": + # initial setup + cluster = LocalCUDACluster(protocol="tcp", scheduler_port=0) + client = Client(cluster) + + n_trials = 5 + root_location = "client" + + # p2p test for ucx + cb = Comms(comms_p2p=True, verbose=True) + cb.init() + + dfs = [ + client.submit( + func_test_send_recv, + cb.sessionId, + n_trials, + pure=False, + workers=[w], + ) + for w in cb.worker_addresses + ] + + wait(dfs, timeout=5) + + assert list(map(lambda x: x.result(), dfs)) + + cb.destroy() + + # collectives test for nccl + + cb = Comms( + verbose=True, client=client, nccl_root_location=root_location + ) + cb.init() + + for k, v in cb.worker_info(cb.worker_addresses).items(): + + dfs = [ + client.submit( + func_test_collective, + perform_test_comms_allgather, + cb.sessionId, + v["rank"], + pure=False, + workers=[w], + ) + for w in cb.worker_addresses + ] + wait(dfs, timeout=5) + + assert all([x.result() for x in dfs]) + + cb.destroy() + + # final client and cluster teardown + client.close() + cluster.close() diff --git a/python/raft-dask/CMakeLists.txt b/python/raft-dask/CMakeLists.txt index fc93a2ddc2..742cd522c3 100644 --- a/python/raft-dask/CMakeLists.txt +++ b/python/raft-dask/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -69,6 +69,8 @@ if(NOT raft_FOUND) endif() add_subdirectory(../../cpp raft-cpp ${_exclude_from_all}) + list(APPEND CMAKE_MODULE_PATH ${CMAKE_BINARY_DIR}/cmake/find_modules) + find_package(NCCL REQUIRED) endif() include(rapids-cython) From dde7c53920f16d5da33cde539f617cd80ceb0539 Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Sat, 14 Jan 2023 00:17:50 +0100 Subject: [PATCH 14/44] IVF-Flat bug fix: the *squared* norm is required for expanded distance calculations (#1141) Authors: - Louis Sugy (https://github.com/Nyrio) - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Artem M. Chirkin (https://github.com/achirkin) - Tamas Bela Feher (https://github.com/tfeher) URL: https://github.com/rapidsai/raft/pull/1141 --- cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh | 3 +-- cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh index ed2c6bae49..0abd3825e6 100644 --- a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh +++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh @@ -245,8 +245,7 @@ inline auto extend(const handle_t& handle, n_lists, raft::linalg::L2Norm, true, - stream, - raft::sqrt_op()); + stream); RAFT_LOG_TRACE_VEC(ext_index.center_norms()->data_handle(), std::min(dim, 20)); } } diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh index fa7504866d..ee020606c7 100644 --- a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh +++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh @@ -1294,8 +1294,7 @@ auto build( index.n_lists(), raft::linalg::L2Norm, true, - stream, - raft::sqrt_op()); + stream); RAFT_CUDA_TRY(cudaMemcpy2DAsync(index.centers().data_handle() + index.dim(), sizeof(float) * index.dim_ext(), center_norms.data(), From 2af2749315c2fc558716a1f829df650990497e90 Mon Sep 17 00:00:00 2001 From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com> Date: Sat, 14 Jan 2023 05:11:46 +0100 Subject: [PATCH 15/44] IVF-PQ: tighten the test criteria (#1135) Make the recall reporting a bit more verbose and try to tighten the `min_recall` for various test cases. This should help spot any regressions in future and improve our understanding of ivf-pq performance for various inputs. Authors: - Artem M. Chirkin (https://github.com/achirkin) - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/1135 --- cpp/test/neighbors/ann_ivf_pq.cuh | 122 +++++++++++++++++++++++------- cpp/test/neighbors/ann_utils.cuh | 11 ++- 2 files changed, 104 insertions(+), 29 deletions(-) diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh index 94777aedd1..b5671b74b0 100644 --- a/cpp/test/neighbors/ann_ivf_pq.cuh +++ b/cpp/test/neighbors/ann_ivf_pq.cuh @@ -42,15 +42,18 @@ #include #include #include +#include #include namespace raft::neighbors::ivf_pq { struct ivf_pq_inputs { - uint32_t num_db_vecs = 4096; - uint32_t num_queries = 1024; - uint32_t dim = 64; - uint32_t k = 32; + uint32_t num_db_vecs = 4096; + uint32_t num_queries = 1024; + uint32_t dim = 64; + uint32_t k = 32; + std::optional min_recall = std::nullopt; + ivf_pq::index_params index_params; ivf_pq::search_params search_params; @@ -91,6 +94,7 @@ inline auto operator<<(std::ostream& os, const ivf_pq_inputs& p) -> std::ostream PRINT_DIFF(.num_queries); PRINT_DIFF(.dim); PRINT_DIFF(.k); + PRINT_DIFF_V(.min_recall, p.min_recall.value_or(0)); PRINT_DIFF_V(.index_params.metric, print_metric{p.index_params.metric}); PRINT_DIFF(.index_params.metric_arg); PRINT_DIFF(.index_params.add_data_on_build); @@ -100,6 +104,7 @@ inline auto operator<<(std::ostream& os, const ivf_pq_inputs& p) -> std::ostream PRINT_DIFF(.index_params.pq_bits); PRINT_DIFF(.index_params.pq_dim); PRINT_DIFF(.index_params.codebook_kind); + PRINT_DIFF(.index_params.force_random_rotation); PRINT_DIFF(.search_params.n_probes); PRINT_DIFF_V(.search_params.lut_dtype, print_dtype{p.search_params.lut_dtype}); PRINT_DIFF_V(.search_params.internal_distance_dtype, @@ -231,12 +236,16 @@ class ivf_pq_test : public ::testing::TestWithParam { update_host(indices_ivf_pq.data(), indices_ivf_pq_dev.data(), queries_size, stream_); handle_.sync_stream(stream_); - // Using very dense, small codebooks results in large errors in the distance calculation - double low_precision_factor = - static_cast(index.pq_dim() * index.pq_bits()) / static_cast(ps.dim * 8); // A very conservative lower bound on recall - double min_recall = low_precision_factor * static_cast(ps.search_params.n_probes) / - static_cast(ps.index_params.n_lists); + double min_recall = + static_cast(ps.search_params.n_probes) / static_cast(ps.index_params.n_lists); + double low_precision_factor = + static_cast(ps.dim * 8) / static_cast(index.pq_dim() * index.pq_bits()); + // Using a heuristic to lower the required recall due to code-packing errors + min_recall = + std::min(std::erfc(0.05 * low_precision_factor / std::max(min_recall, 0.5)), min_recall); + // Use explicit per-test min recall value if provided. + min_recall = ps.min_recall.value_or(min_recall); ASSERT_TRUE(eval_neighbours(indices_ref, indices_ivf_pq, @@ -244,8 +253,9 @@ class ivf_pq_test : public ::testing::TestWithParam { distances_ivf_pq, ps.num_queries, ps.k, - 0.001 / low_precision_factor, - min_recall)); + 0.0001 * low_precision_factor, + min_recall)) + << ps; // Test a few extra invariants IdxT min_results = min_output_size(handle_, index, ps.search_params.n_probes); @@ -350,9 +360,16 @@ inline auto small_dims_per_cluster() -> test_cases_t inline auto big_dims() -> test_cases_t { - return with_dims({512, 513, 1023, 1024, 1025, 2048, 2049, 2050, 2053, 6144}); - // return with_dims({512, 513, 1023, 1024, 1025, 2048, 2049, 2050, 2053, 6144, 8192, 12288, - // 16384}); + // with_dims({512, 513, 1023, 1024, 1025, 2048, 2049, 2050, 2053, 6144, 8192, 12288, 16384}); + auto xs = with_dims({512, 513, 1023, 1024, 1025, 2048, 2049, 2050, 2053, 6144}); + return map(xs, [](const ivf_pq_inputs& x) { + ivf_pq_inputs y(x); + uint32_t pq_len = 2; + y.index_params.pq_dim = div_rounding_up_safe(x.dim, pq_len); + // This comes from pure experimentation, also the recall depens a lot on pq_len. + y.min_recall = 0.48 + 0.028 * std::log2(x.dim); + return y; + }); } /** These will surely trigger no-smem-lut kernel. */ @@ -360,8 +377,11 @@ inline auto big_dims_moderate_lut() -> test_cases_t { return map(big_dims(), [](const ivf_pq_inputs& x) { ivf_pq_inputs y(x); + uint32_t pq_len = 2; + y.index_params.pq_dim = round_up_safe(div_rounding_up_safe(x.dim, pq_len), 4u); y.index_params.pq_bits = 6; y.search_params.lut_dtype = CUDA_R_16F; + y.min_recall = 0.69; return y; }); } @@ -371,9 +391,11 @@ inline auto big_dims_small_lut() -> test_cases_t { return map(big_dims(), [](const ivf_pq_inputs& x) { ivf_pq_inputs y(x); - y.index_params.pq_dim = raft::round_up_safe(y.dim / 8u, 64u); + uint32_t pq_len = 8; + y.index_params.pq_dim = round_up_safe(div_rounding_up_safe(x.dim, pq_len), 4u); y.index_params.pq_bits = 6; y.search_params.lut_dtype = CUDA_R_8U; + y.min_recall = 0.21; return y; }); } @@ -390,30 +412,68 @@ inline auto enum_variety() -> test_cases_t ([](ivf_pq_inputs & x) f)(xs[xs.size() - 1]); \ } while (0); - ADD_CASE({ x.index_params.codebook_kind = ivf_pq::codebook_gen::PER_CLUSTER; }); - ADD_CASE({ x.index_params.codebook_kind = ivf_pq::codebook_gen::PER_SUBSPACE; }); + ADD_CASE({ + x.index_params.codebook_kind = ivf_pq::codebook_gen::PER_CLUSTER; + x.min_recall = 0.86; + }); + ADD_CASE({ + x.index_params.codebook_kind = ivf_pq::codebook_gen::PER_SUBSPACE; + x.min_recall = 0.86; + }); ADD_CASE({ x.index_params.codebook_kind = ivf_pq::codebook_gen::PER_CLUSTER; x.index_params.pq_bits = 4; + x.min_recall = 0.79; }); ADD_CASE({ x.index_params.codebook_kind = ivf_pq::codebook_gen::PER_CLUSTER; x.index_params.pq_bits = 5; + x.min_recall = 0.83; }); - ADD_CASE({ x.index_params.pq_bits = 6; }); - ADD_CASE({ x.index_params.pq_bits = 7; }); - ADD_CASE({ x.index_params.pq_bits = 8; }); + ADD_CASE({ + x.index_params.pq_bits = 6; + x.min_recall = 0.84; + }); + ADD_CASE({ + x.index_params.pq_bits = 7; + x.min_recall = 0.85; + }); + ADD_CASE({ + x.index_params.pq_bits = 8; + x.min_recall = 0.86; + }); - ADD_CASE({ x.index_params.force_random_rotation = true; }); - ADD_CASE({ x.index_params.force_random_rotation = false; }); + ADD_CASE({ + x.index_params.force_random_rotation = true; + x.min_recall = 0.86; + }); + ADD_CASE({ + x.index_params.force_random_rotation = false; + x.min_recall = 0.86; + }); - ADD_CASE({ x.search_params.lut_dtype = CUDA_R_32F; }); - ADD_CASE({ x.search_params.lut_dtype = CUDA_R_16F; }); - ADD_CASE({ x.search_params.lut_dtype = CUDA_R_8U; }); + ADD_CASE({ + x.search_params.lut_dtype = CUDA_R_32F; + x.min_recall = 0.86; + }); + ADD_CASE({ + x.search_params.lut_dtype = CUDA_R_16F; + x.min_recall = 0.86; + }); + ADD_CASE({ + x.search_params.lut_dtype = CUDA_R_8U; + x.min_recall = 0.85; + }); - ADD_CASE({ x.search_params.internal_distance_dtype = CUDA_R_32F; }); - ADD_CASE({ x.search_params.internal_distance_dtype = CUDA_R_16F; }); + ADD_CASE({ + x.search_params.internal_distance_dtype = CUDA_R_32F; + x.min_recall = 0.86; + }); + ADD_CASE({ + x.search_params.internal_distance_dtype = CUDA_R_16F; + x.min_recall = 0.86; + }); return xs; } @@ -431,6 +491,14 @@ inline auto enum_variety_ip() -> test_cases_t { return map(enum_variety(), [](const ivf_pq_inputs& x) { ivf_pq_inputs y(x); + if (y.min_recall.has_value()) { + if (y.search_params.lut_dtype == CUDA_R_8U) { + // InnerProduct score is signed, + // thus we're forced to used signed 8-bit representation, + // thus we have one bit less precision + y.min_recall = y.min_recall.value() * 0.95; + } + } y.index_params.metric = distance::DistanceType::InnerProduct; return y; }); diff --git a/cpp/test/neighbors/ann_utils.cuh b/cpp/test/neighbors/ann_utils.cuh index 05fe6ab92d..b88b6abd9e 100644 --- a/cpp/test/neighbors/ann_utils.cuh +++ b/cpp/test/neighbors/ann_utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -232,7 +232,14 @@ auto eval_neighbours(const std::vector& expected_idx, } } double actual_recall = static_cast(match_count) / static_cast(total_count); - RAFT_LOG_INFO("Recall = %f (%zu/%zu)", actual_recall, match_count, total_count); + double error_margin = (actual_recall - min_recall) / std::max(1.0 - min_recall, eps); + RAFT_LOG_INFO("Recall = %f (%zu/%zu), the error is %2.1f%% %s the threshold (eps = %f).", + actual_recall, + match_count, + total_count, + std::abs(error_margin * 100.0), + error_margin < 0 ? "above" : "below", + eps); if (actual_recall < min_recall - eps) { if (actual_recall < min_recall * min_recall - eps) { RAFT_LOG_ERROR("Recall is much lower than the minimum (%f < %f)", actual_recall, min_recall); From efd42c9f34fc16b28e94578e7e8460685fa2012d Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Sun, 15 Jan 2023 17:26:50 +0100 Subject: [PATCH 16/44] Squared norm fix follow-up (change was lost in merge conflict) (#1144) This change was part of #1141 but was accidentally lost while merging conflicts with #1133 Authors: - Louis Sugy (https://github.com/Nyrio) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/1144 --- cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh index fac8519a03..b139d8df8c 100644 --- a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh +++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh @@ -1133,8 +1133,7 @@ void search_impl(const handle_t& handle, static_cast(n_queries), raft::linalg::L2Norm, true, - stream, - raft::sqrt_op()); + stream); utils::outer_add(query_norm_dev.data(), (IdxT)n_queries, index.center_norms()->data_handle(), From 9a97b52880c5c97e2ff7f24b749d6bda87b6cb6f Mon Sep 17 00:00:00 2001 From: AJ Schmidt Date: Tue, 17 Jan 2023 15:59:32 -0500 Subject: [PATCH 17/44] Update builds for CUDA `11.8` and Python `3.10` (#1120) This PR updates the `raft` CI workflows to build against the CUDA `11.8` / Python `3.10` [branch](https://github.com/rapidsai/shared-action-workflows/tree/cuda-118) of the `shared-action-workflows` repository. Authors: - AJ Schmidt (https://github.com/ajschmidt8) - Bradley Dice (https://github.com/bdice) Approvers: - Bradley Dice (https://github.com/bdice) - Ray Douglass (https://github.com/raydouglass) - Jake Awe (https://github.com/AyodeAwe) - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/1120 --- .github/workflows/build.yaml | 6 ++-- .github/workflows/pr.yaml | 12 +++---- .github/workflows/test.yaml | 4 +-- README.md | 2 +- ..._64.yaml => all_cuda-118_arch-x86_64.yaml} | 32 +++++++++--------- conda/recipes/libraft/conda_build_config.yaml | 22 ++++++------- dependencies.yaml | 33 ++++++++++++++----- docs/source/build.md | 4 +-- python/pylibraft/setup.py | 1 + python/raft-dask/setup.py | 1 + 10 files changed, 68 insertions(+), 49 deletions(-) rename conda/environments/{all_cuda-115_arch-x86_64.yaml => all_cuda-118_arch-x86_64.yaml} (56%) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index f40a8c35cf..d0ded9a530 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -28,7 +28,7 @@ concurrency: jobs: cpp-build: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-matrix-build.yaml@main + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@cuda-118 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -37,7 +37,7 @@ jobs: python-build: needs: [cpp-build] secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-matrix-build.yaml@main + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@cuda-118 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -46,7 +46,7 @@ jobs: upload-conda: needs: [cpp-build, python-build] secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@main + uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@cuda-118 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index a7cd1a8480..c2c64d0003 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -22,33 +22,33 @@ jobs: - wheel-build-raft-dask - wheel-tests-raft-dask secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@main + uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@cuda-118 checks: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@main + uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@cuda-118 conda-cpp-build: needs: checks secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-matrix-build.yaml@main + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@cuda-118 with: build_type: pull-request node_type: cpu16 conda-cpp-tests: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@main + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-118 with: build_type: pull-request conda-python-build: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-matrix-build.yaml@main + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@cuda-118 with: build_type: pull-request conda-python-tests: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@main + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-118 with: build_type: pull-request wheel-build-pylibraft: diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 8b94330f86..6eb2c16ba6 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -16,7 +16,7 @@ on: jobs: conda-cpp-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@main + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-118 with: build_type: nightly branch: ${{ inputs.branch }} @@ -24,7 +24,7 @@ jobs: sha: ${{ inputs.sha }} conda-python-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@main + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-118 with: build_type: nightly branch: ${{ inputs.branch }} diff --git a/README.md b/README.md index 8e0da6cd6d..34d66cbbc3 100755 --- a/README.md +++ b/README.md @@ -277,7 +277,7 @@ Several CMake targets can be made available by adding components in the table be The easiest way to build RAFT from source is to use the `build.sh` script at the root of the repository: 1. Create an environment with the needed dependencies: ``` -mamba env create --name raft_dev_env -f conda/environments/all_cuda-115_arch-x86_64.yaml +mamba env create --name raft_dev_env -f conda/environments/all_cuda-118_arch-x86_64.yaml mamba activate raft_dev_env ``` ``` diff --git a/conda/environments/all_cuda-115_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml similarity index 56% rename from conda/environments/all_cuda-115_arch-x86_64.yaml rename to conda/environments/all_cuda-118_arch-x86_64.yaml index 18e0a8187f..87b7075935 100644 --- a/conda/environments/all_cuda-115_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -12,37 +12,37 @@ dependencies: - clang-tools=11.1.0 - clang=11.1.0 - cmake>=3.23.1,!=3.25.0 -- cuda-profiler-api>=11.4.240,<=11.8.86 +- cuda-profiler-api=11.8.86 - cuda-python >=11.7.1,<12.0 -- cudatoolkit=11.5 +- cudatoolkit=11.8 - cupy - cxx-compiler - cython>=0.29,<0.30 -- dask-cuda=23.02.* +- dask-cuda=23.02 - dask>=2022.12.0 - distributed>=2022.12.0 - doxygen>=1.8.20 - faiss-proc=*=cuda -- gcc_linux-64=9.* -- libcublas-dev>=11.7.3.1,<=11.7.4.6 -- libcublas>=11.7.3.1,<=11.7.4.6 -- libcurand-dev>=10.2.6.48,<=10.2.7.107 -- libcurand>=10.2.6.48,<=10.2.7.107 -- libcusolver-dev>=11.2.1.48,<=11.3.2.107 -- libcusolver>=11.2.1.48,<=11.3.2.107 -- libcusparse-dev>=11.7.0.31,<=11.7.0.107 -- libcusparse>=11.7.0.31,<=11.7.0.107 -- libfaiss>=1.7.0=cuda* +- gcc_linux-64=9 +- libcublas-dev=11.11.3.6 +- libcublas=11.11.3.6 +- libcurand-dev=10.3.0.86 +- libcurand=10.3.0.86 +- libcusolver-dev=11.4.1.48 +- libcusolver=11.4.1.48 +- libcusparse-dev=11.7.5.86 +- libcusparse=11.7.5.86 +- libfaiss>=1.7.1=cuda* - ninja - pytest - pytest-cov -- rmm=23.02.* +- rmm=23.02 - scikit-build>=0.13.1 - scikit-learn - scipy - sphinx-markdown-tables - sysroot_linux-64==2.17 - ucx-proc=*=gpu -- ucx-py=0.30.* +- ucx-py=0.30 - ucx>=1.13.0 -name: all_cuda-115_arch-x86_64 +name: all_cuda-118_arch-x86_64 diff --git a/conda/recipes/libraft/conda_build_config.yaml b/conda/recipes/libraft/conda_build_config.yaml index 399dd198eb..1012bddb40 100644 --- a/conda/recipes/libraft/conda_build_config.yaml +++ b/conda/recipes/libraft/conda_build_config.yaml @@ -20,42 +20,42 @@ gtest_version: - "=1.10.0" libfaiss_version: - - "1.7.0 *_cuda" + - "1.7.2 *_cuda" # The CTK libraries below are missing from the conda-forge::cudatoolkit -# package. The "*_host_*" version specifiers correspond to `11.5` packages and the +# package. The "*_host_*" version specifiers correspond to `11.8` packages and the # "*_run_*" version specifiers correspond to `11.x` packages. libcublas_host_version: - - ">=11.7.3.1,<=11.7.4.6" + - "=11.11.3.6" libcublas_run_version: - - ">=11.5.2.43,<=11.11.3.6" + - ">=11.5.2.43,<12.0.0" libcurand_host_version: - - ">=10.2.6.48,<=10.2.7.107" + - "=10.3.0.86" libcurand_run_version: - - ">=10.2.5.43,<=10.3.0.86" + - ">=10.2.5.43,<10.3.1" libcusolver_host_version: - - ">=11.2.1.48,<=11.3.2.107" + - "=11.4.1.48" libcusolver_run_version: - - ">=11.2.0.43,<=11.4.1.48" + - ">=11.2.0.43,<11.4.2" libcusparse_host_version: - - ">=11.7.0.31,<=11.7.0.107" + - "=11.7.5.86" libcusparse_run_version: - - ">=11.6.0.43,<=11.7.5.86" + - ">=11.6.0.43,<12.0.0" # `cuda-profiler-api` only has `11.8.0` and `12.0.0` packages for all # architectures. The "*_host_*" version specifiers correspond to `11.8` packages and the # "*_run_*" version specifiers correspond to `11.x` packages. cuda_profiler_api_host_version: - - ">=11.8.86,<12" + - "=11.8.86" cuda_profiler_api_run_version: - ">=11.4.240,<12" diff --git a/dependencies.yaml b/dependencies.yaml index 52054d9c7d..ae900542c0 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -3,7 +3,7 @@ files: all: output: conda matrix: - cuda: ["11.5"] + cuda: ["11.8"] arch: [x86_64] includes: - build @@ -53,12 +53,12 @@ dependencies: - matrix: arch: x86_64 packages: - - gcc_linux-64=9.* + - gcc_linux-64=9 - sysroot_linux-64==2.17 - matrix: arch: aarch64 packages: - - gcc_linux-aarch64=9.* + - gcc_linux-aarch64=9 - sysroot_linux-aarch64==2.17 checks: common: @@ -77,6 +77,19 @@ dependencies: specific: - output_types: conda matrices: + - matrix: + cuda: "11.8" + packages: + - cudatoolkit=11.8 + - cuda-profiler-api=11.8.86 + - libcublas-dev=11.11.3.6 + - libcublas=11.11.3.6 + - libcurand-dev=10.3.0.86 + - libcurand=10.3.0.86 + - libcusolver-dev=11.4.1.48 + - libcusolver=11.4.1.48 + - libcusparse-dev=11.7.5.86 + - libcusparse=11.7.5.86 - matrix: cuda: "11.5" packages: @@ -142,22 +155,26 @@ dependencies: py: "3.9" packages: - python=3.9 + - matrix: + py: "3.10" + packages: + - python=3.10 - matrix: packages: - - python>=3.8,<3.10 + - python>=3.8,<3.11 run: common: - output_types: [conda] packages: - - rmm=23.02.* + - rmm=23.02 - dask>=2022.12.0 - distributed>=2022.12.0 - ucx>=1.13.0 - - ucx-py=0.30.* + - ucx-py=0.30 - ucx-proc=*=gpu - - libfaiss>=1.7.0=cuda* + - libfaiss>=1.7.1=cuda* - faiss-proc=*=cuda - - dask-cuda=23.02.* + - dask-cuda=23.02 test_python: common: - output_types: [conda, requirements] diff --git a/docs/source/build.md b/docs/source/build.md index c88cf6c162..4052e49cf8 100644 --- a/docs/source/build.md +++ b/docs/source/build.md @@ -180,10 +180,10 @@ Currently, shared libraries are provided for the `libraft-nn` and `libraft-dista ### Python -Conda environment scripts are provided for installing the necessary dependencies for building and using the Python APIs. It is preferred to use `mamba`, as it provides significant speedup over `conda`. In addition you will have to manually install `nvcc` as it will not be installed as part of the conda environment. The following example will install create and install dependencies for a CUDA 11.5 conda environment: +Conda environment scripts are provided for installing the necessary dependencies for building and using the Python APIs. It is preferred to use `mamba`, as it provides significant speedup over `conda`. In addition you will have to manually install `nvcc` as it will not be installed as part of the conda environment. The following example will install create and install dependencies for a CUDA 11.8 conda environment: ```bash -mamba env create --name raft_env_name -f conda/environments/all_cuda-115_arch-x86_64.yaml +mamba env create --name raft_env_name -f conda/environments/all_cuda-118_arch-x86_64.yaml mamba activate raft_env_name ``` diff --git a/python/pylibraft/setup.py b/python/pylibraft/setup.py index 15889fcd71..079825a30c 100644 --- a/python/pylibraft/setup.py +++ b/python/pylibraft/setup.py @@ -69,6 +69,7 @@ def get_versions(): "Programming Language :: Python", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", ], author="NVIDIA Corporation", include_package_data=True, diff --git a/python/raft-dask/setup.py b/python/raft-dask/setup.py index 7009a9ab44..02bbced9a3 100644 --- a/python/raft-dask/setup.py +++ b/python/raft-dask/setup.py @@ -73,6 +73,7 @@ def get_versions(): "Programming Language :: Python", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", ], author="NVIDIA Corporation", include_package_data=True, From 0cefbfb7ce4bfaa03f993aa615ae264792a8b4f5 Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Tue, 17 Jan 2023 18:02:04 -0800 Subject: [PATCH 18/44] Remove faiss bfKnn call from fused_l2_knn unittest (#1150) Authors: - Ben Frederickson (https://github.com/benfred) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/1150 --- cpp/test/neighbors/fused_l2_knn.cu | 84 ++++++++++++------------------ 1 file changed, 34 insertions(+), 50 deletions(-) diff --git a/cpp/test/neighbors/fused_l2_knn.cu b/cpp/test/neighbors/fused_l2_knn.cu index d57f99da50..ca20bebaf6 100644 --- a/cpp/test/neighbors/fused_l2_knn.cu +++ b/cpp/test/neighbors/fused_l2_knn.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,14 +16,10 @@ #include "../test_utils.cuh" -#include -#include - #include #include #include #include -#include #include #if defined RAFT_NN_COMPILED @@ -112,8 +108,8 @@ class FusedL2KNNTest : public ::testing::TestWithParam { search_queries(params_.num_queries * params_.dim, stream_), raft_indices_(params_.num_queries * params_.k, stream_), raft_distances_(params_.num_queries * params_.k, stream_), - faiss_indices_(params_.num_queries * params_.k, stream_), - faiss_distances_(params_.num_queries * params_.k, stream_) + ref_indices_(params_.num_queries * params_.k, stream_), + ref_distances_(params_.num_queries * params_.k, stream_) { RAFT_CUDA_TRY(cudaMemsetAsync(database.data(), 0, database.size() * sizeof(T), stream_)); RAFT_CUDA_TRY( @@ -123,15 +119,32 @@ class FusedL2KNNTest : public ::testing::TestWithParam { RAFT_CUDA_TRY( cudaMemsetAsync(raft_distances_.data(), 0, raft_distances_.size() * sizeof(T), stream_)); RAFT_CUDA_TRY( - cudaMemsetAsync(faiss_indices_.data(), 0, faiss_indices_.size() * sizeof(int64_t), stream_)); + cudaMemsetAsync(ref_indices_.data(), 0, ref_indices_.size() * sizeof(int64_t), stream_)); RAFT_CUDA_TRY( - cudaMemsetAsync(faiss_distances_.data(), 0, faiss_distances_.size() * sizeof(T), stream_)); + cudaMemsetAsync(ref_distances_.data(), 0, ref_distances_.size() * sizeof(T), stream_)); } protected: void testBruteForce() { - launchFaissBfknn(); + // calculate the naive knn, by calculating the full pairwise distances and doing a k-select + rmm::device_uvector temp_distances(num_db_vecs * num_queries, stream_); + distance::pairwise_distance( + handle_, + raft::make_device_matrix_view(search_queries.data(), num_queries, dim), + raft::make_device_matrix_view(database.data(), num_db_vecs, dim), + raft::make_device_matrix_view(temp_distances.data(), num_queries, num_db_vecs), + metric); + + spatial::knn::select_k(temp_distances.data(), + nullptr, + num_queries, + num_db_vecs, + ref_distances_.data(), + ref_indices_.data(), + true, + k_, + stream_); auto index_view = raft::make_device_matrix_view(database.data(), num_db_vecs, dim); @@ -145,14 +158,14 @@ class FusedL2KNNTest : public ::testing::TestWithParam { handle_, index_view, query_view, out_indices_view, out_dists_view, metric); // verify. - devArrMatchKnnPair(faiss_indices_.data(), - raft_indices_.data(), - faiss_distances_.data(), - raft_distances_.data(), - num_queries, - k_, - float(0.001), - stream_); + ASSERT_TRUE(devArrMatchKnnPair(ref_indices_.data(), + raft_indices_.data(), + ref_distances_.data(), + raft_distances_.data(), + num_queries, + k_, + float(0.001), + stream_)); } void SetUp() override @@ -169,34 +182,6 @@ class FusedL2KNNTest : public ::testing::TestWithParam { uniform(handle_, r, search_queries.data(), num_queries * dim, T(-1.0), T(1.0)); } - void launchFaissBfknn() - { - faiss::MetricType m = detail::build_faiss_metric(metric); - - faiss::gpu::StandardGpuResources gpu_res; - - gpu_res.noTempMemory(); - int device; - RAFT_CUDA_TRY(cudaGetDevice(&device)); - gpu_res.setDefaultStream(device, stream_); - - faiss::gpu::GpuDistanceParams args; - args.metric = m; - args.metricArg = 0; - args.k = k_; - args.dims = dim; - args.vectors = database.data(); - args.vectorsRowMajor = true; - args.numVectors = num_db_vecs; - args.queries = search_queries.data(); - args.queriesRowMajor = true; - args.numQueries = num_queries; - args.outDistances = faiss_distances_.data(); - args.outIndices = faiss_indices_.data(); - - bfKnn(&gpu_res, args); - } - private: raft::handle_t handle_; cudaStream_t stream_ = 0; @@ -208,8 +193,8 @@ class FusedL2KNNTest : public ::testing::TestWithParam { rmm::device_uvector search_queries; rmm::device_uvector raft_indices_; rmm::device_uvector raft_distances_; - rmm::device_uvector faiss_indices_; - rmm::device_uvector faiss_distances_; + rmm::device_uvector ref_indices_; + rmm::device_uvector ref_distances_; int k_; raft::distance::DistanceType metric; }; @@ -223,7 +208,6 @@ const std::vector inputs = { {1000, 10000, 16, 50, raft::distance::DistanceType::L2Expanded}, {1000, 10000, 32, 50, raft::distance::DistanceType::L2Expanded}, {10000, 40000, 32, 30, raft::distance::DistanceType::L2Expanded}, - {131072, 131072, 8, 60, raft::distance::DistanceType::L2Expanded}, // L2 unexpanded {100, 1000, 16, 10, raft::distance::DistanceType::L2Unexpanded}, {1000, 10000, 16, 10, raft::distance::DistanceType::L2Unexpanded}, @@ -232,7 +216,7 @@ const std::vector inputs = { {1000, 10000, 16, 50, raft::distance::DistanceType::L2Unexpanded}, {1000, 10000, 32, 50, raft::distance::DistanceType::L2Unexpanded}, {10000, 40000, 32, 30, raft::distance::DistanceType::L2Unexpanded}, - {131072, 131072, 8, 60, raft::distance::DistanceType::L2Unexpanded}}; +}; typedef FusedL2KNNTest FusedL2KNNTestF; TEST_P(FusedL2KNNTestF, FusedBruteForce) { this->testBruteForce(); } From 187ff9e2507233da041775be3f97de5a8833017b Mon Sep 17 00:00:00 2001 From: Tamas Bela Feher Date: Wed, 18 Jan 2023 18:09:14 +0100 Subject: [PATCH 19/44] Catch signal handler change error (#1147) The interruptible signal handle was incompatible with dask, it would trigger the following error: ``` Exception: "ValueError('signal only works in main thread of the main interpreter')" ``` This PR fixes the problem by catching the error and keeping the original signal handler in that case. Authors: - Tamas Bela Feher (https://github.com/tfeher) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/1147 --- python/pylibraft/pylibraft/common/interruptible.pyx | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/python/pylibraft/pylibraft/common/interruptible.pyx b/python/pylibraft/pylibraft/common/interruptible.pyx index c7f738f2e5..bb5415428f 100644 --- a/python/pylibraft/pylibraft/common/interruptible.pyx +++ b/python/pylibraft/pylibraft/common/interruptible.pyx @@ -54,11 +54,17 @@ def cuda_interruptible(): with nogil: dereference(token).cancel() - oldhr = signal.signal(signal.SIGINT, newhr) + try: + oldhr = signal.signal(signal.SIGINT, newhr) + except ValueError: + # the signal creation would fail if this is not the main thread + # That's fine! The feature is disabled. + oldhr = None try: yield finally: - signal.signal(signal.SIGINT, oldhr) + if oldhr is not None: + signal.signal(signal.SIGINT, oldhr) def synchronize(stream: Stream): From a7399cb5a87126798bb3de2701b1566496bb20c5 Mon Sep 17 00:00:00 2001 From: Philip Hyunsu Cho Date: Thu, 19 Jan 2023 05:42:46 -0800 Subject: [PATCH 20/44] Fix various build errors (#1152) Fix various build errors I encountered as I tried to build RAFT locally on my workstation. (Command used: `./build.sh -g raft-dask pylibraft libraft tests bench --compile-libs`) * Add `gtest` as a link dependency of the C++ benchmark suite, to fix the error ``` [266/332] Building CUDA object CMakeFiles/NEIGHBORS_BENCH.dir/bench/neighbors/refine.cu.o FAILED: CMakeFiles/NEIGHBORS_BENCH.dir/bench/neighbors/refine.cu.o In file included from /home/phcho/Desktop/raft/cpp/bench/neighbors/../../test/neighbors/../test_utils.cuh:19, from /home/phcho/Desktop/raft/cpp/bench/neighbors/../../test/neighbors/ann_utils.cuh:28, from /home/phcho/Desktop/raft/cpp/bench/neighbors/../../test/neighbors/refine_helper.cuh:18, from /home/phcho/Desktop/raft/cpp/bench/neighbors/refine.cu:39: /home/phcho/Desktop/raft/cpp/bench/neighbors/../../test/neighbors/../test_utils.h:22:10: fatal error: gtest/gtest.h: No such file or directory 22 | #include | ^~~~~~~~~~~~~~~ compilation terminated. ``` * Explicitly specify the namespace for `alignTo`. * Cast pointers into an integral type prior to passing it to `alignTo`. * When calling `areSameAlignOffsets()`, pass the underlying pointers of the mdspan objects. Passing an mdspan to `areSameAlignOffsets()` is an error. Authors: - Philip Hyunsu Cho (https://github.com/hcho3) Approvers: - Micka (https://github.com/lowener) - Robert Maynard (https://github.com/robertmaynard) - Artem M. Chirkin (https://github.com/achirkin) - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/1152 --- cpp/bench/CMakeLists.txt | 3 ++- cpp/include/raft/core/device_mdspan.hpp | 6 ++++-- cpp/include/raft/core/host_mdspan.hpp | 6 ++++-- cpp/include/raft/matrix/detail/linewise_op.cuh | 5 +++-- 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/cpp/bench/CMakeLists.txt b/cpp/bench/CMakeLists.txt index 99606dd2e9..813483adc5 100644 --- a/cpp/bench/CMakeLists.txt +++ b/cpp/bench/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -32,6 +32,7 @@ function(ConfigureBench) PRIVATE raft::raft $<$:raft::distance> $<$:raft::nn> + GTest::gtest benchmark::benchmark Threads::Threads $ diff --git a/cpp/include/raft/core/device_mdspan.hpp b/cpp/include/raft/core/device_mdspan.hpp index f64f15d0d5..f72ae36d64 100644 --- a/cpp/include/raft/core/device_mdspan.hpp +++ b/cpp/include/raft/core/device_mdspan.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -197,7 +197,9 @@ auto make_device_aligned_matrix_view(ElementType* ptr, IndexType n_rows, IndexTy detail::alignment::value>::data_handle_type; static_assert(std::is_same>::value || std::is_same>::value); - assert(ptr == alignTo(ptr, detail::alignment::value)); + assert(reinterpret_cast(ptr) == + std::experimental::details::alignTo(reinterpret_cast(ptr), + detail::alignment::value)); data_handle_type aligned_pointer = ptr; diff --git a/cpp/include/raft/core/host_mdspan.hpp b/cpp/include/raft/core/host_mdspan.hpp index 1a0ea6432f..a6cdec7a84 100644 --- a/cpp/include/raft/core/host_mdspan.hpp +++ b/cpp/include/raft/core/host_mdspan.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -144,7 +144,9 @@ auto make_host_aligned_matrix_view(ElementType* ptr, IndexType n_rows, IndexType static_assert(std::is_same>::value || std::is_same>::value); - assert(ptr == alignTo(ptr, detail::alignment::value)); + assert(reinterpret_cast(ptr) == + std::experimental::details::alignTo(reinterpret_cast(ptr), + detail::alignment::value)); data_handle_type aligned_pointer = ptr; matrix_extent extents{n_rows, n_cols}; diff --git a/cpp/include/raft/matrix/detail/linewise_op.cuh b/cpp/include/raft/matrix/detail/linewise_op.cuh index 605726bea6..ef8f0e88c1 100644 --- a/cpp/include/raft/matrix/detail/linewise_op.cuh +++ b/cpp/include/raft/matrix/detail/linewise_op.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -796,7 +796,8 @@ struct MatrixLinewiseOp { "layout for in and out must be either padded row or col major"); // also statically assert padded matrix alignment == 2^i*VecBytes - assert(raft::Pow2::areSameAlignOffsets(in, out)); + RAFT_EXPECTS(raft::Pow2::areSameAlignOffsets(in.data_handle(), out.data_handle()), + "The matrix views in and out does not have correct alignment"); if (alongLines) return matrixLinewiseVecRowsSpan Date: Thu, 19 Jan 2023 15:10:48 +0100 Subject: [PATCH 21/44] balanced-k-means: fix a too large initial memory pool size (#1148) `calc_minibatch_size` decides on the batch size under assumption that the workspace shouldn't exceed 1GB. It takes into account that fewer extra buffers are needed when the data type `T` is float. However, we don't take this into account when setting the initial memory pool size immediately after calculating `max_minibatch_size`. As a result, under some conditions, the algorithm attempts to allocate more memory than available. This PR sets the limit of the initial pool size to 1GB to fix the issue. Authors: - Artem M. Chirkin (https://github.com/achirkin) Approvers: - Tamas Bela Feher (https://github.com/tfeher) URL: https://github.com/rapidsai/raft/pull/1148 --- .../knn/detail/ann_kmeans_balanced.cuh | 54 +++++++++++-------- 1 file changed, 31 insertions(+), 23 deletions(-) diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh index 961cc76381..72df13d760 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,9 +18,6 @@ #include "ann_utils.cuh" -#include -#include - #include #include #include @@ -45,6 +42,11 @@ #include #include +#include +#include + +#include + namespace raft::spatial::knn::detail::kmeans { constexpr static inline const float kAdjustCentersWeight = 7.0f; @@ -170,35 +172,40 @@ inline void predict_float_core(const handle_t& handle, * * @param n_clusters number of clusters in kmeans clustering * @param n_rows dataset size - * @return a suggested minibatch size + * @param dim + * @param metric + * @param is_float float input requires less temporary buffers + * @return a suggested minibatch size and the expected memory cost per-row (in bytes) */ template -constexpr inline auto calc_minibatch_size(uint32_t n_clusters, - IdxT n_rows, - uint32_t dim, - raft::distance::DistanceType metric, - bool is_float) -> IdxT +constexpr auto calc_minibatch_size( + uint32_t n_clusters, IdxT n_rows, uint32_t dim, distance::DistanceType metric, bool is_float) + -> std::tuple { n_clusters = std::max(1, n_clusters); // Estimate memory needs per row (i.e element of the batch). - IdxT mem_per_row = 0; - /* fusedL2NN only needs one integer per row for a mutex. - * Other metrics require storing a distance matrix. */ - if (metric != raft::distance::DistanceType::L2Expanded && - metric != raft::distance::DistanceType::L2SqrtExpanded) { - mem_per_row += sizeof(float) * n_clusters; - } else { - mem_per_row += sizeof(int); + size_t mem_per_row = 0; + switch (metric) { + // fusedL2NN only needs one integer per row for a mutex. + case distance::DistanceType::L2Expanded: + case distance::DistanceType::L2SqrtExpanded: { + mem_per_row += sizeof(int); + } break; + // Other metrics require storing a distance matrix. + default: { + mem_per_row += sizeof(float) * n_clusters; + } } + // If we need to convert to float, space required for the converted batch. if (!is_float) { mem_per_row += sizeof(float) * dim; } // Heuristic: calculate the minibatch size in order to use at most 1GB of memory. IdxT minibatch_size = (1 << 30) / mem_per_row; - minibatch_size = 64 * ceildiv(minibatch_size, (IdxT)64); + minibatch_size = 64 * div_rounding_up_safe(minibatch_size, IdxT{64}); minibatch_size = std::min(minibatch_size, n_rows); - return minibatch_size; + return std::make_tuple(minibatch_size, mem_per_row); } /** @@ -383,7 +390,7 @@ void predict(const handle_t& handle, common::nvtx::range fun_scope( "kmeans::predict(%zu, %u)", static_cast(n_rows), n_clusters); if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); } - IdxT max_minibatch_size = + auto [max_minibatch_size, _mem_per_row] = calc_minibatch_size(n_clusters, n_rows, dim, metric, std::is_same_v); rmm::device_uvector cur_dataset( std::is_same_v ? 0 : max_minibatch_size * dim, stream, mr); @@ -972,9 +979,10 @@ void build_hierarchical(const handle_t& handle, rmm::mr::managed_memory_resource managed_memory; rmm::mr::device_memory_resource* device_memory = nullptr; - IdxT max_minibatch_size = + auto [max_minibatch_size, mem_per_row] = calc_minibatch_size(n_clusters, n_rows, dim, metric, std::is_same_v); - auto pool_guard = raft::get_pool_memory_resource(device_memory, max_minibatch_size * dim * 4); + auto pool_guard = + raft::get_pool_memory_resource(device_memory, mem_per_row * size_t(max_minibatch_size)); if (pool_guard) { RAFT_LOG_DEBUG( "kmeans::build_hierarchical: using pool memory resource with initial size %zu bytes", From f2bc24dabd6fad9e4ef1f62183466073ce3fb176 Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Thu, 19 Jan 2023 16:16:59 -0800 Subject: [PATCH 22/44] Remove faiss ANN code from knnIndex (#1121) Authors: - Ben Frederickson (https://github.com/benfred) - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/1121 --- cpp/include/raft/spatial/knn/ann_common.h | 26 +--- .../raft/spatial/knn/detail/ann_quantized.cuh | 130 ++++-------------- cpp/test/neighbors/ann_ivf_flat.cu | 2 - 3 files changed, 31 insertions(+), 127 deletions(-) diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h index a0d79a1b77..0e9e323b84 100644 --- a/cpp/include/raft/spatial/knn/ann_common.h +++ b/cpp/include/raft/spatial/knn/ann_common.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,12 +22,10 @@ #include "detail/processing.hpp" #include "ivf_flat_types.hpp" +#include #include -#include -#include - namespace raft { namespace spatial { namespace knn { @@ -36,13 +34,14 @@ struct knnIndex { raft::distance::DistanceType metric; float metricArg; int nprobe; - std::unique_ptr index; std::unique_ptr> metric_processor; + std::unique_ptr> ivf_flat_float_; std::unique_ptr> ivf_flat_uint8_t_; std::unique_ptr> ivf_flat_int8_t_; - std::unique_ptr gpu_res; + std::unique_ptr> ivf_pq; + int device; template @@ -70,16 +69,6 @@ inline auto knnIndex::ivf_flat() return ivf_flat_int8_t_; } -enum QuantizerType : unsigned int { - QT_8bit, - QT_4bit, - QT_8bit_uniform, - QT_4bit_uniform, - QT_fp16, - QT_8bit_direct, - QT_6bit -}; - struct knnIndexParam { virtual ~knnIndexParam() {} }; @@ -98,11 +87,6 @@ struct IVFPQParam : IVFParam { bool usePrecomputedTables; }; -struct IVFSQParam : IVFParam { - QuantizerType qtype; - bool encodeResidual; -}; - inline auto from_legacy_index_params(const IVFFlatParam& legacy, raft::distance::DistanceType metric, float metric_arg) diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh index 975f1a0f89..f651e943e3 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh @@ -18,9 +18,7 @@ #include "../ann_common.h" #include "../ivf_flat.cuh" -#include "knn_brute_force_faiss.cuh" -#include "common_faiss.h" #include "processing.cuh" #include #include @@ -29,83 +27,14 @@ #include #include #include -#include +#include #include -#include -#include -#include -#include - #include namespace raft::spatial::knn::detail { -inline faiss::ScalarQuantizer::QuantizerType build_faiss_qtype(QuantizerType qtype) -{ - switch (qtype) { - case QuantizerType::QT_8bit: return faiss::ScalarQuantizer::QuantizerType::QT_8bit; - case QuantizerType::QT_8bit_uniform: - return faiss::ScalarQuantizer::QuantizerType::QT_8bit_uniform; - case QuantizerType::QT_4bit_uniform: - return faiss::ScalarQuantizer::QuantizerType::QT_4bit_uniform; - case QuantizerType::QT_fp16: return faiss::ScalarQuantizer::QuantizerType::QT_fp16; - case QuantizerType::QT_8bit_direct: - return faiss::ScalarQuantizer::QuantizerType::QT_8bit_direct; - case QuantizerType::QT_6bit: return faiss::ScalarQuantizer::QuantizerType::QT_6bit; - default: return (faiss::ScalarQuantizer::QuantizerType)qtype; - } -} - -template -void approx_knn_ivfflat_build_index(knnIndex* index, - const IVFFlatParam& params, - IntType n, - IntType D) -{ - faiss::gpu::GpuIndexIVFFlatConfig config; - config.device = index->device; - faiss::MetricType faiss_metric = build_faiss_metric(index->metric); - index->index.reset( - new faiss::gpu::GpuIndexIVFFlat(index->gpu_res.get(), D, params.nlist, faiss_metric, config)); -} - -template -void approx_knn_ivfpq_build_index(knnIndex* index, const IVFPQParam& params, IntType n, IntType D) -{ - faiss::gpu::GpuIndexIVFPQConfig config; - config.device = index->device; - config.usePrecomputedTables = params.usePrecomputedTables; - config.interleavedLayout = params.n_bits != 8; - faiss::MetricType faiss_metric = build_faiss_metric(index->metric); - index->index.reset(new faiss::gpu::GpuIndexIVFPQ( - index->gpu_res.get(), D, params.nlist, params.M, params.n_bits, faiss_metric, config)); -} - -template -void approx_knn_ivfsq_build_index(knnIndex* index, const IVFSQParam& params, IntType n, IntType D) -{ - faiss::gpu::GpuIndexIVFScalarQuantizerConfig config; - config.device = index->device; - faiss::MetricType faiss_metric = build_faiss_metric(index->metric); - faiss::ScalarQuantizer::QuantizerType faiss_qtype = build_faiss_qtype(params.qtype); - index->index.reset(new faiss::gpu::GpuIndexIVFScalarQuantizer( - index->gpu_res.get(), D, params.nlist, faiss_qtype, faiss_metric, params.encodeResidual)); -} - -inline bool ivf_flat_supported_metric(raft::distance::DistanceType metric) -{ - switch (metric) { - case raft::distance::DistanceType::L2Unexpanded: - case raft::distance::DistanceType::L2Expanded: - case raft::distance::DistanceType::L2SqrtExpanded: - case raft::distance::DistanceType::L2SqrtUnexpanded: - case raft::distance::DistanceType::InnerProduct: return true; - default: return false; - } -} - template void approx_knn_build_index(const handle_t& handle, knnIndex* index, @@ -117,7 +46,6 @@ void approx_knn_build_index(const handle_t& handle, IntType D) { auto stream = handle.get_stream(); - index->index = nullptr; index->metric = metric; index->metricArg = metricArg; if (dynamic_cast(params)) { @@ -125,37 +53,35 @@ void approx_knn_build_index(const handle_t& handle, } auto ivf_ft_pams = dynamic_cast(params); auto ivf_pq_pams = dynamic_cast(params); - auto ivf_sq_pams = dynamic_cast(params); if constexpr (std::is_same_v) { index->metric_processor = create_processor(metric, n, D, 0, false, stream); + // For cosine/correlation distance, the metric processor translates distance + // to inner product via pre/post processing - pass the translated metric to + // ANN index + if (metric == raft::distance::DistanceType::CosineExpanded || + metric == raft::distance::DistanceType::CorrelationExpanded) { + metric = index->metric = raft::distance::DistanceType::InnerProduct; + } } if constexpr (std::is_same_v) { index->metric_processor->preprocess(index_array); } - if (ivf_ft_pams && ivf_flat_supported_metric(metric)) { + if (ivf_ft_pams) { auto new_params = from_legacy_index_params(*ivf_ft_pams, metric, metricArg); index->ivf_flat() = std::make_unique>( ivf_flat::build(handle, new_params, index_array, int64_t(n), D)); + } else if (ivf_pq_pams) { + neighbors::ivf_pq::index_params params; + params.metric = metric; + params.metric_arg = metricArg; + params.n_lists = ivf_pq_pams->nlist; + params.pq_bits = ivf_pq_pams->n_bits; + params.pq_dim = ivf_pq_pams->M; + // TODO: handle ivf_pq_pams.usePrecomputedTables ? + index->ivf_pq = std::make_unique>( + neighbors::ivf_pq::build(handle, params, index_array, int64_t(n), D)); } else { - RAFT_CUDA_TRY(cudaGetDevice(&(index->device))); - index->gpu_res.reset(new raft::spatial::knn::RmmGpuResources()); - index->gpu_res->noTempMemory(); - index->gpu_res->setDefaultStream(index->device, stream); - if (ivf_ft_pams) { - approx_knn_ivfflat_build_index(index, *ivf_ft_pams, n, D); - } else if (ivf_pq_pams) { - approx_knn_ivfpq_build_index(index, *ivf_pq_pams, n, D); - } else if (ivf_sq_pams) { - approx_knn_ivfsq_build_index(index, *ivf_sq_pams, n, D); - } else { - RAFT_FAIL("Unrecognized index type."); - } - if constexpr (std::is_same_v) { - index->index->train(n, index_array); - index->index->add(n, index_array); - } else { - RAFT_FAIL("FAISS-based index supports only float data."); - } + RAFT_FAIL("Unrecognized index type."); } if constexpr (std::is_same_v) { index->metric_processor->revert(index_array); } @@ -170,26 +96,22 @@ void approx_knn_search(const handle_t& handle, T* query_array, IntType n) { - auto faiss_ivf = dynamic_cast(index->index.get()); - if (faiss_ivf) { faiss_ivf->setNumProbes(index->nprobe); } - if constexpr (std::is_same_v) { index->metric_processor->preprocess(query_array); index->metric_processor->set_num_queries(k); } // search - if (faiss_ivf) { - if constexpr (std::is_same_v) { - faiss_ivf->search(n, query_array, k, distances, indices); - } else { - RAFT_FAIL("FAISS-based index supports only float data."); - } - } else if (index->ivf_flat()) { + if (index->ivf_flat()) { ivf_flat::search_params params; params.n_probes = index->nprobe; ivf_flat::search( handle, params, *(index->ivf_flat()), query_array, n, k, indices, distances); + } else if (index->ivf_pq) { + neighbors::ivf_pq::search_params params; + params.n_probes = index->nprobe; + neighbors::ivf_pq::search( + handle, params, *index->ivf_pq, query_array, n, k, indices, distances); } else { RAFT_FAIL("The model is not trained"); } diff --git a/cpp/test/neighbors/ann_ivf_flat.cu b/cpp/test/neighbors/ann_ivf_flat.cu index 86a62bb487..080e7551fa 100644 --- a/cpp/test/neighbors/ann_ivf_flat.cu +++ b/cpp/test/neighbors/ann_ivf_flat.cu @@ -107,8 +107,6 @@ class AnnIVFFlatTest : public ::testing::TestWithParam> { ivfParams.nprobe = ps.nprobe; ivfParams.nlist = ps.nlist; raft::spatial::knn::knnIndex index; - index.index = nullptr; - index.gpu_res = nullptr; approx_knn_build_index(handle_, &index, From d233a2cba9108b37727440e88d0ad6e406d28d5f Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Thu, 19 Jan 2023 19:34:52 -0800 Subject: [PATCH 23/44] Use squeuclidean for metric name in ivf_pq python bindings (#1160) Use sqeuclidean instead of l2_expanded for the distance name in the ivf_pq python bindings. This matches both sklearn, and the RAFT pairwise_distance api - and should be less confusing for our users Authors: - Ben Frederickson (https://github.com/benfred) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/1160 --- .../pylibraft/neighbors/ivf_pq/ivf_pq.pyx | 23 ++++++++++------ .../pylibraft/pylibraft/neighbors/refine.pyx | 6 ++--- .../pylibraft/pylibraft/test/test_ivf_pq.py | 26 +++++++++---------- .../pylibraft/pylibraft/test/test_refine.py | 8 +++--- 4 files changed, 35 insertions(+), 28 deletions(-) diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx index ee30864193..8f8a49fb63 100644 --- a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx +++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx @@ -18,6 +18,8 @@ # cython: embedsignature = True # cython: language_level = 3 +import warnings + import numpy as np from cython.operator cimport dereference as deref @@ -63,17 +65,22 @@ from pylibraft.neighbors.ivf_pq.cpp.c_ivf_pq cimport ( def _get_metric(metric): SUPPORTED_DISTANCES = { - "l2_expanded": DistanceType.L2Expanded, + "sqeuclidean": DistanceType.L2Expanded, "euclidean": DistanceType.L2SqrtExpanded, "inner_product": DistanceType.InnerProduct } if metric not in SUPPORTED_DISTANCES: + if metric == "l2_expanded": + warnings.warn("Using l2_expanded as a metric name is deprecated," + " use sqeuclidean instead", FutureWarning) + return DistanceType.L2Expanded + raise ValueError("metric %s is not supported" % metric) return SUPPORTED_DISTANCES[metric] cdef _get_metric_string(DistanceType metric): - return {DistanceType.L2Expanded : "l2_expanded", + return {DistanceType.L2Expanded : "sqeuclidean", DistanceType.InnerProduct: "inner_product", DistanceType.L2SqrtExpanded: "euclidean"}[metric] @@ -118,7 +125,7 @@ cdef class IndexParams: def __init__(self, *, n_lists=1024, - metric="l2_expanded", + metric="sqeuclidean", kmeans_n_iters=20, kmeans_trainset_fraction=0.5, pq_bits=8, @@ -133,10 +140,10 @@ cdef class IndexParams: ---------- n_list : int, default = 1024 The number of clusters used in the coarse quantizer. - metric : string denoting the metric type, default="l2_expanded" - Valid values for metric: ["l2_expanded", "inner_product", + metric : string denoting the metric type, default="sqeuclidean" + Valid values for metric: ["sqeuclidean", "inner_product", "euclidean"], where - - l2_expanded is the euclidean distance without the square root + - sqeuclidean is the euclidean distance without the square root operation, i.e.: distance(a,b) = \\sum_i (a_i - b_i)^2, - euclidean is the euclidean distance - inner product distance is defined as @@ -251,7 +258,7 @@ cdef class Index: # We create a placeholder object. The actual parameter values do # not matter, it will be replaced with a built index object later. self.index = new c_ivf_pq.index[uint64_t]( - deref(handle_), _get_metric("l2_expanded"), + deref(handle_), _get_metric("sqeuclidean"), c_ivf_pq.codebook_gen.PER_SUBSPACE, 1, 4, @@ -347,7 +354,7 @@ def build(IndexParams index_params, dataset, handle=None): >>> handle = Handle() >>> index_params = ivf_pq.IndexParams( ... n_lists=1024, - ... metric="l2_expanded", + ... metric="sqeuclidean", ... pq_dim=10) >>> index = ivf_pq.build(index_params, dataset, handle=handle) diff --git a/python/pylibraft/pylibraft/neighbors/refine.pyx b/python/pylibraft/pylibraft/neighbors/refine.pyx index 37ef69e7b5..b8f1bd0caa 100644 --- a/python/pylibraft/pylibraft/neighbors/refine.pyx +++ b/python/pylibraft/pylibraft/neighbors/refine.pyx @@ -1,5 +1,5 @@ # -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -215,7 +215,7 @@ cdef host_matrix_view[int8_t, uint64_t, row_major] \ @auto_sync_handle @auto_convert_output def refine(dataset, queries, candidates, k=None, indices=None, distances=None, - metric="l2_expanded", handle=None): + metric="sqeuclidean", handle=None): """ Refine nearest neighbor search. @@ -271,7 +271,7 @@ def refine(dataset, queries, candidates, k=None, indices=None, distances=None, >>> dataset = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) >>> handle = Handle() - >>> index_params = ivf_pq.IndexParams(n_lists=1024, metric="l2_expanded", + >>> index_params = ivf_pq.IndexParams(n_lists=1024, metric="sqeuclidean", ... pq_dim=10) >>> index = ivf_pq.build(index_params, dataset, handle=handle) diff --git a/python/pylibraft/pylibraft/test/test_ivf_pq.py b/python/pylibraft/pylibraft/test/test_ivf_pq.py index db1389c6cd..6952408c02 100644 --- a/python/pylibraft/pylibraft/test/test_ivf_pq.py +++ b/python/pylibraft/pylibraft/test/test_ivf_pq.py @@ -58,7 +58,7 @@ def check_distances(dataset, queries, metric, out_idx, out_dist, eps=None): for i in range(queries.shape[0]): X = queries[np.newaxis, i, :] Y = dataset[out_idx[i, :], :] - if metric == "l2_expanded": + if metric == "sqeuclidean": dist[i, :] = pairwise_distances(X, Y, "sqeuclidean") elif metric == "euclidean": dist[i, :] = pairwise_distances(X, Y, "euclidean") @@ -177,7 +177,7 @@ def run_ivf_pq_build_search_test( # Calculate reference values with sklearn skl_metric = { - "l2_expanded": "sqeuclidean", + "sqeuclidean": "sqeuclidean", "inner_product": "cosine", "euclidean": "euclidean", }[metric] @@ -204,14 +204,14 @@ def test_ivf_pq_dtypes( n_rows, n_cols, n_queries, n_lists, dtype, inplace, array_type ): # Note that inner_product tests use normalized input which we cannot - # represent in int8, therefore we test only l2_expanded metric here. + # represent in int8, therefore we test only sqeuclidean metric here. run_ivf_pq_build_search_test( n_rows=n_rows, n_cols=n_cols, n_queries=n_queries, k=10, n_lists=n_lists, - metric="l2_expanded", + metric="sqeuclidean", dtype=dtype, inplace=inplace, array_type=array_type, @@ -246,14 +246,14 @@ def test_ivf_pq_n(params): n_queries=params["n_queries"], k=params["k"], n_lists=params["n_lists"], - metric="l2_expanded", + metric="sqeuclidean", dtype=np.float32, compare=False, ) @pytest.mark.parametrize( - "metric", ["l2_expanded", "inner_product", "euclidean"] + "metric", ["sqeuclidean", "inner_product", "euclidean"] ) @pytest.mark.parametrize("dtype", [np.float32]) @pytest.mark.parametrize("codebook_kind", ["subspace", "cluster"]) @@ -298,7 +298,7 @@ def test_ivf_pq_params(params): n_queries=1000, k=10, n_lists=params["n_lists"], - metric="l2_expanded", + metric="sqeuclidean", dtype=np.float32, pq_bits=params["pq_bits"], pq_dim=params["pq_dims"], @@ -344,7 +344,7 @@ def test_ivf_pq_search_params(params): k=params["k"], n_lists=100, n_probes=params["n_probes"], - metric="l2_expanded", + metric="sqeuclidean", dtype=np.float32, lut_dtype=params["lut"], internal_distance_dtype=params["idd"], @@ -360,7 +360,7 @@ def test_extend(dtype, array_type): n_queries=100, k=10, n_lists=100, - metric="l2_expanded", + metric="sqeuclidean", dtype=dtype, add_data_on_build=False, array_type=array_type, @@ -375,7 +375,7 @@ def test_build_assertions(): n_queries=100, k=10, n_lists=100, - metric="l2_expanded", + metric="sqeuclidean", dtype=np.float64, ) @@ -388,7 +388,7 @@ def test_build_assertions(): index_params = ivf_pq.IndexParams( n_lists=50, - metric="l2_expanded", + metric="sqeuclidean", kmeans_n_iters=20, kmeans_trainset_fraction=1, add_data_on_build=False, @@ -482,7 +482,7 @@ def test_search_inputs(params): out_dist_device = device_ndarray(out_dist) index_params = ivf_pq.IndexParams( - n_lists=50, metric="l2_expanded", add_data_on_build=True + n_lists=50, metric="sqeuclidean", add_data_on_build=True ) dataset = generate_data((n_rows, n_cols), dtype) @@ -511,7 +511,7 @@ def test_save_load(): dataset = generate_data((n_rows, n_cols), dtype) dataset_device = device_ndarray(dataset) - build_params = ivf_pq.IndexParams(n_lists=100, metric="l2_expanded") + build_params = ivf_pq.IndexParams(n_lists=100, metric="sqeuclidean") index = ivf_pq.build(build_params, dataset_device) assert index.trained diff --git a/python/pylibraft/pylibraft/test/test_refine.py b/python/pylibraft/pylibraft/test/test_refine.py index 2f3bef2e0c..8502d0575c 100644 --- a/python/pylibraft/pylibraft/test/test_refine.py +++ b/python/pylibraft/pylibraft/test/test_refine.py @@ -27,7 +27,7 @@ def run_refine( n_rows=500, n_cols=50, n_queries=100, - metric="l2_expanded", + metric="sqeuclidean", k0=40, k=10, inplace=False, @@ -49,7 +49,7 @@ def run_refine( queries_device = device_ndarray(queries) # Calculate reference values with sklearn - skl_metric = {"l2_expanded": "euclidean", "inner_product": "cosine"}[ + skl_metric = {"sqeuclidean": "euclidean", "inner_product": "cosine"}[ metric ] nn_skl = NearestNeighbors( @@ -106,7 +106,7 @@ def run_refine( if recall <= 0.999: # We did not find the same neighbor indices. # We could have found other neighbor with same distance. - if metric == "l2_expanded": + if metric == "sqeuclidean": skl_dist = np.power(skl_dist[:, :k], 2) elif metric == "inner_product": skl_dist = 1 - skl_dist[:, :k] @@ -120,7 +120,7 @@ def run_refine( @pytest.mark.parametrize("n_queries", [100, 1024, 37]) @pytest.mark.parametrize("inplace", [True, False]) -@pytest.mark.parametrize("metric", ["l2_expanded", "inner_product"]) +@pytest.mark.parametrize("metric", ["sqeuclidean", "inner_product"]) @pytest.mark.parametrize("dtype", [np.float32, np.int8, np.uint8]) @pytest.mark.parametrize("memory_type", ["device", "host"]) def test_refine_dtypes(n_queries, dtype, inplace, metric, memory_type): From 102a4f036a27bff30749f67b1133611575ee0603 Mon Sep 17 00:00:00 2001 From: Sevag H Date: Fri, 20 Jan 2023 09:00:21 -0500 Subject: [PATCH 24/44] Make cutlass use static ctk (#1155) Cutlass links to the CTK in ways that cause problems for downstream pip wheel builds (especially in cugraph). This might help. Authors: - Sevag H (https://github.com/sevagh) Approvers: - Robert Maynard (https://github.com/robertmaynard) URL: https://github.com/rapidsai/raft/pull/1155 --- cpp/cmake/thirdparty/get_cutlass.cmake | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/cmake/thirdparty/get_cutlass.cmake b/cpp/cmake/thirdparty/get_cutlass.cmake index 811a5466c3..3e02ce064e 100644 --- a/cpp/cmake/thirdparty/get_cutlass.cmake +++ b/cpp/cmake/thirdparty/get_cutlass.cmake @@ -30,6 +30,10 @@ function(find_and_configure_cutlass) CACHE BOOL "Disable CUTLASS to build with cuBLAS library." ) + if (CUDA_STATIC_RUNTIME) + set(CUDART_LIBRARY "${CUDA_cudart_static_LIBRARY}" CACHE FILEPATH "fixing cutlass cmake code" FORCE) + endif() + rapids_cpm_find( NvidiaCutlass ${PKG_VERSION} GLOBAL_TARGETS nvidia::cutlass::cutlass From b70519e631cedee4fd652215fb71a1a6c0545c85 Mon Sep 17 00:00:00 2001 From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com> Date: Sat, 21 Jan 2023 02:09:15 +0100 Subject: [PATCH 25/44] Protect balanced k-means out-of-memory in some cases (#1161) There's no guarantee that our balanced k-means implementation always produces balanced clusters. In the first stage, when mesoclusters are trained, the biggest cluster can grow larger than half of all input data. This becomes a problem at the second stage, when in `build_fine_clusters`, the mesocluster data is copied in a temporary buffer. If size is too big, there may be not enough memory on the device. A quick workaround: 1. Expand the error reporting (RAFT_LOG_WARN) 2. Artificially limit the mesocluster size in the event of highly unbalanced clustering Authors: - Artem M. Chirkin (https://github.com/achirkin) Approvers: - Tamas Bela Feher (https://github.com/tfeher) URL: https://github.com/rapidsai/raft/pull/1161 --- .../knn/detail/ann_kmeans_balanced.cuh | 35 +++++++++++-------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh index 72df13d760..c6a3aea0cf 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh @@ -839,6 +839,10 @@ inline auto arrange_fine_clusters(uint32_t n_clusters, * As a result, the fine clusters are what is returned by `build_hierarchical`; * this function returns the total number of fine clusters, which can be checked to be * the same as the requested number of clusters. + * + * Note: this function uses at most `fine_clusters_nums_max` points per mesocluster for training; + * if one of the clusters is larger than that (as given by `mesocluster_sizes`), the extra data + * is ignored and a warning is reported. */ template auto build_fine_clusters(const handle_t& handle, @@ -880,8 +884,8 @@ auto build_fine_clusters(const handle_t& handle, uint32_t n_clusters_done = 0; for (uint32_t i = 0; i < n_mesoclusters; i++) { uint32_t k = 0; - for (IdxT j = 0; j < n_rows; j++) { - if (labels_mptr[j] == (LabelT)i) { mc_trainset_ids[k++] = j; } + for (IdxT j = 0; j < n_rows && k < mesocluster_size_max; j++) { + if (labels_mptr[j] == LabelT(i)) { mc_trainset_ids[k++] = j; } } if (k != mesocluster_sizes[i]) RAFT_LOG_WARN("Incorrect mesocluster size at %d. %d vs %d", i, k, mesocluster_sizes[i]); @@ -896,19 +900,13 @@ auto build_fine_clusters(const handle_t& handle, "Number of fine clusters must be non-zero for a non-empty mesocluster"); } - utils::copy_selected((IdxT)mesocluster_sizes[i], - (IdxT)dim, - dataset_mptr, - mc_trainset_ids, - (IdxT)dim, - mc_trainset, - (IdxT)dim, - stream); + utils::copy_selected( + (IdxT)k, (IdxT)dim, dataset_mptr, mc_trainset_ids, (IdxT)dim, mc_trainset, (IdxT)dim, stream); if (metric == raft::distance::DistanceType::L2Expanded || metric == raft::distance::DistanceType::L2SqrtExpanded) { thrust::gather(handle.get_thrust_policy(), mc_trainset_ids, - mc_trainset_ids + mesocluster_sizes[i], + mc_trainset_ids + k, dataset_norm_mptr, mc_trainset_norm); } @@ -917,7 +915,7 @@ auto build_fine_clusters(const handle_t& handle, n_iters, dim, mc_trainset, - mesocluster_sizes[i], + k, fine_clusters_nums[i], mc_trainset_ccenters.data(), mc_trainset_labels.data(), @@ -1036,10 +1034,19 @@ void build_hierarchical(const handle_t& handle, auto [mesocluster_size_max, fine_clusters_nums_max, fine_clusters_nums, fine_clusters_csum] = arrange_fine_clusters(n_clusters, n_mesoclusters, n_rows, mesocluster_sizes); - if (mesocluster_size_max * n_mesoclusters > 2 * n_rows) { - RAFT_LOG_WARN("build_hierarchical: built unbalanced mesoclusters"); + const auto mesocluster_size_max_balanced = uint32_t(div_rounding_up_safe( + 2lu * size_t(n_rows), std::max(size_t(n_mesoclusters), 1lu))); + if (mesocluster_size_max > mesocluster_size_max_balanced) { + RAFT_LOG_WARN( + "build_hierarchical: built unbalanced mesoclusters (max_mesocluster_size == %u > %u). " + "At most %u points will be used for training within each mesocluster. " + "Consider increasing the number of training iterations `n_iters`.", + mesocluster_size_max, + mesocluster_size_max_balanced, + mesocluster_size_max_balanced); RAFT_LOG_TRACE_VEC(mesocluster_sizes, n_mesoclusters); RAFT_LOG_TRACE_VEC(fine_clusters_nums.data(), n_mesoclusters); + mesocluster_size_max = mesocluster_size_max_balanced; } auto n_clusters_done = build_fine_clusters(handle, From a9e1adc6a55f03fb98199ab8c7f4bc82e9849a73 Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Sat, 21 Jan 2023 21:40:14 +0100 Subject: [PATCH 26/44] Improvement of the math API wrappers (#1146) Solves #1025 Provides a centralized collection of host- and device-friendly wrappers around common math operations, with generalizations when useful. Deprecates former `myXxx` wrappers. Those wrappers are mostly intended to future-proof the API as well as simplify the definition of host-device functions. Authors: - Louis Sugy (https://github.com/Nyrio) - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/1146 --- cpp/include/raft/core/math.hpp | 320 ++++++++++++++++ cpp/include/raft/core/operators.hpp | 27 +- cpp/include/raft/distance/detail/canberra.cuh | 4 +- .../raft/distance/detail/chebyshev.cuh | 4 +- .../raft/distance/detail/correlation.cuh | 4 +- .../raft/distance/detail/euclidean.cuh | 8 +- .../raft/distance/detail/fused_l2_nn.cuh | 4 +- .../raft/distance/detail/hellinger.cuh | 4 +- .../raft/distance/detail/jensen_shannon.cuh | 8 +- .../raft/distance/detail/kl_divergence.cuh | 14 +- cpp/include/raft/distance/detail/l1.cuh | 2 +- .../raft/distance/detail/minkowski.cuh | 8 +- cpp/include/raft/linalg/detail/lstsq.cuh | 4 +- cpp/include/raft/matrix/detail/math.cuh | 10 +- .../raft/random/detail/make_regression.cuh | 6 +- cpp/include/raft/random/detail/rng_device.cuh | 24 +- .../sparse/distance/detail/l2_distance.cuh | 12 +- .../sparse/distance/detail/lp_distance.cuh | 6 +- .../spatial/knn/detail/ball_cover/common.cuh | 4 +- .../spatial/knn/detail/haversine_distance.cuh | 8 +- .../raft/spectral/detail/spectral_util.cuh | 4 +- cpp/include/raft/stats/detail/stddev.cuh | 6 +- cpp/include/raft/util/cuda_utils.cuh | 90 ++--- cpp/test/CMakeLists.txt | 2 + cpp/test/core/math_device.cu | 352 ++++++++++++++++++ cpp/test/core/math_host.cpp | 195 ++++++++++ cpp/test/distance/distance_base.cuh | 22 +- cpp/test/distance/fused_l2_nn.cu | 2 +- cpp/test/linalg/matrix_vector.cu | 4 +- cpp/test/linalg/norm.cu | 10 +- cpp/test/linalg/power.cu | 6 +- cpp/test/linalg/sqrt.cu | 4 +- cpp/test/matrix/math.cu | 4 +- cpp/test/neighbors/ann_utils.cuh | 2 +- cpp/test/random/rng.cu | 14 +- 35 files changed, 1034 insertions(+), 164 deletions(-) create mode 100644 cpp/include/raft/core/math.hpp create mode 100644 cpp/test/core/math_device.cu create mode 100644 cpp/test/core/math_host.cpp diff --git a/cpp/include/raft/core/math.hpp b/cpp/include/raft/core/math.hpp new file mode 100644 index 0000000000..c5f08b84b7 --- /dev/null +++ b/cpp/include/raft/core/math.hpp @@ -0,0 +1,320 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include + +namespace raft { + +/** + * @defgroup Absolute Absolute value + * @{ + */ +template +RAFT_INLINE_FUNCTION auto abs(T x) + -> std::enable_if_t || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v, + T> +{ +#ifdef __CUDA_ARCH__ + return ::abs(x); +#else + return std::abs(x); +#endif +} +template +constexpr RAFT_INLINE_FUNCTION auto abs(T x) + -> std::enable_if_t && !std::is_same_v && + !std::is_same_v && !std::is_same_v && + !std::is_same_v, + T> +{ + return x < T{0} ? -x : x; +} +/** @} */ + +/** + * @defgroup Trigonometry Trigonometry functions + * @{ + */ +/** Inverse cosine */ +template +RAFT_INLINE_FUNCTION auto acos(T x) +{ +#ifdef __CUDA_ARCH__ + return ::acos(x); +#else + return std::acos(x); +#endif +} + +/** Inverse sine */ +template +RAFT_INLINE_FUNCTION auto asin(T x) +{ +#ifdef __CUDA_ARCH__ + return ::asin(x); +#else + return std::asin(x); +#endif +} + +/** Inverse hyperbolic tangent */ +template +RAFT_INLINE_FUNCTION auto atanh(T x) +{ +#ifdef __CUDA_ARCH__ + return ::atanh(x); +#else + return std::atanh(x); +#endif +} + +/** Cosine */ +template +RAFT_INLINE_FUNCTION auto cos(T x) +{ +#ifdef __CUDA_ARCH__ + return ::cos(x); +#else + return std::cos(x); +#endif +} + +/** Sine */ +template +RAFT_INLINE_FUNCTION auto sin(T x) +{ +#ifdef __CUDA_ARCH__ + return ::sin(x); +#else + return std::sin(x); +#endif +} + +/** Sine and cosine */ +template +RAFT_INLINE_FUNCTION std::enable_if_t || std::is_same_v> sincos( + const T& x, T* s, T* c) +{ +#ifdef __CUDA_ARCH__ + ::sincos(x, s, c); +#else + *s = std::sin(x); + *c = std::cos(x); +#endif +} + +/** Hyperbolic tangent */ +template +RAFT_INLINE_FUNCTION auto tanh(T x) +{ +#ifdef __CUDA_ARCH__ + return ::tanh(x); +#else + return std::tanh(x); +#endif +} +/** @} */ + +/** + * @defgroup Exponential Exponential and logarithm + * @{ + */ +/** Exponential function */ +template +RAFT_INLINE_FUNCTION auto exp(T x) +{ +#ifdef __CUDA_ARCH__ + return ::exp(x); +#else + return std::exp(x); +#endif +} + +/** Natural logarithm */ +template +RAFT_INLINE_FUNCTION auto log(T x) +{ +#ifdef __CUDA_ARCH__ + return ::log(x); +#else + return std::log(x); +#endif +} +/** @} */ + +/** + * @defgroup Maximum Maximum of two or more values. + * + * The CUDA Math API has overloads for all combinations of float/double. We provide similar + * functionality while wrapping around std::max, which only supports arguments of the same type. + * However, though the CUDA Math API supports combinations of unsigned and signed integers, this is + * very error-prone so we do not support that and require the user to cast instead. (e.g the max of + * -1 and 1u is 4294967295u...) + * + * When no overload matches, we provide a generic implementation but require that both types be the + * same (and that the less-than operator be defined). + * @{ + */ +template +RAFT_INLINE_FUNCTION auto max(const T1& x, const T2& y) +{ +#ifdef __CUDA_ARCH__ + // Combinations of types supported by the CUDA Math API + if constexpr ((std::is_integral_v && std::is_integral_v && std::is_same_v) || + ((std::is_same_v || std::is_same_v)&&( + std::is_same_v || std::is_same_v))) { + return ::max(x, y); + } + // Else, check that the types are the same and provide a generic implementation + else { + static_assert( + std::is_same_v, + "No native max overload for these types. Both argument types must be the same to use " + "the generic max. Please cast appropriately."); + return (x < y) ? y : x; + } +#else + if constexpr (std::is_same_v && std::is_same_v) { + return std::max(static_cast(x), y); + } else if constexpr (std::is_same_v && std::is_same_v) { + return std::max(x, static_cast(y)); + } else { + static_assert( + std::is_same_v, + "std::max requires that both argument types be the same. Please cast appropriately."); + return std::max(x, y); + } +#endif +} + +/** Many-argument overload to avoid verbose nested calls or use with variadic arguments */ +template +RAFT_INLINE_FUNCTION auto max(const T1& x, const T2& y, Args&&... args) +{ + return raft::max(x, raft::max(y, std::forward(args)...)); +} + +/** One-argument overload for convenience when using with variadic arguments */ +template +constexpr RAFT_INLINE_FUNCTION auto max(const T& x) +{ + return x; +} +/** @} */ + +/** + * @defgroup Minimum Minimum of two or more values. + * + * The CUDA Math API has overloads for all combinations of float/double. We provide similar + * functionality while wrapping around std::min, which only supports arguments of the same type. + * However, though the CUDA Math API supports combinations of unsigned and signed integers, this is + * very error-prone so we do not support that and require the user to cast instead. (e.g the min of + * -1 and 1u is 1u...) + * + * When no overload matches, we provide a generic implementation but require that both types be the + * same (and that the less-than operator be defined). + * @{ + */ +template +RAFT_INLINE_FUNCTION auto min(const T1& x, const T2& y) +{ +#ifdef __CUDA_ARCH__ + // Combinations of types supported by the CUDA Math API + if constexpr ((std::is_integral_v && std::is_integral_v && std::is_same_v) || + ((std::is_same_v || std::is_same_v)&&( + std::is_same_v || std::is_same_v))) { + return ::min(x, y); + } + // Else, check that the types are the same and provide a generic implementation + else { + static_assert( + std::is_same_v, + "No native min overload for these types. Both argument types must be the same to use " + "the generic min. Please cast appropriately."); + return (y < x) ? y : x; + } +#else + if constexpr (std::is_same_v && std::is_same_v) { + return std::min(static_cast(x), y); + } else if constexpr (std::is_same_v && std::is_same_v) { + return std::min(x, static_cast(y)); + } else { + static_assert( + std::is_same_v, + "std::min requires that both argument types be the same. Please cast appropriately."); + return std::min(x, y); + } +#endif +} + +/** Many-argument overload to avoid verbose nested calls or use with variadic arguments */ +template +RAFT_INLINE_FUNCTION auto min(const T1& x, const T2& y, Args&&... args) +{ + return raft::min(x, raft::min(y, std::forward(args)...)); +} + +/** One-argument overload for convenience when using with variadic arguments */ +template +constexpr RAFT_INLINE_FUNCTION auto min(const T& x) +{ + return x; +} +/** @} */ + +/** + * @defgroup Power Power and root functions + * @{ + */ +/** Power */ +template +RAFT_INLINE_FUNCTION auto pow(T1 x, T2 y) +{ +#ifdef __CUDA_ARCH__ + return ::pow(x, y); +#else + return std::pow(x, y); +#endif +} + +/** Square root */ +template +RAFT_INLINE_FUNCTION auto sqrt(T x) +{ +#ifdef __CUDA_ARCH__ + return ::sqrt(x); +#else + return std::sqrt(x); +#endif +} +/** @} */ + +/** Sign */ +template +RAFT_INLINE_FUNCTION auto sgn(T val) -> int +{ + return (T(0) < val) - (val < T(0)); +} + +} // namespace raft diff --git a/cpp/include/raft/core/operators.hpp b/cpp/include/raft/core/operators.hpp index 398354df46..de27c2b271 100644 --- a/cpp/include/raft/core/operators.hpp +++ b/cpp/include/raft/core/operators.hpp @@ -23,6 +23,7 @@ #include #include +#include namespace raft { @@ -75,9 +76,9 @@ struct value_op { struct sqrt_op { template - constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in, UnusedArgs...) const + RAFT_INLINE_FUNCTION auto operator()(const Type& in, UnusedArgs...) const { - return std::sqrt(in); + return raft::sqrt(in); } }; @@ -91,9 +92,9 @@ struct nz_op { struct abs_op { template - constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in, UnusedArgs...) const + RAFT_INLINE_FUNCTION auto operator()(const Type& in, UnusedArgs...) const { - return std::abs(in); + return raft::abs(in); } }; @@ -148,27 +149,25 @@ struct div_checkzero_op { struct pow_op { template - constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& a, const Type& b) const + RAFT_INLINE_FUNCTION auto operator()(const Type& a, const Type& b) const { - return std::pow(a, b); + return raft::pow(a, b); } }; struct min_op { - template - constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& a, const Type& b) const + template + RAFT_INLINE_FUNCTION auto operator()(Args&&... args) const { - if (a > b) { return b; } - return a; + return raft::min(std::forward(args)...); } }; struct max_op { - template - constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& a, const Type& b) const + template + RAFT_INLINE_FUNCTION auto operator()(Args&&... args) const { - if (b > a) { return b; } - return a; + return raft::max(std::forward(args)...); } }; diff --git a/cpp/include/raft/distance/detail/canberra.cuh b/cpp/include/raft/distance/detail/canberra.cuh index 43a904edba..f17a26dc4b 100644 --- a/cpp/include/raft/distance/detail/canberra.cuh +++ b/cpp/include/raft/distance/detail/canberra.cuh @@ -73,8 +73,8 @@ static void canberraImpl(const DataT* x, // Accumulation operation lambda auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { - const auto diff = raft::myAbs(x - y); - const auto add = raft::myAbs(x) + raft::myAbs(y); + const auto diff = raft::abs(x - y); + const auto add = raft::abs(x) + raft::abs(y); // deal with potential for 0 in denominator by // forcing 1/0 instead acc += ((add != 0) * diff / (add + (add == 0))); diff --git a/cpp/include/raft/distance/detail/chebyshev.cuh b/cpp/include/raft/distance/detail/chebyshev.cuh index 52573bd170..43b36e7921 100644 --- a/cpp/include/raft/distance/detail/chebyshev.cuh +++ b/cpp/include/raft/distance/detail/chebyshev.cuh @@ -73,8 +73,8 @@ static void chebyshevImpl(const DataT* x, // Accumulation operation lambda auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { - const auto diff = raft::myAbs(x - y); - acc = raft::myMax(acc, diff); + const auto diff = raft::abs(x - y); + acc = raft::max(acc, diff); }; // epilogue operation lambda for final value calculation diff --git a/cpp/include/raft/distance/detail/correlation.cuh b/cpp/include/raft/distance/detail/correlation.cuh index 9bdbbf112c..f7fe3678e6 100644 --- a/cpp/include/raft/distance/detail/correlation.cuh +++ b/cpp/include/raft/distance/detail/correlation.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -125,7 +125,7 @@ static void correlationImpl(const DataT* x, auto Q_denom = k * regx2n[i] - (regxn[i] * regxn[i]); auto R_denom = k * regy2n[j] - (regyn[j] * regyn[j]); - acc[i][j] = 1 - (numer / raft::mySqrt(Q_denom * R_denom)); + acc[i][j] = 1 - (numer / raft::sqrt(Q_denom * R_denom)); } } }; diff --git a/cpp/include/raft/distance/detail/euclidean.cuh b/cpp/include/raft/distance/detail/euclidean.cuh index 4184810fff..1a2db63f5c 100644 --- a/cpp/include/raft/distance/detail/euclidean.cuh +++ b/cpp/include/raft/distance/detail/euclidean.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * Copyright (c) 2018-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,7 +34,7 @@ struct L2ExpandedOp { __device__ AccT operator()(DataT& aNorm, const DataT& bNorm, DataT& accVal) const noexcept { AccT outVal = aNorm + bNorm - DataT(2.0) * accVal; - return sqrt ? raft::mySqrt(outVal) : outVal; + return sqrt ? raft::sqrt(outVal) : outVal; } __device__ AccT operator()(DataT aData) const noexcept { return aData; } @@ -130,7 +130,7 @@ void euclideanExpImpl(const DataT* x, for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { #pragma unroll for (int j = 0; j < KPolicy::AccColsPerTh; ++j) { - acc[i][j] = raft::mySqrt(acc[i][j]); + acc[i][j] = raft::sqrt(acc[i][j]); } } } @@ -350,7 +350,7 @@ void euclideanUnExpImpl(const DataT* x, for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { #pragma unroll for (int j = 0; j < KPolicy::AccColsPerTh; ++j) { - acc[i][j] = raft::mySqrt(acc[i][j]); + acc[i][j] = raft::sqrt(acc[i][j]); } } } diff --git a/cpp/include/raft/distance/detail/fused_l2_nn.cuh b/cpp/include/raft/distance/detail/fused_l2_nn.cuh index c9750df8ad..447359ffe6 100644 --- a/cpp/include/raft/distance/detail/fused_l2_nn.cuh +++ b/cpp/include/raft/distance/detail/fused_l2_nn.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -175,7 +175,7 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min, #pragma unroll for (int j = 0; j < P::AccColsPerTh; ++j) { auto acc_ij = acc[i][j]; - acc[i][j] = acc_ij > DataT{0} ? raft::mySqrt(acc_ij) : DataT{0}; + acc[i][j] = acc_ij > DataT{0} ? raft::sqrt(acc_ij) : DataT{0}; } } } diff --git a/cpp/include/raft/distance/detail/hellinger.cuh b/cpp/include/raft/distance/detail/hellinger.cuh index 51f462ab36..13507fe84f 100644 --- a/cpp/include/raft/distance/detail/hellinger.cuh +++ b/cpp/include/raft/distance/detail/hellinger.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -105,7 +105,7 @@ static void hellingerImpl(const DataT* x, // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative const auto finalVal = (1 - acc[i][j]); const auto rectifier = (!signbit(finalVal)); - acc[i][j] = raft::mySqrt(rectifier * finalVal); + acc[i][j] = raft::sqrt(rectifier * finalVal); } } }; diff --git a/cpp/include/raft/distance/detail/jensen_shannon.cuh b/cpp/include/raft/distance/detail/jensen_shannon.cuh index 92ee071cf5..f96da01b87 100644 --- a/cpp/include/raft/distance/detail/jensen_shannon.cuh +++ b/cpp/include/raft/distance/detail/jensen_shannon.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -78,11 +78,11 @@ static void jensenShannonImpl(const DataT* x, auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { const DataT m = 0.5f * (x + y); const bool m_zero = (m == 0); - const auto logM = (!m_zero) * raft::myLog(m + m_zero); + const auto logM = (!m_zero) * raft::log(m + m_zero); const bool x_zero = (x == 0); const bool y_zero = (y == 0); - acc += (-x * (logM - raft::myLog(x + x_zero))) + (-y * (logM - raft::myLog(y + y_zero))); + acc += (-x * (logM - raft::log(x + x_zero))) + (-y * (logM - raft::log(y + y_zero))); }; // epilogue operation lambda for final value calculation @@ -95,7 +95,7 @@ static void jensenShannonImpl(const DataT* x, for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { #pragma unroll for (int j = 0; j < KPolicy::AccColsPerTh; ++j) { - acc[i][j] = raft::mySqrt(0.5 * acc[i][j]); + acc[i][j] = raft::sqrt(0.5 * acc[i][j]); } } }; diff --git a/cpp/include/raft/distance/detail/kl_divergence.cuh b/cpp/include/raft/distance/detail/kl_divergence.cuh index 4c0c4b6ace..7ebeaf4de9 100644 --- a/cpp/include/raft/distance/detail/kl_divergence.cuh +++ b/cpp/include/raft/distance/detail/kl_divergence.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -81,10 +81,10 @@ static void klDivergenceImpl(const DataT* x, auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { if (isRowMajor) { const bool x_zero = (x == 0); - acc += x * (raft::myLog(x + x_zero) - y); + acc += x * (raft::log(x + x_zero) - y); } else { const bool y_zero = (y == 0); - acc += y * (raft::myLog(y + y_zero) - x); + acc += y * (raft::log(y + y_zero) - x); } }; @@ -92,23 +92,23 @@ static void klDivergenceImpl(const DataT* x, if (isRowMajor) { const bool x_zero = (x == 0); const bool y_zero = (y == 0); - acc += x * (raft::myLog(x + x_zero) - (!y_zero) * raft::myLog(y + y_zero)); + acc += x * (raft::log(x + x_zero) - (!y_zero) * raft::log(y + y_zero)); } else { const bool y_zero = (y == 0); const bool x_zero = (x == 0); - acc += y * (raft::myLog(y + y_zero) - (!x_zero) * raft::myLog(x + x_zero)); + acc += y * (raft::log(y + y_zero) - (!x_zero) * raft::log(x + x_zero)); } }; auto unaryOp_lambda = [] __device__(DataT input) { const bool x_zero = (input == 0); - return (!x_zero) * raft::myLog(input + x_zero); + return (!x_zero) * raft::log(input + x_zero); }; auto unaryOp_lambda_reverse = [] __device__(DataT input) { // reverse previous log (x) back to x using (e ^ log(x)) const bool x_zero = (input == 0); - return (!x_zero) * raft::myExp(input); + return (!x_zero) * raft::exp(input); }; // epilogue operation lambda for final value calculation diff --git a/cpp/include/raft/distance/detail/l1.cuh b/cpp/include/raft/distance/detail/l1.cuh index 87893bab7c..bf10651b60 100644 --- a/cpp/include/raft/distance/detail/l1.cuh +++ b/cpp/include/raft/distance/detail/l1.cuh @@ -71,7 +71,7 @@ static void l1Impl(const DataT* x, // Accumulation operation lambda auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { - const auto diff = raft::myAbs(x - y); + const auto diff = raft::abs(x - y); acc += diff; }; diff --git a/cpp/include/raft/distance/detail/minkowski.cuh b/cpp/include/raft/distance/detail/minkowski.cuh index bda83babf1..42af8cd281 100644 --- a/cpp/include/raft/distance/detail/minkowski.cuh +++ b/cpp/include/raft/distance/detail/minkowski.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -74,8 +74,8 @@ void minkowskiUnExpImpl(const DataT* x, // Accumulation operation lambda auto core_lambda = [p] __device__(AccT & acc, DataT & x, DataT & y) { - const auto diff = raft::myAbs(x - y); - acc += raft::myPow(diff, p); + const auto diff = raft::abs(x - y); + acc += raft::pow(diff, p); }; // epilogue operation lambda for final value calculation @@ -89,7 +89,7 @@ void minkowskiUnExpImpl(const DataT* x, for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { #pragma unroll for (int j = 0; j < KPolicy::AccColsPerTh; ++j) { - acc[i][j] = raft::myPow(acc[i][j], one_over_p); + acc[i][j] = raft::pow(acc[i][j], one_over_p); } } }; diff --git a/cpp/include/raft/linalg/detail/lstsq.cuh b/cpp/include/raft/linalg/detail/lstsq.cuh index 1273956b21..f0cf300e2f 100644 --- a/cpp/include/raft/linalg/detail/lstsq.cuh +++ b/cpp/include/raft/linalg/detail/lstsq.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * Copyright (c) 2018-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -104,7 +104,7 @@ struct DivideByNonZero { operator()(const math_t a, const math_t b) const { - return raft::myAbs(b) >= eps ? a / b : a; + return raft::abs(b) >= eps ? a / b : a; } }; diff --git a/cpp/include/raft/matrix/detail/math.cuh b/cpp/include/raft/matrix/detail/math.cuh index c559da3942..f5c33d1cf6 100644 --- a/cpp/include/raft/matrix/detail/math.cuh +++ b/cpp/include/raft/matrix/detail/math.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -87,10 +87,10 @@ void seqRoot(math_t* in, if (a < math_t(0)) { return math_t(0); } else { - return sqrt(a * scalar); + return raft::sqrt(a * scalar); } } else { - return sqrt(a * scalar); + return raft::sqrt(a * scalar); } }, stream); @@ -278,7 +278,7 @@ void matrixVectorBinaryDivSkipZero(Type* data, rowMajor, bcastAlongRows, [] __device__(Type a, Type b) { - if (raft::myAbs(b) < Type(1e-10)) + if (raft::abs(b) < Type(1e-10)) return Type(0); else return a / b; @@ -294,7 +294,7 @@ void matrixVectorBinaryDivSkipZero(Type* data, rowMajor, bcastAlongRows, [] __device__(Type a, Type b) { - if (raft::myAbs(b) < Type(1e-10)) + if (raft::abs(b) < Type(1e-10)) return a; else return a / b; diff --git a/cpp/include/raft/random/detail/make_regression.cuh b/cpp/include/raft/random/detail/make_regression.cuh index cb0949c458..057196cd74 100644 --- a/cpp/include/raft/random/detail/make_regression.cuh +++ b/cpp/include/raft/random/detail/make_regression.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -44,8 +44,8 @@ static __global__ void _singular_profile_kernel(DataT* out, IdxT n, DataT tail_s IdxT tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid < n) { DataT sval = static_cast(tid) / rank; - DataT low_rank = ((DataT)1.0 - tail_strength) * raft::myExp(-sval * sval); - DataT tail = tail_strength * raft::myExp((DataT)-0.1 * sval); + DataT low_rank = ((DataT)1.0 - tail_strength) * raft::exp(-sval * sval); + DataT tail = tail_strength * raft::exp((DataT)-0.1 * sval); out[tid] = low_rank + tail; } } diff --git a/cpp/include/raft/random/detail/rng_device.cuh b/cpp/include/raft/random/detail/rng_device.cuh index 6c75a4fa78..7f994fb07f 100644 --- a/cpp/include/raft/random/detail/rng_device.cuh +++ b/cpp/include/raft/random/detail/rng_device.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -143,10 +143,10 @@ DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1, Type { constexpr Type twoPi = Type(2.0) * Type(3.141592654); constexpr Type minus2 = -Type(2.0); - Type R = raft::mySqrt(minus2 * raft::myLog(val1)); + Type R = raft::sqrt(minus2 * raft::log(val1)); Type theta = twoPi * val2; Type s, c; - raft::mySinCos(theta, s, c); + raft::sincos(theta, &s, &c); val1 = R * c * sigma1 + mu1; val2 = R * s * sigma2 + mu2; } @@ -323,7 +323,7 @@ DI void custom_next( gen.next(res); } while (res == OutType(0.0)); - *val = params.mu - params.beta * raft::myLog(-raft::myLog(res)); + *val = params.mu - params.beta * raft::log(-raft::log(res)); } template @@ -340,8 +340,8 @@ DI void custom_next(GenType& gen, gen.next(res2); box_muller_transform(res1, res2, params.sigma, params.mu); - *val = raft::myExp(res1); - *(val + 1) = raft::myExp(res2); + *val = raft::exp(res1); + *(val + 1) = raft::exp(res2); } template @@ -358,7 +358,7 @@ DI void custom_next(GenType& gen, } while (res == OutType(0.0)); constexpr OutType one = (OutType)1.0; - *val = params.mu - params.scale * raft::myLog(one / res - one); + *val = params.mu - params.scale * raft::log(one / res - one); } template @@ -371,7 +371,7 @@ DI void custom_next(GenType& gen, OutType res; gen.next(res); constexpr OutType one = (OutType)1.0; - *val = -raft::myLog(one - res) / params.lambda; + *val = -raft::log(one - res) / params.lambda; } template @@ -386,7 +386,7 @@ DI void custom_next(GenType& gen, constexpr OutType one = (OutType)1.0; constexpr OutType two = (OutType)2.0; - *val = raft::mySqrt(-two * raft::myLog(one - res)) * params.sigma; + *val = raft::sqrt(-two * raft::log(one - res)) * params.sigma; } template @@ -409,9 +409,9 @@ DI void custom_next(GenType& gen, // The <= comparison here means, number of samples going in `if` branch are more by 1 than `else` // branch. However it does not matter as for 0.5 both branches evaluate to same result. if (res <= oneHalf) { - out = params.mu + params.scale * raft::myLog(two * res); + out = params.mu + params.scale * raft::log(two * res); } else { - out = params.mu - params.scale * raft::myLog(two * (one - res)); + out = params.mu - params.scale * raft::log(two * (one - res)); } *val = out; } @@ -424,7 +424,7 @@ DI void custom_next( gen.next(res); params.inIdxPtr[idx] = idx; constexpr OutType one = (OutType)1.0; - auto exp = -raft::myLog(one - res); + auto exp = -raft::log(one - res); if (params.wts != nullptr) { *val = exp / params.wts[idx]; } else { diff --git a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh index 3c852235df..2f165b3ff2 100644 --- a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh +++ b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -112,7 +112,7 @@ __global__ void compute_correlation_warp_kernel(value_t* __restrict__ C, value_t Q_denom = n * Q_l2 - (Q_l1 * Q_l1); value_t R_denom = n * R_l2 - (R_l1 * R_l1); - value_t val = 1 - (numer / sqrt(Q_denom * R_denom)); + value_t val = 1 - (numer / raft::sqrt(Q_denom * R_denom)); // correct for small instabilities C[(size_t)i * n_cols + j] = val * (fabs(val) >= 0.0001); @@ -292,7 +292,7 @@ class l2_sqrt_expanded_distances_t : public l2_expanded_distances_tconfig_->a_nrows * this->config_->b_nrows, [] __device__(value_t input) { int neg = input < 0 ? -1 : 1; - return sqrt(abs(input) * neg); + return raft::sqrt(abs(input) * neg); }, this->config_->handle.get_stream()); } @@ -379,7 +379,7 @@ class cosine_expanded_distances_t : public distances_t { config_->b_nrows, config_->handle.get_stream(), [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { - value_t norms = sqrt(q_norm) * sqrt(r_norm); + value_t norms = raft::sqrt(q_norm) * raft::sqrt(r_norm); // deal with potential for 0 in denominator by forcing 0/1 instead value_t cos = ((norms != 0) * dot) / ((norms == 0) + norms); @@ -429,7 +429,7 @@ class hellinger_expanded_distances_t : public distances_t { out_dists, *config_, coo_rows.data(), - [] __device__(value_t a, value_t b) { return sqrt(a) * sqrt(b); }, + [] __device__(value_t a, value_t b) { return raft::sqrt(a) * raft::sqrt(b); }, raft::add_op(), raft::atomic_add_op()); @@ -440,7 +440,7 @@ class hellinger_expanded_distances_t : public distances_t { [=] __device__(value_t input) { // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative bool rectifier = (1 - input) > 0; - return sqrt(rectifier * (1 - input)); + return raft::sqrt(rectifier * (1 - input)); }, config_->handle.get_stream()); } diff --git a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh index a973aebbab..f67109afbc 100644 --- a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh +++ b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -132,7 +132,7 @@ class l2_sqrt_unexpanded_distances_t : public l2_unexpanded_distances_tconfig_->a_nrows * this->config_->b_nrows, [] __device__(value_t input) { int neg = input < 0 ? -1 : 1; - return sqrt(abs(input) * neg); + return raft::sqrt(abs(input) * neg); }, this->config_->handle.get_stream()); } @@ -274,7 +274,7 @@ class jensen_shannon_unexpanded_distances_t : public distances_t { out_dists, out_dists, config_->a_nrows * config_->b_nrows, - [=] __device__(value_t input) { return sqrt(0.5 * input); }, + [=] __device__(value_t input) { return raft::sqrt(0.5 * input); }, config_->handle.get_stream()); } diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh index b09cf0da10..0a6718f5a5 100644 --- a/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh +++ b/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -71,7 +71,7 @@ struct EuclideanFunc : public DistFunc { sum_sq += diff * diff; } - return sqrt(sum_sq); + return raft::sqrt(sum_sq); } }; diff --git a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh index e073841dd3..9cecc0adf4 100644 --- a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh +++ b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh @@ -32,11 +32,11 @@ namespace detail { template DI value_t compute_haversine(value_t x1, value_t y1, value_t x2, value_t y2) { - value_t sin_0 = sin(0.5 * (x1 - y1)); - value_t sin_1 = sin(0.5 * (x2 - y2)); - value_t rdist = sin_0 * sin_0 + cos(x1) * cos(y1) * sin_1 * sin_1; + value_t sin_0 = raft::sin(0.5 * (x1 - y1)); + value_t sin_1 = raft::sin(0.5 * (x2 - y2)); + value_t rdist = sin_0 * sin_0 + raft::cos(x1) * raft::cos(y1) * sin_1 * sin_1; - return 2 * asin(sqrt(rdist)); + return 2 * raft::asin(raft::sqrt(rdist)); } /** diff --git a/cpp/include/raft/spectral/detail/spectral_util.cuh b/cpp/include/raft/spectral/detail/spectral_util.cuh index 3a0ad1f96f..5991e71ec6 100644 --- a/cpp/include/raft/spectral/detail/spectral_util.cuh +++ b/cpp/include/raft/spectral/detail/spectral_util.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -72,7 +72,7 @@ static __global__ void scale_obs_kernel(index_type_t m, index_type_t n, value_ty // scale by alpha alpha = __shfl_sync(warp_full_mask(), alpha, blockDim.x - 1, blockDim.x); - alpha = std::sqrt(alpha); + alpha = raft::sqrt(alpha); for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) { for (i = threadIdx.x; i < m; i += blockDim.x) { // blockDim.x=32 index = i + j * m; diff --git a/cpp/include/raft/stats/detail/stddev.cuh b/cpp/include/raft/stats/detail/stddev.cuh index ccea2ea5da..2f7e22ca8a 100644 --- a/cpp/include/raft/stats/detail/stddev.cuh +++ b/cpp/include/raft/stats/detail/stddev.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -63,7 +63,7 @@ __global__ void stddevKernelColMajor( thread_data += diff * diff; } Type acc = BlockReduce(temp_storage).Sum(thread_data); - if (threadIdx.x == 0) { std[blockIdx.x] = raft::mySqrt(acc / N); } + if (threadIdx.x == 0) { std[blockIdx.x] = raft::sqrt(acc / N); } } template @@ -126,7 +126,7 @@ void stddev(Type* std, std, mu, D, - [ratio] __device__(Type a, Type b) { return raft::mySqrt(a * ratio - b * b); }, + [ratio] __device__(Type a, Type b) { return raft::sqrt(a * ratio - b * b); }, stream); } else { stddevKernelColMajor<<>>(std, data, mu, D, N); diff --git a/cpp/include/raft/util/cuda_utils.cuh b/cpp/include/raft/util/cuda_utils.cuh index 61dd6e0ad8..5be9dc999a 100644 --- a/cpp/include/raft/util/cuda_utils.cuh +++ b/cpp/include/raft/util/cuda_utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * Copyright (c) 2018-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ #include #include +#include #include #ifndef ENABLE_MEMCPY_ASYNC @@ -259,12 +260,14 @@ DI double myAtomicMax(double* address, double val) template HDI T myMax(T x, T y); template <> -HDI float myMax(float x, float y) +[[deprecated("use raft::max from raft/core/math.hpp instead")]] HDI float myMax(float x, + float y) { return fmaxf(x, y); } template <> -HDI double myMax(double x, double y) +[[deprecated("use raft::max from raft/core/math.hpp instead")]] HDI double myMax(double x, + double y) { return fmax(x, y); } @@ -277,12 +280,14 @@ HDI double myMax(double x, double y) template HDI T myMin(T x, T y); template <> -HDI float myMin(float x, float y) +[[deprecated("use raft::min from raft/core/math.hpp instead")]] HDI float myMin(float x, + float y) { return fminf(x, y); } template <> -HDI double myMin(double x, double y) +[[deprecated("use raft::min from raft/core/math.hpp instead")]] HDI double myMin(double x, + double y) { return fmin(x, y); } @@ -298,7 +303,7 @@ HDI double myMin(double x, double y) template DI T myAtomicMin(T* address, T val) { - myAtomicReduce(address, val, myMin); + myAtomicReduce(address, val, raft::min_op{}); return *address; } @@ -312,19 +317,10 @@ DI T myAtomicMin(T* address, T val) template DI T myAtomicMax(T* address, T val) { - myAtomicReduce(address, val, myMax); + myAtomicReduce(address, val, raft::max_op{}); return *address; } -/** - * Sign function - */ -template -HDI int sgn(const T val) -{ - return (T(0) < val) - (val < T(0)); -} - /** * @defgroup Exp Exponential function * @{ @@ -332,14 +328,14 @@ HDI int sgn(const T val) template HDI T myExp(T x); template <> -HDI float myExp(float x) +[[deprecated("use raft::exp from raft/core/math.hpp instead")]] HDI float myExp(float x) { return expf(x); } template <> -HDI double myExp(double x) +[[deprecated("use raft::exp from raft/core/math.hpp instead")]] HDI double myExp(double x) { - return exp(x); + return ::exp(x); } /** @} */ @@ -368,14 +364,14 @@ inline __device__ double myInf() template HDI T myLog(T x); template <> -HDI float myLog(float x) +[[deprecated("use raft::log from raft/core/math.hpp instead")]] HDI float myLog(float x) { return logf(x); } template <> -HDI double myLog(double x) +[[deprecated("use raft::log from raft/core/math.hpp instead")]] HDI double myLog(double x) { - return log(x); + return ::log(x); } /** @} */ @@ -386,14 +382,14 @@ HDI double myLog(double x) template HDI T mySqrt(T x); template <> -HDI float mySqrt(float x) +[[deprecated("use raft::sqrt from raft/core/math.hpp instead")]] HDI float mySqrt(float x) { return sqrtf(x); } template <> -HDI double mySqrt(double x) +[[deprecated("use raft::sqrt from raft/core/math.hpp instead")]] HDI double mySqrt(double x) { - return sqrt(x); + return ::sqrt(x); } /** @} */ @@ -404,14 +400,18 @@ HDI double mySqrt(double x) template DI void mySinCos(T x, T& s, T& c); template <> -DI void mySinCos(float x, float& s, float& c) +[[deprecated("use raft::sincos from raft/core/math.hpp instead")]] DI void mySinCos(float x, + float& s, + float& c) { sincosf(x, &s, &c); } template <> -DI void mySinCos(double x, double& s, double& c) +[[deprecated("use raft::sincos from raft/core/math.hpp instead")]] DI void mySinCos(double x, + double& s, + double& c) { - sincos(x, &s, &c); + ::sincos(x, &s, &c); } /** @} */ @@ -422,14 +422,14 @@ DI void mySinCos(double x, double& s, double& c) template DI T mySin(T x); template <> -DI float mySin(float x) +[[deprecated("use raft::sin from raft/core/math.hpp instead")]] DI float mySin(float x) { return sinf(x); } template <> -DI double mySin(double x) +[[deprecated("use raft::sin from raft/core/math.hpp instead")]] DI double mySin(double x) { - return sin(x); + return ::sin(x); } /** @} */ @@ -443,12 +443,12 @@ DI T myAbs(T x) return x < 0 ? -x : x; } template <> -DI float myAbs(float x) +[[deprecated("use raft::abs from raft/core/math.hpp instead")]] DI float myAbs(float x) { return fabsf(x); } template <> -DI double myAbs(double x) +[[deprecated("use raft::abs from raft/core/math.hpp instead")]] DI double myAbs(double x) { return fabs(x); } @@ -461,14 +461,16 @@ DI double myAbs(double x) template HDI T myPow(T x, T power); template <> -HDI float myPow(float x, float power) +[[deprecated("use raft::pow from raft/core/math.hpp instead")]] HDI float myPow(float x, + float power) { return powf(x, power); } template <> -HDI double myPow(double x, double power) +[[deprecated("use raft::pow from raft/core/math.hpp instead")]] HDI double myPow(double x, + double power) { - return pow(x, power); + return ::pow(x, power); } /** @} */ @@ -479,14 +481,14 @@ HDI double myPow(double x, double power) template HDI T myTanh(T x); template <> -HDI float myTanh(float x) +[[deprecated("use raft::tanh from raft/core/math.hpp instead")]] HDI float myTanh(float x) { return tanhf(x); } template <> -HDI double myTanh(double x) +[[deprecated("use raft::tanh from raft/core/math.hpp instead")]] HDI double myTanh(double x) { - return tanh(x); + return ::tanh(x); } /** @} */ @@ -497,14 +499,14 @@ HDI double myTanh(double x) template HDI T myATanh(T x); template <> -HDI float myATanh(float x) +[[deprecated("use raft::atanh from raft/core/math.hpp instead")]] HDI float myATanh(float x) { return atanhf(x); } template <> -HDI double myATanh(double x) +[[deprecated("use raft::atanh from raft/core/math.hpp instead")]] HDI double myATanh(double x) { - return atanh(x); + return ::atanh(x); } /** @} */ @@ -526,7 +528,7 @@ struct SqrtOp { [[deprecated("SqrtOp is deprecated. Use sqrt_op instead.")]] HDI Type operator()(Type in, IdxType i = 0) const { - return mySqrt(in); + return raft::sqrt(in); } }; @@ -544,7 +546,7 @@ struct L1Op { [[deprecated("L1Op is deprecated. Use abs_op instead.")]] HDI Type operator()(Type in, IdxType i = 0) const { - return myAbs(in); + return raft::abs(in); } }; diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 8ca30a5c82..a4b3758faa 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -86,6 +86,8 @@ if(BUILD_TESTS) CORE_TEST PATH test/core/logger.cpp + test/core/math_device.cu + test/core/math_host.cpp test/core/operators_device.cu test/core/operators_host.cpp test/core/handle.cpp diff --git a/cpp/test/core/math_device.cu b/cpp/test/core/math_device.cu new file mode 100644 index 0000000000..ff4b343d9e --- /dev/null +++ b/cpp/test/core/math_device.cu @@ -0,0 +1,352 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "../test_utils.h" +#include +#include +#include + +template +__global__ void math_eval_kernel(OutT* out, OpT op, Args... args) +{ + out[0] = op(std::forward(args)...); +} + +template +auto math_eval(OpT op, Args&&... args) +{ + typedef decltype(op(args...)) OutT; + auto stream = rmm::cuda_stream_default; + rmm::device_scalar result(stream); + math_eval_kernel<<<1, 1, 0, stream>>>(result.data(), op, std::forward(args)...); + return result.value(stream); +} + +struct abs_test_op { + template + constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const + { + return raft::abs(in); + } +}; + +TEST(MathDevice, Abs) +{ + // Integer abs + ASSERT_TRUE( + raft::match(int8_t{123}, math_eval(abs_test_op{}, int8_t{-123}), raft::Compare())); + ASSERT_TRUE(raft::match(12345, math_eval(abs_test_op{}, -12345), raft::Compare())); + ASSERT_TRUE(raft::match(12345l, math_eval(abs_test_op{}, -12345l), raft::Compare())); + ASSERT_TRUE(raft::match(123451234512345ll, + math_eval(abs_test_op{}, -123451234512345ll), + raft::Compare())); + // Floating-point abs + ASSERT_TRUE( + raft::match(12.34f, math_eval(abs_test_op{}, -12.34f), raft::CompareApprox(0.0001f))); + ASSERT_TRUE( + raft::match(12.34, math_eval(abs_test_op{}, -12.34), raft::CompareApprox(0.000001))); +} + +struct acos_test_op { + template + constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const + { + return raft::acos(in); + } +}; + +TEST(MathDevice, Acos) +{ + ASSERT_TRUE(raft::match( + std::acos(0.123f), math_eval(acos_test_op{}, 0.123f), raft::CompareApprox(0.0001f))); + ASSERT_TRUE(raft::match( + std::acos(0.123), math_eval(acos_test_op{}, 0.123), raft::CompareApprox(0.000001))); +} + +struct asin_test_op { + template + constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const + { + return raft::asin(in); + } +}; + +TEST(MathDevice, Asin) +{ + ASSERT_TRUE(raft::match( + std::asin(0.123f), math_eval(asin_test_op{}, 0.123f), raft::CompareApprox(0.0001f))); + ASSERT_TRUE(raft::match( + std::asin(0.123), math_eval(asin_test_op{}, 0.123), raft::CompareApprox(0.000001))); +} + +struct atanh_test_op { + template + constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const + { + return raft::atanh(in); + } +}; + +TEST(MathDevice, Atanh) +{ + ASSERT_TRUE(raft::match( + std::atanh(0.123f), math_eval(atanh_test_op{}, 0.123f), raft::CompareApprox(0.0001f))); + ASSERT_TRUE(raft::match( + std::atanh(0.123), math_eval(atanh_test_op{}, 0.123), raft::CompareApprox(0.000001))); +} + +struct cos_test_op { + template + constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const + { + return raft::cos(in); + } +}; + +TEST(MathDevice, Cos) +{ + ASSERT_TRUE(raft::match( + std::cos(12.34f), math_eval(cos_test_op{}, 12.34f), raft::CompareApprox(0.0001f))); + ASSERT_TRUE(raft::match( + std::cos(12.34), math_eval(cos_test_op{}, 12.34), raft::CompareApprox(0.000001))); +} + +struct exp_test_op { + template + constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const + { + return raft::exp(in); + } +}; + +TEST(MathDevice, Exp) +{ + ASSERT_TRUE(raft::match( + std::exp(12.34f), math_eval(exp_test_op{}, 12.34f), raft::CompareApprox(0.0001f))); + ASSERT_TRUE(raft::match( + std::exp(12.34), math_eval(exp_test_op{}, 12.34), raft::CompareApprox(0.000001))); +} + +struct log_test_op { + template + constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const + { + return raft::log(in); + } +}; + +TEST(MathDevice, Log) +{ + ASSERT_TRUE(raft::match( + std::log(12.34f), math_eval(log_test_op{}, 12.34f), raft::CompareApprox(0.0001f))); + ASSERT_TRUE(raft::match( + std::log(12.34), math_eval(log_test_op{}, 12.34), raft::CompareApprox(0.000001))); +} + +struct max_test_op { + template + constexpr RAFT_INLINE_FUNCTION auto operator()(Args&&... args) const + { + return raft::max(std::forward(args)...); + } +}; + +TEST(MathDevice, Max2) +{ + ASSERT_TRUE(raft::match(1234, math_eval(max_test_op{}, -1234, 1234), raft::Compare())); + ASSERT_TRUE( + raft::match(1234u, math_eval(max_test_op{}, 1234u, 123u), raft::Compare())); + ASSERT_TRUE( + raft::match(1234ll, math_eval(max_test_op{}, -1234ll, 1234ll), raft::Compare())); + ASSERT_TRUE(raft::match( + 1234ull, math_eval(max_test_op{}, 1234ull, 123ull), raft::Compare())); + + ASSERT_TRUE( + raft::match(12.34f, math_eval(max_test_op{}, -12.34f, 12.34f), raft::Compare())); + ASSERT_TRUE(raft::match(12.34, math_eval(max_test_op{}, -12.34, 12.34), raft::Compare())); + ASSERT_TRUE(raft::match( + 12.34, math_eval(max_test_op{}, -12.34f, 12.34), raft::CompareApprox(0.000001))); + ASSERT_TRUE(raft::match( + 12.34, math_eval(max_test_op{}, -12.34, 12.34f), raft::CompareApprox(0.000001))); +} + +TEST(MathDevice, Max3) +{ + ASSERT_TRUE(raft::match(1234, math_eval(max_test_op{}, 1234, 0, -1234), raft::Compare())); + ASSERT_TRUE(raft::match(1234, math_eval(max_test_op{}, -1234, 1234, 0), raft::Compare())); + ASSERT_TRUE(raft::match(1234, math_eval(max_test_op{}, 0, -1234, 1234), raft::Compare())); + + ASSERT_TRUE(raft::match( + 12.34, math_eval(max_test_op{}, 12.34f, 0., -12.34), raft::CompareApprox(0.000001))); + ASSERT_TRUE(raft::match( + 12.34, math_eval(max_test_op{}, -12.34, 12.34f, 0.), raft::CompareApprox(0.000001))); + ASSERT_TRUE(raft::match( + 12.34, math_eval(max_test_op{}, 0., -12.34, 12.34f), raft::CompareApprox(0.000001))); +} + +struct min_test_op { + template + constexpr RAFT_INLINE_FUNCTION auto operator()(Args&&... args) const + { + return raft::min(std::forward(args)...); + } +}; + +TEST(MathDevice, Min2) +{ + ASSERT_TRUE(raft::match(-1234, math_eval(min_test_op{}, -1234, 1234), raft::Compare())); + ASSERT_TRUE( + raft::match(123u, math_eval(min_test_op{}, 1234u, 123u), raft::Compare())); + ASSERT_TRUE(raft::match( + -1234ll, math_eval(min_test_op{}, -1234ll, 1234ll), raft::Compare())); + ASSERT_TRUE(raft::match( + 123ull, math_eval(min_test_op{}, 1234ull, 123ull), raft::Compare())); + + ASSERT_TRUE( + raft::match(-12.34f, math_eval(min_test_op{}, -12.34f, 12.34f), raft::Compare())); + ASSERT_TRUE( + raft::match(-12.34, math_eval(min_test_op{}, -12.34, 12.34), raft::Compare())); + ASSERT_TRUE(raft::match( + -12.34, math_eval(min_test_op{}, -12.34f, 12.34), raft::CompareApprox(0.000001))); + ASSERT_TRUE(raft::match( + -12.34, math_eval(min_test_op{}, -12.34, 12.34f), raft::CompareApprox(0.000001))); +} + +TEST(MathDevice, Min3) +{ + ASSERT_TRUE(raft::match(-1234, math_eval(min_test_op{}, 1234, 0, -1234), raft::Compare())); + ASSERT_TRUE(raft::match(-1234, math_eval(min_test_op{}, -1234, 1234, 0), raft::Compare())); + ASSERT_TRUE(raft::match(-1234, math_eval(min_test_op{}, 0, -1234, 1234), raft::Compare())); + + ASSERT_TRUE(raft::match( + -12.34, math_eval(min_test_op{}, 12.34f, 0., -12.34), raft::CompareApprox(0.000001))); + ASSERT_TRUE(raft::match( + -12.34, math_eval(min_test_op{}, -12.34, 12.34f, 0.), raft::CompareApprox(0.000001))); + ASSERT_TRUE(raft::match( + -12.34, math_eval(min_test_op{}, 0., -12.34, 12.34f), raft::CompareApprox(0.000001))); +} + +struct pow_test_op { + template + constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& x, const Type& y) const + { + return raft::pow(x, y); + } +}; + +TEST(MathDevice, Pow) +{ + ASSERT_TRUE(raft::match(std::pow(12.34f, 2.f), + math_eval(pow_test_op{}, 12.34f, 2.f), + raft::CompareApprox(0.0001f))); + ASSERT_TRUE(raft::match(std::pow(12.34, 2.), + math_eval(pow_test_op{}, 12.34, 2.), + raft::CompareApprox(0.000001))); +} + +struct sgn_test_op { + template + constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const + { + return raft::sgn(in); + } +}; + +TEST(MathDevice, Sgn) +{ + ASSERT_TRUE(raft::match(-1, math_eval(sgn_test_op{}, -1234), raft::Compare())); + ASSERT_TRUE(raft::match(0, math_eval(sgn_test_op{}, 0), raft::Compare())); + ASSERT_TRUE(raft::match(1, math_eval(sgn_test_op{}, 1234), raft::Compare())); + ASSERT_TRUE(raft::match(-1, math_eval(sgn_test_op{}, -12.34f), raft::Compare())); + ASSERT_TRUE(raft::match(0, math_eval(sgn_test_op{}, 0.f), raft::Compare())); + ASSERT_TRUE(raft::match(1, math_eval(sgn_test_op{}, 12.34f), raft::Compare())); +} + +struct sin_test_op { + template + constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const + { + return raft::sin(in); + } +}; + +TEST(MathDevice, Sin) +{ + ASSERT_TRUE(raft::match( + std::sin(12.34f), math_eval(sin_test_op{}, 12.34f), raft::CompareApprox(0.0001f))); + ASSERT_TRUE(raft::match( + std::sin(12.34), math_eval(sin_test_op{}, 12.34), raft::CompareApprox(0.000001))); +} + +struct sincos_test_op { + template + constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& x, Type* s, Type* c) const + { + raft::sincos(x, s, c); + return x; // unused, just to avoid creating another helper + } +}; + +TEST(MathDevice, SinCos) +{ + auto stream = rmm::cuda_stream_default; + float xf = 12.34f; + rmm::device_scalar sf(stream); + rmm::device_scalar cf(stream); + math_eval(sincos_test_op{}, xf, sf.data(), cf.data()); + ASSERT_TRUE(raft::match(std::sin(12.34f), sf.value(stream), raft::CompareApprox(0.0001f))); + ASSERT_TRUE(raft::match(std::cos(12.34f), cf.value(stream), raft::CompareApprox(0.0001f))); + double xd = 12.34f; + rmm::device_scalar sd(stream); + rmm::device_scalar cd(stream); + math_eval(sincos_test_op{}, xd, sd.data(), cd.data()); + ASSERT_TRUE(raft::match(std::sin(12.34), sd.value(stream), raft::CompareApprox(0.0001f))); + ASSERT_TRUE(raft::match(std::cos(12.34), cd.value(stream), raft::CompareApprox(0.0001f))); +} + +struct sqrt_test_op { + template + constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const + { + return raft::sqrt(in); + } +}; + +TEST(MathDevice, Sqrt) +{ + ASSERT_TRUE(raft::match( + std::sqrt(12.34f), math_eval(sqrt_test_op{}, 12.34f), raft::CompareApprox(0.0001f))); + ASSERT_TRUE(raft::match( + std::sqrt(12.34), math_eval(sqrt_test_op{}, 12.34), raft::CompareApprox(0.000001))); +} + +struct tanh_test_op { + template + constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const + { + return raft::tanh(in); + } +}; + +TEST(MathDevice, Tanh) +{ + ASSERT_TRUE(raft::match( + std::tanh(12.34f), math_eval(tanh_test_op{}, 12.34f), raft::CompareApprox(0.0001f))); + ASSERT_TRUE(raft::match( + std::tanh(12.34), math_eval(tanh_test_op{}, 12.34), raft::CompareApprox(0.000001))); +} diff --git a/cpp/test/core/math_host.cpp b/cpp/test/core/math_host.cpp new file mode 100644 index 0000000000..5808905713 --- /dev/null +++ b/cpp/test/core/math_host.cpp @@ -0,0 +1,195 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "../test_utils.h" +#include + +TEST(MathHost, Abs) +{ + // Integer abs + ASSERT_TRUE(raft::match(int8_t{123}, raft::abs(int8_t{-123}), raft::Compare())); + ASSERT_TRUE(raft::match(12345, raft::abs(-12345), raft::Compare())); + ASSERT_TRUE(raft::match(12345l, raft::abs(-12345l), raft::Compare())); + ASSERT_TRUE( + raft::match(123451234512345ll, raft::abs(-123451234512345ll), raft::Compare())); + // Floating-point abs + ASSERT_TRUE(raft::match(12.34f, raft::abs(-12.34f), raft::CompareApprox(0.0001f))); + ASSERT_TRUE(raft::match(12.34, raft::abs(-12.34), raft::CompareApprox(0.000001))); +} + +TEST(MathHost, Acos) +{ + ASSERT_TRUE( + raft::match(std::acos(0.123f), raft::acos(0.123f), raft::CompareApprox(0.0001f))); + ASSERT_TRUE( + raft::match(std::acos(0.123), raft::acos(0.123), raft::CompareApprox(0.000001))); +} + +TEST(MathHost, Asin) +{ + ASSERT_TRUE( + raft::match(std::asin(0.123f), raft::asin(0.123f), raft::CompareApprox(0.0001f))); + ASSERT_TRUE( + raft::match(std::asin(0.123), raft::asin(0.123), raft::CompareApprox(0.000001))); +} + +TEST(MathHost, Atanh) +{ + ASSERT_TRUE( + raft::match(std::atanh(0.123f), raft::atanh(0.123f), raft::CompareApprox(0.0001f))); + ASSERT_TRUE( + raft::match(std::atanh(0.123), raft::atanh(0.123), raft::CompareApprox(0.000001))); +} + +TEST(MathHost, Cos) +{ + ASSERT_TRUE( + raft::match(std::cos(12.34f), raft::cos(12.34f), raft::CompareApprox(0.0001f))); + ASSERT_TRUE( + raft::match(std::cos(12.34), raft::cos(12.34), raft::CompareApprox(0.000001))); +} + +TEST(MathHost, Exp) +{ + ASSERT_TRUE( + raft::match(std::exp(12.34f), raft::exp(12.34f), raft::CompareApprox(0.0001f))); + ASSERT_TRUE( + raft::match(std::exp(12.34), raft::exp(12.34), raft::CompareApprox(0.000001))); +} + +TEST(MathHost, Log) +{ + ASSERT_TRUE( + raft::match(std::log(12.34f), raft::log(12.34f), raft::CompareApprox(0.0001f))); + ASSERT_TRUE( + raft::match(std::log(12.34), raft::log(12.34), raft::CompareApprox(0.000001))); +} + +TEST(MathHost, Max2) +{ + ASSERT_TRUE(raft::match(1234, raft::max(-1234, 1234), raft::Compare())); + ASSERT_TRUE(raft::match(1234u, raft::max(1234u, 123u), raft::Compare())); + ASSERT_TRUE(raft::match(1234ll, raft::max(-1234ll, 1234ll), raft::Compare())); + ASSERT_TRUE( + raft::match(1234ull, raft::max(1234ull, 123ull), raft::Compare())); + + ASSERT_TRUE(raft::match(12.34f, raft::max(-12.34f, 12.34f), raft::Compare())); + ASSERT_TRUE(raft::match(12.34, raft::max(-12.34, 12.34), raft::Compare())); + ASSERT_TRUE(raft::match(12.34, raft::max(-12.34f, 12.34), raft::CompareApprox(0.000001))); + ASSERT_TRUE(raft::match(12.34, raft::max(-12.34, 12.34f), raft::CompareApprox(0.000001))); +} + +TEST(MathHost, Max3) +{ + ASSERT_TRUE(raft::match(1234, raft::max(1234, 0, -1234), raft::Compare())); + ASSERT_TRUE(raft::match(1234, raft::max(-1234, 1234, 0), raft::Compare())); + ASSERT_TRUE(raft::match(1234, raft::max(0, -1234, 1234), raft::Compare())); + + ASSERT_TRUE( + raft::match(12.34, raft::max(12.34f, 0., -12.34), raft::CompareApprox(0.000001))); + ASSERT_TRUE( + raft::match(12.34, raft::max(-12.34, 12.34f, 0.), raft::CompareApprox(0.000001))); + ASSERT_TRUE( + raft::match(12.34, raft::max(0., -12.34, 12.34f), raft::CompareApprox(0.000001))); +} + +TEST(MathHost, Min2) +{ + ASSERT_TRUE(raft::match(-1234, raft::min(-1234, 1234), raft::Compare())); + ASSERT_TRUE(raft::match(123u, raft::min(1234u, 123u), raft::Compare())); + ASSERT_TRUE(raft::match(-1234ll, raft::min(-1234ll, 1234ll), raft::Compare())); + ASSERT_TRUE( + raft::match(123ull, raft::min(1234ull, 123ull), raft::Compare())); + + ASSERT_TRUE(raft::match(-12.34f, raft::min(-12.34f, 12.34f), raft::Compare())); + ASSERT_TRUE(raft::match(-12.34, raft::min(-12.34, 12.34), raft::Compare())); + ASSERT_TRUE( + raft::match(-12.34, raft::min(-12.34f, 12.34), raft::CompareApprox(0.000001))); + ASSERT_TRUE( + raft::match(-12.34, raft::min(-12.34, 12.34f), raft::CompareApprox(0.000001))); +} + +TEST(MathHost, Min3) +{ + ASSERT_TRUE(raft::match(-1234, raft::min(1234, 0, -1234), raft::Compare())); + ASSERT_TRUE(raft::match(-1234, raft::min(-1234, 1234, 0), raft::Compare())); + ASSERT_TRUE(raft::match(-1234, raft::min(0, -1234, 1234), raft::Compare())); + + ASSERT_TRUE( + raft::match(-12.34, raft::min(12.34f, 0., -12.34), raft::CompareApprox(0.000001))); + ASSERT_TRUE( + raft::match(-12.34, raft::min(-12.34, 12.34f, 0.), raft::CompareApprox(0.000001))); + ASSERT_TRUE( + raft::match(-12.34, raft::min(0., -12.34, 12.34f), raft::CompareApprox(0.000001))); +} + +TEST(MathHost, Pow) +{ + ASSERT_TRUE(raft::match( + std::pow(12.34f, 2.f), raft::pow(12.34f, 2.f), raft::CompareApprox(0.0001f))); + ASSERT_TRUE( + raft::match(std::pow(12.34, 2.), raft::pow(12.34, 2.), raft::CompareApprox(0.000001))); +} + +TEST(MathHost, Sgn) +{ + ASSERT_TRUE(raft::match(-1, raft::sgn(-1234), raft::Compare())); + ASSERT_TRUE(raft::match(0, raft::sgn(0), raft::Compare())); + ASSERT_TRUE(raft::match(1, raft::sgn(1234), raft::Compare())); + ASSERT_TRUE(raft::match(-1, raft::sgn(-12.34f), raft::Compare())); + ASSERT_TRUE(raft::match(0, raft::sgn(0.f), raft::Compare())); + ASSERT_TRUE(raft::match(1, raft::sgn(12.34f), raft::Compare())); +} + +TEST(MathHost, Sin) +{ + ASSERT_TRUE( + raft::match(std::sin(12.34f), raft::sin(12.34f), raft::CompareApprox(0.0001f))); + ASSERT_TRUE( + raft::match(std::sin(12.34), raft::sin(12.34), raft::CompareApprox(0.000001))); +} + +TEST(MathHost, SinCos) +{ + float xf = 12.34f; + float sf, cf; + raft::sincos(xf, &sf, &cf); + ASSERT_TRUE(raft::match(std::sin(12.34f), sf, raft::CompareApprox(0.0001f))); + ASSERT_TRUE(raft::match(std::cos(12.34f), cf, raft::CompareApprox(0.0001f))); + double xd = 12.34f; + double sd, cd; + raft::sincos(xd, &sd, &cd); + ASSERT_TRUE(raft::match(std::sin(12.34), sd, raft::CompareApprox(0.000001))); + ASSERT_TRUE(raft::match(std::cos(12.34), cd, raft::CompareApprox(0.000001))); +} + +TEST(MathHost, Sqrt) +{ + ASSERT_TRUE( + raft::match(std::sqrt(12.34f), raft::sqrt(12.34f), raft::CompareApprox(0.0001f))); + ASSERT_TRUE( + raft::match(std::sqrt(12.34), raft::sqrt(12.34), raft::CompareApprox(0.000001))); +} + +TEST(MathHost, Tanh) +{ + ASSERT_TRUE( + raft::match(std::tanh(12.34f), raft::tanh(12.34f), raft::CompareApprox(0.0001f))); + ASSERT_TRUE( + raft::match(std::tanh(12.34), raft::tanh(12.34), raft::CompareApprox(0.000001))); +} diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh index cbfd97ebc6..fedbee919d 100644 --- a/cpp/test/distance/distance_base.cuh +++ b/cpp/test/distance/distance_base.cuh @@ -52,7 +52,7 @@ __global__ void naiveDistanceKernel(DataType* dist, } if (type == raft::distance::DistanceType::L2SqrtExpanded || type == raft::distance::DistanceType::L2SqrtUnexpanded) - acc = raft::mySqrt(acc); + acc = raft::sqrt(acc); int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; dist[outidx] = acc; } @@ -79,9 +79,9 @@ __global__ void naiveL1_Linf_CanberraDistanceKernel(DataType* dist, auto b = y[yidx]; auto diff = (a > b) ? (a - b) : (b - a); if (type == raft::distance::DistanceType::Linf) { - acc = raft::myMax(acc, diff); + acc = raft::max(acc, diff); } else if (type == raft::distance::DistanceType::Canberra) { - const auto add = raft::myAbs(a) + raft::myAbs(b); + const auto add = raft::abs(a) + raft::abs(b); // deal with potential for 0 in denominator by // forcing 1/0 instead acc += ((add != 0) * diff / (add + (add == 0))); @@ -119,7 +119,7 @@ __global__ void naiveCosineDistanceKernel( int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; // Use 1.0 - (cosine similarity) to calc the distance - dist[outidx] = (DataType)1.0 - acc_ab / (raft::mySqrt(acc_a) * raft::mySqrt(acc_b)); + dist[outidx] = (DataType)1.0 - acc_ab / (raft::sqrt(acc_a) * raft::sqrt(acc_b)); } template @@ -137,7 +137,7 @@ __global__ void naiveHellingerDistanceKernel( int yidx = isRowMajor ? i + nidx * k : i * n + nidx; auto a = x[xidx]; auto b = y[yidx]; - acc_ab += raft::mySqrt(a) * raft::mySqrt(b); + acc_ab += raft::sqrt(a) * raft::sqrt(b); } int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; @@ -145,7 +145,7 @@ __global__ void naiveHellingerDistanceKernel( // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative acc_ab = 1 - acc_ab; auto rectifier = (!signbit(acc_ab)); - dist[outidx] = raft::mySqrt(rectifier * acc_ab); + dist[outidx] = raft::sqrt(rectifier * acc_ab); } template @@ -167,11 +167,11 @@ __global__ void naiveLpUnexpDistanceKernel(DataType* dist, int yidx = isRowMajor ? i + nidx * k : i * n + nidx; auto a = x[xidx]; auto b = y[yidx]; - auto diff = raft::myAbs(a - b); - acc += raft::myPow(diff, p); + auto diff = raft::abs(a - b); + acc += raft::pow(diff, p); } auto one_over_p = 1 / p; - acc = raft::myPow(acc, one_over_p); + acc = raft::pow(acc, one_over_p); int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; dist[outidx] = acc; } @@ -222,7 +222,7 @@ __global__ void naiveJensenShannonDistanceKernel( acc += (-a * (!p_zero * log(p + p_zero))) + (-b * (!q_zero * log(q + q_zero))); } - acc = raft::mySqrt(0.5f * acc); + acc = raft::sqrt(0.5f * acc); int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; dist[outidx] = acc; } @@ -297,7 +297,7 @@ __global__ void naiveCorrelationDistanceKernel( auto Q_denom = k * a_sq_norm - (a_norm * a_norm); auto R_denom = k * b_sq_norm - (b_norm * b_norm); - acc = 1 - (numer / raft::mySqrt(Q_denom * R_denom)); + acc = 1 - (numer / raft::sqrt(Q_denom * R_denom)); int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; dist[outidx] = acc; diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu index e746a2382d..54de12307a 100644 --- a/cpp/test/distance/fused_l2_nn.cu +++ b/cpp/test/distance/fused_l2_nn.cu @@ -60,7 +60,7 @@ __global__ void naiveKernel(raft::KeyValuePair* min, auto diff = midx >= m || nidx >= n ? DataT(0) : x[xidx] - y[yidx]; acc += diff * diff; } - if (Sqrt) { acc = raft::mySqrt(acc); } + if (Sqrt) { acc = raft::sqrt(acc); } ReduceOpT redOp; typedef cub::WarpReduce> WarpReduce; __shared__ typename WarpReduce::TempStorage temp[NWARPS]; diff --git a/cpp/test/linalg/matrix_vector.cu b/cpp/test/linalg/matrix_vector.cu index 7018e1da96..fb1e2235f9 100644 --- a/cpp/test/linalg/matrix_vector.cu +++ b/cpp/test/linalg/matrix_vector.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -116,7 +116,7 @@ void naive_matrix_vector_op_launch(const raft::handle_t& handle, } }; auto operation_bin_div_skip_zero = [] __device__(T mat_element, T vec_element) { - if (raft::myAbs(vec_element) < T(1e-10)) + if (raft::abs(vec_element) < T(1e-10)) return T(0); else return mat_element / vec_element; diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu index 94540b9ff6..90cfbd8f89 100644 --- a/cpp/test/linalg/norm.cu +++ b/cpp/test/linalg/norm.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * Copyright (c) 2018-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -56,10 +56,10 @@ __global__ void naiveRowNormKernel( if (type == L2Norm) { acc += data[rowStart * D + i] * data[rowStart * D + i]; } else { - acc += raft::myAbs(data[rowStart * D + i]); + acc += raft::abs(data[rowStart * D + i]); } } - dots[rowStart] = do_sqrt ? raft::mySqrt(acc) : acc; + dots[rowStart] = do_sqrt ? raft::sqrt(acc) : acc; } } @@ -131,10 +131,10 @@ __global__ void naiveColNormKernel( Type acc = 0; for (IdxT i = 0; i < N; i++) { Type v = data[colID + i * D]; - acc += type == L2Norm ? v * v : raft::myAbs(v); + acc += type == L2Norm ? v * v : raft::abs(v); } - dots[colID] = do_sqrt ? raft::mySqrt(acc) : acc; + dots[colID] = do_sqrt ? raft::sqrt(acc) : acc; } template diff --git a/cpp/test/linalg/power.cu b/cpp/test/linalg/power.cu index 54c2e2a7aa..5cb63a5697 100644 --- a/cpp/test/linalg/power.cu +++ b/cpp/test/linalg/power.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * Copyright (c) 2018-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,7 +27,7 @@ template __global__ void naivePowerElemKernel(Type* out, const Type* in1, const Type* in2, int len) { int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { out[idx] = raft::myPow(in1[idx], in2[idx]); } + if (idx < len) { out[idx] = raft::pow(in1[idx], in2[idx]); } } template @@ -43,7 +43,7 @@ template __global__ void naivePowerScalarKernel(Type* out, const Type* in1, const Type in2, int len) { int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { out[idx] = raft::myPow(in1[idx], in2); } + if (idx < len) { out[idx] = raft::pow(in1[idx], in2); } } template diff --git a/cpp/test/linalg/sqrt.cu b/cpp/test/linalg/sqrt.cu index 9008313b58..93150ca77d 100644 --- a/cpp/test/linalg/sqrt.cu +++ b/cpp/test/linalg/sqrt.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * Copyright (c) 2018-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,7 +27,7 @@ template __global__ void naiveSqrtElemKernel(Type* out, const Type* in1, int len) { int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { out[idx] = raft::mySqrt(in1[idx]); } + if (idx < len) { out[idx] = raft::sqrt(in1[idx]); } } template diff --git a/cpp/test/matrix/math.cu b/cpp/test/matrix/math.cu index f2c1a6249c..9dcbfc8899 100644 --- a/cpp/test/matrix/math.cu +++ b/cpp/test/matrix/math.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * Copyright (c) 2018-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -51,7 +51,7 @@ template __global__ void naiveSqrtKernel(Type* in, Type* out, int len) { int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { out[idx] = std::sqrt(in[idx]); } + if (idx < len) { out[idx] = raft::sqrt(in[idx]); } } template diff --git a/cpp/test/neighbors/ann_utils.cuh b/cpp/test/neighbors/ann_utils.cuh index b88b6abd9e..bb2f334db4 100644 --- a/cpp/test/neighbors/ann_utils.cuh +++ b/cpp/test/neighbors/ann_utils.cuh @@ -131,7 +131,7 @@ __global__ void naive_distance_kernel(EvalT* dist, } if (type == raft::distance::DistanceType::L2SqrtExpanded || type == raft::distance::DistanceType::L2SqrtUnexpanded) - acc = raft::mySqrt(acc); + acc = raft::sqrt(acc); dist[midx * n + nidx] = acc; } } diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu index bdce79b76e..0bf494b624 100644 --- a/cpp/test/random/rng.cu +++ b/cpp/test/random/rng.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * Copyright (c) 2018-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -145,8 +145,8 @@ class RngTest : public ::testing::TestWithParam> { case RNG_LogNormal: { auto var = params.end * params.end; auto mu = params.start; - meanvar[0] = raft::myExp(mu + var * T(0.5)); - meanvar[1] = (raft::myExp(var) - T(1.0)) * raft::myExp(T(2.0) * mu + var); + meanvar[0] = raft::exp(mu + var * T(0.5)); + meanvar[1] = (raft::exp(var) - T(1.0)) * raft::exp(T(2.0) * mu + var); break; } case RNG_Uniform: @@ -169,7 +169,7 @@ class RngTest : public ::testing::TestWithParam> { meanvar[1] = meanvar[0] * meanvar[0]; break; case RNG_Rayleigh: - meanvar[0] = params.start * raft::mySqrt(T(3.1415 / 2.0)); + meanvar[0] = params.start * raft::sqrt(T(3.1415 / 2.0)); meanvar[1] = ((T(4.0) - T(3.1415)) / T(2.0)) * params.start * params.start; break; case RNG_Laplace: @@ -239,8 +239,8 @@ class RngMdspanTest : public ::testing::TestWithParam> { case RNG_LogNormal: { auto var = params.end * params.end; auto mu = params.start; - meanvar[0] = raft::myExp(mu + var * T(0.5)); - meanvar[1] = (raft::myExp(var) - T(1.0)) * raft::myExp(T(2.0) * mu + var); + meanvar[0] = raft::exp(mu + var * T(0.5)); + meanvar[1] = (raft::exp(var) - T(1.0)) * raft::exp(T(2.0) * mu + var); break; } case RNG_Uniform: @@ -263,7 +263,7 @@ class RngMdspanTest : public ::testing::TestWithParam> { meanvar[1] = meanvar[0] * meanvar[0]; break; case RNG_Rayleigh: - meanvar[0] = params.start * raft::mySqrt(T(3.1415 / 2.0)); + meanvar[0] = params.start * raft::sqrt(T(3.1415 / 2.0)); meanvar[1] = ((T(4.0) - T(3.1415)) / T(2.0)) * params.start * params.start; break; case RNG_Laplace: From 0e96662f9b4fc77cd4ac6e528fe6103c81715287 Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Sat, 21 Jan 2023 21:55:43 +0100 Subject: [PATCH 27/44] Improvements in `matrix::gather`: test coverage, compilation errors, performance (#1126) In order to deprecate `copy_selected` from `ann_utils.cuh`, I wanted to make sure that the performance of `matrix::gather` was on par. But in the process I discovered that: - Map transforms and conditional copy were not tested at all. - In fact, most of the API in `gather.cuh` wasn't covered in tests and some of the functions didn't even compile. - The same type `MatrixIteratorT` was used for the input and output iterators, which made it impossible to take advantage of custom iterators, as is needed in `kmeans_balanced` to convert the dataset from `T` to `float` and gather in a single step. - The performance was really poor when `D` is small because the kernel assigns one block per row (so a block could be working on only 2 or 3 elements...) This PR addresses all the aforementioned issues. Authors: - Louis Sugy (https://github.com/Nyrio) Approvers: - Tamas Bela Feher (https://github.com/tfeher) - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/1126 --- cpp/bench/CMakeLists.txt | 2 +- cpp/bench/matrix/argmin.cu | 17 +- cpp/bench/matrix/gather.cu | 101 +++++ .../raft/cluster/detail/kmeans_common.cuh | 2 +- cpp/include/raft/core/operators.hpp | 51 ++- cpp/include/raft/matrix/detail/gather.cuh | 236 ++++++----- cpp/include/raft/matrix/gather.cuh | 371 ++++++++---------- cpp/test/matrix/gather.cu | 208 ++++++---- 8 files changed, 578 insertions(+), 410 deletions(-) create mode 100644 cpp/bench/matrix/gather.cu diff --git a/cpp/bench/CMakeLists.txt b/cpp/bench/CMakeLists.txt index 813483adc5..8dcdb325e9 100644 --- a/cpp/bench/CMakeLists.txt +++ b/cpp/bench/CMakeLists.txt @@ -103,7 +103,7 @@ if(BUILD_BENCH) bench/main.cpp ) - ConfigureBench(NAME MATRIX_BENCH PATH bench/matrix/argmin.cu bench/main.cpp) + ConfigureBench(NAME MATRIX_BENCH PATH bench/matrix/argmin.cu bench/matrix/gather.cu bench/main.cpp) ConfigureBench( NAME RANDOM_BENCH PATH bench/random/make_blobs.cu bench/random/permute.cu bench/random/rng.cu diff --git a/cpp/bench/matrix/argmin.cu b/cpp/bench/matrix/argmin.cu index 0d0dea0fdb..52f5aab7f3 100644 --- a/cpp/bench/matrix/argmin.cu +++ b/cpp/bench/matrix/argmin.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,10 +17,11 @@ #include #include #include +#include #include -namespace raft::bench::linalg { +namespace raft::bench::matrix { template struct ArgminParams { @@ -57,15 +58,11 @@ struct Argmin : public fixture { raft::device_vector indices; }; // struct Argmin -const std::vector> argmin_inputs_i64{ - {1000, 64}, {1000, 128}, {1000, 256}, {1000, 512}, {1000, 1024}, - {10000, 64}, {10000, 128}, {10000, 256}, {10000, 512}, {10000, 1024}, - {100000, 64}, {100000, 128}, {100000, 256}, {100000, 512}, {100000, 1024}, - {1000000, 64}, {1000000, 128}, {1000000, 256}, {1000000, 512}, {1000000, 1024}, - {10000000, 64}, {10000000, 128}, {10000000, 256}, {10000000, 512}, {10000000, 1024}, -}; +const std::vector> argmin_inputs_i64 = + raft::util::itertools::product>({1000, 10000, 100000, 1000000, 10000000}, + {64, 128, 256, 512, 1024}); RAFT_BENCH_REGISTER((Argmin), "", argmin_inputs_i64); RAFT_BENCH_REGISTER((Argmin), "", argmin_inputs_i64); -} // namespace raft::bench::linalg +} // namespace raft::bench::matrix diff --git a/cpp/bench/matrix/gather.cu b/cpp/bench/matrix/gather.cu new file mode 100644 index 0000000000..97812c20a1 --- /dev/null +++ b/cpp/bench/matrix/gather.cu @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include + +namespace raft::bench::matrix { + +template +struct GatherParams { + IdxT rows, cols, map_length; +}; + +template +inline auto operator<<(std::ostream& os, const GatherParams& p) -> std::ostream& +{ + os << p.rows << "#" << p.cols << "#" << p.map_length; + return os; +} + +template +struct Gather : public fixture { + Gather(const GatherParams& p) : params(p) {} + + void allocate_data(const ::benchmark::State& state) override + { + matrix = raft::make_device_matrix(handle, params.rows, params.cols); + map = raft::make_device_vector(handle, params.map_length); + out = raft::make_device_matrix(handle, params.map_length, params.cols); + stencil = raft::make_device_vector(handle, Conditional ? params.map_length : IdxT(0)); + + raft::random::RngState rng{1234}; + raft::random::uniform( + rng, matrix.data_handle(), params.rows * params.cols, T(-1), T(1), stream); + raft::random::uniformInt( + handle, rng, map.data_handle(), params.map_length, (MapT)0, (MapT)params.rows); + if constexpr (Conditional) { + raft::random::uniform(rng, stencil.data_handle(), params.map_length, T(-1), T(1), stream); + } + handle.sync_stream(stream); + } + + void run_benchmark(::benchmark::State& state) override + { + std::ostringstream label_stream; + label_stream << params; + state.SetLabel(label_stream.str()); + + loop_on_state(state, [this]() { + auto matrix_const_view = raft::make_device_matrix_view( + matrix.data_handle(), matrix.extent(0), matrix.extent(1)); + auto map_const_view = + raft::make_device_vector_view(map.data_handle(), map.extent(0)); + if constexpr (Conditional) { + auto stencil_const_view = + raft::make_device_vector_view(stencil.data_handle(), stencil.extent(0)); + auto pred_op = raft::plug_const_op(T(0.0), raft::greater_op()); + raft::matrix::gather_if( + handle, matrix_const_view, out.view(), map_const_view, stencil_const_view, pred_op); + } else { + raft::matrix::gather(handle, matrix_const_view, map_const_view, out.view()); + } + }); + } + + private: + GatherParams params; + raft::device_matrix matrix, out; + raft::device_vector stencil; + raft::device_vector map; +}; // struct Gather + +template +using GatherIf = Gather; + +const std::vector> gather_inputs_i64 = + raft::util::itertools::product>( + {1000000}, {10, 20, 50, 100, 200, 500}, {1000, 10000, 100000, 1000000}); + +RAFT_BENCH_REGISTER((Gather), "", gather_inputs_i64); +RAFT_BENCH_REGISTER((Gather), "", gather_inputs_i64); +RAFT_BENCH_REGISTER((GatherIf), "", gather_inputs_i64); +RAFT_BENCH_REGISTER((GatherIf), "", gather_inputs_i64); +} // namespace raft::bench::matrix diff --git a/cpp/include/raft/cluster/detail/kmeans_common.cuh b/cpp/include/raft/cluster/detail/kmeans_common.cuh index 2fd33ac759..559793442f 100644 --- a/cpp/include/raft/cluster/detail/kmeans_common.cuh +++ b/cpp/include/raft/cluster/detail/kmeans_common.cuh @@ -335,7 +335,7 @@ void shuffleAndGather(const raft::handle_t& handle, in.extent(1), in.extent(0), indices.data_handle(), - n_samples_to_gather, + static_cast(n_samples_to_gather), out.data_handle(), stream); } diff --git a/cpp/include/raft/core/operators.hpp b/cpp/include/raft/core/operators.hpp index de27c2b271..edb437c880 100644 --- a/cpp/include/raft/core/operators.hpp +++ b/cpp/include/raft/core/operators.hpp @@ -147,6 +147,14 @@ struct div_checkzero_op { } }; +struct modulo_op { + template + constexpr RAFT_INLINE_FUNCTION auto operator()(const T1& a, const T2& b) const + { + return a % b; + } +}; + struct pow_op { template RAFT_INLINE_FUNCTION auto operator()(const Type& a, const Type& b) const @@ -189,17 +197,49 @@ struct argmax_op { } }; +struct greater_op { + template + constexpr RAFT_INLINE_FUNCTION auto operator()(const T1& a, const T2& b) const + { + return a > b; + } +}; + +struct less_op { + template + constexpr RAFT_INLINE_FUNCTION auto operator()(const T1& a, const T2& b) const + { + return a < b; + } +}; + +struct greater_or_equal_op { + template + constexpr RAFT_INLINE_FUNCTION auto operator()(const T1& a, const T2& b) const + { + return a >= b; + } +}; + +struct less_or_equal_op { + template + constexpr RAFT_INLINE_FUNCTION auto operator()(const T1& a, const T2& b) const + { + return a <= b; + } +}; + struct equal_op { - template - constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& a, const Type& b) const + template + constexpr RAFT_INLINE_FUNCTION auto operator()(const T1& a, const T2& b) const { return a == b; } }; struct notequal_op { - template - constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& a, const Type& b) const + template + constexpr RAFT_INLINE_FUNCTION auto operator()(const T1& a, const T2& b) const { return a != b; } @@ -267,6 +307,9 @@ using div_const_op = plug_const_op; template using div_checkzero_const_op = plug_const_op; +template +using modulo_const_op = plug_const_op; + template using pow_const_op = plug_const_op; diff --git a/cpp/include/raft/matrix/detail/gather.cuh b/cpp/include/raft/matrix/detail/gather.cuh index c006f69e47..a8efc2d0d0 100644 --- a/cpp/include/raft/matrix/detail/gather.cuh +++ b/cpp/include/raft/matrix/detail/gather.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,41 +17,63 @@ #pragma once #include +#include namespace raft { namespace matrix { namespace detail { -// gatherKernel conditionally copies rows from the source matrix 'in' into the destination matrix -// 'out' according to a map (or a transformed map) -template +struct gather_policy { + static constexpr int n_threads = tpb; + static constexpr int work_per_thread = wpt; + static constexpr int stride = tpb * wpt; +}; + +/** Conditionally copies rows from the source matrix 'in' into the destination matrix + * 'out' according to a map (or a transformed map) */ +template -__global__ void gatherKernel(const MatrixIteratorT in, - IndexT D, - IndexT N, - MapIteratorT map, - StencilIteratorT stencil, - MatrixIteratorT out, - PredicateOp pred_op, - MapTransformOp transform_op) + typename OutputIteratorT, + typename IndexT> +__global__ void gather_kernel(const InputIteratorT in, + IndexT D, + IndexT len, + const MapIteratorT map, + StencilIteratorT stencil, + OutputIteratorT out, + PredicateOp pred_op, + MapTransformOp transform_op) { typedef typename std::iterator_traits::value_type MapValueT; typedef typename std::iterator_traits::value_type StencilValueT; - IndexT outRowStart = blockIdx.x * D; - MapValueT map_val = map[blockIdx.x]; - StencilValueT stencil_val = stencil[blockIdx.x]; +#pragma unroll + for (IndexT wid = 0; wid < Policy::work_per_thread; wid++) { + IndexT tid = threadIdx.x + (Policy::work_per_thread * static_cast(blockIdx.x) + wid) * + Policy::n_threads; + if (tid < len) { + IndexT i_dst = tid / D; + IndexT j = tid % D; + + MapValueT map_val = map[i_dst]; + StencilValueT stencil_val = stencil[i_dst]; - bool predicate = pred_op(stencil_val); - if (predicate) { - IndexT inRowStart = transform_op(map_val) * D; - for (int i = threadIdx.x; i < D; i += TPB) { - out[outRowStart + i] = in[inRowStart + i]; + bool predicate = pred_op(stencil_val); + if (predicate) { + IndexT i_src = transform_op(map_val); + out[tid] = in[i_src * D + j]; + } } } } @@ -60,7 +82,7 @@ __global__ void gatherKernel(const MatrixIteratorT in, * @brief gather conditionally copies rows from a source matrix into a destination matrix according * to a transformed map. * - * @tparam MatrixIteratorT Random-access iterator type, for reading input matrix (may be a + * @tparam InputIteratorT Random-access iterator type, for reading input matrix (may be a * simple pointer type). * @tparam MapIteratorT Random-access iterator type, for reading input map (may be a simple * pointer type). @@ -69,7 +91,10 @@ __global__ void gatherKernel(const MatrixIteratorT in, * @tparam UnaryPredicateOp Unary lambda expression or operator type, UnaryPredicateOp's result * type must be convertible to bool type. * @tparam MapTransformOp Unary lambda expression or operator type, MapTransformOp's result - * type must be convertible to IndexT (= int) type. + * type must be convertible to IndexT. + * @tparam OutputIteratorT Random-access iterator type, for writing output matrix (may be a + * simple pointer type). + * @tparam IndexT Index type. * * @param in Pointer to the input matrix (assumed to be row-major) * @param D Leading dimension of the input matrix 'in', which in-case of row-major @@ -83,18 +108,20 @@ __global__ void gatherKernel(const MatrixIteratorT in, * @param transform_op The transformation operation, transforms the map values to IndexT * @param stream CUDA stream to launch kernels within */ -template -void gatherImpl(const MatrixIteratorT in, - int D, - int N, - MapIteratorT map, + typename MapTransformOp, + typename OutputIteratorT, + typename IndexT> +void gatherImpl(const InputIteratorT in, + IndexT D, + IndexT N, + const MapIteratorT map, StencilIteratorT stencil, - int map_length, - MatrixIteratorT out, + IndexT map_length, + OutputIteratorT out, UnaryPredicateOp pred_op, MapTransformOp transform_op, cudaStream_t stream) @@ -102,9 +129,6 @@ void gatherImpl(const MatrixIteratorT in, // skip in case of 0 length input if (map_length <= 0 || N <= 0 || D <= 0) return; - // signed integer type for indexing or global offsets - typedef int IndexT; - // map value type typedef typename std::iterator_traits::value_type MapValueT; @@ -121,38 +145,26 @@ void gatherImpl(const MatrixIteratorT in, static_assert((std::is_convertible::value), "UnaryPredicateOp's result type must be convertible to bool type"); - if (D <= 32) { - gatherKernel - <<>>(in, D, N, map, stencil, out, pred_op, transform_op); - } else if (D <= 64) { - gatherKernel - <<>>(in, D, N, map, stencil, out, pred_op, transform_op); - } else if (D <= 128) { - gatherKernel - <<>>(in, D, N, map, stencil, out, pred_op, transform_op); + IndexT len = map_length * D; + constexpr int TPB = 128; + const int n_sm = raft::getMultiProcessorCount(); + // The following empirical heuristics enforce that we keep a good balance between having enough + // blocks and enough work per thread. + if (len < 32 * TPB * n_sm) { + using Policy = gather_policy; + IndexT n_blocks = raft::ceildiv(map_length * D, static_cast(Policy::stride)); + gather_kernel<<>>( + in, D, len, map, stencil, out, pred_op, transform_op); + } else if (len < 32 * 4 * TPB * n_sm) { + using Policy = gather_policy; + IndexT n_blocks = raft::ceildiv(map_length * D, static_cast(Policy::stride)); + gather_kernel<<>>( + in, D, len, map, stencil, out, pred_op, transform_op); } else { - gatherKernel - <<>>(in, D, N, map, stencil, out, pred_op, transform_op); + using Policy = gather_policy; + IndexT n_blocks = raft::ceildiv(map_length * D, static_cast(Policy::stride)); + gather_kernel<<>>( + in, D, len, map, stencil, out, pred_op, transform_op); } RAFT_CUDA_TRY(cudaPeekAtLastError()); } @@ -160,10 +172,13 @@ void gatherImpl(const MatrixIteratorT in, /** * @brief gather copies rows from a source matrix into a destination matrix according to a map. * - * @tparam MatrixIteratorT Random-access iterator type, for reading input matrix (may be a + * @tparam InputIteratorT Random-access iterator type, for reading input matrix (may be a * simple pointer type). * @tparam MapIteratorT Random-access iterator type, for reading input map (may be a simple * pointer type). + * @tparam OutputIteratorT Random-access iterator type, for writing output matrix (may be a + * simple pointer type). + * @tparam IndexT Index type. * * @param in Pointer to the input matrix (assumed to be row-major) * @param D Leading dimension of the input matrix 'in', which in-case of row-major @@ -174,13 +189,13 @@ void gatherImpl(const MatrixIteratorT in, * @param out Pointer to the output matrix (assumed to be row-major) * @param stream CUDA stream to launch kernels within */ -template -void gather(const MatrixIteratorT in, - int D, - int N, - MapIteratorT map, - int map_length, - MatrixIteratorT out, +template +void gather(const InputIteratorT in, + IndexT D, + IndexT N, + const MapIteratorT map, + IndexT map_length, + OutputIteratorT out, cudaStream_t stream) { typedef typename std::iterator_traits::value_type MapValueT; @@ -192,12 +207,15 @@ void gather(const MatrixIteratorT in, * @brief gather copies rows from a source matrix into a destination matrix according to a * transformed map. * - * @tparam MatrixIteratorT Random-access iterator type, for reading input matrix (may be a + * @tparam InputIteratorT Random-access iterator type, for reading input matrix (may be a * simple pointer type). * @tparam MapIteratorT Random-access iterator type, for reading input map (may be a simple * pointer type). * @tparam MapTransformOp Unary lambda expression or operator type, MapTransformOp's result - * type must be convertible to IndexT (= int) type. + * type must be convertible to IndexT. + * @tparam OutputIteratorT Random-access iterator type, for writing output matrix (may be a + * simple pointer type). + * @tparam IndexT Index type. * * @param in Pointer to the input matrix (assumed to be row-major) * @param D Leading dimension of the input matrix 'in', which in-case of row-major @@ -209,13 +227,17 @@ void gather(const MatrixIteratorT in, * @param transform_op The transformation operation, transforms the map values to IndexT * @param stream CUDA stream to launch kernels within */ -template -void gather(const MatrixIteratorT in, - int D, - int N, - MapIteratorT map, - int map_length, - MatrixIteratorT out, +template +void gather(const InputIteratorT in, + IndexT D, + IndexT N, + const MapIteratorT map, + IndexT map_length, + OutputIteratorT out, MapTransformOp transform_op, cudaStream_t stream) { @@ -227,7 +249,7 @@ void gather(const MatrixIteratorT in, * @brief gather_if conditionally copies rows from a source matrix into a destination matrix * according to a map. * - * @tparam MatrixIteratorT Random-access iterator type, for reading input matrix (may be a + * @tparam InputIteratorT Random-access iterator type, for reading input matrix (may be a * simple pointer type). * @tparam MapIteratorT Random-access iterator type, for reading input map (may be a simple * pointer type). @@ -235,6 +257,9 @@ void gather(const MatrixIteratorT in, * simple pointer type). * @tparam UnaryPredicateOp Unary lambda expression or operator type, UnaryPredicateOp's result * type must be convertible to bool type. + * @tparam OutputIteratorT Random-access iterator type, for writing output matrix (may be a + * simple pointer type). + * @tparam IndexT Index type. * * @param in Pointer to the input matrix (assumed to be row-major) * @param D Leading dimension of the input matrix 'in', which in-case of row-major @@ -247,17 +272,19 @@ void gather(const MatrixIteratorT in, * @param pred_op Predicate to apply to the stencil values * @param stream CUDA stream to launch kernels within */ -template -void gather_if(const MatrixIteratorT in, - int D, - int N, - MapIteratorT map, + typename UnaryPredicateOp, + typename OutputIteratorT, + typename IndexT> +void gather_if(const InputIteratorT in, + IndexT D, + IndexT N, + const MapIteratorT map, StencilIteratorT stencil, - int map_length, - MatrixIteratorT out, + IndexT map_length, + OutputIteratorT out, UnaryPredicateOp pred_op, cudaStream_t stream) { @@ -269,7 +296,7 @@ void gather_if(const MatrixIteratorT in, * @brief gather_if conditionally copies rows from a source matrix into a destination matrix * according to a transformed map. * - * @tparam MatrixIteratorT Random-access iterator type, for reading input matrix (may be a + * @tparam InputIteratorT Random-access iterator type, for reading input matrix (may be a * simple pointer type). * @tparam MapIteratorT Random-access iterator type, for reading input map (may be a simple * pointer type). @@ -278,7 +305,10 @@ void gather_if(const MatrixIteratorT in, * @tparam UnaryPredicateOp Unary lambda expression or operator type, UnaryPredicateOp's result * type must be convertible to bool type. * @tparam MapTransformOp Unary lambda expression or operator type, MapTransformOp's result - * type must be convertible to IndexT (= int) type. + * type must be convertible to IndexT type. + * @tparam OutputIteratorT Random-access iterator type, for writing output matrix (may be a + * simple pointer type). + * @tparam IndexT Index type. * * @param in Pointer to the input matrix (assumed to be row-major) * @param D Leading dimension of the input matrix 'in', which in-case of row-major @@ -292,18 +322,20 @@ void gather_if(const MatrixIteratorT in, * @param transform_op The transformation operation, transforms the map values to IndexT * @param stream CUDA stream to launch kernels within */ -template -void gather_if(const MatrixIteratorT in, - int D, - int N, - MapIteratorT map, + typename MapTransformOp, + typename OutputIteratorT, + typename IndexT> +void gather_if(const InputIteratorT in, + IndexT D, + IndexT N, + const MapIteratorT map, StencilIteratorT stencil, - int map_length, - MatrixIteratorT out, + IndexT map_length, + OutputIteratorT out, UnaryPredicateOp pred_op, MapTransformOp transform_op, cudaStream_t stream) diff --git a/cpp/include/raft/matrix/gather.cuh b/cpp/include/raft/matrix/gather.cuh index 6a923fb0cc..9487da35b5 100644 --- a/cpp/include/raft/matrix/gather.cuh +++ b/cpp/include/raft/matrix/gather.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ #include #include #include +#include namespace raft::matrix { @@ -28,62 +29,68 @@ namespace raft::matrix { */ /** - * @brief gather copies rows from a source matrix into a destination matrix according to a map. + * @brief Copies rows from a source matrix into a destination matrix according to a map. * - * @tparam MatrixIteratorT Random-access iterator type, for reading input matrix (may be a - * simple pointer type). - * @tparam MapIteratorT Random-access iterator type, for reading input map (may be a simple - * pointer type). + * For each output row, read the index in the input matrix from the map and copy the row. * - * @param in Pointer to the input matrix (assumed to be row-major) - * @param D Leading dimension of the input matrix 'in', which in-case of row-major - * storage is the number of columns - * @param N Second dimension - * @param map Pointer to the input sequence of gather locations - * @param map_length The length of 'map' and 'stencil' - * @param out Pointer to the output matrix (assumed to be row-major) + * @tparam InputIteratorT Input iterator type, for the input matrix (may be a pointer type). + * @tparam MapIteratorT Input iterator type, for the map (may be a pointer type). + * @tparam OutputIteratorT Output iterator type, for the output matrix (may be a pointer type). + * @tparam IndexT Index type. + * + * @param in Input matrix, dim = [N, D] (row-major) + * @param D Number of columns of the input/output matrices + * @param N Number of rows of the input matrix + * @param map Map of row indices to gather, dim = [map_length] + * @param map_length The length of 'map', number of rows of the output matrix + * @param out Output matrix, dim = [map_length, D] (row-major) * @param stream CUDA stream to launch kernels within */ -template -void gather(const MatrixIteratorT in, - int D, - int N, - MapIteratorT map, - int map_length, - MatrixIteratorT out, +template +void gather(const InputIteratorT in, + IndexT D, + IndexT N, + const MapIteratorT map, + IndexT map_length, + OutputIteratorT out, cudaStream_t stream) { detail::gather(in, D, N, map, map_length, out, stream); } /** - * @brief gather copies rows from a source matrix into a destination matrix according to a - * transformed map. + * @brief Copies rows from a source matrix into a destination matrix according to a transformed map. + * + * For each output row, read the index in the input matrix from the map, apply a transformation to + * this input index and copy the row. * - * @tparam MatrixIteratorT Random-access iterator type, for reading input matrix (may be a - * simple pointer type). - * @tparam MapIteratorT Random-access iterator type, for reading input map (may be a simple - * pointer type). - * @tparam MapTransformOp Unary lambda expression or operator type, MapTransformOp's result - * type must be convertible to IndexT (= int) type. + * @tparam InputIteratorT Input iterator type, for the input matrix (may be a pointer type). + * @tparam MapIteratorT Input iterator type, for the map (may be a pointer type). + * @tparam MapTransformOp Unary lambda expression or operator type. MapTransformOp's result type + * must be convertible to IndexT. + * @tparam OutputIteratorT Output iterator type, for the output matrix (may be a pointer type). + * @tparam IndexT Index type. * - * @param in Pointer to the input matrix (assumed to be row-major) - * @param D Leading dimension of the input matrix 'in', which in-case of row-major - * storage is the number of columns - * @param N Second dimension - * @param map Pointer to the input sequence of gather locations - * @param map_length The length of 'map' and 'stencil' - * @param out Pointer to the output matrix (assumed to be row-major) - * @param transform_op The transformation operation, transforms the map values to IndexT + * @param in Input matrix, dim = [N, D] (row-major) + * @param D Number of columns of the input/output matrices + * @param N Number of rows of the input matrix + * @param map Map of row indices to gather, dim = [map_length] + * @param map_length The length of 'map', number of rows of the output matrix + * @param out Output matrix, dim = [map_length, D] (row-major) + * @param transform_op Transformation to apply to map values * @param stream CUDA stream to launch kernels within */ -template -void gather(const MatrixIteratorT in, - int D, - int N, - MapIteratorT map, - int map_length, - MatrixIteratorT out, +template +void gather(const InputIteratorT in, + IndexT D, + IndexT N, + const MapIteratorT map, + IndexT map_length, + OutputIteratorT out, MapTransformOp transform_op, cudaStream_t stream) { @@ -91,40 +98,42 @@ void gather(const MatrixIteratorT in, } /** - * @brief gather_if conditionally copies rows from a source matrix into a destination matrix - * according to a map. + * @brief Conditionally copies rows from a source matrix into a destination matrix. + * + * For each output row, read the index in the input matrix from the map, read a stencil value, apply + * a predicate to the stencil value, and if true, copy the row. * - * @tparam MatrixIteratorT Random-access iterator type, for reading input matrix (may be a - * simple pointer type). - * @tparam MapIteratorT Random-access iterator type, for reading input map (may be a simple - * pointer type). - * @tparam StencilIteratorT Random-access iterator type, for reading input stencil (may be a - * simple pointer type). - * @tparam UnaryPredicateOp Unary lambda expression or operator type, UnaryPredicateOp's result - * type must be convertible to bool type. + * @tparam InputIteratorT Input iterator type, for the input matrix (may be a pointer type). + * @tparam MapIteratorT Input iterator type, for the map (may be a pointer type). + * @tparam StencilIteratorT Input iterator type, for the stencil (may be a pointer type). + * @tparam UnaryPredicateOp Unary lambda expression or operator type. UnaryPredicateOp's result type + * must be convertible to bool type. + * @tparam OutputIteratorT Output iterator type, for the output matrix (may be a pointer type). + * @tparam IndexT Index type. * - * @param in Pointer to the input matrix (assumed to be row-major) - * @param D Leading dimension of the input matrix 'in', which in-case of row-major - * storage is the number of columns - * @param N Second dimension - * @param map Pointer to the input sequence of gather locations - * @param stencil Pointer to the input sequence of stencil or predicate values - * @param map_length The length of 'map' and 'stencil' - * @param out Pointer to the output matrix (assumed to be row-major) + * @param in Input matrix, dim = [N, D] (row-major) + * @param D Number of columns of the input/output matrices + * @param N Number of rows of the input matrix + * @param map Map of row indices to gather, dim = [map_length] + * @param stencil Sequence of stencil values, dim = [map_length] + * @param map_length The length of 'map' and 'stencil', number of rows of the output matrix + * @param out Output matrix, dim = [map_length, D] (row-major) * @param pred_op Predicate to apply to the stencil values * @param stream CUDA stream to launch kernels within */ -template -void gather_if(const MatrixIteratorT in, - int D, - int N, - MapIteratorT map, + typename UnaryPredicateOp, + typename OutputIteratorT, + typename IndexT> +void gather_if(const InputIteratorT in, + IndexT D, + IndexT N, + const MapIteratorT map, StencilIteratorT stencil, - int map_length, - MatrixIteratorT out, + IndexT map_length, + OutputIteratorT out, UnaryPredicateOp pred_op, cudaStream_t stream) { @@ -132,44 +141,47 @@ void gather_if(const MatrixIteratorT in, } /** - * @brief gather_if conditionally copies rows from a source matrix into a destination matrix - * according to a transformed map. + * @brief Conditionally copies rows according to a transformed map. + * + * For each output row, read the index in the input matrix from the map, read a stencil value, + * apply a predicate to the stencil value, and if true, apply a transformation to the input index + * and copy the row. * - * @tparam MatrixIteratorT Random-access iterator type, for reading input matrix (may be a - * simple pointer type). - * @tparam MapIteratorT Random-access iterator type, for reading input map (may be a simple - * pointer type). - * @tparam StencilIteratorT Random-access iterator type, for reading input stencil (may be a - * simple pointer type). - * @tparam UnaryPredicateOp Unary lambda expression or operator type, UnaryPredicateOp's result - * type must be convertible to bool type. - * @tparam MapTransformOp Unary lambda expression or operator type, MapTransformOp's result - * type must be convertible to IndexT (= int) type. + * @tparam InputIteratorT Input iterator type, for the input matrix (may be a pointer type). + * @tparam MapIteratorT Input iterator type, for the map (may be a pointer type). + * @tparam MapTransformOp Unary lambda expression or operator type. MapTransformOp's result type + * must be convertible to IndexT. + * @tparam StencilIteratorT Input iterator type, for the stencil (may be a pointer type). + * @tparam UnaryPredicateOp Unary lambda expression or operator type. UnaryPredicateOp's result type + * must be convertible to bool type. + * @tparam OutputIteratorT Output iterator type, for the output matrix (may be a pointer type). + * @tparam IndexT Index type. * - * @param in Pointer to the input matrix (assumed to be row-major) - * @param D Leading dimension of the input matrix 'in', which in-case of row-major - * storage is the number of columns - * @param N Second dimension - * @param map Pointer to the input sequence of gather locations - * @param stencil Pointer to the input sequence of stencil or predicate values - * @param map_length The length of 'map' and 'stencil' - * @param out Pointer to the output matrix (assumed to be row-major) + * @param in Input matrix, dim = [N, D] (row-major) + * @param D Number of columns of the input/output matrices + * @param N Number of rows of the input matrix + * @param map Map of row indices to gather, dim = [map_length] + * @param stencil Sequence of stencil values, dim = [map_length] + * @param map_length The length of 'map' and 'stencil', number of rows of the output matrix + * @param out Output matrix, dim = [map_length, D] (row-major) * @param pred_op Predicate to apply to the stencil values - * @param transform_op The transformation operation, transforms the map values to IndexT + * @param transform_op Transformation to apply to map values * @param stream CUDA stream to launch kernels within */ -template -void gather_if(const MatrixIteratorT in, - int D, - int N, - MapIteratorT map, + typename MapTransformOp, + typename OutputIteratorT, + typename IndexT> +void gather_if(const InputIteratorT in, + IndexT D, + IndexT N, + const MapIteratorT map, StencilIteratorT stencil, - int map_length, - MatrixIteratorT out, + IndexT map_length, + OutputIteratorT out, UnaryPredicateOp pred_op, MapTransformOp transform_op, cudaStream_t stream) @@ -178,58 +190,31 @@ void gather_if(const MatrixIteratorT in, } /** - * @brief gather copies rows from a source matrix into a destination matrix according to a map. + * @brief Copies rows from a source matrix into a destination matrix according to a transformed map. * - * @tparam matrix_t Matrix element type - * @tparam map_t Map vector type - * @tparam idx_t integer type used for indexing - * @param[in] handle raft handle for managing resources - * @param[in] in Input matrix (assumed to be row-major) - * @param[in] map Vector of gather locations - * @param[out] out Output matrix (assumed to be row-major) - */ -template -void gather(const raft::handle_t& handle, - raft::device_matrix_view in, - raft::device_vector_view map, - raft::device_matrix_view out) -{ - RAFT_EXPECTS(out.extent(0) == map.extent(0), - "Number of rows in output matrix must equal the size of the map vector"); - RAFT_EXPECTS(out.extent(1) == in.extent(1), - "Number of columns in input and output matrices must be equal."); - - raft::matrix::detail::gather( - const_cast(in.data_handle()), // TODO: There's a better way to handle this - static_cast(in.extent(1)), - static_cast(in.extent(0)), - map.data_handle(), - static_cast(map.extent(0)), - out.data_handle(), - handle.get_stream()); -} - -/** - * @brief gather copies rows from a source matrix into a destination matrix according to a - * transformed map. + * For each output row, read the index in the input matrix from the map, apply a transformation to + * this input index if specified, and copy the row. * - * @tparam matrix_t Matrix type - * @tparam map_t Map vector type - * @tparam map_xform_t Unary lambda expression or operator type, MapTransformOp's result - * type must be convertible to idx_t (= int) type. - * @tparam idx_t integer type for indexing - * @param[in] handle raft handle for managing resources - * @param[in] in Input matrix (assumed to be row-major) - * @param[in] map Input vector of gather locations - * @param[out] out Output matrix (assumed to be row-major) - * @param[in] transform_op The transformation operation, transforms the map values to idx_t + * @tparam matrix_t Matrix element type + * @tparam map_t Integer type of map elements + * @tparam idx_t Integer type used for indexing + * @tparam map_xform_t Unary lambda expression or operator type. MapTransformOp's result type must + * be convertible to idx_t. + * @param[in] handle raft handle for managing resources + * @param[in] in Input matrix, dim = [N, D] (row-major) + * @param[in] map Map of row indices to gather, dim = [map_length] + * @param[out] out Output matrix, dim = [map_length, D] (row-major) + * @param[in] transform_op (optional) Transformation to apply to map values */ -template +template void gather(const raft::handle_t& handle, raft::device_matrix_view in, raft::device_vector_view map, - raft::device_matrix_view out, - map_xform_t transform_op) + raft::device_matrix_view out, + map_xform_t transform_op = raft::identity_op()) { RAFT_EXPECTS(out.extent(0) == map.extent(0), "Number of rows in output matrix must equal the size of the map vector"); @@ -238,95 +223,51 @@ void gather(const raft::handle_t& handle, detail::gather( const_cast(in.data_handle()), // TODO: There's a better way to handle this - static_cast(in.extent(1)), - static_cast(in.extent(0)), - map, - static_cast(map.extent(0)), + in.extent(1), + in.extent(0), + map.data_handle(), + map.extent(0), out.data_handle(), transform_op, handle.get_stream()); } /** - * @brief gather_if conditionally copies rows from a source matrix into a destination matrix - * according to a map. + * @brief Conditionally copies rows according to a transformed map. + * + * For each output row, read the index in the input matrix from the map, read a stencil value, + * apply a predicate to the stencil value, and if true, apply a transformation if specified to the + * input index, and copy the row. * - * @tparam matrix_t Matrix value type - * @tparam map_t Map vector type - * @tparam stencil_t Stencil vector type - * @tparam unary_pred_t Unary lambda expression or operator type, unary_pred_t's result - * type must be convertible to bool type. - * @tparam idx_t integer type for indexing - * @param[in] handle raft handle for managing resources - * @param[in] in Input matrix (assumed to be row-major) - * @param[in] map Input vector of gather locations - * @param[in] stencil Input vector of stencil or predicate values - * @param[out] out Output matrix (assumed to be row-major) - * @param[in] pred_op Predicate to apply to the stencil values + * @tparam matrix_t Matrix element type + * @tparam map_t Integer type of map elements + * @tparam stencil_t Value type for stencil (input type for the pred_op) + * @tparam unary_pred_t Unary lambda expression or operator type. unary_pred_t's result + * type must be convertible to bool type. + * @tparam map_xform_t Unary lambda expression or operator type. MapTransformOp's result type must + * be convertible to idx_t. + * @tparam idx_t Integer type used for indexing + * @param[in] handle raft handle for managing resources + * @param[in] in Input matrix, dim = [N, D] (row-major) + * @param[in] map Map of row indices to gather, dim = [map_length] + * @param[in] stencil Vector of stencil values, dim = [map_length] + * @param[out] out Output matrix, dim = [map_length, D] (row-major) + * @param[in] pred_op Predicate to apply to the stencil values + * @param[in] transform_op (optional) Transformation to apply to map values */ template + typename idx_t, + typename map_xform_t = raft::identity_op> void gather_if(const raft::handle_t& handle, raft::device_matrix_view in, raft::device_matrix_view out, raft::device_vector_view map, raft::device_vector_view stencil, - unary_pred_t pred_op) -{ - RAFT_EXPECTS(out.extent(0) == map.extent(0), - "Number of rows in output matrix must equal the size of the map vector"); - RAFT_EXPECTS(out.extent(1) == in.extent(1), - "Number of columns in input and output matrices must be equal."); - RAFT_EXPECTS(map.extent(0) == stencil.extent(0), - "Number of elements in stencil must equal number of elements in map"); - - detail::gather_if(const_cast(in.data_handle()), - out.extent(1), - out.extent(0), - map.data_handle(), - stencil.data_handle(), - map.extent(0), - out.data_handle(), - pred_op, - handle.get_stream()); -} - -/** - * @brief gather_if conditionally copies rows from a source matrix into a destination matrix - * according to a transformed map. - * - * @tparam matrix_t Matrix value type, for reading input matrix - * @tparam map_t Vector value type for map - * @tparam stencil_t Vector value type for stencil - * @tparam unary_pred_t Unary lambda expression or operator type, unary_pred_t's result - * type must be convertible to bool type. - * @tparam map_xform_t Unary lambda expression or operator type, map_xform_t's result - * type must be convertible to idx_t (= int) type. - * @tparam idx_t integer type for indexing - * @param[in] handle raft handle for managing resources - * @param[in] in Input matrix (assumed to be row-major) - * @param[in] map Vector of gather locations - * @param[in] stencil Vector of stencil or predicate values - * @param[out] out Output matrix (assumed to be row-major) - * @param[in] pred_op Predicate to apply to the stencil values - * @param[in] transform_op The transformation operation, transforms the map values to idx_t - */ -template -void gather_if(const raft::handle_t& handle, - raft::device_matrix_view in, - raft::device_matrix_view out, - raft::device_vector_view map, - raft::device_vector_view stencil, unary_pred_t pred_op, - map_xform_t transform_op) + map_xform_t transform_op = raft::identity_op()) { RAFT_EXPECTS(out.extent(0) == map.extent(0), "Number of rows in output matrix must equal the size of the map vector"); diff --git a/cpp/test/matrix/gather.cu b/cpp/test/matrix/gather.cu index 0bea62e9cf..3659265e84 100644 --- a/cpp/test/matrix/gather.cu +++ b/cpp/test/matrix/gather.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,50 +18,72 @@ #include #include #include +#include #include #include #include +#include #include namespace raft { -template -void naiveGatherImpl( - MatrixIteratorT in, int D, int N, MapIteratorT map, int map_length, MatrixIteratorT out) +template +void naiveGather(InputIteratorT in, + IdxT D, + IdxT N, + MapIteratorT map, + StencilIteratorT stencil, + IdxT map_length, + OutputIteratorT out, + UnaryPredicateOp pred_op, + MapTransformOp transform_op) { - for (int outRow = 0; outRow < map_length; ++outRow) { + for (IdxT outRow = 0; outRow < map_length; ++outRow) { + if constexpr (Conditional) { + auto stencil_val = stencil[outRow]; + if (!pred_op(stencil_val)) continue; + } typename std::iterator_traits::value_type map_val = map[outRow]; - int inRowStart = map_val * D; - int outRowStart = outRow * D; - for (int i = 0; i < D; ++i) { + IdxT transformed_val; + if constexpr (MapTransform) { + transformed_val = transform_op(map_val); + } else { + transformed_val = map_val; + } + IdxT inRowStart = transformed_val * D; + IdxT outRowStart = outRow * D; + for (IdxT i = 0; i < D; ++i) { out[outRowStart + i] = in[inRowStart + i]; } } } -template -void naiveGather( - MatrixIteratorT in, int D, int N, MapIteratorT map, int map_length, MatrixIteratorT out) -{ - naiveGatherImpl(in, D, N, map, map_length, out); -} - +template struct GatherInputs { - uint32_t nrows; - uint32_t ncols; - uint32_t map_length; + IdxT nrows; + IdxT ncols; + IdxT map_length; unsigned long long int seed; }; -template -class GatherTest : public ::testing::TestWithParam { +template +class GatherTest : public ::testing::TestWithParam> { protected: GatherTest() : stream(handle.get_stream()), - params(::testing::TestWithParam::GetParam()), + params(::testing::TestWithParam>::GetParam()), d_in(0, stream), d_out_exp(0, stream), d_out_act(0, stream), + d_stencil(0, stream), d_map(0, stream) { } @@ -71,44 +93,71 @@ class GatherTest : public ::testing::TestWithParam { raft::random::RngState r(params.seed); raft::random::RngState r_int(params.seed); - uint32_t nrows = params.nrows; - uint32_t ncols = params.ncols; - uint32_t map_length = params.map_length; - uint32_t len = nrows * ncols; + IdxT map_length = params.map_length; + IdxT len = params.nrows * params.ncols; // input matrix setup - d_in.resize(nrows * ncols, stream); - h_in.resize(nrows * ncols); + d_in.resize(params.nrows * params.ncols, stream); + h_in.resize(params.nrows * params.ncols); raft::random::uniform(handle, r, d_in.data(), len, MatrixT(-1.0), MatrixT(1.0)); raft::update_host(h_in.data(), d_in.data(), len, stream); // map setup d_map.resize(map_length, stream); h_map.resize(map_length); - raft::random::uniformInt(handle, r_int, d_map.data(), map_length, (MapT)0, nrows); + raft::random::uniformInt(handle, r_int, d_map.data(), map_length, (MapT)0, (MapT)params.nrows); raft::update_host(h_map.data(), d_map.data(), map_length, stream); - // expected and actual output matrix setup - h_out.resize(map_length * ncols); - d_out_exp.resize(map_length * ncols, stream); - d_out_act.resize(map_length * ncols, stream); + // stencil setup + if (Conditional) { + d_stencil.resize(map_length, stream); + h_stencil.resize(map_length); + raft::random::uniform(handle, r, d_stencil.data(), map_length, MatrixT(-1.0), MatrixT(1.0)); + raft::update_host(h_stencil.data(), d_stencil.data(), map_length, stream); + } - // launch gather on the host and copy the results to device - naiveGather(h_in.data(), ncols, nrows, h_map.data(), map_length, h_out.data()); - raft::update_device(d_out_exp.data(), h_out.data(), map_length * ncols, stream); + // unary predicate op (used only when Conditional is true) + auto pred_op = raft::plug_const_op(MatrixT(0.0), raft::greater_op()); - auto in_view = raft::make_device_matrix_view( - d_in.data(), nrows, ncols); - auto out_view = - raft::make_device_matrix_view(d_out_act.data(), map_length, ncols); - auto map_view = - raft::make_device_vector_view(d_map.data(), map_length); + // map transform op (used only when MapTransform is true) + auto transform_op = + raft::compose_op(raft::modulo_const_op(params.nrows), raft::add_const_op(10)); - raft::matrix::gather(handle, in_view, map_view, out_view); + // expected and actual output matrix setup + h_out.resize(map_length * params.ncols); + d_out_exp.resize(map_length * params.ncols, stream); + d_out_act.resize(map_length * params.ncols, stream); - // // launch device version of the kernel - // gatherLaunch( - // handle, d_in.data(), ncols, nrows, d_map.data(), map_length, d_out_act.data(), stream); + // launch gather on the host and copy the results to device + naiveGather(h_in.data(), + params.ncols, + params.nrows, + h_map.data(), + h_stencil.data(), + map_length, + h_out.data(), + pred_op, + transform_op); + raft::update_device(d_out_exp.data(), h_out.data(), map_length * params.ncols, stream); + + auto in_view = raft::make_device_matrix_view( + d_in.data(), params.nrows, params.ncols); + auto out_view = raft::make_device_matrix_view( + d_out_act.data(), map_length, params.ncols); + auto map_view = raft::make_device_vector_view(d_map.data(), map_length); + auto stencil_view = + raft::make_device_vector_view(d_stencil.data(), map_length); + + if (Conditional && MapTransform) { + raft::matrix::gather_if( + handle, in_view, out_view, map_view, stencil_view, pred_op, transform_op); + } else if (Conditional) { + raft::matrix::gather_if(handle, in_view, out_view, map_view, stencil_view, pred_op); + } else if (MapTransform) { + raft::matrix::gather(handle, in_view, map_view, out_view, transform_op); + } else { + raft::matrix::gather(handle, in_view, map_view, out_view); + } handle.sync_stream(stream); } @@ -116,41 +165,46 @@ class GatherTest : public ::testing::TestWithParam { protected: raft::handle_t handle; cudaStream_t stream = 0; - GatherInputs params; - std::vector h_in, h_out; + GatherInputs params; + std::vector h_in, h_out, h_stencil; std::vector h_map; - rmm::device_uvector d_in, d_out_exp, d_out_act; + rmm::device_uvector d_in, d_out_exp, d_out_act, d_stencil; rmm::device_uvector d_map; }; -const std::vector inputs = {{1024, 32, 128, 1234ULL}, - {1024, 32, 256, 1234ULL}, - {1024, 32, 512, 1234ULL}, - {1024, 32, 1024, 1234ULL}, - {1024, 64, 128, 1234ULL}, - {1024, 64, 256, 1234ULL}, - {1024, 64, 512, 1234ULL}, - {1024, 64, 1024, 1234ULL}, - {1024, 128, 128, 1234ULL}, - {1024, 128, 256, 1234ULL}, - {1024, 128, 512, 1234ULL}, - {1024, 128, 1024, 1234ULL}}; - -typedef GatherTest GatherTestF; -TEST_P(GatherTestF, Result) -{ - ASSERT_TRUE(devArrMatch( - d_out_exp.data(), d_out_act.data(), params.map_length * params.ncols, raft::Compare())); -} - -typedef GatherTest GatherTestD; -TEST_P(GatherTestD, Result) -{ - ASSERT_TRUE(devArrMatch( - d_out_exp.data(), d_out_act.data(), params.map_length * params.ncols, raft::Compare())); -} - -INSTANTIATE_TEST_CASE_P(GatherTests, GatherTestF, ::testing::ValuesIn(inputs)); -INSTANTIATE_TEST_CASE_P(GatherTests, GatherTestD, ::testing::ValuesIn(inputs)); +#define GATHER_TEST(test_type, test_name, test_inputs) \ + typedef RAFT_DEPAREN(test_type) test_name; \ + TEST_P(test_name, Result) \ + { \ + ASSERT_TRUE(devArrMatch(d_out_exp.data(), \ + d_out_act.data(), \ + params.map_length* params.ncols, \ + raft::Compare())); \ + } \ + INSTANTIATE_TEST_CASE_P(GatherTests, test_name, ::testing::ValuesIn(test_inputs)) + +const std::vector> inputs_i32 = + raft::util::itertools::product>({25, 2000}, {6, 31, 129}, {11, 999}, {1234ULL}); +const std::vector> inputs_i64 = + raft::util::itertools::product>( + {25, 2000}, {6, 31, 129}, {11, 999}, {1234ULL}); + +GATHER_TEST((GatherTest), GatherTestFU32I32, inputs_i32); +GATHER_TEST((GatherTest), + GatherTransformTestFU32I32, + inputs_i32); +GATHER_TEST((GatherTest), GatherIfTestFU32I32, inputs_i32); +GATHER_TEST((GatherTest), + GatherIfTransformTestFU32I32, + inputs_i32); +GATHER_TEST((GatherTest), + GatherIfTransformTestDU32I32, + inputs_i32); +GATHER_TEST((GatherTest), + GatherIfTransformTestFU32I64, + inputs_i64); +GATHER_TEST((GatherTest), + GatherIfTransformTestFI64I64, + inputs_i64); } // end namespace raft \ No newline at end of file From 9963c12187d5b73710fa98d0fe0f94f84ce5e669 Mon Sep 17 00:00:00 2001 From: Raymond Douglass Date: Mon, 23 Jan 2023 10:51:58 -0500 Subject: [PATCH 28/44] DOC --- ci/cpu/build.sh | 2 +- ci/gpu/build.sh | 2 +- conda/environments/all_cuda-118_arch-x86_64.yaml | 6 +++--- cpp/CMakeLists.txt | 4 ++-- dependencies.yaml | 6 +++--- docs/source/conf.py | 4 ++-- fetch_rapids.cmake | 2 +- python/pylibraft/CMakeLists.txt | 2 +- python/raft-dask/CMakeLists.txt | 2 +- 9 files changed, 15 insertions(+), 15 deletions(-) diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh index 2f0e2b94ca..657126fdf0 100755 --- a/ci/cpu/build.sh +++ b/ci/cpu/build.sh @@ -43,7 +43,7 @@ export CMAKE_GENERATOR="Ninja" export CONDA_BLD_DIR="${WORKSPACE}/.conda-bld" # ucx-py version -export UCX_PY_VERSION='0.30.*' +export UCX_PY_VERSION='0.31.*' ################################################################################ # SETUP - Check environment diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 1808480d37..84026203fa 100644 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -38,7 +38,7 @@ export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'` unset GIT_DESCRIBE_TAG # ucx-py version -export UCX_PY_VERSION='0.30.*' +export UCX_PY_VERSION='0.31.*' # Whether to install dask nightly or stable packages. export INSTALL_DASK_MAIN=1 diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 87b7075935..f194b152a6 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -18,7 +18,7 @@ dependencies: - cupy - cxx-compiler - cython>=0.29,<0.30 -- dask-cuda=23.02 +- dask-cuda=23.04 - dask>=2022.12.0 - distributed>=2022.12.0 - doxygen>=1.8.20 @@ -36,13 +36,13 @@ dependencies: - ninja - pytest - pytest-cov -- rmm=23.02 +- rmm=23.04 - scikit-build>=0.13.1 - scikit-learn - scipy - sphinx-markdown-tables - sysroot_linux-64==2.17 - ucx-proc=*=gpu -- ucx-py=0.30 +- ucx-py=0.31.* - ucx>=1.13.0 name: all_cuda-118_arch-x86_64 diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 784bbbb935..4a3abd4bd7 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -10,8 +10,8 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing permissions and limitations under # the License. -set(RAPIDS_VERSION "23.02") -set(RAFT_VERSION "23.02.00") +set(RAPIDS_VERSION "23.04") +set(RAFT_VERSION "23.04.00") cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) include(../fetch_rapids.cmake) diff --git a/dependencies.yaml b/dependencies.yaml index ae900542c0..c4fad0e92d 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -166,15 +166,15 @@ dependencies: common: - output_types: [conda] packages: - - rmm=23.02 + - rmm=23.04 - dask>=2022.12.0 - distributed>=2022.12.0 - ucx>=1.13.0 - - ucx-py=0.30 + - ucx-py=0.31.* - ucx-proc=*=gpu - libfaiss>=1.7.1=cuda* - faiss-proc=*=cuda - - dask-cuda=23.02 + - dask-cuda=23.04 test_python: common: - output_types: [conda, requirements] diff --git a/docs/source/conf.py b/docs/source/conf.py index 4a0dfe00b5..a85dc15b3b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -85,9 +85,9 @@ # built documents. # # The short X.Y version. -version = '23.02' +version = '23.04' # The full version, including alpha/beta/rc tags. -release = '23.02.00' +release = '23.04.00' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake index 248f5784c0..a6be017d77 100644 --- a/fetch_rapids.cmake +++ b/fetch_rapids.cmake @@ -12,7 +12,7 @@ # the License. # ============================================================================= if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS.cmake) - file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.02/RAPIDS.cmake + file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.04/RAPIDS.cmake ${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS.cmake ) endif() diff --git a/python/pylibraft/CMakeLists.txt b/python/pylibraft/CMakeLists.txt index 3efc3a547b..98d723e27b 100644 --- a/python/pylibraft/CMakeLists.txt +++ b/python/pylibraft/CMakeLists.txt @@ -14,7 +14,7 @@ cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) -set(pylibraft_version 23.02.00) +set(pylibraft_version 23.04.00) include(../../fetch_rapids.cmake) diff --git a/python/raft-dask/CMakeLists.txt b/python/raft-dask/CMakeLists.txt index 742cd522c3..8486523226 100644 --- a/python/raft-dask/CMakeLists.txt +++ b/python/raft-dask/CMakeLists.txt @@ -14,7 +14,7 @@ cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) -set(raft_dask_version 23.02.00) +set(raft_dask_version 23.04.00) include(../../fetch_rapids.cmake) From 5a6cb097fcdb9e781a21d3adddcf6d4443ce6650 Mon Sep 17 00:00:00 2001 From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com> Date: Mon, 23 Jan 2023 16:56:50 +0100 Subject: [PATCH 29/44] ANN tests: make the min_recall check strict (#1156) In #1135, we adjusted the min_recall values to report if any regressions happen in ivf-pq. However, `eval_neighbours` function, which is used in several ANN test suites, doesn't fail unless the regression is really large (it prints a warning if the calculated recall is "slightly" smaller than the expected recall). In this PR, I make `eval_neighbours` always fail if the calculated recall is smaller than the expected recall. Slightly adjust the tests and do a small refactoring along the way. Authors: - Artem M. Chirkin (https://github.com/achirkin) - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Tamas Bela Feher (https://github.com/tfeher) URL: https://github.com/rapidsai/raft/pull/1156 --- cpp/test/neighbors/ann_ivf_pq.cuh | 13 +++++--- cpp/test/neighbors/ann_utils.cuh | 52 +++++++++++++++++-------------- 2 files changed, 36 insertions(+), 29 deletions(-) diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh index b5671b74b0..719f429f13 100644 --- a/cpp/test/neighbors/ann_ivf_pq.cuh +++ b/cpp/test/neighbors/ann_ivf_pq.cuh @@ -139,8 +139,8 @@ class ivf_pq_test : public ::testing::TestWithParam { protected: void gen_data() { - database.resize(ps.num_db_vecs * ps.dim, stream_); - search_queries.resize(ps.num_queries * ps.dim, stream_); + database.resize(size_t{ps.num_db_vecs} * size_t{ps.dim}, stream_); + search_queries.resize(size_t{ps.num_queries} * size_t{ps.dim}, stream_); raft::random::Rng r(1234ULL); if constexpr (std::is_same{}) { @@ -155,7 +155,7 @@ class ivf_pq_test : public ::testing::TestWithParam { void calc_ref() { - size_t queries_size = ps.num_queries * ps.k; + size_t queries_size = size_t{ps.num_queries} * size_t{ps.k}; rmm::device_uvector distances_naive_dev(queries_size, stream_); rmm::device_uvector indices_naive_dev(queries_size, stream_); naiveBfKnn(distances_naive_dev.data(), @@ -463,7 +463,7 @@ inline auto enum_variety() -> test_cases_t }); ADD_CASE({ x.search_params.lut_dtype = CUDA_R_8U; - x.min_recall = 0.85; + x.min_recall = 0.84; }); ADD_CASE({ @@ -496,7 +496,10 @@ inline auto enum_variety_ip() -> test_cases_t // InnerProduct score is signed, // thus we're forced to used signed 8-bit representation, // thus we have one bit less precision - y.min_recall = y.min_recall.value() * 0.95; + y.min_recall = y.min_recall.value() * 0.90; + } else { + // In other cases it seems to perform a little bit better, still worse than L2 + y.min_recall = y.min_recall.value() * 0.94; } } y.index_params.metric = distance::DistanceType::InnerProduct; diff --git a/cpp/test/neighbors/ann_utils.cuh b/cpp/test/neighbors/ann_utils.cuh index bb2f334db4..551ebd767f 100644 --- a/cpp/test/neighbors/ann_utils.cuh +++ b/cpp/test/neighbors/ann_utils.cuh @@ -110,28 +110,39 @@ __global__ void naive_distance_kernel(EvalT* dist, IdxT m, IdxT n, IdxT k, - raft::distance::DistanceType type) + raft::distance::DistanceType metric) { - IdxT midx = threadIdx.x + blockIdx.x * blockDim.x; + IdxT midx = IdxT(threadIdx.x) + IdxT(blockIdx.x) * IdxT(blockDim.x); if (midx >= m) return; - for (IdxT nidx = threadIdx.y + blockIdx.y * blockDim.y; nidx < n; - nidx += blockDim.y * gridDim.y) { + IdxT grid_size = IdxT(blockDim.y) * IdxT(gridDim.y); + for (IdxT nidx = threadIdx.y + blockIdx.y * blockDim.y; nidx < n; nidx += grid_size) { EvalT acc = EvalT(0); for (IdxT i = 0; i < k; ++i) { IdxT xidx = i + midx * k; IdxT yidx = i + nidx * k; - EvalT xv = (EvalT)x[xidx]; - EvalT yv = (EvalT)y[yidx]; - if (type == raft::distance::DistanceType::InnerProduct) { - acc += xv * yv; - } else { - EvalT diff = xv - yv; - acc += diff * diff; + auto xv = EvalT(x[xidx]); + auto yv = EvalT(y[yidx]); + switch (metric) { + case raft::distance::DistanceType::InnerProduct: { + acc += xv * yv; + } break; + case raft::distance::DistanceType::L2SqrtExpanded: + case raft::distance::DistanceType::L2SqrtUnexpanded: + case raft::distance::DistanceType::L2Expanded: + case raft::distance::DistanceType::L2Unexpanded: { + auto diff = xv - yv; + acc += diff * diff; + } break; + default: break; } } - if (type == raft::distance::DistanceType::L2SqrtExpanded || - type == raft::distance::DistanceType::L2SqrtUnexpanded) - acc = raft::sqrt(acc); + switch (metric) { + case raft::distance::DistanceType::L2SqrtExpanded: + case raft::distance::DistanceType::L2SqrtUnexpanded: { + acc = raft::sqrt(acc); + } break; + default: break; + } dist[midx * n + nidx] = acc; } } @@ -241,16 +252,9 @@ auto eval_neighbours(const std::vector& expected_idx, error_margin < 0 ? "above" : "below", eps); if (actual_recall < min_recall - eps) { - if (actual_recall < min_recall * min_recall - eps) { - RAFT_LOG_ERROR("Recall is much lower than the minimum (%f < %f)", actual_recall, min_recall); - } else { - RAFT_LOG_WARN("Recall is suspiciously too low (%f < %f)", actual_recall, min_recall); - } - if (match_count == 0 || actual_recall < min_recall * std::min(min_recall, 0.5) - eps) { - return testing::AssertionFailure() - << "actual recall (" << actual_recall - << ") is much smaller than the minimum expected recall (" << min_recall << ")."; - } + return testing::AssertionFailure() + << "actual recall (" << actual_recall << ") is lower than the minimum expected recall (" + << min_recall << "); eps = " << eps << ". "; } return testing::AssertionSuccess(); } From 0076101e69c03bab03c9cb022d2e4c519bce60af Mon Sep 17 00:00:00 2001 From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com> Date: Mon, 23 Jan 2023 21:17:02 +0100 Subject: [PATCH 30/44] matrix::select_k: move selection and warp-sort primitives (#1085) Refactor and move a set of implementations for batch-selecting top K largest/smallest values: - Move device warp-wide primitives `bitonic_sort.cuh` to the public `raft::util` namespace, add tests. - Create a new public `matrix::select_k` interface. - Deprecate the legacy public `raft::spatial::knn::select_k` interface. - Copy/adapt `select_k` tests. - Move/adapt `select_k` benchmarks. - Rework the internals of `select_warpsort.cuh` to enable more implementations. Closes https://github.com/rapidsai/raft/issues/853 Authors: - Artem M. Chirkin (https://github.com/achirkin) - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/1085 --- cpp/bench/CMakeLists.txt | 6 +- cpp/bench/matrix/select_k.cu | 133 +++++ cpp/bench/neighbors/selection.cu | 123 ----- .../topk.cuh => matrix/detail/select_k.cuh} | 58 +-- .../detail/select_radix.cuh} | 113 +++-- .../detail/select_warpsort.cuh} | 415 +++++++++++----- cpp/include/raft/matrix/select_k.cuh | 110 +++++ cpp/include/raft/neighbors/detail/refine.cuh | 4 +- .../spatial/knn/detail/ivf_flat_search.cuh | 75 +-- .../raft/spatial/knn/detail/ivf_pq_search.cuh | 79 ++- cpp/include/raft/spatial/knn/knn.cuh | 38 +- .../knn/detail/topk => util}/bitonic_sort.cuh | 83 ++-- cpp/include/raft/util/integer_utils.hpp | 34 +- cpp/test/CMakeLists.txt | 5 +- cpp/test/matrix/select_k.cu | 459 ++++++++++++++++++ cpp/test/matrix/select_k.cuh | 127 +++++ cpp/test/neighbors/ann_ivf_flat.cu | 8 +- cpp/test/neighbors/ann_utils.cuh | 23 +- cpp/test/neighbors/selection.cu | 2 +- cpp/test/util/bitonic_sort.cu | 200 ++++++++ docs/source/cpp_api/matrix_ordering.rst | 12 + 21 files changed, 1631 insertions(+), 476 deletions(-) create mode 100644 cpp/bench/matrix/select_k.cu delete mode 100644 cpp/bench/neighbors/selection.cu rename cpp/include/raft/{spatial/knn/detail/topk.cuh => matrix/detail/select_k.cuh} (59%) rename cpp/include/raft/{spatial/knn/detail/topk/radix_topk.cuh => matrix/detail/select_radix.cuh} (87%) rename cpp/include/raft/{spatial/knn/detail/topk/warpsort_topk.cuh => matrix/detail/select_warpsort.cuh} (71%) create mode 100644 cpp/include/raft/matrix/select_k.cuh rename cpp/include/raft/{spatial/knn/detail/topk => util}/bitonic_sort.cuh (68%) create mode 100644 cpp/test/matrix/select_k.cu create mode 100644 cpp/test/matrix/select_k.cuh create mode 100644 cpp/test/util/bitonic_sort.cu diff --git a/cpp/bench/CMakeLists.txt b/cpp/bench/CMakeLists.txt index 8dcdb325e9..6b985acfc3 100644 --- a/cpp/bench/CMakeLists.txt +++ b/cpp/bench/CMakeLists.txt @@ -103,7 +103,10 @@ if(BUILD_BENCH) bench/main.cpp ) - ConfigureBench(NAME MATRIX_BENCH PATH bench/matrix/argmin.cu bench/matrix/gather.cu bench/main.cpp) + ConfigureBench( + NAME MATRIX_BENCH PATH bench/matrix/argmin.cu bench/matrix/gather.cu bench/matrix/select_k.cu + bench/main.cpp + ) ConfigureBench( NAME RANDOM_BENCH PATH bench/random/make_blobs.cu bench/random/permute.cu bench/random/rng.cu @@ -127,7 +130,6 @@ if(BUILD_BENCH) bench/neighbors/knn/ivf_pq_int8_t_int64_t.cu bench/neighbors/knn/ivf_pq_uint8_t_uint32_t.cu bench/neighbors/refine.cu - bench/neighbors/selection.cu bench/main.cpp OPTIONAL DIST diff --git a/cpp/bench/matrix/select_k.cu b/cpp/bench/matrix/select_k.cu new file mode 100644 index 0000000000..452a50ba50 --- /dev/null +++ b/cpp/bench/matrix/select_k.cu @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * TODO: reconsider how to organize shared test+bench files better + * Related Issue: https://github.com/rapidsai/raft/issues/1153 + * (although this header does not depend on any gtest headers) + */ +#include "../../test/matrix/select_k.cuh" + +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +namespace raft::matrix { + +using namespace raft::bench; // NOLINT + +template +struct selection : public fixture { + explicit selection(const select::params& p) + : params_(p), + in_dists_(p.batch_size * p.len, stream), + in_ids_(p.batch_size * p.len, stream), + out_dists_(p.batch_size * p.k, stream), + out_ids_(p.batch_size * p.k, stream) + { + raft::sparse::iota_fill(in_ids_.data(), IdxT(p.batch_size), IdxT(p.len), stream); + raft::random::RngState state{42}; + raft::random::uniform(handle, state, in_dists_.data(), in_dists_.size(), KeyT(-1.0), KeyT(1.0)); + } + + void run_benchmark(::benchmark::State& state) override // NOLINT + { + handle_t handle{stream}; + using_pool_memory_res res; + try { + std::ostringstream label_stream; + label_stream << params_.batch_size << "#" << params_.len << "#" << params_.k; + state.SetLabel(label_stream.str()); + loop_on_state(state, [this, &handle]() { + select::select_k_impl(handle, + Algo, + in_dists_.data(), + in_ids_.data(), + params_.batch_size, + params_.len, + params_.k, + out_dists_.data(), + out_ids_.data(), + params_.select_min); + }); + } catch (raft::exception& e) { + state.SkipWithError(e.what()); + } + } + + private: + const select::params params_; + rmm::device_uvector in_dists_, out_dists_; + rmm::device_uvector in_ids_, out_ids_; +}; + +const std::vector kInputs{ + {20000, 500, 1, true}, {20000, 500, 2, true}, {20000, 500, 4, true}, + {20000, 500, 8, true}, {20000, 500, 16, true}, {20000, 500, 32, true}, + {20000, 500, 64, true}, {20000, 500, 128, true}, {20000, 500, 256, true}, + + {1000, 10000, 1, true}, {1000, 10000, 2, true}, {1000, 10000, 4, true}, + {1000, 10000, 8, true}, {1000, 10000, 16, true}, {1000, 10000, 32, true}, + {1000, 10000, 64, true}, {1000, 10000, 128, true}, {1000, 10000, 256, true}, + + {100, 100000, 1, true}, {100, 100000, 2, true}, {100, 100000, 4, true}, + {100, 100000, 8, true}, {100, 100000, 16, true}, {100, 100000, 32, true}, + {100, 100000, 64, true}, {100, 100000, 128, true}, {100, 100000, 256, true}, + + {10, 1000000, 1, true}, {10, 1000000, 2, true}, {10, 1000000, 4, true}, + {10, 1000000, 8, true}, {10, 1000000, 16, true}, {10, 1000000, 32, true}, + {10, 1000000, 64, true}, {10, 1000000, 128, true}, {10, 1000000, 256, true}, +}; + +#define SELECTION_REGISTER(KeyT, IdxT, A) \ + namespace BENCHMARK_PRIVATE_NAME(selection) \ + { \ + using SelectK = selection; \ + RAFT_BENCH_REGISTER(SelectK, #KeyT "/" #IdxT "/" #A, kInputs); \ + } + +SELECTION_REGISTER(float, int, kPublicApi); // NOLINT +SELECTION_REGISTER(float, int, kRadix8bits); // NOLINT +SELECTION_REGISTER(float, int, kRadix11bits); // NOLINT +SELECTION_REGISTER(float, int, kWarpAuto); // NOLINT +SELECTION_REGISTER(float, int, kWarpImmediate); // NOLINT +SELECTION_REGISTER(float, int, kWarpFiltered); // NOLINT +SELECTION_REGISTER(float, int, kWarpDistributed); // NOLINT +SELECTION_REGISTER(float, int, kWarpDistributedShm); // NOLINT + +SELECTION_REGISTER(double, int, kRadix8bits); // NOLINT +SELECTION_REGISTER(double, int, kRadix11bits); // NOLINT +SELECTION_REGISTER(double, int, kWarpAuto); // NOLINT + +SELECTION_REGISTER(double, size_t, kRadix8bits); // NOLINT +SELECTION_REGISTER(double, size_t, kRadix11bits); // NOLINT +SELECTION_REGISTER(double, size_t, kWarpImmediate); // NOLINT +SELECTION_REGISTER(double, size_t, kWarpFiltered); // NOLINT +SELECTION_REGISTER(double, size_t, kWarpDistributed); // NOLINT +SELECTION_REGISTER(double, size_t, kWarpDistributedShm); // NOLINT + +} // namespace raft::matrix diff --git a/cpp/bench/neighbors/selection.cu b/cpp/bench/neighbors/selection.cu deleted file mode 100644 index 1f116c199f..0000000000 --- a/cpp/bench/neighbors/selection.cu +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -#if defined RAFT_NN_COMPILED -#include -#endif - -#include -#include - -#include -#include - -namespace raft::bench::spatial { - -struct params { - int n_inputs; - int input_len; - int k; - int select_min; -}; - -template -struct selection : public fixture { - explicit selection(const params& p) - : params_(p), - in_dists_(p.n_inputs * p.input_len, stream), - in_ids_(p.n_inputs * p.input_len, stream), - out_dists_(p.n_inputs * p.k, stream), - out_ids_(p.n_inputs * p.k, stream) - { - raft::sparse::iota_fill(in_ids_.data(), IdxT(p.n_inputs), IdxT(p.input_len), stream); - raft::random::RngState state{42}; - raft::random::uniform(handle, state, in_dists_.data(), in_dists_.size(), KeyT(-1.0), KeyT(1.0)); - } - - void run_benchmark(::benchmark::State& state) override - { - using_pool_memory_res res; - try { - std::ostringstream label_stream; - label_stream << params_.n_inputs << "#" << params_.input_len << "#" << params_.k; - state.SetLabel(label_stream.str()); - loop_on_state(state, [this]() { - raft::spatial::knn::select_k(in_dists_.data(), - in_ids_.data(), - params_.n_inputs, - params_.input_len, - out_dists_.data(), - out_ids_.data(), - params_.select_min, - params_.k, - stream, - Algo); - }); - } catch (raft::exception& e) { - state.SkipWithError(e.what()); - } - } - - private: - const params params_; - rmm::device_uvector in_dists_, out_dists_; - rmm::device_uvector in_ids_, out_ids_; -}; - -const std::vector kInputs{ - {20000, 500, 1, true}, {20000, 500, 2, true}, {20000, 500, 4, true}, - {20000, 500, 8, true}, {20000, 500, 16, true}, {20000, 500, 32, true}, - {20000, 500, 64, true}, {20000, 500, 128, true}, {20000, 500, 256, true}, - - {1000, 10000, 1, true}, {1000, 10000, 2, true}, {1000, 10000, 4, true}, - {1000, 10000, 8, true}, {1000, 10000, 16, true}, {1000, 10000, 32, true}, - {1000, 10000, 64, true}, {1000, 10000, 128, true}, {1000, 10000, 256, true}, - - {100, 100000, 1, true}, {100, 100000, 2, true}, {100, 100000, 4, true}, - {100, 100000, 8, true}, {100, 100000, 16, true}, {100, 100000, 32, true}, - {100, 100000, 64, true}, {100, 100000, 128, true}, {100, 100000, 256, true}, - - {10, 1000000, 1, true}, {10, 1000000, 2, true}, {10, 1000000, 4, true}, - {10, 1000000, 8, true}, {10, 1000000, 16, true}, {10, 1000000, 32, true}, - {10, 1000000, 64, true}, {10, 1000000, 128, true}, {10, 1000000, 256, true}, -}; - -#define SELECTION_REGISTER(KeyT, IdxT, Algo) \ - namespace BENCHMARK_PRIVATE_NAME(selection) \ - { \ - using SelectK = selection; \ - RAFT_BENCH_REGISTER(SelectK, #KeyT "/" #IdxT "/" #Algo, kInputs); \ - } - -SELECTION_REGISTER(float, int, FAISS); -SELECTION_REGISTER(float, int, RADIX_8_BITS); -SELECTION_REGISTER(float, int, RADIX_11_BITS); -SELECTION_REGISTER(float, int, WARP_SORT); - -SELECTION_REGISTER(double, int, FAISS); -SELECTION_REGISTER(double, int, RADIX_8_BITS); -SELECTION_REGISTER(double, int, RADIX_11_BITS); -SELECTION_REGISTER(double, int, WARP_SORT); - -SELECTION_REGISTER(double, size_t, FAISS); -SELECTION_REGISTER(double, size_t, RADIX_8_BITS); -SELECTION_REGISTER(double, size_t, RADIX_11_BITS); -SELECTION_REGISTER(double, size_t, WARP_SORT); - -} // namespace raft::bench::spatial diff --git a/cpp/include/raft/spatial/knn/detail/topk.cuh b/cpp/include/raft/matrix/detail/select_k.cuh similarity index 59% rename from cpp/include/raft/spatial/knn/detail/topk.cuh rename to cpp/include/raft/matrix/detail/select_k.cuh index f4dcb53088..ac1ba3dfa3 100644 --- a/cpp/include/raft/spatial/knn/detail/topk.cuh +++ b/cpp/include/raft/matrix/detail/select_k.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,34 +16,34 @@ #pragma once -#include "topk/radix_topk.cuh" -#include "topk/warpsort_topk.cuh" +#include "select_radix.cuh" +#include "select_warpsort.cuh" #include #include #include -namespace raft::spatial::knn::detail { +namespace raft::matrix::detail { /** * Select k smallest or largest key/values from each row in the input data. * - * If you think of the input data `in_keys` as a row-major matrix with len columns and - * batch_size rows, then this function selects k smallest/largest values in each row and fills - * in the row-major matrix `out` of size (batch_size, k). + * If you think of the input data `in_val` as a row-major matrix with `len` columns and + * `batch_size` rows, then this function selects `k` smallest/largest values in each row and fills + * in the row-major matrix `out_val` of size (batch_size, k). * * @tparam T * the type of the keys (what is being compared). * @tparam IdxT * the index type (what is being selected together with the keys). * - * @param[in] in + * @param[in] in_val * contiguous device array of inputs of size (len * batch_size); * these are compared and selected. * @param[in] in_idx * contiguous device array of inputs of size (len * batch_size); - * typically, these are indices of the corresponding in_keys. + * typically, these are indices of the corresponding in_val. * @param batch_size * number of input rows, i.e. the batch size. * @param len @@ -51,12 +51,12 @@ namespace raft::spatial::knn::detail { * Invariant: len >= k. * @param k * the number of outputs to select in each input row. - * @param[out] out + * @param[out] out_val * contiguous device array of outputs of size (k * batch_size); - * the k smallest/largest values from each row of the `in_keys`. + * the k smallest/largest values from each row of the `in_val`. * @param[out] out_idx * contiguous device array of outputs of size (k * batch_size); - * the payload selected together with `out`. + * the payload selected together with `out_val`. * @param select_min * whether to select k smallest (true) or largest (false) keys. * @param stream @@ -64,28 +64,28 @@ namespace raft::spatial::knn::detail { * memory pool here to avoid memory allocations within the call). */ template -void select_topk(const T* in, - const IdxT* in_idx, - size_t batch_size, - size_t len, - int k, - T* out, - IdxT* out_idx, - bool select_min, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = nullptr) +void select_k(const T* in_val, + const IdxT* in_idx, + size_t batch_size, + size_t len, + int k, + T* out_val, + IdxT* out_idx, + bool select_min, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = nullptr) { common::nvtx::range fun_scope( - "matrix::select_topk(batch_size = %zu, len = %zu, k = %d)", batch_size, len, k); + "matrix::select_k(batch_size = %zu, len = %zu, k = %d)", batch_size, len, k); // TODO (achirkin): investigate the trade-off for a wider variety of inputs. const bool radix_faster = batch_size >= 64 && len >= 102400 && k >= 128; - if (k <= raft::spatial::knn::detail::topk::kMaxCapacity && !radix_faster) { - topk::warp_sort_topk( - in, in_idx, batch_size, len, k, out, out_idx, select_min, stream, mr); + if (k <= select::warpsort::kMaxCapacity && !radix_faster) { + select::warpsort::select_k( + in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr); } else { - topk::radix_topk= 4 ? 11 : 8), 512>( - in, in_idx, batch_size, len, k, out, out_idx, select_min, stream, mr); + select::radix::select_k= 4 ? 11 : 8), 512>( + in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr); } } -} // namespace raft::spatial::knn::detail +} // namespace raft::matrix::detail diff --git a/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh b/cpp/include/raft/matrix/detail/select_radix.cuh similarity index 87% rename from cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh rename to cpp/include/raft/matrix/detail/select_radix.cuh index 9c0f20b706..de19e63a4c 100644 --- a/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh +++ b/cpp/include/raft/matrix/detail/select_radix.cuh @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include #include @@ -27,29 +28,29 @@ #include #include -#include +#include #include -namespace raft::spatial::knn::detail::topk { +namespace raft::matrix::detail::select::radix { constexpr int ITEM_PER_THREAD = 32; constexpr int VECTORIZED_READ_SIZE = 16; template -__host__ __device__ constexpr int calc_num_buckets() +_RAFT_HOST_DEVICE constexpr int calc_num_buckets() { return 1 << BitsPerPass; } template -__host__ __device__ constexpr int calc_num_passes() +_RAFT_HOST_DEVICE constexpr int calc_num_passes() { return ceildiv(sizeof(T) * 8, BitsPerPass); } // Minimum reasonable block size for the given radix size. template -__host__ __device__ constexpr int calc_min_block_size() +_RAFT_HOST_DEVICE constexpr int calc_min_block_size() { return 1 << std::max(BitsPerPass - 4, Pow2::Log2 + 1); } @@ -62,7 +63,7 @@ __host__ __device__ constexpr int calc_min_block_size() * NB: Use pass=-1 for calc_mask(). */ template -__device__ constexpr int calc_start_bit(int pass) +_RAFT_DEVICE constexpr int calc_start_bit(int pass) { int start_bit = static_cast(sizeof(T) * 8) - (pass + 1) * BitsPerPass; if (start_bit < 0) { start_bit = 0; } @@ -70,7 +71,7 @@ __device__ constexpr int calc_start_bit(int pass) } template -__device__ constexpr unsigned calc_mask(int pass) +_RAFT_DEVICE constexpr unsigned calc_mask(int pass) { static_assert(BitsPerPass <= 31); int num_bits = calc_start_bit(pass - 1) - calc_start_bit(pass); @@ -82,7 +83,7 @@ __device__ constexpr unsigned calc_mask(int pass) * as of integers. */ template -__device__ typename cub::Traits::UnsignedBits twiddle_in(T key, bool greater) +_RAFT_DEVICE typename cub::Traits::UnsignedBits twiddle_in(T key, bool greater) { auto bits = reinterpret_cast::UnsignedBits&>(key); bits = cub::Traits::TwiddleIn(bits); @@ -91,7 +92,7 @@ __device__ typename cub::Traits::UnsignedBits twiddle_in(T key, bool greater) } template -__device__ int calc_bucket(T x, int start_bit, unsigned mask, bool greater) +_RAFT_DEVICE int calc_bucket(T x, int start_bit, unsigned mask, bool greater) { static_assert(BitsPerPass <= sizeof(int) * 8 - 1); // so return type can be int return (twiddle_in(x, greater) >> start_bit) & mask; @@ -112,7 +113,7 @@ __device__ int calc_bucket(T x, int start_bit, unsigned mask, bool greater) * @param f the lambda taking two arguments (T x, IdxT idx) */ template -__device__ void vectorized_process(const T* in, IdxT len, Func f) +_RAFT_DEVICE void vectorized_process(const T* in, IdxT len, Func f) { const IdxT stride = blockDim.x * gridDim.x; const int tid = blockIdx.x * blockDim.x + threadIdx.x; @@ -167,18 +168,18 @@ struct Counter { * (see steps 4-1 in `radix_kernel` description). */ template -__device__ void filter_and_histogram(const T* in_buf, - const IdxT* in_idx_buf, - T* out_buf, - IdxT* out_idx_buf, - T* out, - IdxT* out_idx, - IdxT len, - Counter* counter, - IdxT* histogram, - bool greater, - int pass, - int k) +_RAFT_DEVICE void filter_and_histogram(const T* in_buf, + const IdxT* in_idx_buf, + T* out_buf, + IdxT* out_idx_buf, + T* out, + IdxT* out_idx, + IdxT len, + Counter* counter, + IdxT* histogram, + bool greater, + int pass, + int k) { constexpr int num_buckets = calc_num_buckets(); __shared__ IdxT histogram_smem[num_buckets]; @@ -260,10 +261,10 @@ __device__ void filter_and_histogram(const T* in_buf, * (step 2 in `radix_kernel` description) */ template -__device__ void scan(volatile IdxT* histogram, - const int start, - const int num_buckets, - const IdxT current) +_RAFT_DEVICE void scan(volatile IdxT* histogram, + const int start, + const int num_buckets, + const IdxT current) { typedef cub::BlockScan BlockScan; __shared__ typename BlockScan::TempStorage temp_storage; @@ -284,7 +285,7 @@ __device__ void scan(volatile IdxT* histogram, * (steps 2-3 in `radix_kernel` description) */ template -__device__ void choose_bucket(Counter* counter, IdxT* histogram, const IdxT k) +_RAFT_DEVICE void choose_bucket(Counter* counter, IdxT* histogram, const IdxT k) { constexpr int num_buckets = calc_num_buckets(); int index = threadIdx.x; @@ -547,21 +548,21 @@ inline dim3 get_optimal_grid_size(size_t req_batch_size, size_t len) * memory pool here to avoid memory allocations within the call). */ template -void radix_topk(const T* in, - const IdxT* in_idx, - size_t batch_size, - size_t len, - int k, - T* out, - IdxT* out_idx, - bool select_min, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = nullptr) +void select_k(const T* in, + const IdxT* in_idx, + size_t batch_size, + size_t len, + int k, + T* out, + IdxT* out_idx, + bool select_min, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = nullptr) { // reduce the block size if the input length is too small. if constexpr (BlockSize > calc_min_block_size()) { if (BlockSize * ITEM_PER_THREAD > len) { - return radix_topk( + return select_k( in, in_idx, batch_size, len, k, out, out_idx, select_min, stream); } } @@ -573,23 +574,33 @@ void radix_topk(const T* in, dim3 blocks = get_optimal_grid_size(batch_size, len); size_t max_chunk_size = blocks.y; - auto pool_guard = raft::get_pool_memory_resource( - mr, - max_chunk_size * (sizeof(Counter) // counters - + sizeof(IdxT) * (num_buckets + 2) // histograms and IdxT bufs - + sizeof(T) * 2 // T bufs - )); + size_t req_aux = max_chunk_size * (sizeof(Counter) + num_buckets * sizeof(IdxT)); + size_t req_buf = max_chunk_size * len * 2 * (sizeof(T) + sizeof(IdxT)); + size_t mem_req = req_aux + req_buf; + size_t mem_free, mem_total; + RAFT_CUDA_TRY(cudaMemGetInfo(&mem_free, &mem_total)); + std::optional managed_memory; + rmm::mr::device_memory_resource* mr_buf = nullptr; + if (mem_req > mem_free) { + // if there's not enough memory for buffers on the device, resort to the managed memory. + mem_req = req_aux; + managed_memory.emplace(); + mr_buf = &managed_memory.value(); + } + + auto pool_guard = raft::get_pool_memory_resource(mr, mem_req); if (pool_guard) { - RAFT_LOG_DEBUG("radix_topk: using pool memory resource with initial size %zu bytes", + RAFT_LOG_DEBUG("radix::select_k: using pool memory resource with initial size %zu bytes", pool_guard->pool_size()); } + if (mr_buf == nullptr) { mr_buf = mr; } rmm::device_uvector> counters(max_chunk_size, stream, mr); - rmm::device_uvector histograms(num_buckets * max_chunk_size, stream, mr); - rmm::device_uvector buf1(len * max_chunk_size, stream, mr); - rmm::device_uvector idx_buf1(len * max_chunk_size, stream, mr); - rmm::device_uvector buf2(len * max_chunk_size, stream, mr); - rmm::device_uvector idx_buf2(len * max_chunk_size, stream, mr); + rmm::device_uvector histograms(max_chunk_size * num_buckets, stream, mr); + rmm::device_uvector buf1(max_chunk_size * len, stream, mr_buf); + rmm::device_uvector idx_buf1(max_chunk_size * len, stream, mr_buf); + rmm::device_uvector buf2(max_chunk_size * len, stream, mr_buf); + rmm::device_uvector idx_buf2(max_chunk_size * len, stream, mr_buf); for (size_t offset = 0; offset < batch_size; offset += max_chunk_size) { blocks.y = std::min(max_chunk_size, batch_size - offset); @@ -646,4 +657,4 @@ void radix_topk(const T* in, } } -} // namespace raft::spatial::knn::detail::topk +} // namespace raft::matrix::detail::select::radix diff --git a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh b/cpp/include/raft/matrix/detail/select_warpsort.cuh similarity index 71% rename from cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh rename to cpp/include/raft/matrix/detail/select_warpsort.cuh index c06aa04aea..d362b73792 100644 --- a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh +++ b/cpp/include/raft/matrix/detail/select_warpsort.cuh @@ -16,10 +16,11 @@ #pragma once -#include "bitonic_sort.cuh" - +#include #include +#include #include +#include #include #include @@ -31,12 +32,12 @@ /* Three APIs of different scopes are provided: - 1. host function: warp_sort_topk() + 1. host function: select_k() 2. block-wide API: class block_sort 3. warp-wide API: several implementations of warp_sort_* - 1. warp_sort_topk() + 1. select_k() (see the docstring) 2. class block_sort @@ -74,7 +75,7 @@ These two classes can be regarded as fixed size priority queue for a warp. Usage is similar to class block_sort. No shared memory is needed. - The host function (warp_sort_topk) uses a heuristic to choose between these two classes for + The host function (select_k) uses a heuristic to choose between these two classes for sorting, warp_sort_immediate being chosen when the number of inputs per warp is somewhat small (see the usage of LaunchThreshold::len_factor_for_choosing). @@ -94,7 +95,7 @@ } */ -namespace raft::spatial::knn::detail::topk { +namespace raft::matrix::detail::select::warpsort { static constexpr int kMaxCapacity = 256; @@ -102,18 +103,12 @@ namespace { /** Whether 'left` should indeed be on the left w.r.t. `right`. */ template -__device__ __forceinline__ auto is_ordered(T left, T right) -> bool +_RAFT_DEVICE _RAFT_FORCEINLINE auto is_ordered(T left, T right) -> bool { if constexpr (Ascending) { return left < right; } if constexpr (!Ascending) { return left > right; } } -constexpr auto calc_capacity(int k) -> int -{ - int capacity = isPo2(k) ? k : (1 << (log2(k) + 1)); - return capacity; -} - } // namespace /** @@ -134,7 +129,7 @@ constexpr auto calc_capacity(int k) -> int */ template class warp_sort { - static_assert(isPo2(Capacity)); + static_assert(is_a_power_of_two(Capacity)); static_assert(std::is_default_constructible_v); public: @@ -148,13 +143,16 @@ class warp_sort { /** The number of elements to select. */ const int k; + /** Extra memory required per-block for keeping the state (shared or global). */ + constexpr static auto mem_required(uint32_t block_size) -> size_t { return 0; } + /** * Construct the warp_sort empty queue. * * @param k * number of elements to select. */ - __device__ warp_sort(int k) : k(k) + _RAFT_DEVICE warp_sort(int k) : k(k) { #pragma unroll for (int i = 0; i < kMaxArrLen; i++) { @@ -182,7 +180,7 @@ class warp_sort { * It serves as a conditional; when `false` the function does nothing. * We need it to ensure threads within a full warp don't diverge calling `bitonic::merge()`. */ - __device__ void load_sorted(const T* in, const IdxT* in_idx, bool do_merge = true) + _RAFT_DEVICE void load_sorted(const T* in, const IdxT* in_idx, bool do_merge = true) { if (do_merge) { int idx = Pow2::mod(laneId()) ^ Pow2::Mask; @@ -198,7 +196,7 @@ class warp_sort { } } if (kWarpWidth < WarpSize || do_merge) { - topk::bitonic(Ascending, kWarpWidth).merge(val_arr_, idx_arr_); + util::bitonic(Ascending, kWarpWidth).merge(val_arr_, idx_arr_); } } @@ -211,15 +209,23 @@ class warp_sort { * @param[out] out_idx * device pointer to a contiguous array, unique per-subwarp of size `kWarpWidth` * (length: k <= kWarpWidth * kMaxArrLen). + * @param valF (optional) postprocess values (T -> OutT) + * @param idxF (optional) postprocess indices (IdxT -> OutIdxT) */ - template - __device__ void store(T* out, IdxT* out_idx, Lambda post_process = raft::identity_op()) const + template + _RAFT_DEVICE void store(OutT* out, + OutIdxT* out_idx, + ValF valF = raft::identity_op{}, + IdxF idxF = raft::identity_op{}) const { int idx = Pow2::mod(laneId()); #pragma unroll kMaxArrLen for (int i = 0; i < kMaxArrLen && idx < k; i++, idx += kWarpWidth) { - out[idx] = post_process(val_arr_[i]); - out_idx[idx] = idx_arr_[i]; + out[idx] = valF(val_arr_[i]); + out_idx[idx] = idxF(idx_arr_[i]); } } @@ -246,8 +252,8 @@ class warp_sort { * the associated indices of the elements in the same format as `keys_in`. */ template - __device__ __forceinline__ void merge_in(const T* __restrict__ keys_in, - const IdxT* __restrict__ ids_in) + _RAFT_DEVICE _RAFT_FORCEINLINE void merge_in(const T* __restrict__ keys_in, + const IdxT* __restrict__ ids_in) { #pragma unroll for (int i = std::min(kMaxArrLen, PerThreadSizeIn); i > 0; i--) { @@ -258,7 +264,7 @@ class warp_sort { idx_arr_[kMaxArrLen - i] = ids_in[PerThreadSizeIn - i]; } } - topk::bitonic(Ascending, kWarpWidth).merge(val_arr_, idx_arr_); + util::bitonic(Ascending, kWarpWidth).merge(val_arr_, idx_arr_); } }; @@ -276,8 +282,9 @@ class warp_sort_filtered : public warp_sort { using warp_sort::kDummy; using warp_sort::kWarpWidth; using warp_sort::k; + using warp_sort::mem_required; - __device__ warp_sort_filtered(int k, T limit) + explicit _RAFT_DEVICE warp_sort_filtered(int k, T limit = kDummy) : warp_sort(k), buf_len_(0), k_th_(limit) { #pragma unroll @@ -287,12 +294,14 @@ class warp_sort_filtered : public warp_sort { } } - __device__ __forceinline__ explicit warp_sort_filtered(int k) - : warp_sort_filtered(k, kDummy) + _RAFT_DEVICE _RAFT_FORCEINLINE static auto init_blockwide(int k, + uint8_t* = nullptr, + T limit = kDummy) { + return warp_sort_filtered{k, limit}; } - __device__ void add(T val, IdxT idx) + _RAFT_DEVICE void add(T val, IdxT idx) { // comparing for k_th should reduce the total amount of updates: // `false` means the input value is surely not in the top-k values. @@ -310,22 +319,22 @@ class warp_sort_filtered : public warp_sort { if (do_add) { add_to_buf_(val, idx); } } - __device__ void done() + _RAFT_DEVICE void done() { if (any(buf_len_ != 0)) { merge_buf_(); } } private: - __device__ __forceinline__ void set_k_th_() + _RAFT_DEVICE _RAFT_FORCEINLINE void set_k_th_() { // NB on using srcLane: it's ok if it is outside the warp size / width; // the modulo op will be done inside the __shfl_sync. k_th_ = shfl(val_arr_[kMaxArrLen - 1], k - 1, kWarpWidth); } - __device__ __forceinline__ void merge_buf_() + _RAFT_DEVICE _RAFT_FORCEINLINE void merge_buf_() { - topk::bitonic(!Ascending, kWarpWidth).sort(val_buf_, idx_buf_); + util::bitonic(!Ascending, kWarpWidth).sort(val_buf_, idx_buf_); this->merge_in(val_buf_, idx_buf_); buf_len_ = 0; set_k_th_(); // contains warp sync @@ -335,7 +344,7 @@ class warp_sort_filtered : public warp_sort { } } - __device__ __forceinline__ void add_to_buf_(T val, IdxT idx) + _RAFT_DEVICE _RAFT_FORCEINLINE void add_to_buf_(T val, IdxT idx) { // NB: the loop is used here to ensure the constant indexing, // to not force the buffers spill into the local memory. @@ -374,8 +383,9 @@ class warp_sort_distributed : public warp_sort { using warp_sort::kDummy; using warp_sort::kWarpWidth; using warp_sort::k; + using warp_sort::mem_required; - __device__ warp_sort_distributed(int k, T limit) + explicit _RAFT_DEVICE warp_sort_distributed(int k, T limit = kDummy) : warp_sort(k), buf_val_(kDummy), buf_idx_(IdxT{}), @@ -384,12 +394,14 @@ class warp_sort_distributed : public warp_sort { { } - __device__ __forceinline__ explicit warp_sort_distributed(int k) - : warp_sort_distributed(k, kDummy) + _RAFT_DEVICE _RAFT_FORCEINLINE static auto init_blockwide(int k, + uint8_t* = nullptr, + T limit = kDummy) { + return warp_sort_distributed{k, limit}; } - __device__ void add(T val, IdxT idx) + _RAFT_DEVICE void add(T val, IdxT idx) { // mask tells which lanes in the warp have valid items to be added uint32_t mask = ballot(is_ordered(val, k_th_)); @@ -429,7 +441,7 @@ class warp_sort_distributed : public warp_sort { } } - __device__ void done() + _RAFT_DEVICE void done() { if (buf_len_ != 0) { merge_buf_(); @@ -438,16 +450,16 @@ class warp_sort_distributed : public warp_sort { } private: - __device__ __forceinline__ void set_k_th_() + _RAFT_DEVICE _RAFT_FORCEINLINE void set_k_th_() { // NB on using srcLane: it's ok if it is outside the warp size / width; // the modulo op will be done inside the __shfl_sync. k_th_ = shfl(val_arr_[kMaxArrLen - 1], k - 1, kWarpWidth); } - __device__ __forceinline__ void merge_buf_() + _RAFT_DEVICE _RAFT_FORCEINLINE void merge_buf_() { - topk::bitonic<1>(!Ascending, kWarpWidth).sort(buf_val_, buf_idx_); + util::bitonic<1>(!Ascending, kWarpWidth).sort(buf_val_, buf_idx_); this->merge_in<1>(&buf_val_, &buf_idx_); set_k_th_(); // contains warp sync buf_val_ = kDummy; @@ -464,6 +476,117 @@ class warp_sort_distributed : public warp_sort { T k_th_; }; +/** + * The same as `warp_sort_distributed`, but keeps the temporary value and index buffers + * in the given external pointers (normally, a shared memory pointer should be passed in). + */ +template +class warp_sort_distributed_ext : public warp_sort { + public: + using warp_sort::kDummy; + using warp_sort::kWarpWidth; + using warp_sort::k; + + constexpr static auto mem_required(uint32_t block_size) -> size_t + { + return (sizeof(T) + sizeof(IdxT)) * block_size; + } + + _RAFT_DEVICE warp_sort_distributed_ext(int k, T* val_buf, IdxT* idx_buf, T limit = kDummy) + : warp_sort(k), + val_buf_(val_buf), + idx_buf_(idx_buf), + buf_len_(0), + k_th_(limit) + { + val_buf_[laneId()] = kDummy; + } + + _RAFT_DEVICE static auto init_blockwide(int k, uint8_t* shmem, T limit = kDummy) + { + T* val_buf = nullptr; + IdxT* idx_buf = nullptr; + if constexpr (alignof(T) >= alignof(IdxT)) { + val_buf = reinterpret_cast(shmem); + idx_buf = reinterpret_cast(val_buf + blockDim.x); + } else { + idx_buf = reinterpret_cast(shmem); + val_buf = reinterpret_cast(idx_buf + blockDim.x); + } + auto warp_offset = Pow2::roundDown(threadIdx.x); + val_buf += warp_offset; + idx_buf += warp_offset; + return warp_sort_distributed_ext{k, val_buf, idx_buf, limit}; + } + + _RAFT_DEVICE void add(T val, IdxT idx) + { + bool do_add = is_ordered(val, k_th_); + // mask tells which lanes in the warp have valid items to be added + uint32_t mask = ballot(do_add); + if (mask == 0) { return; } + // where to put the element in the tmp buffer + int dst_ix = buf_len_ + __popc(mask & ((1u << laneId()) - 1u)); + // put all elements, which fit into the current tmp buffer + if (do_add && dst_ix < WarpSize) { + val_buf_[dst_ix] = val; + idx_buf_[dst_ix] = idx; + do_add = false; + } + // Total number of elements to be added + buf_len_ += __popc(mask); + // If the buffer is still not full, we can return + if (buf_len_ < WarpSize) { return; } + // Otherwise, merge the warp tmp buffer into the queue + merge_buf_(); // implies warp sync + buf_len_ -= WarpSize; + // save the inputs that couldn't fit before the merge + if (do_add) { + dst_ix -= WarpSize; + val_buf_[dst_ix] = val; + idx_buf_[dst_ix] = idx; + } + } + + _RAFT_DEVICE void done() + { + if (buf_len_ != 0) { + merge_buf_(); + buf_len_ = 0; + } + __syncthreads(); + } + + private: + _RAFT_DEVICE _RAFT_FORCEINLINE void set_k_th_() + { + // NB on using srcLane: it's ok if it is outside the warp size / width; + // the modulo op will be done inside the __shfl_sync. + k_th_ = shfl(val_arr_[kMaxArrLen - 1], k - 1, kWarpWidth); + } + + _RAFT_DEVICE _RAFT_FORCEINLINE void merge_buf_() + { + __syncwarp(); // make sure the threads are aware of the data written by others + T buf_val = val_buf_[laneId()]; + IdxT buf_idx = idx_buf_[laneId()]; + val_buf_[laneId()] = kDummy; + util::bitonic<1>(!Ascending, kWarpWidth).sort(buf_val, buf_idx); + this->merge_in<1>(&buf_val, &buf_idx); + set_k_th_(); // contains warp sync + } + + using warp_sort::kMaxArrLen; + using warp_sort::val_arr_; + using warp_sort::idx_arr_; + + T* val_buf_; + IdxT* idx_buf_; + uint32_t buf_len_; // 0 <= buf_len_ < WarpSize + + T k_th_; +}; + /** * This version of warp_sort adds every input element into the intermediate sorting * buffer, and thus does the sorting step every `Capacity` input elements. @@ -476,8 +599,10 @@ class warp_sort_immediate : public warp_sort { using warp_sort::kDummy; using warp_sort::kWarpWidth; using warp_sort::k; + using warp_sort::mem_required; - __device__ warp_sort_immediate(int k) : warp_sort(k), buf_len_(0) + explicit _RAFT_DEVICE warp_sort_immediate(int k) + : warp_sort(k), buf_len_(0) { #pragma unroll for (int i = 0; i < kMaxArrLen; i++) { @@ -486,7 +611,12 @@ class warp_sort_immediate : public warp_sort { } } - __device__ void add(T val, IdxT idx) + _RAFT_DEVICE _RAFT_FORCEINLINE static auto init_blockwide(int k, uint8_t* = nullptr) + { + return warp_sort_immediate{k}; + } + + _RAFT_DEVICE void add(T val, IdxT idx) { // NB: the loop is used here to ensure the constant indexing, // to not force the buffers spill into the local memory. @@ -500,7 +630,7 @@ class warp_sort_immediate : public warp_sort { ++buf_len_; if (buf_len_ == kMaxArrLen) { - topk::bitonic(!Ascending, kWarpWidth).sort(val_buf_, idx_buf_); + util::bitonic(!Ascending, kWarpWidth).sort(val_buf_, idx_buf_); this->merge_in(val_buf_, idx_buf_); #pragma unroll for (int i = 0; i < kMaxArrLen; i++) { @@ -510,10 +640,10 @@ class warp_sort_immediate : public warp_sort { } } - __device__ void done() + _RAFT_DEVICE void done() { if (buf_len_ != 0) { - topk::bitonic(!Ascending, kWarpWidth).sort(val_buf_, idx_buf_); + util::bitonic(!Ascending, kWarpWidth).sort(val_buf_, idx_buf_); this->merge_in(val_buf_, idx_buf_); } } @@ -545,15 +675,11 @@ class block_sort { using queue_t = WarpSortWarpWide; template - __device__ block_sort(int k, uint8_t* smem_buf, Args... args) : queue_(k, args...) + _RAFT_DEVICE block_sort(int k, Args... args) : queue_(queue_t::init_blockwide(k, args...)) { - val_smem_ = reinterpret_cast(smem_buf); - const int num_of_warp = subwarp_align::div(blockDim.x); - idx_smem_ = reinterpret_cast( - smem_buf + Pow2<256>::roundUp(ceildiv(num_of_warp, 2) * sizeof(T) * k)); } - __device__ void add(T val, IdxT idx) { queue_.add(val, idx); } + _RAFT_DEVICE void add(T val, IdxT idx) { queue_.add(val, idx); } /** * At the point of calling this function, the warp-level queues consumed all input @@ -561,22 +687,26 @@ class block_sort { * * Here we tree-merge the results using the shared memory and block sync. */ - __device__ void done() + _RAFT_DEVICE void done(uint8_t* smem_buf) { queue_.done(); + int nwarps = subwarp_align::div(blockDim.x); + auto val_smem = reinterpret_cast(smem_buf); + auto idx_smem = reinterpret_cast( + smem_buf + Pow2<256>::roundUp(ceildiv(nwarps, 2) * sizeof(T) * queue_.k)); + const int warp_id = subwarp_align::div(threadIdx.x); // NB: there is no need for the second __synchthreads between .load_sorted and .store: // we shift the pointers every iteration, such that individual warps either access the same // locations or do not overlap with any of the other warps. The access patterns within warps // are different for the two functions, but .load_sorted implies warp sync at the end, so // there is no need for __syncwarp either. - for (int shift_mask = ~0, nwarps = subwarp_align::div(blockDim.x), split = (nwarps + 1) >> 1; - nwarps > 1; + for (int shift_mask = ~0, split = (nwarps + 1) >> 1; nwarps > 1; nwarps = split, split = (nwarps + 1) >> 1) { if (warp_id < nwarps && warp_id >= split) { int dst_warp_shift = (warp_id - (split & shift_mask)) * queue_.k; - queue_.store(val_smem_ + dst_warp_shift, idx_smem_ + dst_warp_shift); + queue_.store(val_smem + dst_warp_shift, idx_smem + dst_warp_shift); } __syncthreads(); @@ -586,23 +716,27 @@ class block_sort { // The last argument serves as a condition for loading // -- to make sure threads within a full warp do not diverge on `bitonic::merge()` queue_.load_sorted( - val_smem_ + src_warp_shift, idx_smem_ + src_warp_shift, warp_id < nwarps - split); + val_smem + src_warp_shift, idx_smem + src_warp_shift, warp_id < nwarps - split); } } } /** Save the content by the pointer location. */ - template - __device__ void store(T* out, IdxT* out_idx, Lambda post_process = raft::identity_op()) const + template + _RAFT_DEVICE void store(OutT* out, + OutIdxT* out_idx, + ValF valF = raft::identity_op{}, + IdxF idxF = raft::identity_op{}) const { - if (threadIdx.x < subwarp_align::Value) { queue_.store(out, out_idx, post_process); } + if (threadIdx.x < subwarp_align::Value) { queue_.store(out, out_idx, valF, idxF); } } private: using subwarp_align = Pow2; queue_t queue_; - T* val_smem_; - IdxT* idx_smem_; }; /** @@ -620,7 +754,10 @@ __launch_bounds__(256) __global__ void block_kernel(const T* in, const IdxT* in_idx, IdxT len, int k, T* out, IdxT* out_idx) { extern __shared__ __align__(256) uint8_t smem_buf_bytes[]; - block_sort queue(k, smem_buf_bytes); + using bq_t = block_sort; + uint8_t* warp_smem = bq_t::queue_t::mem_required(blockDim.x) > 0 ? smem_buf_bytes : nullptr; + bq_t queue(k, warp_smem); + in += blockIdx.y * len; if (in_idx != nullptr) { in_idx += blockIdx.y * len; } @@ -631,7 +768,7 @@ __launch_bounds__(256) __global__ (i < len && in_idx != nullptr) ? __ldcs(in_idx + i) : i); } - queue.done(); + queue.done(smem_buf_bytes); const int block_id = blockIdx.x + gridDim.x * blockIdx.y; queue.store(out + block_id * k, out_idx + block_id * k); } @@ -658,7 +795,7 @@ struct launch_setup { int* min_grid_size, int block_size_limit = 0) { - const int capacity = calc_capacity(k); + const int capacity = bound_by_power_of_two(k); if constexpr (Capacity > 1) { if (capacity < Capacity) { return launch_setup::calc_optimal_params( @@ -691,7 +828,7 @@ struct launch_setup { IdxT* out_idx, rmm::cuda_stream_view stream) { - const int capacity = calc_capacity(k); + const int capacity = bound_by_power_of_two(k); if constexpr (Capacity > 1) { if (capacity < Capacity) { return launch_setup::kernel(k, @@ -742,6 +879,18 @@ struct LaunchThreshold { static constexpr int len_factor_for_single_block = 32; }; +template <> +struct LaunchThreshold { + static constexpr int len_factor_for_multi_block = 2; + static constexpr int len_factor_for_single_block = 32; +}; + +template <> +struct LaunchThreshold { + static constexpr int len_factor_for_multi_block = 2; + static constexpr int len_factor_for_single_block = 32; +}; + template <> struct LaunchThreshold { static constexpr int len_factor_for_choosing = 4; @@ -753,7 +902,7 @@ template