diff --git a/cpp/bench/CMakeLists.txt b/cpp/bench/CMakeLists.txt index 7a0f1d5201..51e1c41499 100644 --- a/cpp/bench/CMakeLists.txt +++ b/cpp/bench/CMakeLists.txt @@ -30,6 +30,7 @@ add_executable(${RAFT_CPP_BENCH_TARGET} bench/random/permute.cu bench/random/rng.cu bench/spatial/fused_l2_nn.cu + bench/spatial/knn.cu bench/spatial/selection.cu bench/main.cpp ) diff --git a/cpp/bench/common/benchmark.hpp b/cpp/bench/common/benchmark.hpp index de34cf4f57..fb878a0c8d 100644 --- a/cpp/bench/common/benchmark.hpp +++ b/cpp/bench/common/benchmark.hpp @@ -40,7 +40,7 @@ struct using_pool_memory_res { private: rmm::mr::device_memory_resource* orig_res_; rmm::mr::cuda_memory_resource cuda_res_; - rmm::mr::pool_memory_resource pool_res_; + rmm::mr::pool_memory_resource pool_res_; public: using_pool_memory_res(size_t initial_size, size_t max_size) @@ -115,13 +115,20 @@ class fixture { int device_id = 0; RAFT_CUDA_TRY(cudaGetDevice(&device_id)); RAFT_CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_size, cudaDevAttrL2CacheSize, device_id)); - scratch_buf_ = rmm::device_buffer(l2_cache_size, stream); + scratch_buf_ = rmm::device_buffer(l2_cache_size * 3, stream); } // every benchmark should be overriding this virtual void run_benchmark(::benchmark::State& state) = 0; virtual void generate_metrics(::benchmark::State& state) {} + protected: + /** The helper that writes zeroes to some buffer in GPU memory to flush the L2 cache. */ + void flush_L2_cache() + { + RAFT_CUDA_TRY(cudaMemsetAsync(scratch_buf_.data(), 0, scratch_buf_.size(), stream)); + } + /** * The helper to be used inside `run_benchmark`, to loop over the state and record time using the * cuda_event_timer. @@ -130,9 +137,7 @@ class fixture { void loop_on_state(::benchmark::State& state, Lambda benchmark_func, bool flush_L2 = true) { for (auto _ : state) { - if (flush_L2) { - RAFT_CUDA_TRY(cudaMemsetAsync(scratch_buf_.data(), 0, scratch_buf_.size(), stream)); - } + if (flush_L2) { flush_L2_cache(); } cuda_event_timer timer(state, stream); benchmark_func(); } @@ -147,9 +152,9 @@ class Fixture : public ::benchmark::Fixture { public: explicit Fixture(const std::string name, const Params&... params) - : ::benchmark::Fixture(), params_(params...) + : ::benchmark::Fixture(), params_(params...), name_(name) { - SetName(name.c_str()); + SetName(name_.c_str()); } Fixture() = delete; @@ -165,6 +170,7 @@ class Fixture : public ::benchmark::Fixture { private: std::unique_ptr fixture_; std::tuple params_; + const std::string name_; protected: void BenchmarkCase(State& state) override diff --git a/cpp/bench/spatial/knn.cu b/cpp/bench/spatial/knn.cu new file mode 100644 index 0000000000..72a1244269 --- /dev/null +++ b/cpp/bench/spatial/knn.cu @@ -0,0 +1,332 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#if defined RAFT_NN_COMPILED +#include +#endif + +#include +#include + +#include +#include + +#include + +namespace raft::bench::spatial { + +struct params { + /** Size of the dataset. */ + size_t n_samples; + /** Number of dimensions in the dataset. */ + size_t n_dims; + /** The batch size -- number of KNN searches. */ + size_t n_queries; + /** Number of nearest neighbours to find for every probe. */ + size_t k; +}; + +auto operator<<(std::ostream& os, const params& p) -> std::ostream& +{ + os << p.n_samples << "#" << p.n_dims << "#" << p.n_queries << "#" << p.k; + return os; +} + +enum class TransferStrategy { NO_COPY, COPY_PLAIN, COPY_PINNED, MAP_PINNED, MANAGED }; +enum class Scope { BUILD, SEARCH, BUILD_SEARCH }; + +auto operator<<(std::ostream& os, const TransferStrategy& ts) -> std::ostream& +{ + switch (ts) { + case TransferStrategy::NO_COPY: os << "NO_COPY"; break; + case TransferStrategy::COPY_PLAIN: os << "COPY_PLAIN"; break; + case TransferStrategy::COPY_PINNED: os << "COPY_PINNED"; break; + case TransferStrategy::MAP_PINNED: os << "MAP_PINNED"; break; + case TransferStrategy::MANAGED: os << "MANAGED"; break; + default: os << "UNKNOWN"; + } + return os; +} + +auto operator<<(std::ostream& os, const Scope& s) -> std::ostream& +{ + switch (s) { + case Scope::BUILD: os << "BUILD"; break; + case Scope::SEARCH: os << "SEARCH"; break; + case Scope::BUILD_SEARCH: os << "BUILD_SEARCH"; break; + default: os << "UNKNOWN"; + } + return os; +} + +struct device_resource { + public: + explicit device_resource(bool managed) : managed_(managed) + { + if (managed_) { + res_ = new rmm::mr::managed_memory_resource(); + } else { + res_ = rmm::mr::get_current_device_resource(); + } + } + + ~device_resource() + { + if (managed_) { delete res_; } + } + + [[nodiscard]] auto get() const -> rmm::mr::device_memory_resource* { return res_; } + + private: + const bool managed_; + rmm::mr::device_memory_resource* res_; +}; + +template +struct host_uvector { + host_uvector(size_t n, bool pinned) : n_(n) + { + if (pinned) { + res_ = new rmm::mr::pinned_memory_resource(); + } else { + res_ = new rmm::mr::new_delete_resource(); + } + arr_ = static_cast(res_->allocate(n_ * sizeof(T))); + } + + ~host_uvector() noexcept + { + res_->deallocate(arr_, n_ * sizeof(T)); + delete res_; + } + + auto data() -> T* { return arr_; } + [[nodiscard]] auto size() const -> size_t { return n_; } + + private: + rmm::mr::host_memory_resource* res_; + size_t n_; + T* arr_; +}; + +template +struct brute_force_knn { + using dist_t = ValT; + + ValT* index; + params ps; + + brute_force_knn(const raft::handle_t& handle, const params& ps, const ValT* data) + : index(const_cast(data)), ps(ps) + { + } + + void search(const raft::handle_t& handle, + const ValT* search_items, + dist_t* out_dists, + IdxT* out_idxs) + { + std::vector input{index}; + std::vector sizes{ps.n_samples}; + raft::spatial::knn::brute_force_knn(handle, + input, + sizes, + ps.n_dims, + const_cast(search_items), + ps.n_queries, + out_idxs, + out_dists, + ps.k); + } +}; + +template +struct knn : public fixture { + explicit knn(const params& p, const TransferStrategy& strategy, const Scope& scope) + : params_(p), + strategy_(strategy), + scope_(scope), + dev_mem_res_(strategy == TransferStrategy::MANAGED), + data_host_(0), + search_items_(p.n_queries * p.n_dims, stream), + out_dists_(p.n_queries * p.k, stream), + out_idxs_(p.n_queries * p.k, stream) + { + raft::random::RngState state{42}; + gen_data(state, search_items_, search_items_.size(), stream); + try { + size_t total_size = p.n_samples * p.n_dims; + data_host_.resize(total_size); + constexpr size_t kGenMinibatchSize = 1024 * 1024 * 1024; + rmm::device_uvector d(std::min(kGenMinibatchSize, total_size), stream); + for (size_t offset = 0; offset < total_size; offset += kGenMinibatchSize) { + size_t actual_size = std::min(total_size - offset, kGenMinibatchSize); + gen_data(state, d, actual_size, stream); + copy(data_host_.data() + offset, d.data(), actual_size, stream); + } + } catch (std::bad_alloc& e) { + data_does_not_fit_ = true; + } + } + + template + void gen_data(raft::random::RngState& state, + rmm::device_uvector& vec, + size_t n, + rmm::cuda_stream_view stream) + { + constexpr T kRangeMax = std::is_integral_v ? std::numeric_limits::max() : T(1); + constexpr T kRangeMin = std::is_integral_v ? std::numeric_limits::min() : T(-1); + if constexpr (std::is_integral_v) { + raft::random::uniformInt(state, vec.data(), n, kRangeMin, kRangeMax, stream); + } else { + raft::random::uniform(state, vec.data(), n, kRangeMin, kRangeMax, stream); + } + } + + void run_benchmark(::benchmark::State& state) override + { + if (data_does_not_fit_) { + state.SkipWithError("The data size is too big to fit into the host memory."); + } + if (scope_ == Scope::SEARCH && strategy_ != TransferStrategy::NO_COPY) { + state.SkipWithError( + "When benchmarking without index building (Scope::SEARCH), the data must be already on the " + "device (TransferStrategy::NO_COPY)"); + } + + using_pool_memory_res default_resource; + + try { + std::ostringstream label_stream; + label_stream << params_ << "#" << strategy_ << "#" << scope_; + state.SetLabel(label_stream.str()); + raft::handle_t handle(stream); + std::optional index; + + if (scope_ == Scope::SEARCH) { // also implies TransferStrategy::NO_COPY + rmm::device_uvector data(data_host_.size(), stream); + copy(data.data(), data_host_.data(), data_host_.size(), stream); + index.emplace(handle, params_, data.data()); + stream.synchronize(); + } + + // benchmark loop + for (auto _ : state) { + // managed or plain device memory initialized anew every time + rmm::device_uvector data(data_host_.size(), stream, dev_mem_res_.get()); + ValT* data_ptr = data.data(); + size_t allocation_size = data_host_.size() * sizeof(ValT); + + // Non-benchmarked part: using different methods to copy the data if necessary + switch (strategy_) { + case TransferStrategy::NO_COPY: // copy data to GPU before starting the timer. + copy(data_ptr, data_host_.data(), data_host_.size(), stream); + break; + case TransferStrategy::COPY_PINNED: + RAFT_CUDA_TRY( + cudaHostRegister(data_host_.data(), allocation_size, cudaHostRegisterDefault)); + break; + case TransferStrategy::MAP_PINNED: + RAFT_CUDA_TRY( + cudaHostRegister(data_host_.data(), allocation_size, cudaHostRegisterMapped)); + RAFT_CUDA_TRY(cudaHostGetDevicePointer(&data_ptr, data_host_.data(), 0)); + break; + case TransferStrategy::MANAGED: // sic! using std::memcpy rather than cuda copy + CUDA_CHECK(cudaMemAdvise( + data_ptr, allocation_size, cudaMemAdviseSetPreferredLocation, handle.get_device())); + CUDA_CHECK(cudaMemAdvise( + data_ptr, allocation_size, cudaMemAdviseSetAccessedBy, handle.get_device())); + CUDA_CHECK(cudaMemAdvise( + data_ptr, allocation_size, cudaMemAdviseSetReadMostly, handle.get_device())); + std::memcpy(data_ptr, data_host_.data(), allocation_size); + break; + default: break; + } + + flush_L2_cache(); + { + // Timer synchronizes the stream, so all prior gpu work should be done before it sets off. + cuda_event_timer timer(state, stream); + switch (strategy_) { + case TransferStrategy::COPY_PLAIN: + case TransferStrategy::COPY_PINNED: + copy(data_ptr, data_host_.data(), data_host_.size(), stream); + default: break; + } + + if (scope_ != Scope::SEARCH) { index.emplace(handle, params_, data_ptr); } + if (scope_ != Scope::BUILD) { + index->search(handle, search_items_.data(), out_dists_.data(), out_idxs_.data()); + } + } + + if (scope_ != Scope::SEARCH) { index.reset(); } + + switch (strategy_) { + case TransferStrategy::COPY_PINNED: + case TransferStrategy::MAP_PINNED: + RAFT_CUDA_TRY(cudaHostUnregister(data_host_.data())); + break; + default: break; + } + } + } catch (raft::exception& e) { + state.SkipWithError(e.what()); + } catch (std::bad_alloc& e) { + state.SkipWithError(e.what()); + } + } + + private: + const params params_; + const TransferStrategy strategy_; + const Scope scope_; + device_resource dev_mem_res_; + bool data_does_not_fit_ = false; + + std::vector data_host_; + rmm::device_uvector search_items_; + rmm::device_uvector out_dists_; + rmm::device_uvector out_idxs_; +}; + +const std::vector kInputs{ + {2000000, 128, 1000, 32}, {10000000, 128, 1000, 32}, {10000, 8192, 1000, 32}}; + +const std::vector kAllStrategies{ + TransferStrategy::NO_COPY, TransferStrategy::MAP_PINNED, TransferStrategy::MANAGED}; +const std::vector kNoCopyOnly{TransferStrategy::NO_COPY}; + +const std::vector kScopeFull{Scope::BUILD_SEARCH}; +const std::vector kAllScopes{Scope::BUILD_SEARCH, Scope::SEARCH, Scope::BUILD}; + +#define KNN_REGISTER(ValT, IdxT, ImplT, inputs, strats, scope) \ + namespace BENCHMARK_PRIVATE_NAME(knn) \ + { \ + using KNN = knn>; \ + RAFT_BENCH_REGISTER(KNN, #ValT "/" #IdxT "/" #ImplT, inputs, strats, scope); \ + } + +KNN_REGISTER(float, int64_t, brute_force_knn, kInputs, kAllStrategies, kScopeFull); + +KNN_REGISTER(float, uint32_t, brute_force_knn, kInputs, kNoCopyOnly, kScopeFull); + +} // namespace raft::bench::spatial diff --git a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh index c2d89aae7d..5d703bdb8d 100644 --- a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh +++ b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh @@ -72,7 +72,11 @@ __global__ void haversine_knn_kernel(value_idx* out_inds, faiss::gpu:: BlockSelect, warp_q, thread_q, tpb> - heap(faiss::gpu::Limits::getMax(), -1, smemK, smemV, k); + heap(faiss::gpu::Limits::getMax(), + std::numeric_limits::max(), + smemK, + smemV, + k); // Grid is exactly sized to rows available int limit = faiss::gpu::utils::roundDown(n_index_rows, faiss::gpu::kWarpSize); diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh index 196124352a..f78ffa84e1 100644 --- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh @@ -268,11 +268,11 @@ void brute_force_knn_impl( int device; RAFT_CUDA_TRY(cudaGetDevice(&device)); - rmm::device_uvector trans(id_ranges->size(), userStream); + rmm::device_uvector trans(id_ranges->size(), userStream); raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(), userStream); rmm::device_uvector all_D(0, userStream); - rmm::device_uvector all_I(0, userStream); + rmm::device_uvector all_I(0, userStream); value_t* out_D = res_D; IdxType* out_I = res_I; @@ -342,6 +342,8 @@ void brute_force_knn_impl( args.numQueries = n; args.outDistances = out_d_ptr; args.outIndices = out_i_ptr; + args.outIndicesType = sizeof(IdxType) == 4 ? faiss::gpu::IndicesDataType::I32 + : faiss::gpu::IndicesDataType::I64; /** * @todo: Until FAISS supports pluggable allocation strategies, diff --git a/cpp/include/raft/spatial/knn/specializations/knn.cuh b/cpp/include/raft/spatial/knn/specializations/knn.cuh index 6cf2418d29..bbbbf67d71 100644 --- a/cpp/include/raft/spatial/knn/specializations/knn.cuh +++ b/cpp/include/raft/spatial/knn/specializations/knn.cuh @@ -50,6 +50,38 @@ extern template void brute_force_knn(raft::handle_t c std::vector* translations, distance::DistanceType metric, float metric_arg); + +extern template void brute_force_knn(raft::handle_t const& handle, + std::vector& input, + std::vector& sizes, + int D, + float* search_items, + int n, + uint32_t* res_I, + float* res_D, + int k, + bool rowMajorIndex, + bool rowMajorQuery, + std::vector* translations, + distance::DistanceType metric, + float metric_arg); + +extern template void brute_force_knn( + raft::handle_t const& handle, + std::vector& input, + std::vector& sizes, + unsigned int D, + float* search_items, + unsigned int n, + uint32_t* res_I, + float* res_D, + unsigned int k, + bool rowMajorIndex, + bool rowMajorQuery, + std::vector* translations, + distance::DistanceType metric, + float metric_arg); + }; // namespace knn }; // namespace spatial }; // namespace raft diff --git a/cpp/src/nn/specializations/knn.cu b/cpp/src/nn/specializations/knn.cu index bb59e5b2ba..4e0a821c24 100644 --- a/cpp/src/nn/specializations/knn.cu +++ b/cpp/src/nn/specializations/knn.cu @@ -51,6 +51,36 @@ template void brute_force_knn(raft::handle_t const& h distance::DistanceType metric, float metric_arg); +template void brute_force_knn(raft::handle_t const& handle, + std::vector& input, + std::vector& sizes, + int D, + float* search_items, + int n, + uint32_t* res_I, + float* res_D, + int k, + bool rowMajorIndex, + bool rowMajorQuery, + std::vector* translations, + distance::DistanceType metric, + float metric_arg); + +template void brute_force_knn(raft::handle_t const& handle, + std::vector& input, + std::vector& sizes, + unsigned int D, + float* search_items, + unsigned int n, + uint32_t* res_I, + float* res_D, + unsigned int k, + bool rowMajorIndex, + bool rowMajorQuery, + std::vector* translations, + distance::DistanceType metric, + float metric_arg); + }; // namespace knn }; // namespace spatial }; // namespace raft diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu index bf13288f48..37e0edb6ab 100644 --- a/cpp/test/spatial/knn.cu +++ b/cpp/test/spatial/knn.cu @@ -16,8 +16,8 @@ #include "../test_utils.h" +#include #include - #include #if defined RAFT_NN_COMPILED #include @@ -40,8 +40,9 @@ struct KNNInputs { std::vector labels; }; +template __global__ void build_actual_output( - int* output, int n_rows, int k, const int* idx_labels, const int64_t* indices) + int* output, int n_rows, int k, const int* idx_labels, const IdxT* indices) { int element = threadIdx.x + blockDim.x * blockIdx.x; if (element >= n_rows * k) return; @@ -60,7 +61,7 @@ __global__ void build_expected_output(int* output, int n_rows, int k, const int* } } -template +template class KNNTest : public ::testing::TestWithParam { public: KNNTest() @@ -79,9 +80,11 @@ class KNNTest : public ::testing::TestWithParam { protected: void testBruteForce() { +#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG) raft::print_device_vector("Input array: ", input_.data(), rows_ * cols_, std::cout); - std::cout << "K: " << k_ << "\n"; + std::cout << "K: " << k_ << std::endl; raft::print_device_vector("Labels array: ", search_labels_.data(), rows_, std::cout); +#endif std::vector input_vec; std::vector sizes_vec; @@ -131,7 +134,7 @@ class KNNTest : public ::testing::TestWithParam { RAFT_CUDA_TRY(cudaMemsetAsync(input_.data(), 0, input_.size() * sizeof(float), stream)); RAFT_CUDA_TRY( cudaMemsetAsync(search_data_.data(), 0, search_data_.size() * sizeof(float), stream)); - RAFT_CUDA_TRY(cudaMemsetAsync(indices_.data(), 0, indices_.size() * sizeof(int64_t), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(indices_.data(), 0, indices_.size() * sizeof(IdxT), stream)); RAFT_CUDA_TRY(cudaMemsetAsync(distances_.data(), 0, distances_.size() * sizeof(float), stream)); RAFT_CUDA_TRY( cudaMemsetAsync(search_labels_.data(), 0, search_labels_.size() * sizeof(int), stream)); @@ -165,7 +168,7 @@ class KNNTest : public ::testing::TestWithParam { int cols_; rmm::device_uvector input_; rmm::device_uvector search_data_; - rmm::device_uvector indices_; + rmm::device_uvector indices_; rmm::device_uvector distances_; int k_; @@ -191,10 +194,13 @@ const std::vector inputs = { 2, {0, 0, 0, 0, 0, 1, 1, 1, 1, 1}}}; -typedef KNNTest KNNTestF; -TEST_P(KNNTestF, BruteForce) { this->testBruteForce(); } +typedef KNNTest KNNTestFint64_t; +TEST_P(KNNTestFint64_t, BruteForce) { this->testBruteForce(); } +typedef KNNTest KNNTestFuint32_t; +TEST_P(KNNTestFuint32_t, BruteForce) { this->testBruteForce(); } -INSTANTIATE_TEST_CASE_P(KNNTest, KNNTestF, ::testing::ValuesIn(inputs)); +INSTANTIATE_TEST_CASE_P(KNNTest, KNNTestFint64_t, ::testing::ValuesIn(inputs)); +INSTANTIATE_TEST_CASE_P(KNNTest, KNNTestFuint32_t, ::testing::ValuesIn(inputs)); } // namespace knn } // namespace spatial