From 64efd1e9df652768d879989aadb5449aaf1ce104 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Sat, 4 Nov 2023 04:49:21 +0100 Subject: [PATCH] Fixing hnswlib in latency mode (#1959) Authors: - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Divye Gala (https://github.com/divyegala) URL: https://github.com/rapidsai/raft/pull/1959 --- cpp/bench/ann/src/common/benchmark.hpp | 12 ++++++++---- cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h | 13 +++++-------- .../src/raft-ann-bench/run/conf/datasets.yaml | 6 ++++++ 3 files changed, 19 insertions(+), 12 deletions(-) diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp index ecb5e366b5..9ddb232f5c 100644 --- a/cpp/bench/ann/src/common/benchmark.hpp +++ b/cpp/bench/ann/src/common/benchmark.hpp @@ -198,7 +198,7 @@ void bench_search(::benchmark::State& state, * Make sure the first thread loads the algo and dataset */ if (state.thread_index() == 0) { - std::lock_guard lk(init_mutex); + std::unique_lock lk(init_mutex); // algo is static to cache it between close search runs to save time on index loading static std::string index_file = ""; if (index.file != index_file) { @@ -249,9 +249,11 @@ void bench_search(::benchmark::State& state, query_set = dataset->query_set(current_algo_props->query_memory_type); cond_var.notify_all(); } else { - // All other threads will wait for the first thread to initialize the algo. std::unique_lock lk(init_mutex); - cond_var.wait(lk, [] { return current_algo_props.get() != nullptr; }); + // All other threads will wait for the first thread to initialize the algo. + + cond_var.wait( + lk, [] { return current_algo_props.get() != nullptr && current_algo.get() != nullptr; }); // gbench ensures that all threads are synchronized at the start of the benchmark loop. // We are accessing shared variables (like current_algo, current_algo_probs) before the // benchmark loop, therefore the synchronization here is necessary. @@ -292,6 +294,7 @@ void bench_search(::benchmark::State& state, // advance to the next batch batch_offset = (batch_offset + n_queries) % query_set_size; + queries_processed += n_queries; } } @@ -410,7 +413,6 @@ void register_search(std::shared_ptr> dataset, auto* b = ::benchmark::RegisterBenchmark( index.name + suf, bench_search, index, i, dataset, metric_objective) ->Unit(benchmark::kMillisecond) - ->ThreadRange(threads[0], threads[1]) /** * The following are important for getting accuracy QPS measurements on both CPU * and GPU These make sure that @@ -420,6 +422,8 @@ void register_search(std::shared_ptr> dataset, */ ->MeasureProcessCPUTime() ->UseRealTime(); + + if (metric_objective == Objective::THROUGHPUT) { b->ThreadRange(threads[0], threads[1]); } } } } diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h index 23cae6352c..364da81f77 100644 --- a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h +++ b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h @@ -147,7 +147,6 @@ void HnswLib::build(const T* dataset, size_t nrow, cudaStream_t) char buf[20]; std::time_t now = std::time(nullptr); std::strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", std::localtime(&now)); - printf("%s building %zu / %zu\n", buf, i, items_per_thread); fflush(stdout); } @@ -163,13 +162,11 @@ void HnswLib::set_search_param(const AnnSearchParam& param_) auto param = dynamic_cast(param_); appr_alg_->ef_ = param.ef; metric_objective_ = param.metric_objective; + num_threads_ = param.num_threads; - bool use_pool = (metric_objective_ == Objective::LATENCY && param.num_threads > 1) && - (!thread_pool_ || num_threads_ != param.num_threads); - if (use_pool) { - num_threads_ = param.num_threads; - thread_pool_ = std::make_unique(num_threads_); - } + // Create a pool if multiple query threads have been set and the pool hasn't been created already + bool create_pool = (metric_objective_ == Objective::LATENCY && num_threads_ > 1 && !thread_pool_); + if (create_pool) { thread_pool_ = std::make_unique(num_threads_); } } template @@ -180,7 +177,7 @@ void HnswLib::search( // hnsw can only handle a single vector at a time. get_search_knn_results_(query + i * dim_, k, indices + i * k, distances + i * k); }; - if (metric_objective_ == Objective::LATENCY) { + if (metric_objective_ == Objective::LATENCY && num_threads_ > 1) { thread_pool_->submit(f, batch_size); } else { for (int i = 0; i < batch_size; i++) { diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/datasets.yaml b/python/raft-ann-bench/src/raft-ann-bench/run/conf/datasets.yaml index 11a91f91fd..a2b948a464 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/datasets.yaml +++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/datasets.yaml @@ -39,36 +39,42 @@ dims: 960 base_file: gist-960-euclidean/base.fbin query_file: gist-960-euclidean/query.fbin + groundtruth_neighbors_file: gist-960-euclidean/groundtruth.neighbors.ibin distance: euclidean - name: glove-50-angular dims: 50 base_file: glove-50-angular/base.fbin query_file: glove-50-angular/query.fbin + groundtruth_neighbors_file: glove-50-angular/groundtruth.neighbors.ibin distance: euclidean - name: glove-50-inner dims: 50 base_file: glove-50-inner/base.fbin query_file: glove-50-inner/query.fbin + groundtruth_neighbors_file: glove-50-inner/groundtruth.neighbors.ibin distance: euclidean - name: glove-100-angular dims: 100 base_file: glove-100-angular/base.fbin query_file: glove-100-angular/query.fbin + groundtruth_neighbors_file: glove-100-angular/groundtruth.neighbors.ibin distance: euclidean - name: glove-100-inner dims: 100 base_file: glove-100-inner/base.fbin query_file: glove-100-inner/query.fbin + groundtruth_neighbors_file: glove-100-inner/groundtruth.neighbors.ibin distance: euclidean - name: lastfm-65-angular dims: 65 base_file: lastfm-65-angular/base.fbin query_file: lastfm-65-angular/query.fbin + groundtruth_neighbors_file: lastfm-65-angular/groundtruth.neighbors.ibin distance: euclidean - name: mnist-784-euclidean