rapidsai · rapids-bot · Nov 3, 2023 · Oct 31, 2023 · Oct 31, 2023 · Oct 31, 2023
@@ -172,8 +172,6 @@ void bench_search(::benchmark::State& state,
   std::ptrdiff_t batch_offset   = 0;
   std::size_t queries_processed = 0;
 
-  double total_time = 0;
-
   const auto& sp_json = index.search_params[search_param_ix];
 
   if (state.thread_index() == 0) { dump_parameters(state, sp_json); }
@@ -249,13 +247,14 @@ void bench_search(::benchmark::State& state,
   {
     nvtx_case nvtx{state.name()};
 
+    // gbench ensures that all threads are synchronized at the start of the benchmark loop.
+
     // TODO: Have the odd threads load the queries backwards just to rule out caching.
     ANN<T>* algo = dynamic_cast<ANN<T>*>(current_algo.get());
     for (auto _ : state) {
       [[maybe_unused]] auto ntx_lap = nvtx.lap();
       [[maybe_unused]] auto gpu_lap = gpu_timer.lap();
 
-      auto start = std::chrono::high_resolution_clock::now();
       // run the search
       try {
         algo->search(query_set + batch_offset * dataset->dim(),
@@ -268,24 +267,22 @@ void bench_search(::benchmark::State& state,
         state.SkipWithError(std::string(e.what()));
       }
 
-      auto end = std::chrono::high_resolution_clock::now();
-
-      auto elapsed_seconds = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
       // advance to the next batch
       batch_offset = (batch_offset + n_queries) % query_set_size;
       queries_processed += n_queries;
-      state.SetIterationTime(elapsed_seconds.count());
-      total_time += elapsed_seconds.count();
     }
   }
-  auto end = std::chrono::high_resolution_clock::now();
-  if (state.thread_index() == 0) {
-    auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(end - start).count();
-    state.counters.insert({{"end_to_end", duration}});
-  }
+  cudaDeviceSynchronize();
+  auto end      = std::chrono::high_resolution_clock::now();
+  auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(end - start).count();
+  if (state.thread_index() == 0) { state.counters.insert({{"end_to_end", duration}}); }
+  state.counters.insert(
+    {"Latency", {duration / double(state.iterations()), benchmark::Counter::kAvgThreads}});
+
   state.SetItemsProcessed(queries_processed);
   if (cudart.found()) {
-    state.counters.insert({{"GPU", gpu_timer.total_time() / double(state.iterations())}});
+    double gpu_time_per_iteration = gpu_timer.total_time() / (double)state.iterations();
+    state.counters.insert({"GPU", {gpu_time_per_iteration, benchmark::Counter::kAvgThreads}});
   }
 
   // This will be the total number of queries across all threads
@@ -619,5 +616,4 @@ inline auto run_main(int argc, char** argv) -> int
   current_algo.reset();
   return 0;
 }
-
 };  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/fix_latency_workload.h b/cpp/bench/ann/src/raft/fix_latency_workload.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cassert>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <raft/core/logger.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <stdexcept>
+#include <string>
+
+#include <raft/util/cudart_utils.hpp>
+
+#include <chrono>
+#include <thread>
+
+namespace raft::bench::ann {
+
+// This kernel sleeps for 10ms
+__global__ inline void kernel_sleep(int sleep_ms)
+{
+  for (int i = 0; i < sleep_ms; i++)
+    __nanosleep(1000000U);  // lms
+}
+
+inline void workload(bool use_gpu, bool sync_stream, int sleep_ms, cudaStream_t stream)
+{
+  if (use_gpu) {
+    kernel_sleep<<<1, 1, 0, stream>>>(sleep_ms);
+    if (sync_stream) { cudaStreamSynchronize(stream); }
+  } else {
+    std::this_thread::sleep_for(std::chrono::milliseconds(sleep_ms));
+  }
+}
+
+class FixLatencyWorkload : public ANN<float> {
+ public:
+  using typename ANN<float>::AnnSearchParam;
+
+  struct SearchParam : public AnnSearchParam {
+    bool use_gpu     = true;
+    bool sync_stream = true;
+    int sleep_ms     = 10;
+  };
+
+  using BuildParam = SearchParam;
+
+  FixLatencyWorkload(Metric metric, int dim, const BuildParam& param)
+    : ANN<float>(metric, dim), build_param_{param}
+  {
+  }
+
+  ~FixLatencyWorkload() noexcept {}
+
+  void build(const float* dataset, size_t nrow, cudaStream_t stream) final
+  {
+    workload(build_param_.use_gpu,
+             build_param_.sync_stream,
+             build_param_.sleep_ms,
+             raft::resource::get_cuda_stream(handle_));
+  }
+
+  void set_search_param(const AnnSearchParam& param) override
+  {
+    search_param_ = dynamic_cast<const SearchParam&>(param);
+  }
+
+  // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
+  // will be filled with (size_t)-1
+  void search(const float* queries,
+              int batch_size,
+              int k,
+              size_t* neighbors,
+              float* distances,
+              cudaStream_t stream = 0) const override
+  {
+    workload(search_param_.use_gpu,
+             search_param_.sync_stream,
+             search_param_.sleep_ms,
+             raft::resource::get_cuda_stream(handle_));
+  }
+
+  // to enable dataset access from GPU memory
+  AlgoProperty get_preference() const override
+  {
+    AlgoProperty property;
+    property.dataset_memory_type = MemoryType::HostMmap;
+    property.query_memory_type   = MemoryType::Device;
+    return property;
+  }
+
+  void save(const std::string& file) const override
+  {
+    std::ofstream of(file, std::ios::out | std::ios::binary);
+    of.close();
+  }
+
+  void load(const std::string&) override {}
+
+ private:
+  raft::device_resources handle_;
+  BuildParam build_param_;
+  SearchParam search_param_;
+};
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_benchmark.cu b/cpp/bench/ann/src/raft/raft_benchmark.cu
@@ -47,10 +47,19 @@ extern template class raft::bench::ann::RaftCagra<uint8_t, uint32_t>;
 extern template class raft::bench::ann::RaftCagra<int8_t, uint32_t>;
 #endif
 #define JSON_DIAGNOSTICS 1
+#include "fix_latency_workload.h"
 #include <nlohmann/json.hpp>
 
 namespace raft::bench::ann {
 
+inline void parse_search_param(const nlohmann::json& conf,
+                               typename raft::bench::ann::FixLatencyWorkload::SearchParam& param)
+{
+  if (conf.contains("use_gpu")) param.use_gpu = conf.at("use_gpu");
+  if (conf.contains("sync_stream")) param.sync_stream = conf.at("sync_stream");
+  if (conf.contains("sleep_ms")) param.sleep_ms = conf.at("sleep_ms");
+}
+
 #ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
 template <typename T, typename IdxT>
 void parse_build_param(const nlohmann::json& conf,
@@ -198,6 +207,11 @@ std::unique_ptr<raft::bench::ann::ANN<T>> create_algo(const std::string& algo,
 #ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN
     if (algo == "raft_bfknn") { ann = std::make_unique<raft::bench::ann::RaftGpu<T>>(metric, dim); }
 #endif
+    if (algo == "fix_latency") {
+      typename raft::bench::ann::FixLatencyWorkload::BuildParam param;
+      parse_search_param(conf, param);  // Note build and search params are the same.
+      ann = std::make_unique<raft::bench::ann::FixLatencyWorkload>(metric, dim, param);
+    }
   }
 
   if constexpr (std::is_same_v<T, uint8_t>) {}
@@ -238,6 +252,14 @@ std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search
     return param;
   }
 #endif
+
+  if constexpr (std::is_same_v<T, float>) {
+    if (algo == "fix_latency") {
+      auto param = std::make_unique<typename raft::bench::ann::FixLatencyWorkload::SearchParam>();
+      parse_search_param(conf, *param);
+      return param;
+    }
+  }
 #ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
   if (algo == "raft_ivf_flat") {
     auto param =
@@ -263,7 +285,6 @@ std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search
   // else
   throw std::runtime_error("invalid algo: '" + algo + "'");
 }
-
 };  // namespace raft::bench::ann
 
 REGISTER_ALGO_INSTANCE(float);

diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md
@@ -377,20 +377,23 @@ The benchmarks capture several different measurements. The table below describes
 
 The table below describes each of the measurements for the index search benchmarks:
 
-| Name | Description                                                                                                                                           |
-|------|-------------------------------------------------------------------------------------------------------------------------------------------------------|
-| Benchmark | A name that uniquely identifies the benchmark instance                                                                                                |
-| Time | The average runtime for each batch. This is approximately `end_to_end` / `Iterations`                                                                 |
-| CPU | The average `wall-time`. In `throughput` mode, this is the average `wall-time` spent in each thread.                                                  |
-| Iterations | Total number of batches. This is going to be `total_queres` / `n_queries`                                                                             | 
-| Recall | Proportion of correct neighbors to ground truth neighbors. Note this column is only present if groundtruth file is specified in dataset configuration |
-| items_per_second | Total throughput. This is approximately `total_queries` / `end_to_end`.                                                                               |
-| k | Number of neighbors being queried in each iteration                                                                                                   |
+| Name       | Description                                                                                                                                           |
+|------------|-------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Benchmark  | A name that uniquely identifies the benchmark instance                                                                                                |
+| Time       | The wall clock time divided by the number of threads.                                                                                                 |
+| CPU        | The average CPU time (user + sys time). This does not include idle time (which can also happen while waiting for GPU sync).                           |
+| Iterations | Total number of batches. This is going to be `total_queries` / `n_queries`                                                                            | 
+| GPU        | GPU latency of a single batch (seconds). In throughput mode this is averaged over multiple threads.                                                   |
+| Latency    | Latency of a single batch (second), calculated from wall-clock time. In throughput mode this is averaged over multiple threads.                       |
+| Recall     | Proportion of correct neighbors to ground truth neighbors. Note this column is only present if groundtruth file is specified in dataset configuration. |
+| items_per_second | Total throughput. This is approximately `total_queries` / `end_to_end`.                                                                         |
+| k          | Number of neighbors being queried in each iteration                                                                                                   |
 | end_to_end | Total time taken to run all batches for all iterations                                                                                                | 
-| n_queries | Total number of query vectors in each batch                                                                                                           |
-| total_queries | Total number of vectors queries across all iterations                                                                                                 |
+| n_queries  | Total number of query vectors in each batch                                                                                                           |
+| total_queries | Total number of vectors queries across all iterations                                                                                              |
 
 Note that the actual table displayed on the screen may differ slightly as the hyper-parameters will also be displayed for each different combination being benchmarked.
+Note for recall calculation: the number of queries processed per test depends on the number of iterations. Because of this, recall can show slight fluctuations if less neighbors are processed then it is available for the benchmark. 
 
 ## Creating and customizing dataset configurations
 

diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/fix_latency.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/fix_latency.json
@@ -0,0 +1,29 @@
+{
+  "dataset": {
+    "name": "dummy",
+    "base_file": "deep-1B/base.1B.fbin",
+    "subset_size": 10,
+    "query_file": "deep-1B/query.public.10K.fbin",
+    "groundtruth_neighbors_file": "deep-1B/groundtruth.neighbors.ibin",
+    "distance": "euclidean"
+  },
+
+  "search_basic_param": {
+    "batch_size": 10000,
+    "k": 10
+  },
+
+  "index": [   
+    {
+      "name": "fix_latency",
+      "algo": "fix_latency",
+      "build_param": {"use_gpu": true},
+      "file": "fix_latency/dummy_idx",
+      "search_params": [
+        {"use_gpu": false},
+        {"use_gpu": true, "sync_stream": false},
+        {"use_gpu": true, "sync_stream": true}
+      ]
+    }
+  ]
+}