Cache IVF-PQ and select-warpsort kernel launch parameters to reduce l…

…atency
rapidsai · Aug 30, 2023 · a653b8b · a653b8b
1 parent 6f58669
commit a653b8b
Show file tree

Hide file tree

Showing 4 changed files with 156 additions and 49 deletions.
diff --git a/cpp/include/raft/matrix/detail/select_k-inl.cuh b/cpp/include/raft/matrix/detail/select_k-inl.cuh
@@ -327,7 +327,7 @@ void select_k(raft::resources const& handle,
     case Algo::kWarpDistributedShm:
       return detail::select::warpsort::
         select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_distributed_ext>(
-          in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr);
+          handle, in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr);
     case Algo::kFaissBlockSelect:
       return neighbors::detail::select_k(
         in_val, in_idx, batch_size, len, out_val, out_idx, select_min, k, stream);

diff --git a/cpp/include/raft/matrix/detail/select_warpsort.cuh b/cpp/include/raft/matrix/detail/select_warpsort.cuh
@@ -18,7 +18,9 @@
 
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/logger.hpp>
+#include <raft/core/resource/user_resource.hpp>
 #include <raft/util/bitonic_sort.cuh>
+#include <raft/util/cache.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/integer_utils.hpp>
 #include <raft/util/pow2_utils.cuh>
@@ -773,6 +775,11 @@ __launch_bounds__(256) __global__
   queue.store(out + block_id * k, out_idx + block_id * k);
 }
 
+struct launch_params {
+  int block_size    = 0;
+  int min_grid_size = 0;
+};
+
 template <template <int, bool, typename, typename> class WarpSortClass,
           typename T,
           typename IdxT,
@@ -790,29 +797,28 @@ struct launch_setup {
    * @param[in] block_size_limit
    *   Forcefully limit the block size (optional)
    */
-  static void calc_optimal_params(int k,
-                                  int* block_size,
-                                  int* min_grid_size,
-                                  int block_size_limit = 0)
+  static auto calc_optimal_params(int k, int block_size_limit) -> launch_params
   {
     const int capacity = bound_by_power_of_two(k);
     if constexpr (Capacity > 1) {
       if (capacity < Capacity) {
         return launch_setup<WarpSortClass, T, IdxT, Capacity / 2>::calc_optimal_params(
-          capacity, block_size, min_grid_size, block_size_limit);
+          capacity, block_size_limit);
       }
     }
     ASSERT(capacity <= Capacity, "Requested k is too big (%d)", k);
     auto calc_smem = [k](int block_size) {
       int num_of_warp = block_size / std::min<int>(WarpSize, Capacity);
       return calc_smem_size_for_block_wide<T, IdxT>(num_of_warp, k);
     };
+    launch_params ps;
     RAFT_CUDA_TRY(cudaOccupancyMaxPotentialBlockSizeVariableSMem(
-      min_grid_size,
-      block_size,
+      &ps.min_grid_size,
+      &ps.block_size,
       block_kernel<WarpSortClass, Capacity, true, T, IdxT>,
       calc_smem,
       block_size_limit));
+    return ps;
   }
 
   static void kernel(int k,
@@ -869,6 +875,29 @@ struct launch_setup {
   }
 };
 
+template <template <int, bool, typename, typename> class WarpSortClass, typename T, typename IdxT>
+struct warpsort_params_cache {
+  static constexpr size_t kDefaultSize = 100;
+  cache::lru<uint64_t, std::hash<uint64_t>, std::equal_to<>, launch_params> value{kDefaultSize};
+};
+
+template <template <int, bool, typename, typename> class WarpSortClass, typename T, typename IdxT>
+static auto calc_optimal_params(raft::resources const& res, int k, int block_size_limit = 0)
+  -> launch_params
+{
+  static thread_local std::unordered_map<uint64_t, launch_params> memo{};
+  uint64_t key = (static_cast<uint64_t>(k) << 32) | static_cast<uint64_t>(block_size_limit);
+  auto& cache =
+    resource::get_user_resource<warpsort_params_cache<WarpSortClass, T, IdxT>>(res)->value;
+  launch_params val;
+  if (!cache.get(key, &val)) {
+    val =
+      launch_setup<WarpSortClass, T, IdxT, kMaxCapacity>::calc_optimal_params(k, block_size_limit);
+    cache.set(key, val);
+  }
+  return val;
+}
+
 template <template <int, bool, typename, typename> class WarpSortClass>
 struct LaunchThreshold {};
 
@@ -898,15 +927,19 @@ struct LaunchThreshold<warp_sort_immediate> {
 };
 
 template <template <int, bool, typename, typename> class WarpSortClass, typename T, typename IdxT>
-void calc_launch_parameter(
-  size_t batch_size, size_t len, int k, int* p_num_of_block, int* p_num_of_warp)
+void calc_launch_parameter(raft::resources const& res,
+                           size_t batch_size,
+                           size_t len,
+                           int k,
+                           int* p_num_of_block,
+                           int* p_num_of_warp)
 {
   const int capacity               = bound_by_power_of_two(k);
   const int capacity_per_full_warp = std::max(capacity, WarpSize);
-  int block_size                   = 0;
-  int min_grid_size                = 0;
-  launch_setup<WarpSortClass, T, IdxT>::calc_optimal_params(k, &block_size, &min_grid_size);
-  block_size = Pow2<WarpSize>::roundDown(block_size);
+  auto lps                         = calc_optimal_params<WarpSortClass, T, IdxT>(res, k);
+  int block_size                   = lps.block_size;
+  int min_grid_size                = lps.min_grid_size;
+  block_size                       = Pow2<WarpSize>::roundDown(block_size);
 
   int num_of_warp;
   int num_of_block;
@@ -950,19 +983,16 @@ void calc_launch_parameter(
     // to occupy a single block well.
     block_size = adjust_block_size(block_size);
     do {
-      num_of_warp               = block_size / WarpSize;
-      int another_block_size    = 0;
-      int another_min_grid_size = 0;
-      launch_setup<WarpSortClass, T, IdxT>::calc_optimal_params(
-        k, &another_block_size, &another_min_grid_size, block_size);
-      another_block_size = adjust_block_size(another_block_size);
-      if (batch_size >= size_t(another_min_grid_size)  // still have enough work
-          && another_block_size < block_size           // protect against an infinite loop
-          && another_min_grid_size * another_block_size >
+      num_of_warp        = block_size / WarpSize;
+      auto another       = calc_optimal_params<WarpSortClass, T, IdxT>(res, k, block_size);
+      another.block_size = adjust_block_size(another.block_size);
+      if (batch_size >= size_t(another.min_grid_size)  // still have enough work
+          && another.block_size < block_size           // protect against an infinite loop
+          && another.min_grid_size * another.block_size >
                min_grid_size * block_size              // improve occupancy
       ) {
-        block_size    = another_block_size;
-        min_grid_size = another_min_grid_size;
+        block_size    = another.block_size;
+        min_grid_size = another.min_grid_size;
       } else {
         break;
       }
@@ -1036,7 +1066,8 @@ void select_k_(int num_of_block,
 }
 
 template <typename T, typename IdxT, template <int, bool, typename, typename> class WarpSortClass>
-void select_k_impl(const T* in,
+void select_k_impl(raft::resources const& res,
+                   const T* in,
                    const IdxT* in_idx,
                    size_t batch_size,
                    size_t len,
@@ -1049,7 +1080,8 @@ void select_k_impl(const T* in,
 {
   int num_of_block = 0;
   int num_of_warp  = 0;
-  calc_launch_parameter<WarpSortClass, T, IdxT>(batch_size, len, k, &num_of_block, &num_of_warp);
+  calc_launch_parameter<WarpSortClass, T, IdxT>(
+    res, batch_size, len, k, &num_of_block, &num_of_warp);
 
   select_k_<WarpSortClass, T, IdxT>(num_of_block,
                                     num_of_warp,
@@ -1103,7 +1135,8 @@ void select_k_impl(const T* in,
  *           memory pool here to avoid memory allocations within the call).
  */
 template <typename T, typename IdxT>
-void select_k(const T* in,
+void select_k(raft::resources const& res,
+              const T* in,
               const IdxT* in_idx,
               size_t batch_size,
               size_t len,
@@ -1123,7 +1156,7 @@ void select_k(const T* in,
   int num_of_block = 0;
   int num_of_warp  = 0;
   calc_launch_parameter<warp_sort_immediate, T, IdxT>(
-    batch_size, len, k, &num_of_block, &num_of_warp);
+    res, batch_size, len, k, &num_of_block, &num_of_warp);
   int len_per_thread = len / (num_of_block * num_of_warp * std::min(capacity, WarpSize));
 
   if (len_per_thread <= LaunchThreshold<warp_sort_immediate>::len_factor_for_choosing) {
@@ -1141,7 +1174,7 @@ void select_k(const T* in,
                                             mr);
   } else {
     calc_launch_parameter<warp_sort_filtered, T, IdxT>(
-      batch_size, len, k, &num_of_block, &num_of_warp);
+      res, batch_size, len, k, &num_of_block, &num_of_warp);
     select_k_<warp_sort_filtered, T, IdxT>(num_of_block,
                                            num_of_warp,
                                            in,

diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
@@ -33,13 +33,15 @@
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/detail/device_memory_resource.hpp>
 #include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/core/resource/user_resource.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/linalg/gemm.cuh>
 #include <raft/linalg/map.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/matrix/detail/select_k.cuh>
 #include <raft/matrix/detail/select_warpsort.cuh>
+#include <raft/util/cache.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/device_atomics.cuh>
 #include <raft/util/device_loads_stores.cuh>
@@ -80,6 +82,12 @@ void select_clusters(raft::resources const& handle,
                      const float* cluster_centers,  // [n_lists, dim_ext]
                      rmm::mr::device_memory_resource* mr)
 {
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "ivf_pq::search::select_clusters(n_probes = %u, n_queries = %u, n_lists = %u, dim = %u)",
+    n_probes,
+    n_queries,
+    n_lists,
+    dim);
   auto stream = resource::get_cuda_stream(handle);
   /* NOTE[qc_distances]
 
@@ -409,6 +417,46 @@ constexpr inline auto expected_probe_coresidency(uint32_t n_clusters,
   return 1 + (n_queries - 1) * n_probes / (2 * n_clusters);
 }
 
+struct search_kernel_key {
+  bool manage_local_topk;
+  uint32_t locality_hint;
+  double preferred_shmem_carveout;
+  uint32_t pq_bits;
+  uint32_t pq_dim;
+  uint32_t precomp_data_count;
+  uint32_t n_queries;
+  uint32_t n_probes;
+  uint32_t topk;
+};
+
+inline auto operator==(const search_kernel_key& a, const search_kernel_key& b) -> bool
+{
+  return a.manage_local_topk == b.manage_local_topk && a.locality_hint == b.locality_hint &&
+         a.preferred_shmem_carveout == b.preferred_shmem_carveout && a.pq_bits == b.pq_bits &&
+         a.pq_dim == b.pq_dim && a.precomp_data_count == b.precomp_data_count &&
+         a.n_queries == b.n_queries && a.n_probes == b.n_probes && a.topk == b.topk;
+}
+
+struct search_kernel_key_hash {
+  inline auto operator()(const search_kernel_key& x) const noexcept -> std::size_t
+  {
+    return (size_t{x.manage_local_topk} << 63) +
+           size_t{x.topk} * size_t{x.n_probes} * size_t{x.n_queries} +
+           size_t{x.precomp_data_count} * size_t{x.pq_dim} * size_t{x.pq_bits};
+  }
+};
+
+template <typename OutT, typename LutT, typename IvfSampleFilterT>
+struct search_kernel_cache {
+  /** Number of matmul invocations to cache. */
+  static constexpr size_t kDefaultSize = 100;
+  cache::lru<search_kernel_key,
+             search_kernel_key_hash,
+             std::equal_to<>,
+             selected<OutT, LutT, IvfSampleFilterT>>
+    value{kDefaultSize};
+};
+
 /**
  * The "main part" of the search, which assumes that outer-level `search` has already:
  *
@@ -433,6 +481,12 @@ void ivfpq_search_worker(raft::resources const& handle,
                          double preferred_shmem_carveout,
                          IvfSampleFilterT sample_filter)
 {
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "ivf_pq::search-worker(n_queries = %u, n_probes = %u, k = %u, dim = %zu)",
+    n_queries,
+    n_probes,
+    topK,
+    index.dim());
   auto stream = resource::get_cuda_stream(handle);
   auto mr     = resource::get_workspace_resource(handle);
 
@@ -535,17 +589,32 @@ void ivfpq_search_worker(raft::resources const& handle,
     } break;
   }
 
-  auto search_instance = compute_similarity_select<ScoreT, LutT, IvfSampleFilterT>(
-    resource::get_device_properties(handle),
-    manage_local_topk,
-    coresidency,
-    preferred_shmem_carveout,
-    index.pq_bits(),
-    index.pq_dim(),
-    precomp_data_count,
-    n_queries,
-    n_probes,
-    topK);
+  selected<ScoreT, LutT, IvfSampleFilterT> search_instance;
+  search_kernel_key search_key{manage_local_topk,
+                               coresidency,
+                               preferred_shmem_carveout,
+                               index.pq_bits(),
+                               index.pq_dim(),
+                               precomp_data_count,
+                               n_queries,
+                               n_probes,
+                               topK};
+  auto& cache =
+    resource::get_user_resource<search_kernel_cache<ScoreT, LutT, IvfSampleFilterT>>(handle)->value;
+  if (!cache.get(search_key, &search_instance)) {
+    search_instance = compute_similarity_select<ScoreT, LutT, IvfSampleFilterT>(
+      resource::get_device_properties(handle),
+      manage_local_topk,
+      coresidency,
+      preferred_shmem_carveout,
+      index.pq_bits(),
+      index.pq_dim(),
+      precomp_data_count,
+      n_queries,
+      n_probes,
+      topK);
+    cache.set(search_key, search_instance);
+  }
 
   rmm::device_uvector<LutT> device_lut(search_instance.device_lut_size, stream, mr);
   std::optional<device_vector<float>> query_kths_buf{std::nullopt};
@@ -696,7 +765,7 @@ inline auto get_max_batch_size(raft::resources const& res,
                                uint32_t max_samples) -> uint32_t
 {
   uint32_t max_batch_size         = n_queries;
-  uint32_t n_ctas_total           = getMultiProcessorCount() * 2;
+  uint32_t n_ctas_total           = resource::get_device_properties(res).multiProcessorCount * 2;
   uint32_t n_ctas_total_per_batch = n_ctas_total / max_batch_size;
   float utilization               = float(n_ctas_total_per_batch * max_batch_size) / n_ctas_total;
   if (n_ctas_total_per_batch > 1 || (n_ctas_total_per_batch == 1 && utilization < 0.6)) {
@@ -798,6 +867,8 @@ inline void search(raft::resources const& handle,
 
   for (uint32_t offset_q = 0; offset_q < n_queries; offset_q += max_queries) {
     uint32_t queries_batch = min(max_queries, n_queries - offset_q);
+    common::nvtx::range<common::nvtx::domain::raft> batch_scope(
+      "ivf_pq::search-batch(queries: %u - %u)", offset_q, offset_q + queries_batch);
 
     select_clusters(handle,
                     clusters_to_probe.data(),

diff --git a/cpp/internal/raft_internal/matrix/select_k.cuh b/cpp/internal/raft_internal/matrix/select_k.cuh
@@ -33,6 +33,7 @@ struct params {
   bool use_index_input       = true;
   bool use_same_leading_bits = false;
   bool use_memory_pool       = true;
+  double frac_infinities     = 0.0;
 };
 
 inline auto operator<<(std::ostream& os, const params& ss) -> std::ostream&
@@ -41,8 +42,10 @@ inline auto operator<<(std::ostream& os, const params& ss) -> std::ostream&
   os << ", len: " << ss.len;
   os << ", k: " << ss.k;
   os << (ss.select_min ? ", asc" : ", dsc");
-  os << (ss.use_index_input ? "" : ", no-input-index");
-  os << (ss.use_same_leading_bits ? ", same-leading-bits}" : "}");
+  if (!ss.use_index_input) { os << ", no-input-index"; }
+  if (ss.use_same_leading_bits) { os << ", same-leading-bits"; }
+  if (ss.frac_infinities > 0) { os << ", infs: " << ss.frac_infinities; }
+  os << "}";
   return os;
 }
 
@@ -147,23 +150,23 @@ void select_k_impl(const resources& handle,
                                                                stream);
     case Algo::kWarpAuto:
       return detail::select::warpsort::select_k<T, IdxT>(
-        in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+        handle, in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
     case Algo::kWarpImmediate:
       return detail::select::warpsort::
         select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_immediate>(
-          in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+          handle, in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
     case Algo::kWarpFiltered:
       return detail::select::warpsort::
         select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_filtered>(
-          in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+          handle, in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
     case Algo::kWarpDistributed:
       return detail::select::warpsort::
         select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_distributed>(
-          in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+          handle, in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
     case Algo::kWarpDistributedShm:
       return detail::select::warpsort::
         select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_distributed_ext>(
-          in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+          handle, in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
     case Algo::kFaissBlockSelect:
       return neighbors::detail::select_k(
         in, in_idx, batch_size, len, out, out_idx, select_min, k, stream);