From eceba57e6938211bd72a9b15b304e4189d9e40fd Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 12 Oct 2022 16:13:11 +0200
Subject: [PATCH 01/31] Improvements for ivfpq_compute_similarity_kernel

  1. Change the layout of pq_centers to facilitate coalesced access during search
  2. Optimize the arithmetics in ivfpq_compute_score and ivfpq_compute_similarity_kernel
---
 cpp/include/raft/neighbors/ivf_pq_types.hpp   |  10 +-
 .../raft/spatial/knn/detail/ivf_pq_build.cuh  | 118 +++++++---
 .../raft/spatial/knn/detail/ivf_pq_search.cuh | 219 ++++++++++++------
 3 files changed, 230 insertions(+), 117 deletions(-)
diff --git a/cpp/include/raft/neighbors/ivf_pq_types.hpp b/cpp/include/raft/neighbors/ivf_pq_types.hpp
index 3dbf004e95..c95ad49a74 100644
--- a/cpp/include/raft/neighbors/ivf_pq_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_pq_types.hpp
@@ -292,8 +292,8 @@ struct index : ann::index {
   /**
    * PQ cluster centers
    *
-   *   - codebook_gen::PER_SUBSPACE: [pq_dim , pq_book_size, pq_len]
-   *   - codebook_gen::PER_CLUSTER:  [n_lists, pq_book_size, pq_len]
+   *   - codebook_gen::PER_SUBSPACE: [pq_dim , pq_len, pq_book_size]
+   *   - codebook_gen::PER_CLUSTER:  [n_lists, pq_len, pq_book_size]
    */
   inline auto pq_centers() noexcept -> device_mdspan<float, extent_3d<uint32_t>, row_major>
   {
@@ -408,9 +408,9 @@ struct index : ann::index {
   {
     switch (codebook_kind()) {
       case codebook_gen::PER_SUBSPACE:
-        return make_extents<uint32_t>(pq_dim(), pq_book_size(), pq_len());
+        return make_extents<uint32_t>(pq_dim(), pq_len(), pq_book_size());
       case codebook_gen::PER_CLUSTER:
-        return make_extents<uint32_t>(n_lists(), pq_book_size(), pq_len());
+        return make_extents<uint32_t>(n_lists(), pq_len(), pq_book_size());
       default: RAFT_FAIL("Unreachable code");
     }
   }
@@ -420,7 +420,7 @@ struct index : ann::index {
     // If the dimensionality is large enough, we can reduce it to improve performance
     if (dim >= 128) { dim /= 2; }
     // Round it down to 32 to improve performance.
-    uint32_t r = raft::round_down_safe<uint32_t>(dim, 32);
+    auto r = raft::round_down_safe<uint32_t>(dim, 32);
     if (r > 0) return r;
     // If the dimensionality is really low, round it to the closest power-of-two
     r = 1;
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
index f13dcd8cc6..a62432da7f 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
@@ -16,10 +16,11 @@
 
 #pragma once
 
-#include "../ivf_pq_types.hpp"
 #include "ann_kmeans_balanced.cuh"
 #include "ann_utils.cuh"
 
+#include <raft/neighbors/ivf_pq_types.hpp>
+
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/handle.hpp>
 #include <raft/core/logger.hpp>
@@ -49,6 +50,10 @@ namespace raft::spatial::knn::ivf_pq::detail {
 
 using namespace raft::spatial::knn::detail;  // NOLINT
 
+using raft::neighbors::ivf_pq::codebook_gen;
+using raft::neighbors::ivf_pq::index;
+using raft::neighbors::ivf_pq::index_params;
+
 namespace {
 
 /**
@@ -318,6 +323,8 @@ void compute_pq_codes(const handle_t& handle,
   //
   utils::memzero(pq_dataset, n_rows * pq_dim * pq_bits / 8, stream);
 
+  uint32_t pq_width = 1 << pq_bits;
+  rmm::device_uvector<float> pq_centers_tmp(pq_len * pq_width, stream, device_memory);
   rmm::device_uvector<float> rot_vectors(max_cluster_size * rot_dim, stream, device_memory);
   rmm::device_uvector<float> sub_vectors(max_cluster_size * pq_dim * pq_len, stream, device_memory);
   rmm::device_uvector<uint32_t> sub_vector_labels(max_cluster_size * pq_dim, stream, device_memory);
@@ -360,23 +367,36 @@ void compute_pq_codes(const handle_t& handle,
                                       stream));
     }
 
+    if (codebook_kind == codebook_gen::PER_CLUSTER) {
+      linalg::writeOnlyUnaryOp(
+        pq_centers_tmp.data(),
+        pq_len * pq_width,
+        [pq_centers, pq_width, pq_len, l] __device__(float* out, uint32_t i) {
+          auto i0 = i % pq_len;
+          auto i1 = i / pq_len;
+          *out    = pq_centers[pq_width * pq_len * l + i0 * pq_width + i1];
+        },
+        stream);
+    }
+
     //
     // Find a label (cluster ID) for each vector subspace.
     //
     for (uint32_t j = 0; j < pq_dim; j++) {
-      const float* sub_pq_centers = nullptr;
-      switch (codebook_kind) {
-        case codebook_gen::PER_SUBSPACE:
-          sub_pq_centers = pq_centers + ((1 << pq_bits) * pq_len) * j;
-          break;
-        case codebook_gen::PER_CLUSTER:
-          sub_pq_centers = pq_centers + ((1 << pq_bits) * pq_len) * l;
-          break;
-        default: RAFT_FAIL("Unreachable code");
+      if (codebook_kind == codebook_gen::PER_SUBSPACE) {
+        linalg::writeOnlyUnaryOp(
+          pq_centers_tmp.data(),
+          pq_len * pq_width,
+          [pq_centers, pq_width, pq_len, j] __device__(float* out, uint32_t i) {
+            auto i0 = i % pq_len;
+            auto i1 = i / pq_len;
+            *out    = pq_centers[pq_width * pq_len * j + i0 * pq_width + i1];
+          },
+          stream);
       }
       kmeans::predict(handle,
-                      sub_pq_centers,
-                      (1 << pq_bits),
+                      pq_centers_tmp.data(),
+                      pq_width,
                       pq_len,
                       sub_vectors.data() + j * (cluster_size * pq_len),
                       cluster_size,
@@ -460,6 +480,25 @@ auto calculate_offsets_and_indices(IdxT n_rows,
   return max_cluster_size;
 }
 
+template <typename IdxT>
+void transpose_pq_centers(index<IdxT>& index,
+                          const float* pq_centers_source,
+                          rmm::cuda_stream_view stream)
+{
+  auto pq_bits = index.pq_bits();
+  auto pq_len  = index.pq_len();
+  linalg::writeOnlyUnaryOp(
+    index.pq_centers().data_handle(),
+    index.pq_centers().size(),
+    [pq_centers_source, pq_bits, pq_len] __device__(float* out, size_t i) {
+      auto i0  = i & ((1 << pq_bits) - 1);
+      auto i1  = (i >> pq_bits) % pq_len;
+      auto i2p = i - (i1 << pq_bits) - i0;
+      *out     = pq_centers_source[i1 + i0 * pq_len + i2p];
+    },
+    stream);
+}
+
 template <typename IdxT>
 void train_per_subset(const handle_t& handle,
                       index<IdxT>& index,
@@ -472,6 +511,7 @@ void train_per_subset(const handle_t& handle,
 {
   auto stream = handle.get_stream();
 
+  rmm::device_uvector<float> pq_centers_tmp(index.pq_centers().size(), stream, device_memory);
   rmm::device_uvector<float> sub_trainset(n_rows * index.pq_len(), stream, device_memory);
   rmm::device_uvector<uint32_t> sub_labels(n_rows, stream, device_memory);
 
@@ -512,20 +552,20 @@ void train_per_subset(const handle_t& handle,
                  stream);
 
     // train PQ codebook for this subspace
-    kmeans::build_clusters(
-      handle,
-      kmeans_n_iters,
-      index.pq_len(),
-      sub_trainset.data(),
-      n_rows,
-      index.pq_book_size(),
-      index.pq_centers().data_handle() + (index.pq_book_size() * index.pq_len()) * j,
-      sub_labels.data(),
-      pq_cluster_sizes.data(),
-      raft::distance::DistanceType::L2Expanded,
-      stream,
-      device_memory);
+    kmeans::build_clusters(handle,
+                           kmeans_n_iters,
+                           index.pq_len(),
+                           sub_trainset.data(),
+                           n_rows,
+                           index.pq_book_size(),
+                           pq_centers_tmp.data() + (index.pq_book_size() * index.pq_len()) * j,
+                           sub_labels.data(),
+                           pq_cluster_sizes.data(),
+                           raft::distance::DistanceType::L2Expanded,
+                           stream,
+                           device_memory);
   }
+  transpose_pq_centers(index, pq_centers_tmp.data(), stream);
 }
 
 template <typename IdxT>
@@ -539,6 +579,8 @@ void train_per_cluster(const handle_t& handle,
                        rmm::mr::device_memory_resource* device_memory)
 {
   auto stream = handle.get_stream();
+
+  rmm::device_uvector<float> pq_centers_tmp(index.pq_centers().size(), stream, device_memory);
   rmm::device_uvector<uint32_t> cluster_sizes(index.n_lists(), stream, managed_memory);
   rmm::device_uvector<IdxT> indices_buf(n_rows, stream, device_memory);
   rmm::device_uvector<IdxT> offsets_buf(index.list_offsets().size(), stream, managed_memory);
@@ -584,20 +626,20 @@ void train_per_cluster(const handle_t& handle,
     size_t available_rows = cluster_size * index.pq_dim();
     auto pq_n_rows        = uint32_t(std::min(big_enough, available_rows));
     // train PQ codebook for this cluster
-    kmeans::build_clusters(
-      handle,
-      kmeans_n_iters,
-      index.pq_len(),
-      rot_vectors.data(),
-      pq_n_rows,
-      index.pq_book_size(),
-      index.pq_centers().data_handle() + index.pq_book_size() * index.pq_len() * l,
-      pq_labels.data(),
-      pq_cluster_sizes.data(),
-      raft::distance::DistanceType::L2Expanded,
-      stream,
-      device_memory);
+    kmeans::build_clusters(handle,
+                           kmeans_n_iters,
+                           index.pq_len(),
+                           rot_vectors.data(),
+                           pq_n_rows,
+                           index.pq_book_size(),
+                           pq_centers_tmp.data() + index.pq_book_size() * index.pq_len() * l,
+                           pq_labels.data(),
+                           pq_cluster_sizes.data(),
+                           raft::distance::DistanceType::L2Expanded,
+                           stream,
+                           device_memory);
   }
+  transpose_pq_centers(index, pq_centers_tmp.data(), stream);
 }
 
 /** See raft::spatial::knn::ivf_pq::extend docs */
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
index b1f47a6c52..5535b532b7 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
@@ -16,11 +16,12 @@
 
 #pragma once
 
-#include "../ivf_pq_types.hpp"
 #include "ann_utils.cuh"
 #include "topk.cuh"
 #include "topk/warpsort_topk.cuh"
 
+#include <raft/neighbors/ivf_pq_types.hpp>
+
 #include <raft/core/cudart_utils.hpp>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/handle.hpp>
@@ -56,6 +57,10 @@ static_assert((kMaxCapacity >= 32) && !(kMaxCapacity & (kMaxCapacity - 1)),
 
 using namespace raft::spatial::knn::detail;  // NOLINT
 
+using raft::neighbors::ivf_pq::codebook_gen;
+using raft::neighbors::ivf_pq::index;
+using raft::neighbors::ivf_pq::search_params;
+
 /** 8-bit floating-point storage type.
  *
  * This is a custom type for the current IVF-PQ implementation. No arithmetic operations defined
@@ -444,6 +449,15 @@ void postprocess_distances(float* out,        // [n_queries, topk]
   }
 }
 
+/** Extract `bits` lowest bits from the `source` and shift the remaining content by this number.  */
+template <typename OpT>
+__device__ __forceinline__ auto extract_bits(OpT& source, uint32_t bits) -> uint8_t
+{
+  uint8_t r = source & ((1 << bits) - 1);
+  source >>= bits;
+  return r;
+}
+
 /**
  * @brief Compute the similarity score between a vector from `pq_dataset` and a query vector.
  *
@@ -456,6 +470,7 @@ void postprocess_distances(float* out,        // [n_queries, topk]
  *
  * @param pq_bits The bit length of an encoded vector element after compression by PQ
  * @param vec_len == 8 * sizeof(OpT) / gcd(8 * sizeof(OpT), pq_bits)
+ *                == lcm(8 * sizeof(OpT), pq_bits) / pq_bits
  * @param pq_dim
  * @param[in] pq_code_ptr
  *   a device pointer to the dataset at the indexed position (`pq_dim * pq_bits` bits-wide)
@@ -465,31 +480,29 @@ void postprocess_distances(float* out,        // [n_queries, topk]
  * @return the score for the entry `data_ix` in the `pq_dataset`.
  */
 template <typename OpT, typename LutT>
-__device__ auto ivfpq_compute_score(
-  uint32_t pq_bits, uint32_t vec_len, uint32_t pq_dim, const OpT* pq_head, const LutT* lut_scores)
-  -> float
+__device__ auto ivfpq_compute_score(const uint32_t pq_bits,
+                                    const uint32_t vec_len,
+                                    uint32_t pq_dim,
+                                    const OpT* pq_head,
+                                    const LutT* lut_scores) -> float
 {
-  float score                   = 0.0;
-  constexpr uint32_t kBitsTotal = 8 * sizeof(OpT);
+  float score                  = 0.0;
+  constexpr int32_t kBitsTotal = 8 * sizeof(OpT);
+  // These two nested loops combined span exactly `pq_dim` iterations.
   for (; pq_dim > 0; pq_dim -= vec_len) {
-    OpT pq_code = pq_head[0];
-    pq_head++;
-    auto bits_left = kBitsTotal;
-    for (uint32_t k = 0; k < vec_len; k++) {
-      uint8_t code = pq_code;
-      if (bits_left > pq_bits) {
-        pq_code >>= pq_bits;
-        bits_left -= pq_bits;
-      } else {
-        if (k < vec_len - 1) {
-          pq_code = pq_head[0];
-          pq_head++;
-        }
-        code |= (pq_code << bits_left);
-        pq_code >>= (pq_bits - bits_left);
-        bits_left += (kBitsTotal - pq_bits);
+    OpT pq_code = *pq_head++;
+    // Loop over lcm(kBitsTotal, pq_bits) = pq_bits * vec_len.
+    // Thanks to `lcm` in the expression above,
+    // `bits_left` is zero after exactly vec_len iterations.
+    for (int32_t bits_left = kBitsTotal; bits_left != 0;) {
+      uint8_t code = extract_bits(pq_code, pq_bits);
+      bits_left -= pq_bits;
+      if (bits_left < 0) {  // `code` is on the border between two `OpT` ints
+        pq_code = *pq_head++;
+        code <<= -bits_left;
+        code |= extract_bits(pq_code, -bits_left);
+        bits_left += kBitsTotal;
       }
-      code &= (1 << pq_bits) - 1;
       score += float(lut_scores[code]);
       lut_scores += (1 << pq_bits);
     }
@@ -590,7 +603,7 @@ template <typename OpT,
           int Capacity,
           bool PrecompBaseDiff,
           bool EnableSMemLut>
-__launch_bounds__(1024) __global__
+__launch_bounds__(1024, 1) __global__
   void ivfpq_compute_similarity_kernel(uint32_t n_rows,
                                        uint32_t dim,
                                        uint32_t n_probes,
@@ -615,7 +628,7 @@ __launch_bounds__(1024) __global__
   /* Shared memory:
 
     * lut_scores: lookup table (LUT) of size = `pq_dim << pq_bits`  (when EnableSMemLut)
-    * base_diff: size = dim  (which is equal to `pq_dim * pq_len`)
+    * base_diff: size = dim (which is equal to `pq_dim * pq_len`)  or dim*2
     * topk::block_sort: some amount of shared memory, but overlaps with the rest:
         block_sort only needs shared memory for `.done()` operation, which can come very last.
   */
@@ -623,25 +636,30 @@ __launch_bounds__(1024) __global__
   constexpr bool kManageLocalTopK = Capacity > 0;
   constexpr uint32_t kOpBits      = 8 * sizeof(OpT);
 
-  const uint32_t pq_len  = dim / pq_dim;
-  const uint32_t vec_len = kOpBits / gcd<uint32_t>(kOpBits, pq_bits);
+  const uint32_t pq_len   = dim / pq_dim;
+  const uint32_t vec_len  = kOpBits / gcd<uint32_t>(kOpBits, pq_bits);
+  const uint32_t lut_size = pq_dim << pq_bits;
 
   if constexpr (EnableSMemLut) {
     lut_scores = reinterpret_cast<LutT*>(smem_buf);
   } else {
-    lut_scores += (pq_dim << pq_bits) * blockIdx.x;
+    lut_scores += lut_size * blockIdx.x;
   }
 
   float* base_diff = nullptr;
   if constexpr (PrecompBaseDiff) {
     if constexpr (EnableSMemLut) {
-      base_diff = reinterpret_cast<float*>(lut_scores + (pq_dim << pq_bits));
+      base_diff = reinterpret_cast<float*>(lut_scores + lut_size);
     } else {
       base_diff = reinterpret_cast<float*>(smem_buf);
     }
   }
 
   for (int ib = blockIdx.x; ib < n_queries * n_probes; ib += gridDim.x) {
+    if (ib >= gridDim.x) {
+      // sync shared memory accesses on the second and further iterations
+      __syncthreads();
+    }
     uint32_t query_ix;
     uint32_t probe_ix;
     if (index_list == nullptr) {
@@ -676,43 +694,68 @@ __launch_bounds__(1024) __global__
     }
 
     if constexpr (PrecompBaseDiff) {
-      // Reduce computational complexity by pre-computing the difference
-      // between the cluster centroid and the query.
-      for (uint32_t i = threadIdx.x; i < dim; i += blockDim.x) {
-        base_diff[i] = query[i] - cluster_center[i];
-      }
-      __syncthreads();
-    }
-
-    // Create a lookup table
-    // For each subspace, the lookup table stores the distance between the actual query vector
-    // (projected into the subspace) and all possible pq vectors in that subspace.
-    for (uint32_t i = threadIdx.x; i < (pq_dim << pq_bits); i += blockDim.x) {
-      uint32_t i_pq   = i >> pq_bits;
-      uint32_t i_code = codebook_kind == codebook_gen::PER_CLUSTER ? i & ((1 << pq_bits) - 1) : i;
-      float score     = 0.0;
+      // Reduce number of memory reads later by pre-computing parts of the score
       switch (metric) {
         case distance::DistanceType::L2Expanded: {
-          for (uint32_t j = 0; j < pq_len; j++) {
-            uint32_t k = j + (pq_len * i_pq);
-            float diff;
-            if constexpr (PrecompBaseDiff) {
-              diff = base_diff[k];
-            } else {
-              diff = query[k] - cluster_center[k];
-            }
-            diff -= pq_center[j + pq_len * i_code];
-            score += diff * diff;
+          for (uint32_t i = threadIdx.x; i < dim; i += blockDim.x) {
+            base_diff[i] = query[i] - cluster_center[i];
           }
         } break;
         case distance::DistanceType::InnerProduct: {
-          for (uint32_t j = 0; j < pq_len; j++) {
-            uint32_t k = j + (pq_len * i_pq);
-            score += query[k] * (cluster_center[k] + pq_center[j + pq_len * i_code]);
+          for (uint32_t i = threadIdx.x; i < dim; i += blockDim.x) {
+            auto q             = query[i];
+            base_diff[i]       = q;
+            base_diff[dim + i] = cluster_center[i] * q;
           }
         } break;
       }
-      lut_scores[i] = LutT(score);
+      __syncthreads();
+    }
+
+    {
+      // Create a lookup table
+      // For each subspace, the lookup table stores the distance between the actual query vector
+      // (projected into the subspace) and all possible pq vectors in that subspace.
+      const uint32_t pq_mask = (1u << pq_bits) - 1u;
+      for (uint32_t i = threadIdx.x; i < lut_size; i += blockDim.x) {
+        const uint32_t i_pq    = i >> pq_bits;
+        const uint32_t i_shift = i_pq * pq_len;
+        const float* cur_pq_center =
+          pq_center + (i & pq_mask) +
+          (codebook_kind == codebook_gen::PER_SUBSPACE ? i_shift << pq_bits : 0u);
+        float score = 0.0;
+        switch (metric) {
+          case distance::DistanceType::L2Expanded: {
+            for (uint32_t j = 0; j < pq_len; j++) {
+              uint32_t k = j + i_shift;
+              float diff;
+              if constexpr (PrecompBaseDiff) {
+                diff = base_diff[k];
+              } else {
+                diff = query[k] - cluster_center[k];
+              }
+              diff -= cur_pq_center[j << pq_bits];
+              score += diff * diff;
+            }
+          } break;
+          case distance::DistanceType::InnerProduct: {
+            // NB: we negate the scores as we hardcoded select-topk to always take the minimum
+            for (uint32_t j = 0; j < pq_len; j++) {
+              uint32_t k = j + i_shift;
+              float q;
+              if constexpr (PrecompBaseDiff) {
+                q = base_diff[k];
+                score -= base_diff[dim + k];
+              } else {
+                q = query[k];
+                score -= q * cluster_center[k];
+              }
+              score -= q * cur_pq_center[j << pq_bits];
+            }
+          } break;
+        }
+        lut_scores[i] = LutT(score);
+      }
     }
 
     uint32_t sample_offset = 0;
@@ -736,13 +779,7 @@ __launch_bounds__(1024) __global__
         auto pq_ptr =
           reinterpret_cast<const OpT*>(pq_dataset + uint64_t(pq_line_width) * (cluster_offset + i));
         float fscore = ivfpq_compute_score<OpT, LutT>(pq_bits, vec_len, pq_dim, pq_ptr, lut_scores);
-        switch (metric) {
-          // For similarity metrics,
-          // we negate the scores as we hardcoded select-topk to always take the minimum
-          case distance::DistanceType::InnerProduct: fscore = -fscore; break;
-          default: break;
-        }
-        if (fscore < float(score)) { score = OutT{fscore}; }
+        if (fscore < float(score)) { score = OutT(fscore); }
       }
       if constexpr (kManageLocalTopK) {
         block_topk.add(score, cluster_offset + i);
@@ -750,12 +787,11 @@ __launch_bounds__(1024) __global__
         if (i < n_samples) { out_scores[i + sample_offset] = score; }
       }
     }
-    __syncthreads();
     if constexpr (kManageLocalTopK) {
-      // sync threads before and after the topk merging operation, because we reuse smem_buf
+      // sync threads before the topk merging operation, because we reuse smem_buf
+      __syncthreads();
       block_topk.done();
       block_topk.store(out_scores, out_indices);
-      __syncthreads();
     } else {
       // fill in the rest of the out_scores with dummy values
       uint32_t max_samples = uint32_t(Pow2<128>::roundUp(cluster_offsets[n_probes]));
@@ -877,7 +913,7 @@ struct ivfpq_compute_similarity {
   static inline auto select(bool manage_local_topk,
                             uint32_t pq_bits,
                             uint32_t pq_dim,
-                            uint32_t rot_dim,
+                            uint32_t precomp_data_count,
                             uint32_t preferred_thread_block_size,
                             uint32_t n_queries,
                             uint32_t n_probes,
@@ -895,7 +931,7 @@ struct ivfpq_compute_similarity {
 
     const size_t smem_threshold = 48 * 1024;
     size_t smem_size            = sizeof(LutT) * (pq_dim << pq_bits);
-    size_t smem_size_base_diff  = sizeof(float) * rot_dim;
+    size_t smem_size_base_diff  = sizeof(float) * precomp_data_count;
 
     uint32_t n_blocks  = n_queries * n_probes;
     uint32_t n_threads = 1024;
@@ -978,8 +1014,8 @@ struct ivfpq_compute_similarity {
         RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
           &kernel_fast_n_blocks, kernel_fast, n_threads, smem_size + smem_size_base_diff));
 
-        // Use "kernel_fast" only if GPU occupancy does not drop
-        if (kernel_no_basediff_n_blocks == kernel_fast_n_blocks) {
+        // Use "kernel_fast" only if GPU occupancy does not drop too much
+        if (kernel_no_basediff_n_blocks * 3 <= kernel_fast_n_blocks * 4) {
           kernel = kernel_fast;
           smem_size += smem_size_base_diff;
         }
@@ -1096,17 +1132,48 @@ void ivfpq_search_worker(const handle_t& handle,
   }
 
   // select and run the main search kernel
+  uint32_t precomp_data_count = 0;
+  switch (index.metric()) {
+    case distance::DistanceType::L2SqrtExpanded:
+    case distance::DistanceType::L2SqrtUnexpanded:
+    case distance::DistanceType::L2Unexpanded:
+    case distance::DistanceType::L2Expanded: {
+      // stores basediff (query[i] - center[i])
+      precomp_data_count = index.pq_dim();
+    } break;
+    case distance::DistanceType::InnerProduct: {
+      // stores two components (query[i] * center[i], center[i])
+      precomp_data_count = index.pq_dim() * 2;
+    } break;
+    default: {
+      RAFT_FAIL("Unsupported metric");
+    } break;
+  }
   auto search_instance =
     ivfpq_compute_similarity<IdxT, ScoreT, LutT>::select(manage_local_topk,
                                                          index.pq_bits(),
                                                          index.pq_dim(),
-                                                         index.rot_dim(),
+                                                         precomp_data_count,
                                                          preferred_thread_block_size,
                                                          n_queries,
                                                          n_probes,
                                                          topK);
 
   rmm::device_uvector<LutT> device_lut(search_instance.device_lut_size, stream, mr);
+  printf(
+    "===|     raft: n_rows = %zu, n_lists = %u, n_queries = %u, n_probes = %u, dim = %u, pq_dim = "
+    "%u, k = %u, max_samples = %u, metric = %d, codebook_kind = %d\n",
+    uint64_t(index.size()),
+    uint32_t(index.n_lists()),
+    uint32_t(n_queries),
+    uint32_t(n_probes),
+    uint32_t(index.rot_dim()),
+    uint32_t(index.pq_dim()),
+    uint32_t(topK),
+    uint32_t(max_samples),
+    int(index.metric()),
+    int(index.codebook_kind()));
+
   search_instance(stream,
                   index.size(),
                   index.rot_dim(),
@@ -1261,7 +1328,11 @@ inline void search(const handle_t& handle,
   static_assert(std::is_same_v<T, float> || std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
                 "Unsupported element type.");
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
-    "ivf_pq::search(k = %u, n_queries = %u, dim = %zu)", k, n_queries, index.dim());
+    "ivf_pq::search(n_queries = %u, n_probes = %u, k = %u, dim = %zu)",
+    n_queries,
+    params.n_probes,
+    k,
+    index.dim());
 
   RAFT_EXPECTS(
     params.internal_distance_dtype == CUDA_R_16F || params.internal_distance_dtype == CUDA_R_32F,

From 3eb37b960be9257a95cab9636393806713c0ae7c Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 12 Oct 2022 16:19:46 +0200
Subject: [PATCH 02/31] Remove debugging printf

---
 .../raft/spatial/knn/detail/ivf_pq_search.cuh       | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
index 5535b532b7..46a0f6c60a 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
@@ -1160,19 +1160,6 @@ void ivfpq_search_worker(const handle_t& handle,
                                                          topK);
 
   rmm::device_uvector<LutT> device_lut(search_instance.device_lut_size, stream, mr);
-  printf(
-    "===|     raft: n_rows = %zu, n_lists = %u, n_queries = %u, n_probes = %u, dim = %u, pq_dim = "
-    "%u, k = %u, max_samples = %u, metric = %d, codebook_kind = %d\n",
-    uint64_t(index.size()),
-    uint32_t(index.n_lists()),
-    uint32_t(n_queries),
-    uint32_t(n_probes),
-    uint32_t(index.rot_dim()),
-    uint32_t(index.pq_dim()),
-    uint32_t(topK),
-    uint32_t(max_samples),
-    int(index.metric()),
-    int(index.codebook_kind()));
 
   search_instance(stream,
                   index.size(),

From 094a8d5e76985a27d8d25cb5c0ad2d58c89bb153 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 12 Oct 2022 20:06:09 +0200
Subject: [PATCH 03/31] Fix integer overflow for large n_queries * max_samples

---
 cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
index 46a0f6c60a..e71256f7b6 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
@@ -682,7 +682,7 @@ __launch_bounds__(1024, 1) __global__
     } else {
       // Store all calculated distances to out_scores
       auto max_samples = Pow2<128>::roundUp(cluster_offsets[n_probes]);
-      out_scores       = _out_scores + max_samples * query_ix;
+      out_scores       = _out_scores + uint64_t(max_samples) * query_ix;
     }
     uint32_t label              = cluster_labels[n_probes * query_ix + probe_ix];
     const float* cluster_center = cluster_centers + (dim * label);
@@ -1080,11 +1080,11 @@ void ivfpq_search_worker(const handle_t& handle,
   rmm::device_uvector<uint32_t> num_samples(n_queries, stream, mr);
   rmm::device_uvector<uint32_t> chunk_index(n_queries * n_probes, stream, mr);
   // [maxBatchSize, max_samples] or  [maxBatchSize, n_probes, topk]
-  rmm::device_uvector<ScoreT> distances_buf(n_queries * topk_len, stream, mr);
+  rmm::device_uvector<ScoreT> distances_buf(uint64_t(n_queries) * uint64_t(topk_len), stream, mr);
   rmm::device_uvector<IdxT> neighbors_buf(0, stream, mr);
   IdxT* neighbors_ptr = nullptr;
   if (manage_local_topk) {
-    neighbors_buf.resize(n_queries * topk_len, stream);
+    neighbors_buf.resize(uint64_t(n_queries) * topk_len, stream);
     neighbors_ptr = neighbors_buf.data();
   }
 

From bb68b03de40c24921c564d923190d931c4f5fcc9 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 13 Oct 2022 15:58:57 +0200
Subject: [PATCH 04/31] Fix bad top-k kernel selection

---
 cpp/include/raft/spatial/knn/detail/topk.cuh | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/cpp/include/raft/spatial/knn/detail/topk.cuh b/cpp/include/raft/spatial/knn/detail/topk.cuh
index 5adf6df472..f4dcb53088 100644
--- a/cpp/include/raft/spatial/knn/detail/topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/topk.cuh
@@ -19,6 +19,8 @@
 #include "topk/radix_topk.cuh"
 #include "topk/warpsort_topk.cuh"
 
+#include <raft/core/nvtx.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
@@ -73,7 +75,11 @@ void select_topk(const T* in,
                  rmm::cuda_stream_view stream,
                  rmm::mr::device_memory_resource* mr = nullptr)
 {
-  if (k <= raft::spatial::knn::detail::topk::kMaxCapacity) {
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "matrix::select_topk(batch_size = %zu, len = %zu, k = %d)", batch_size, len, k);
+  // TODO (achirkin): investigate the trade-off for a wider variety of inputs.
+  const bool radix_faster = batch_size >= 64 && len >= 102400 && k >= 128;
+  if (k <= raft::spatial::knn::detail::topk::kMaxCapacity && !radix_faster) {
     topk::warp_sort_topk<T, IdxT>(
       in, in_idx, batch_size, len, k, out, out_idx, select_min, stream, mr);
   } else {

From 33bc1857abf9a032e5d711bb313e1c90d447ce7e Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 13 Oct 2022 16:01:47 +0200
Subject: [PATCH 05/31] Limit max batch size to avoid extra large temporary
 buffers

---
 .../raft/spatial/knn/detail/ivf_pq_search.cuh | 49 ++++++++++++++-----
 1 file changed, 38 insertions(+), 11 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
index e71256f7b6..1014419c52 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
@@ -682,7 +682,7 @@ __launch_bounds__(1024, 1) __global__
     } else {
       // Store all calculated distances to out_scores
       auto max_samples = Pow2<128>::roundUp(cluster_offsets[n_probes]);
-      out_scores       = _out_scores + uint64_t(max_samples) * query_ix;
+      out_scores       = _out_scores + max_samples * query_ix;
     }
     uint32_t label              = cluster_labels[n_probes * query_ix + probe_ix];
     const float* cluster_center = cluster_centers + (dim * label);
@@ -1031,6 +1031,14 @@ struct ivfpq_compute_similarity {
   }
 };
 
+auto is_local_topk_feasible(uint32_t k, uint32_t n_probes, uint32_t n_queries) -> bool
+{
+  if (k > kMaxCapacity) { return false; }             // warp_sort not possible
+  if (n_probes <= 16) { return false; }               // too few clusters
+  if (n_queries * n_probes <= 256) { return false; }  // overall amount of work is too small
+  return true;
+}
+
 /**
  * The "main part" of the search, which assumes that outer-level `search` has already:
  *
@@ -1062,12 +1070,8 @@ void ivfpq_search_worker(const handle_t& handle,
   auto cluster_centers = index.centers_rot().data_handle();
   auto cluster_offsets = index.list_offsets().data_handle();
 
-  bool manage_local_topk = topK <= kMaxCapacity  // depth is not too large
-                           && n_probes >= 16     // not too few clusters looked up
-                           &&
-                           n_queries * n_probes >= 256  // overall amount of work is not too small
-    ;
-  auto topk_len = manage_local_topk ? n_probes * topK : max_samples;
+  bool manage_local_topk = is_local_topk_feasible(topK, n_probes, n_queries);
+  auto topk_len          = manage_local_topk ? n_probes * topK : max_samples;
   if (manage_local_topk) {
     RAFT_LOG_DEBUG("Fused version of the search kernel is selected (manage_local_topk == true)");
   } else {
@@ -1080,11 +1084,11 @@ void ivfpq_search_worker(const handle_t& handle,
   rmm::device_uvector<uint32_t> num_samples(n_queries, stream, mr);
   rmm::device_uvector<uint32_t> chunk_index(n_queries * n_probes, stream, mr);
   // [maxBatchSize, max_samples] or  [maxBatchSize, n_probes, topk]
-  rmm::device_uvector<ScoreT> distances_buf(uint64_t(n_queries) * uint64_t(topk_len), stream, mr);
+  rmm::device_uvector<ScoreT> distances_buf(n_queries * topk_len, stream, mr);
   rmm::device_uvector<IdxT> neighbors_buf(0, stream, mr);
   IdxT* neighbors_ptr = nullptr;
   if (manage_local_topk) {
-    neighbors_buf.resize(uint64_t(n_queries) * topk_len, stream);
+    neighbors_buf.resize(n_queries * topk_len, stream);
     neighbors_ptr = neighbors_buf.data();
   }
 
@@ -1280,12 +1284,18 @@ struct ivfpq_search {
  * A heuristic for bounding the number of queries per batch, to improve GPU utilization.
  * (based on the number of SMs and the work size).
  *
+ * @param k top-k
+ * @param n_probes number of selected clusters per query
  * @param n_queries number of queries hoped to be processed at once.
  *                  (maximum value for the returned batch size)
+ * @param max_samples maximum possible number of samples to be processed for the given `n_probes`
  *
  * @return maximum recommended batch size.
  */
-inline auto get_max_batch_size(uint32_t n_queries) -> uint32_t
+inline auto get_max_batch_size(uint32_t k,
+                               uint32_t n_probes,
+                               uint32_t n_queries,
+                               uint32_t max_samples) -> uint32_t
 {
   uint32_t max_batch_size         = n_queries;
   uint32_t n_ctas_total           = getMultiProcessorCount() * 2;
@@ -1297,6 +1307,23 @@ inline auto get_max_batch_size(uint32_t n_queries) -> uint32_t
     float utilization_1 = float(n_ctas_total_per_batch_1 * max_batch_size_1) / n_ctas_total;
     if (utilization < utilization_1) { max_batch_size = max_batch_size_1; }
   }
+  // Check in the tmp distance buffer is not too big
+  auto ws_size = [k, n_probes, max_samples](uint32_t bs) -> uint64_t {
+    return uint64_t(is_local_topk_feasible(k, n_probes, bs) ? k * n_probes : max_samples) * bs;
+  };
+  constexpr uint64_t kMaxWsSize = 2ul * 1024 * 1024 * 1024;
+  if (ws_size(max_batch_size) > kMaxWsSize) {
+    uint32_t smaller_batch_size = 1;
+    // take powers of two for better alignment
+    while (smaller_batch_size * 2 <= max_batch_size) {
+      smaller_batch_size <<= 1;
+    }
+    // gradually reduce the batch size until we fit into the max size limit.
+    while (smaller_batch_size > 1 && ws_size(smaller_batch_size) > kMaxWsSize) {
+      smaller_batch_size >>= 1;
+    }
+    return smaller_batch_size;
+  }
   return max_batch_size;
 }
 
@@ -1384,7 +1411,7 @@ inline void search(const handle_t& handle,
 
   // Maximum number of query vectors to search at the same time.
   const auto max_queries = std::min<uint32_t>(std::max<uint32_t>(n_queries, 1), 4096);
-  auto max_batch_size    = get_max_batch_size(max_queries);
+  auto max_batch_size    = get_max_batch_size(k, n_probes, max_queries, max_samples);
 
   rmm::device_uvector<float> float_queries(max_queries * dim_ext, stream, mr);
   rmm::device_uvector<float> rot_queries(max_queries * index.rot_dim(), stream, mr);

From 9135a799b941344a28bba981f737d6360e09c5ca Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 19 Oct 2022 09:54:11 +0200
Subject: [PATCH 06/31] Change the layout of pq_centers for better memory
 utilization and optimize ALU usage in the main kernel

---
 cpp/include/raft/neighbors/ivf_pq_types.hpp   |  19 +-
 .../raft/spatial/knn/detail/ivf_pq_build.cuh  | 228 ++++++------
 .../raft/spatial/knn/detail/ivf_pq_search.cuh | 328 ++++++++++++------
 3 files changed, 355 insertions(+), 220 deletions(-)

diff --git a/cpp/include/raft/neighbors/ivf_pq_types.hpp b/cpp/include/raft/neighbors/ivf_pq_types.hpp
index c95ad49a74..815e3fc1de 100644
--- a/cpp/include/raft/neighbors/ivf_pq_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_pq_types.hpp
@@ -289,18 +289,20 @@ struct index : ann::index {
     check_consistency();
   }
 
+  using pq_centers_extents =
+    std::experimental::extents<uint32_t, dynamic_extent, dynamic_extent, dynamic_extent, 4>;
   /**
    * PQ cluster centers
    *
-   *   - codebook_gen::PER_SUBSPACE: [pq_dim , pq_len, pq_book_size]
-   *   - codebook_gen::PER_CLUSTER:  [n_lists, pq_len, pq_book_size]
+   *   - codebook_gen::PER_SUBSPACE: [pq_dim , ceildiv(pq_len, 4), pq_book_size, 4]
+   *   - codebook_gen::PER_CLUSTER:  [n_lists, ceildiv(pq_len, 4), pq_book_size, 4]
    */
-  inline auto pq_centers() noexcept -> device_mdspan<float, extent_3d<uint32_t>, row_major>
+  inline auto pq_centers() noexcept -> device_mdspan<float, pq_centers_extents, row_major>
   {
     return pq_centers_.view();
   }
   [[nodiscard]] inline auto pq_centers() const noexcept
-    -> device_mdspan<const float, extent_3d<uint32_t>, row_major>
+    -> device_mdspan<const float, pq_centers_extents, row_major>
   {
     return pq_centers_.view();
   }
@@ -383,7 +385,7 @@ struct index : ann::index {
   uint32_t pq_dim_;
   uint32_t n_nonempty_lists_;
 
-  device_mdarray<float, extent_3d<uint32_t>, row_major> pq_centers_;
+  device_mdarray<float, pq_centers_extents, row_major> pq_centers_;
   device_mdarray<uint8_t, extent_2d<IdxT>, row_major> pq_dataset_;
   device_mdarray<IdxT, extent_1d<IdxT>, row_major> indices_;
   device_mdarray<float, extent_2d<uint32_t>, row_major> rotation_matrix_;
@@ -404,13 +406,14 @@ struct index : ann::index {
                  pq_bits() * pq_dim());
   }
 
-  auto make_pq_centers_extents() -> extent_3d<uint32_t>
+  auto make_pq_centers_extents() -> pq_centers_extents
   {
+    auto d = raft::div_rounding_up_unsafe(pq_len(), 4);
     switch (codebook_kind()) {
       case codebook_gen::PER_SUBSPACE:
-        return make_extents<uint32_t>(pq_dim(), pq_len(), pq_book_size());
+        return make_extents<uint32_t>(pq_dim(), d, pq_book_size(), 4);
       case codebook_gen::PER_CLUSTER:
-        return make_extents<uint32_t>(n_lists(), pq_len(), pq_book_size());
+        return make_extents<uint32_t>(n_lists(), d, pq_book_size(), 4);
       default: RAFT_FAIL("Unreachable code");
     }
   }
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
index a62432da7f..3f88c616ae 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
@@ -230,7 +230,7 @@ void select_residuals(const handle_t& handle,
 )
 {
   auto stream = handle.get_stream();
-  rmm::device_uvector<float> tmp(n_rows * dim, stream, device_memory);
+  rmm::device_uvector<float> tmp(size_t(n_rows) * size_t(dim), stream, device_memory);
   utils::copy_selected<float, T>(
     n_rows, (IdxT)dim, dataset, row_ids, (IdxT)dim, tmp.data(), (IdxT)dim, stream);
 
@@ -287,25 +287,26 @@ void select_residuals(const handle_t& handle,
  * @param device_memory
  */
 template <typename T, typename IdxT>
-void compute_pq_codes(const handle_t& handle,
-                      IdxT n_rows,
-                      uint32_t data_dim,
-                      uint32_t rot_dim,
-                      uint32_t pq_dim,
-                      uint32_t pq_len,
-                      uint32_t pq_bits,
-                      uint32_t n_clusters,
-                      codebook_gen codebook_kind,
-                      uint32_t max_cluster_size,
-                      float* cluster_centers,
-                      const float* rotation_matrix,
-                      const T* dataset,
-                      const IdxT* data_indices,
-                      const uint32_t* cluster_sizes,
-                      const IdxT* cluster_offsets,
-                      const float* pq_centers,
-                      uint8_t* pq_dataset,
-                      rmm::mr::device_memory_resource* device_memory)
+void compute_pq_codes(
+  const handle_t& handle,
+  IdxT n_rows,
+  uint32_t data_dim,
+  uint32_t rot_dim,
+  uint32_t pq_dim,
+  uint32_t pq_len,
+  uint32_t pq_bits,
+  uint32_t n_clusters,
+  codebook_gen codebook_kind,
+  uint32_t max_cluster_size,
+  float* cluster_centers,
+  const float* rotation_matrix,
+  const T* dataset,
+  const IdxT* data_indices,
+  const uint32_t* cluster_sizes,
+  const IdxT* cluster_offsets,
+  device_mdspan<const float, typename index<IdxT>::pq_centers_extents, row_major> pq_centers,
+  uint8_t* pq_dataset,
+  rmm::mr::device_memory_resource* device_memory)
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
     "ivf_pq::compute_pq_codes(n_rows = %zu, data_dim = %u, rot_dim = %u (%u * %u), n_clusters = "
@@ -321,15 +322,17 @@ void compute_pq_codes(const handle_t& handle,
   //
   // Compute PQ code
   //
-  utils::memzero(pq_dataset, n_rows * pq_dim * pq_bits / 8, stream);
 
   uint32_t pq_width = 1 << pq_bits;
   rmm::device_uvector<float> pq_centers_tmp(pq_len * pq_width, stream, device_memory);
-  rmm::device_uvector<float> rot_vectors(max_cluster_size * rot_dim, stream, device_memory);
-  rmm::device_uvector<float> sub_vectors(max_cluster_size * pq_dim * pq_len, stream, device_memory);
-  rmm::device_uvector<uint32_t> sub_vector_labels(max_cluster_size * pq_dim, stream, device_memory);
+  rmm::device_uvector<float> rot_vectors(
+    size_t(max_cluster_size) * size_t(rot_dim), stream, device_memory);
+  rmm::device_uvector<float> sub_vectors(
+    size_t(max_cluster_size) * size_t(pq_dim * pq_len), stream, device_memory);
+  rmm::device_uvector<uint32_t> sub_vector_labels(
+    size_t(max_cluster_size) * size_t(pq_dim), stream, device_memory);
   rmm::device_uvector<uint8_t> my_pq_dataset(
-    max_cluster_size * pq_dim * pq_bits / 8 /* NB: pq_dim * bitPQ % 8 == 0 */,
+    size_t(max_cluster_size) * size_t(pq_dim * pq_bits / 8) /* NB: pq_dim * bitPQ % 8 == 0 */,
     stream,
     device_memory);
 
@@ -345,7 +348,7 @@ void compute_pq_codes(const handle_t& handle,
                      data_dim,
                      rot_dim,
                      rotation_matrix,
-                     cluster_centers + uint64_t(l) * data_dim,
+                     cluster_centers + size_t(l) * size_t(data_dim),
                      dataset,
                      data_indices + cluster_offsets[l],
                      device_memory);
@@ -357,14 +360,15 @@ void compute_pq_codes(const handle_t& handle,
     //   output: sub_vectors[pq_dim, cluster_size, pq_len]
     //
     for (uint32_t i = 0; i < pq_dim; i++) {
-      RAFT_CUDA_TRY(cudaMemcpy2DAsync(sub_vectors.data() + i * pq_len * cluster_size,
-                                      sizeof(float) * pq_len,
-                                      rot_vectors.data() + i * pq_len,
-                                      sizeof(float) * rot_dim,
-                                      sizeof(float) * pq_len,
-                                      cluster_size,
-                                      cudaMemcpyDefault,
-                                      stream));
+      RAFT_CUDA_TRY(
+        cudaMemcpy2DAsync(sub_vectors.data() + size_t(i) * size_t(pq_len) * size_t(cluster_size),
+                          sizeof(float) * pq_len,
+                          rot_vectors.data() + i * pq_len,
+                          sizeof(float) * rot_dim,
+                          sizeof(float) * pq_len,
+                          cluster_size,
+                          cudaMemcpyDefault,
+                          stream));
     }
 
     if (codebook_kind == codebook_gen::PER_CLUSTER) {
@@ -372,9 +376,11 @@ void compute_pq_codes(const handle_t& handle,
         pq_centers_tmp.data(),
         pq_len * pq_width,
         [pq_centers, pq_width, pq_len, l] __device__(float* out, uint32_t i) {
-          auto i0 = i % pq_len;
-          auto i1 = i / pq_len;
-          *out    = pq_centers[pq_width * pq_len * l + i0 * pq_width + i1];
+          auto pq_i = i % pq_len;
+          auto i0   = pq_i % 4;
+          auto i2   = pq_i / 4;
+          auto i1   = i / pq_len;
+          *out      = pq_centers(l, i2, i1, i0);
         },
         stream);
     }
@@ -388,9 +394,11 @@ void compute_pq_codes(const handle_t& handle,
           pq_centers_tmp.data(),
           pq_len * pq_width,
           [pq_centers, pq_width, pq_len, j] __device__(float* out, uint32_t i) {
-            auto i0 = i % pq_len;
-            auto i1 = i / pq_len;
-            *out    = pq_centers[pq_width * pq_len * j + i0 * pq_width + i1];
+            auto pq_i = i % pq_len;
+            auto i0   = pq_i % 4;
+            auto i2   = pq_i / 4;
+            auto i1   = i / pq_len;
+            *out      = pq_centers(j, i2, i1, i0);
           },
           stream);
       }
@@ -398,9 +406,9 @@ void compute_pq_codes(const handle_t& handle,
                       pq_centers_tmp.data(),
                       pq_width,
                       pq_len,
-                      sub_vectors.data() + j * (cluster_size * pq_len),
+                      sub_vectors.data() + size_t(j) * size_t(cluster_size) * size_t(pq_len),
                       cluster_size,
-                      sub_vector_labels.data() + j * cluster_size,
+                      sub_vector_labels.data() + size_t(j) * size_t(cluster_size),
                       raft::distance::DistanceType::L2Expanded,
                       stream,
                       device_memory);
@@ -411,9 +419,9 @@ void compute_pq_codes(const handle_t& handle,
     //
     ivfpq_encode(
       cluster_size, pq_dim, pq_bits, sub_vector_labels.data(), my_pq_dataset.data(), stream);
-    copy(pq_dataset + cluster_offsets[l] * uint64_t{pq_dim * pq_bits / 8},
+    copy(pq_dataset + size_t(cluster_offsets[l]) * size_t(pq_dim * pq_bits / 8),
          my_pq_dataset.data(),
-         cluster_size * pq_dim * pq_bits / 8,
+         size_t(cluster_size) * size_t(pq_dim * pq_bits / 8),
          stream);
   }
 }
@@ -424,7 +432,7 @@ __launch_bounds__(BlockDim) __global__ void fill_indices_kernel(IdxT n_rows,
                                                                 IdxT* data_offsets,
                                                                 const uint32_t* labels)
 {
-  const auto i = BlockDim * IdxT(blockIdx.x) + IdxT(threadIdx.x);
+  const auto i = IdxT(BlockDim) * IdxT(blockIdx.x) + IdxT(threadIdx.x);
   if (i >= n_rows) { return; }
   data_indices[atomicAdd<IdxT>(data_offsets + labels[i], 1)] = i;
 }
@@ -485,16 +493,24 @@ void transpose_pq_centers(index<IdxT>& index,
                           const float* pq_centers_source,
                           rmm::cuda_stream_view stream)
 {
-  auto pq_bits = index.pq_bits();
-  auto pq_len  = index.pq_len();
+  auto extents  = index.pq_centers().extents();
+  auto pq_len   = index.pq_len();
+  auto pq_width = index.pq_book_size();
   linalg::writeOnlyUnaryOp(
     index.pq_centers().data_handle(),
     index.pq_centers().size(),
-    [pq_centers_source, pq_bits, pq_len] __device__(float* out, size_t i) {
-      auto i0  = i & ((1 << pq_bits) - 1);
-      auto i1  = (i >> pq_bits) % pq_len;
-      auto i2p = i - (i1 << pq_bits) - i0;
-      *out     = pq_centers_source[i1 + i0 * pq_len + i2p];
+    [pq_centers_source, extents, pq_len, pq_width] __device__(float* out, size_t i) {
+      uint32_t ii[4];
+      for (int r = 3; r > 0; r--) {
+        ii[r] = i % extents.extent(r);
+        i /= extents.extent(r);
+      }
+      ii[0]   = i;
+      auto j2 = ii[3] + ii[1] * extents.extent(3);
+      auto j1 = ii[2];
+      auto j0 = ii[0];
+      auto j  = ((j0 * pq_width) + j1) * pq_len + j2;
+      *out    = j2 < pq_len ? pq_centers_source[j] : 0.0f;
     },
     stream);
 }
@@ -502,7 +518,7 @@ void transpose_pq_centers(index<IdxT>& index,
 template <typename IdxT>
 void train_per_subset(const handle_t& handle,
                       index<IdxT>& index,
-                      IdxT n_rows,
+                      size_t n_rows,
                       const float* trainset,   // [n_rows, dim]
                       const uint32_t* labels,  // [n_rows]
                       uint32_t kmeans_n_iters,
@@ -512,7 +528,7 @@ void train_per_subset(const handle_t& handle,
   auto stream = handle.get_stream();
 
   rmm::device_uvector<float> pq_centers_tmp(index.pq_centers().size(), stream, device_memory);
-  rmm::device_uvector<float> sub_trainset(n_rows * index.pq_len(), stream, device_memory);
+  rmm::device_uvector<float> sub_trainset(n_rows * size_t(index.pq_len()), stream, device_memory);
   rmm::device_uvector<uint32_t> sub_labels(n_rows, stream, device_memory);
 
   rmm::device_uvector<uint32_t> pq_cluster_sizes(index.pq_book_size(), stream, device_memory);
@@ -523,14 +539,15 @@ void train_per_subset(const handle_t& handle,
 
     // Get the rotated cluster centers for each training vector.
     // This will be subtracted from the input vectors afterwards.
-    utils::copy_selected(n_rows,
-                         (IdxT)index.pq_len(),
-                         index.centers_rot().data_handle() + index.pq_len() * j,
-                         labels,
-                         (IdxT)index.rot_dim(),
-                         sub_trainset.data(),
-                         (IdxT)index.pq_len(),
-                         stream);
+    utils::copy_selected<float, float, size_t, uint32_t>(
+      n_rows,
+      index.pq_len(),
+      index.centers_rot().data_handle() + index.pq_len() * j,
+      labels,
+      index.rot_dim(),
+      sub_trainset.data(),
+      index.pq_len(),
+      stream);
 
     // sub_trainset is the slice of: rotate(trainset) - centers_rot
     float alpha = 1.0;
@@ -558,7 +575,7 @@ void train_per_subset(const handle_t& handle,
                            sub_trainset.data(),
                            n_rows,
                            index.pq_book_size(),
-                           pq_centers_tmp.data() + (index.pq_book_size() * index.pq_len()) * j,
+                           pq_centers_tmp.data() + index.pq_book_size() * index.pq_len() * j,
                            sub_labels.data(),
                            pq_cluster_sizes.data(),
                            raft::distance::DistanceType::L2Expanded,
@@ -571,7 +588,7 @@ void train_per_subset(const handle_t& handle,
 template <typename IdxT>
 void train_per_cluster(const handle_t& handle,
                        index<IdxT>& index,
-                       IdxT n_rows,
+                       size_t n_rows,
                        const float* trainset,   // [n_rows, dim]
                        const uint32_t* labels,  // [n_rows]
                        uint32_t kmeans_n_iters,
@@ -585,22 +602,24 @@ void train_per_cluster(const handle_t& handle,
   rmm::device_uvector<IdxT> indices_buf(n_rows, stream, device_memory);
   rmm::device_uvector<IdxT> offsets_buf(index.list_offsets().size(), stream, managed_memory);
 
-  raft::stats::histogram<uint32_t, IdxT>(raft::stats::HistTypeAuto,
-                                         reinterpret_cast<int32_t*>(cluster_sizes.data()),
-                                         IdxT(index.n_lists()),
-                                         labels,
-                                         n_rows,
-                                         1,
-                                         stream);
+  raft::stats::histogram<uint32_t, size_t>(raft::stats::HistTypeAuto,
+                                           reinterpret_cast<int32_t*>(cluster_sizes.data()),
+                                           index.n_lists(),
+                                           labels,
+                                           n_rows,
+                                           1,
+                                           stream);
 
   auto cluster_offsets      = offsets_buf.data();
   auto indices              = indices_buf.data();
   uint32_t max_cluster_size = calculate_offsets_and_indices(
-    n_rows, index.n_lists(), labels, cluster_sizes.data(), cluster_offsets, indices, stream);
+    IdxT(n_rows), index.n_lists(), labels, cluster_sizes.data(), cluster_offsets, indices, stream);
 
-  rmm::device_uvector<uint32_t> pq_labels(max_cluster_size * index.pq_dim(), stream, device_memory);
+  rmm::device_uvector<uint32_t> pq_labels(
+    size_t(max_cluster_size) * size_t(index.pq_dim()), stream, device_memory);
   rmm::device_uvector<uint32_t> pq_cluster_sizes(index.pq_book_size(), stream, device_memory);
-  rmm::device_uvector<float> rot_vectors(max_cluster_size * index.rot_dim(), stream, device_memory);
+  rmm::device_uvector<float> rot_vectors(
+    size_t(max_cluster_size) * size_t(index.rot_dim()), stream, device_memory);
 
   handle.sync_stream();  // make sure cluster offsets are up-to-date
   for (uint32_t l = 0; l < index.n_lists(); l++) {
@@ -615,29 +634,30 @@ void train_per_cluster(const handle_t& handle,
                      index.dim(),
                      index.rot_dim(),
                      index.rotation_matrix().data_handle(),
-                     index.centers().data_handle() + uint64_t(l) * index.dim_ext(),
+                     index.centers().data_handle() + size_t(l) * size_t(index.dim_ext()),
                      trainset,
                      indices + cluster_offsets[l],
                      device_memory);
 
     // limit the cluster size to bound the training time.
     // [sic] we interpret the data as pq_len-dimensional
-    size_t big_enough     = 256 * std::max(index.pq_book_size(), index.pq_dim());
-    size_t available_rows = cluster_size * index.pq_dim();
+    size_t big_enough     = 256ul * std::max<size_t>(index.pq_book_size(), index.pq_dim());
+    size_t available_rows = size_t(cluster_size) * size_t(index.pq_dim());
     auto pq_n_rows        = uint32_t(std::min(big_enough, available_rows));
     // train PQ codebook for this cluster
-    kmeans::build_clusters(handle,
-                           kmeans_n_iters,
-                           index.pq_len(),
-                           rot_vectors.data(),
-                           pq_n_rows,
-                           index.pq_book_size(),
-                           pq_centers_tmp.data() + index.pq_book_size() * index.pq_len() * l,
-                           pq_labels.data(),
-                           pq_cluster_sizes.data(),
-                           raft::distance::DistanceType::L2Expanded,
-                           stream,
-                           device_memory);
+    kmeans::build_clusters(
+      handle,
+      kmeans_n_iters,
+      index.pq_len(),
+      rot_vectors.data(),
+      pq_n_rows,
+      index.pq_book_size(),
+      pq_centers_tmp.data() + size_t(index.pq_book_size()) * size_t(index.pq_len()) * size_t(l),
+      pq_labels.data(),
+      pq_cluster_sizes.data(),
+      raft::distance::DistanceType::L2Expanded,
+      stream,
+      device_memory);
   }
   transpose_pq_centers(index, pq_centers_tmp.data(), stream);
 }
@@ -678,7 +698,8 @@ inline auto extend(const handle_t& handle,
   //
   const auto n_clusters = orig_index.n_lists();
 
-  rmm::device_uvector<float> cluster_centers(n_clusters * orig_index.dim(), stream, device_memory);
+  rmm::device_uvector<float> cluster_centers(
+    size_t(n_clusters) * size_t(orig_index.dim()), stream, device_memory);
   RAFT_CUDA_TRY(cudaMemcpy2DAsync(cluster_centers.data(),
                                   sizeof(float) * orig_index.dim(),
                                   orig_index.centers().data_handle(),
@@ -733,7 +754,7 @@ inline auto extend(const handle_t& handle,
   // Compute PQ code for new vectors
   //
   rmm::device_uvector<uint8_t> new_pq_codes(
-    n_rows * orig_index.pq_dim() * orig_index.pq_bits() / 8, stream, device_memory);
+    size_t(n_rows) * size_t(orig_index.pq_dim() * orig_index.pq_bits() / 8), stream, device_memory);
   compute_pq_codes<T>(handle,
                       n_rows,
                       orig_index.dim(),
@@ -750,7 +771,7 @@ inline auto extend(const handle_t& handle,
                       new_data_indices.data(),
                       new_cluster_sizes,
                       new_cluster_offsets.data(),
-                      orig_index.pq_centers().data_handle(),
+                      orig_index.pq_centers(),
                       new_pq_codes.data(),
                       device_memory);
 
@@ -931,14 +952,14 @@ inline auto extend(const handle_t& handle,
   size_t pq_dataset_unit = ext_index.pq_dim() * ext_index.pq_bits() / 8;
   for (uint32_t l = 0; l < ext_index.n_lists(); l++) {
     auto k                = cluster_ordering.data()[l];
-    auto old_cluster_size = old_cluster_sizes[k];
-    copy(ext_pq_dataset + pq_dataset_unit * ext_cluster_offsets[l],
-         orig_index.pq_dataset().data_handle() + pq_dataset_unit * old_cluster_offsets[k],
+    auto old_cluster_size = size_t(old_cluster_sizes[k]);
+    copy(ext_pq_dataset + pq_dataset_unit * size_t(ext_cluster_offsets[l]),
+         orig_index.pq_dataset().data_handle() + pq_dataset_unit * size_t(old_cluster_offsets[k]),
          pq_dataset_unit * old_cluster_size,
          stream);
-    copy(ext_pq_dataset + pq_dataset_unit * (ext_cluster_offsets[l] + old_cluster_size),
-         new_pq_codes.data() + pq_dataset_unit * new_cluster_offsets.data()[k],
-         pq_dataset_unit * new_cluster_sizes[k],
+    copy(ext_pq_dataset + pq_dataset_unit * size_t(ext_cluster_offsets[l] + old_cluster_size),
+         new_pq_codes.data() + pq_dataset_unit * size_t(new_cluster_offsets.data()[k]),
+         pq_dataset_unit * size_t(new_cluster_sizes[k]),
          stream);
   }
 
@@ -963,9 +984,10 @@ inline auto build(
   ivf_pq::index<IdxT> index(handle, params, dim);
   utils::memzero(index.list_offsets().data_handle(), index.list_offsets().size(), stream);
 
-  auto trainset_ratio = std::max<IdxT>(
-    1, n_rows / std::max<IdxT>(params.kmeans_trainset_fraction * n_rows, index.n_lists()));
-  auto n_rows_train = n_rows / trainset_ratio;
+  auto trainset_ratio = std::max<size_t>(
+    1,
+    size_t(n_rows) / std::max<size_t>(params.kmeans_trainset_fraction * n_rows, index.n_lists()));
+  size_t n_rows_train = n_rows / trainset_ratio;
 
   rmm::mr::device_memory_resource* device_memory = nullptr;
   auto pool_guard = raft::get_pool_memory_resource(device_memory, 1024 * 1024);
@@ -995,10 +1017,10 @@ inline auto build(
     auto dim = index.dim();
     linalg::writeOnlyUnaryOp(
       trainset.data(),
-      index.dim() * n_rows_train,
+      size_t(index.dim()) * n_rows_train,
       [dataset, trainset_ratio, dim] __device__(float* out, size_t i) {
         auto col = i % dim;
-        *out     = utils::mapping<float>{}(dataset[(i - col) * trainset_ratio + col]);
+        *out     = utils::mapping<float>{}(dataset[(i - col) * size_t(trainset_ratio) + col]);
       },
       stream);
   }
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
index 1014419c52..6ddb2709eb 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
@@ -449,13 +449,67 @@ void postprocess_distances(float* out,        // [n_queries, topk]
   }
 }
 
-/** Extract `bits` lowest bits from the `source` and shift the remaining content by this number.  */
-template <typename OpT>
-__device__ __forceinline__ auto extract_bits(OpT& source, uint32_t bits) -> uint8_t
+/** Extract `Bits` lowest bits from the `source` and shift the remaining content by this number.  */
+template <uint32_t Bits, typename OpT>
+__device__ __forceinline__ auto extract_bits(OpT& source) -> uint8_t
 {
-  uint8_t r = source & ((1 << bits) - 1);
-  source >>= bits;
-  return r;
+  if constexpr (Bits < 8 * sizeof(OpT)) {
+    constexpr uint32_t Mask = (1u << Bits) - 1u;  // NOLINT
+    uint8_t r               = source & Mask;
+    source >>= Bits;
+    return r;
+  } else {
+    uint8_t r = source;
+    source    = 0;
+    return r;
+  }
+}
+
+/**
+ * Starting with `BitsLeft = kBitsTotal = 8 * sizeof(OpT)`,
+ *   loop over `lcm(kBitsTotal, PqBits) = PqBits * kVecLen` bits of data.
+ * Thanks to `lcm` in the expression above,
+ *   `BitsLeft` is zero after exactly kVecLen iterations.
+ */
+template <uint32_t BitsLeft, uint32_t PqBits, typename OpT, typename LutT>
+__device__ __forceinline__ void compute_score_inner_loop(
+  const OpT*& pq_head, const LutT*& lut_scores, OpT& pq_code, float& score, const bool valid_range)
+{
+  uint8_t code = extract_bits<PqBits>(pq_code);
+  if constexpr (BitsLeft < PqBits) {  // `code` is on the border between two `OpT` ints
+    pq_code = valid_range ? *pq_head : OpT(0);
+    ++pq_head;
+    code <<= PqBits - BitsLeft;
+    code |= extract_bits<PqBits - BitsLeft>(pq_code);
+  }
+  score += valid_range ? float(lut_scores[code]) : 0.0f;
+  lut_scores += (1u << PqBits);
+  if constexpr (BitsLeft != PqBits) {
+    constexpr uint32_t kBitsLeftNext =
+      BitsLeft < PqBits ? (8 * sizeof(OpT) + BitsLeft - PqBits) : (BitsLeft - PqBits);
+    compute_score_inner_loop<kBitsLeftNext, PqBits, OpT, LutT>(
+      pq_head, lut_scores, pq_code, score, valid_range);
+  }
+}
+
+template <uint32_t PqBits, typename OpT, typename LutT>
+__device__ __forceinline__ auto compute_score(uint32_t pq_dim,
+                                              const OpT* pq_head,
+                                              const LutT* lut_scores,
+                                              const bool valid_range) -> float
+{
+  float score                   = 0.0;
+  constexpr uint32_t kBitsTotal = 8 * sizeof(OpT);
+  constexpr uint32_t kVecLen    = kBitsTotal / gcd<uint32_t>(kBitsTotal, PqBits);
+
+  // These two nested loops combined span exactly `pq_dim` iterations.
+  for (; pq_dim > 0; pq_dim -= kVecLen) {
+    OpT pq_code = valid_range ? *pq_head : OpT(0);
+    ++pq_head;
+    compute_score_inner_loop<kBitsTotal, PqBits, OpT, LutT>(
+      pq_head, lut_scores, pq_code, score, valid_range);
+  }
+  return score;
 }
 
 /**
@@ -466,50 +520,75 @@ __device__ __forceinline__ auto extract_bits(OpT& source, uint32_t bits) -> uint
  *     1. `pq_bits * vec_len % 8 * sizeof(OpT) == 0`.
  *     2. `pq_dim % vec_len == 0`
  *
+ *   Here `vec_len` is fully determined by `OpT` and `pq_bits`:
+ *        vec_len == 8 * sizeof(OpT) / gcd(8 * sizeof(OpT), pq_bits)
+ *                == lcm(8 * sizeof(OpT), pq_bits) / pq_bits
+ *
+ *
  * @tparam LutT type of the elements in the lookup table.
  *
  * @param pq_bits The bit length of an encoded vector element after compression by PQ
- * @param vec_len == 8 * sizeof(OpT) / gcd(8 * sizeof(OpT), pq_bits)
- *                == lcm(8 * sizeof(OpT), pq_bits) / pq_bits
  * @param pq_dim
  * @param[in] pq_code_ptr
  *   a device pointer to the dataset at the indexed position (`pq_dim * pq_bits` bits-wide)
  * @param[in] lut_scores
  *   a device or shared memory pointer to the lookup table [pq_dim, pq_book_size]
+ * @param valid_range
+ *   whether reads and writes are allowed
  *
  * @return the score for the entry `data_ix` in the `pq_dataset`.
  */
 template <typename OpT, typename LutT>
-__device__ auto ivfpq_compute_score(const uint32_t pq_bits,
-                                    const uint32_t vec_len,
-                                    uint32_t pq_dim,
-                                    const OpT* pq_head,
-                                    const LutT* lut_scores) -> float
+__device__ __noinline__ auto ivfpq_compute_score(const uint32_t pq_bits,
+                                                 uint32_t pq_dim,
+                                                 const OpT* pq_head,
+                                                 const LutT* lut_scores,
+                                                 const bool valid_range) -> float
 {
-  float score                  = 0.0;
-  constexpr int32_t kBitsTotal = 8 * sizeof(OpT);
-  // These two nested loops combined span exactly `pq_dim` iterations.
-  for (; pq_dim > 0; pq_dim -= vec_len) {
-    OpT pq_code = *pq_head++;
-    // Loop over lcm(kBitsTotal, pq_bits) = pq_bits * vec_len.
-    // Thanks to `lcm` in the expression above,
-    // `bits_left` is zero after exactly vec_len iterations.
-    for (int32_t bits_left = kBitsTotal; bits_left != 0;) {
-      uint8_t code = extract_bits(pq_code, pq_bits);
-      bits_left -= pq_bits;
-      if (bits_left < 0) {  // `code` is on the border between two `OpT` ints
-        pq_code = *pq_head++;
-        code <<= -bits_left;
-        code |= extract_bits(pq_code, -bits_left);
-        bits_left += kBitsTotal;
-      }
-      score += float(lut_scores[code]);
-      lut_scores += (1 << pq_bits);
-    }
+  switch (pq_bits) {
+    case 4: return compute_score<4, OpT, LutT>(pq_dim, pq_head, lut_scores, valid_range);
+    case 5: return compute_score<5, OpT, LutT>(pq_dim, pq_head, lut_scores, valid_range);
+    case 6: return compute_score<6, OpT, LutT>(pq_dim, pq_head, lut_scores, valid_range);
+    case 7: return compute_score<7, OpT, LutT>(pq_dim, pq_head, lut_scores, valid_range);
+    case 8: return compute_score<8, OpT, LutT>(pq_dim, pq_head, lut_scores, valid_range);
   }
-  return score;
 }
 
+// template <typename OpT, typename LutT>
+// __device__ __forceinline__ auto ivfpq_compute_score(const uint32_t pq_bits,
+//                                                     const uint32_t vec_len,
+//                                                     uint32_t pq_dim,
+//                                                     const OpT* pq_head,
+//                                                     const LutT* lut_scores,
+//                                                     bool valid_range) -> float
+// {
+//   float score                  = 0.0;
+//   constexpr int32_t kBitsTotal = 8 * sizeof(OpT);
+//   // These two nested loops combined span exactly `pq_dim` iterations.
+//   for (; pq_dim > 0; pq_dim -= vec_len) {
+//     OpT pq_code = valid_range ? *pq_head : OpT(0);
+//     ++pq_head;
+//     // Loop over lcm(kBitsTotal, pq_bits) = pq_bits * vec_len.
+//     // Thanks to `lcm` in the expression above,
+//     // `bits_left` is zero after exactly vec_len iterations.
+// #pragma unroll sizeof(OpT)
+//     for (int32_t bits_left = kBitsTotal; bits_left != 0;) {
+//       uint8_t code = extract_bits(pq_bits, pq_code);
+//       bits_left -= pq_bits;
+//       if (bits_left < 0) {  // `code` is on the border between two `OpT` ints
+//         pq_code = valid_range ? *pq_head : OpT(0);
+//         ++pq_head;
+//         code <<= -bits_left;
+//         code |= extract_bits(-bits_left, pq_code);
+//         bits_left += kBitsTotal;
+//       }
+//       score += valid_range ? float(lut_scores[code]) : 0.0f;
+//       lut_scores += (1 << pq_bits);
+//     }
+//   }
+//   return score;
+// }
+
 template <typename T, typename IdxT>
 struct dummy_block_sort_t {
   using queue_t = topk::warp_sort_immediate<WarpSize, true, T, IdxT>;
@@ -538,7 +617,7 @@ using block_sort_t = typename pq_block_sort<Capacity, T, IdxT>::type;
  * vector and all the dataset vector in the cluster that we are probing.
  *
  * @tparam OpT is a carrier integer type selected to maximize throughput;
- *   Used solely in `ivfpq_compute_score`;
+ *   Used solely in ivfpq-compute-score;
  * @tparam IdxT
  *   The type of data indices
  * @tparam OutT
@@ -634,10 +713,8 @@ __launch_bounds__(1024, 1) __global__
   */
   extern __shared__ __align__(256) uint8_t smem_buf[];  // NOLINT
   constexpr bool kManageLocalTopK = Capacity > 0;
-  constexpr uint32_t kOpBits      = 8 * sizeof(OpT);
 
   const uint32_t pq_len   = dim / pq_dim;
-  const uint32_t vec_len  = kOpBits / gcd<uint32_t>(kOpBits, pq_bits);
   const uint32_t lut_size = pq_dim << pq_bits;
 
   if constexpr (EnableSMemLut) {
@@ -690,7 +767,7 @@ __launch_bounds__(1024, 1) __global__
     if (codebook_kind == codebook_gen::PER_SUBSPACE) {
       pq_center = pq_centers;
     } else {
-      pq_center = pq_centers + (pq_len << pq_bits) * label;
+      pq_center = pq_centers + (Pow2<4>::roundUp(pq_len) << pq_bits) * label;
     }
 
     if constexpr (PrecompBaseDiff) {
@@ -702,10 +779,11 @@ __launch_bounds__(1024, 1) __global__
           }
         } break;
         case distance::DistanceType::InnerProduct: {
+          float2 pvals;
           for (uint32_t i = threadIdx.x; i < dim; i += blockDim.x) {
-            auto q             = query[i];
-            base_diff[i]       = q;
-            base_diff[dim + i] = cluster_center[i] * q;
+            pvals.x                                 = query[i];
+            pvals.y                                 = cluster_center[i] * pvals.x;
+            reinterpret_cast<float2*>(base_diff)[i] = pvals;
           }
         } break;
       }
@@ -716,71 +794,88 @@ __launch_bounds__(1024, 1) __global__
       // Create a lookup table
       // For each subspace, the lookup table stores the distance between the actual query vector
       // (projected into the subspace) and all possible pq vectors in that subspace.
-      const uint32_t pq_mask = (1u << pq_bits) - 1u;
+      const uint32_t pq_shift = 1u << pq_bits;
+      using f4                = raft::TxN_t<float, 4>;
       for (uint32_t i = threadIdx.x; i < lut_size; i += blockDim.x) {
-        const uint32_t i_pq    = i >> pq_bits;
-        const uint32_t i_shift = i_pq * pq_len;
-        const float* cur_pq_center =
-          pq_center + (i & pq_mask) +
-          (codebook_kind == codebook_gen::PER_SUBSPACE ? i_shift << pq_bits : 0u);
+        const uint32_t i_pq  = i >> pq_bits;
+        uint32_t j           = i_pq * pq_len;
+        const uint32_t j_end = pq_len + j;
+        auto cur_pq_center = reinterpret_cast<const f4::io_t*>(pq_center) + (i & (pq_shift - 1u)) +
+                             (codebook_kind == codebook_gen::PER_SUBSPACE
+                                ? (i_pq * Pow2<4>::roundUp(pq_len)) << (pq_bits - 2)
+                                : 0u);
         float score = 0.0;
-        switch (metric) {
-          case distance::DistanceType::L2Expanded: {
-            for (uint32_t j = 0; j < pq_len; j++) {
-              uint32_t k = j + i_shift;
-              float diff;
-              if constexpr (PrecompBaseDiff) {
-                diff = base_diff[k];
-              } else {
-                diff = query[k] - cluster_center[k];
+        do {
+          f4 pq_c;
+          *pq_c.vectorized_data() = *cur_pq_center;
+          cur_pq_center += pq_shift;
+          switch (metric) {
+            case distance::DistanceType::L2Expanded: {
+              f4 diff;
+#pragma unroll
+              for (int l = 0; l < 4; l++, j++) {
+                if constexpr (PrecompBaseDiff) {
+                  diff.val.data[l] = j < j_end ? base_diff[j] : 0.0f;
+                } else {
+                  diff.val.data[l] = j < j_end ? query[j] - cluster_center[j] : 0.0f;
+                }
               }
-              diff -= cur_pq_center[j << pq_bits];
-              score += diff * diff;
-            }
-          } break;
-          case distance::DistanceType::InnerProduct: {
-            // NB: we negate the scores as we hardcoded select-topk to always take the minimum
-            for (uint32_t j = 0; j < pq_len; j++) {
-              uint32_t k = j + i_shift;
-              float q;
-              if constexpr (PrecompBaseDiff) {
-                q = base_diff[k];
-                score -= base_diff[dim + k];
-              } else {
-                q = query[k];
-                score -= q * cluster_center[k];
+#pragma unroll
+              for (int l = 0; l < 4; l++) {
+                float d = diff.val.data[l] - pq_c.val.data[l];
+                score += d * d;
               }
-              score -= q * cur_pq_center[j << pq_bits];
-            }
-          } break;
-        }
+            } break;
+            case distance::DistanceType::InnerProduct: {
+              // NB: we negate the scores as we hardcoded select-topk to always compute the minimum
+              f4 q;
+#pragma unroll
+              for (int l = 0; l < 4; l++, j++) {
+                if constexpr (PrecompBaseDiff) {
+                  float2 pvals  = j < j_end ? reinterpret_cast<float2*>(base_diff)[j] : float2{};
+                  q.val.data[l] = pvals.x;
+                  score -= pvals.y;
+                } else {
+                  q.val.data[l] = j < j_end ? query[j] : 0.0f;
+                  score -= j < j_end ? q.val.data[l] * cluster_center[j] : 0.0f;
+                }
+              }
+#pragma unroll
+              for (int l = 0; l < 4; l++) {
+                score -= q.val.data[l] * pq_c.val.data[l];
+              }
+            } break;
+          }
+        } while (j < j_end);
         lut_scores[i] = LutT(score);
       }
     }
 
     uint32_t sample_offset = 0;
     if (probe_ix > 0) { sample_offset = chunk_indices[probe_ix - 1]; }
-    uint32_t n_samples   = chunk_indices[probe_ix] - sample_offset;
-    uint32_t n_samples32 = Pow2<32>::roundUp(n_samples);
-    IdxT cluster_offset  = cluster_offsets[label];
+    uint32_t n_samples           = chunk_indices[probe_ix] - sample_offset;
+    uint32_t n_samples32         = Pow2<32>::roundUp(n_samples);
+    IdxT cluster_offset          = cluster_offsets[label];
+    const uint32_t pq_line_width = pq_dim * pq_bits / 8;
+    auto pq_cluster_data         = pq_dataset + size_t(cluster_offset) * size_t(pq_line_width);
 
     using local_topk_t = block_sort_t<Capacity, OutT, IdxT>;
     local_topk_t block_topk(topk, smem_buf);
 
-    // Ensure lut_scores is written by all threads before using it in ivfpq_compute_score
+    // Ensure lut_scores is written by all threads before using it in ivfpq-compute-score
     __threadfence_block();
     __syncthreads();
 
     // Compute a distance for each sample
-    const uint32_t pq_line_width = pq_dim * pq_bits / 8;
     for (uint32_t i = threadIdx.x; i < n_samples32; i += blockDim.x) {
-      OutT score = local_topk_t::queue_t::kDummy;
-      if (i < n_samples) {
-        auto pq_ptr =
-          reinterpret_cast<const OpT*>(pq_dataset + uint64_t(pq_line_width) * (cluster_offset + i));
-        float fscore = ivfpq_compute_score<OpT, LutT>(pq_bits, vec_len, pq_dim, pq_ptr, lut_scores);
-        if (fscore < float(score)) { score = OutT(fscore); }
-      }
+      OutT score   = local_topk_t::queue_t::kDummy;
+      float fscore = ivfpq_compute_score<OpT, LutT>(
+        pq_bits,
+        pq_dim,
+        reinterpret_cast<const OpT*>(pq_cluster_data + pq_line_width * i),
+        lut_scores,
+        i < n_samples);
+      if (fscore < float(score)) { score = OutT(fscore); }
       if constexpr (kManageLocalTopK) {
         block_topk.add(score, cluster_offset + i);
       } else {
@@ -930,8 +1025,8 @@ struct ivfpq_compute_similarity {
       conf_no_smem_lut::kernel(pq_bits, pq_dim, manage_local_topk ? topk : 0u);
 
     const size_t smem_threshold = 48 * 1024;
-    size_t smem_size            = sizeof(LutT) * (pq_dim << pq_bits);
-    size_t smem_size_base_diff  = sizeof(float) * precomp_data_count;
+    size_t lut_mem              = sizeof(LutT) * (pq_dim << pq_bits);
+    size_t bdf_mem              = sizeof(float) * precomp_data_count;
 
     uint32_t n_blocks  = n_queries * n_probes;
     uint32_t n_threads = 1024;
@@ -944,7 +1039,7 @@ struct ivfpq_compute_similarity {
       RAFT_CUDA_TRY(cudaGetDeviceProperties(&dev_props, cur_dev));
       while (n_threads > thread_min) {
         if (n_blocks < uint32_t(getMultiProcessorCount() * (1024 / (n_threads / 2)))) { break; }
-        if (dev_props.sharedMemPerMultiprocessor * 2 / 3 < smem_size * (1024 / (n_threads / 2))) {
+        if (dev_props.sharedMemPerMultiprocessor * 2 / 3 < lut_mem * (1024 / (n_threads / 2))) {
           break;
         }
         n_threads /= 2;
@@ -956,13 +1051,16 @@ struct ivfpq_compute_similarity {
       manage_local_topk
         ? topk::template calc_smem_size_for_block_wide<OutT, IdxT>(n_threads / WarpSize, topk)
         : 0;
-    smem_size = max(smem_size, smem_size_local_topk);
+    size_t smem_fast        = max(lut_mem + bdf_mem, smem_size_local_topk);
+    size_t smem_no_basediff = max(lut_mem, smem_size_local_topk);
+    size_t smem_no_smem_lut = max(bdf_mem, smem_size_local_topk);
 
-    kernel_t kernel = kernel_no_basediff;
+    kernel_t kernel  = kernel_no_basediff;
+    size_t smem_size = smem_no_basediff;
 
     bool kernel_no_basediff_available = true;
     bool use_smem_lut                 = true;
-    if (smem_size > smem_threshold) {
+    if (smem_no_basediff > smem_threshold) {
       cudaError_t cuda_status = cudaFuncSetAttribute(
         kernel_no_basediff, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
       if (cuda_status != cudaSuccess) {
@@ -978,22 +1076,21 @@ struct ivfpq_compute_similarity {
           "%zu bytes)",
           smem_size);
         kernel       = kernel_no_smem_lut;
+        smem_size    = smem_no_smem_lut;
         use_smem_lut = false;
         n_threads    = 1024;
         smem_size_local_topk =
           manage_local_topk
             ? topk::template calc_smem_size_for_block_wide<OutT, IdxT>(n_threads / WarpSize, topk)
             : 0;
-        smem_size = max(smem_size_base_diff, smem_size_local_topk);
-        n_blocks  = getMultiProcessorCount();
+        n_blocks = getMultiProcessorCount();
       }
     }
     if (kernel_no_basediff_available) {
       bool kernel_fast_available = true;
-      if (smem_size + smem_size_base_diff > smem_threshold) {
-        cudaError_t cuda_status = cudaFuncSetAttribute(kernel_fast,
-                                                       cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                                       smem_size + smem_size_base_diff);
+      if (smem_fast > smem_threshold) {
+        cudaError_t cuda_status =
+          cudaFuncSetAttribute(kernel_fast, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_fast);
         if (cuda_status != cudaSuccess) {
           RAFT_EXPECTS(
             cuda_status == cudaGetLastError(),
@@ -1002,22 +1099,22 @@ struct ivfpq_compute_similarity {
           RAFT_LOG_DEBUG(
             "No-precomputed-basediff kernel is selected, because the basediff wouldn't fit (shmem "
             "required: %zu bytes)",
-            smem_size + smem_size_base_diff);
+            smem_fast);
         }
       }
       if (kernel_fast_available) {
         int kernel_no_basediff_n_blocks = 0;
         RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-          &kernel_no_basediff_n_blocks, kernel_no_basediff, n_threads, smem_size));
+          &kernel_no_basediff_n_blocks, kernel_no_basediff, n_threads, smem_no_basediff));
 
         int kernel_fast_n_blocks = 0;
         RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-          &kernel_fast_n_blocks, kernel_fast, n_threads, smem_size + smem_size_base_diff));
+          &kernel_fast_n_blocks, kernel_fast, n_threads, smem_fast));
 
         // Use "kernel_fast" only if GPU occupancy does not drop too much
         if (kernel_no_basediff_n_blocks * 3 <= kernel_fast_n_blocks * 4) {
-          kernel = kernel_fast;
-          smem_size += smem_size_base_diff;
+          kernel    = kernel_fast;
+          smem_size = smem_fast;
         }
       }
     }
@@ -1031,7 +1128,7 @@ struct ivfpq_compute_similarity {
   }
 };
 
-auto is_local_topk_feasible(uint32_t k, uint32_t n_probes, uint32_t n_queries) -> bool
+inline auto is_local_topk_feasible(uint32_t k, uint32_t n_probes, uint32_t n_queries) -> bool
 {
   if (k > kMaxCapacity) { return false; }             // warp_sort not possible
   if (n_probes <= 16) { return false; }               // too few clusters
@@ -1143,11 +1240,11 @@ void ivfpq_search_worker(const handle_t& handle,
     case distance::DistanceType::L2Unexpanded:
     case distance::DistanceType::L2Expanded: {
       // stores basediff (query[i] - center[i])
-      precomp_data_count = index.pq_dim();
+      precomp_data_count = index.rot_dim();
     } break;
     case distance::DistanceType::InnerProduct: {
-      // stores two components (query[i] * center[i], center[i])
-      precomp_data_count = index.pq_dim() * 2;
+      // stores two components (query[i] * center[i], query[i] * center[i])
+      precomp_data_count = index.rot_dim() * 2;
     } break;
     default: {
       RAFT_FAIL("Unsupported metric");
@@ -1164,6 +1261,19 @@ void ivfpq_search_worker(const handle_t& handle,
                                                          topK);
 
   rmm::device_uvector<LutT> device_lut(search_instance.device_lut_size, stream, mr);
+  printf(
+    "===|     raft: n_rows = %zu, n_lists = %u, n_queries = %u, n_probes = %u, dim = %u, pq_dim = "
+    "%u, k = %u, max_samples = %u, metric = %d, codebook_kind = %d\n",
+    uint64_t(index.size()),
+    uint32_t(index.n_lists()),
+    uint32_t(n_queries),
+    uint32_t(n_probes),
+    uint32_t(index.rot_dim()),
+    uint32_t(index.pq_dim()),
+    uint32_t(topK),
+    uint32_t(max_samples),
+    int(index.metric()),
+    int(index.codebook_kind()));
 
   search_instance(stream,
                   index.size(),
@@ -1311,7 +1421,7 @@ inline auto get_max_batch_size(uint32_t k,
   auto ws_size = [k, n_probes, max_samples](uint32_t bs) -> uint64_t {
     return uint64_t(is_local_topk_feasible(k, n_probes, bs) ? k * n_probes : max_samples) * bs;
   };
-  constexpr uint64_t kMaxWsSize = 2ul * 1024 * 1024 * 1024;
+  constexpr uint64_t kMaxWsSize = 1024 * 1024 * 1024;
   if (ws_size(max_batch_size) > kMaxWsSize) {
     uint32_t smaller_batch_size = 1;
     // take powers of two for better alignment

From 9cc1a926a8aa5de1efd32829cf056298f9e7072d Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 19 Oct 2022 16:04:50 +0200
Subject: [PATCH 07/31] Fix invalid scores being added when number of samples %
 32 != 0

---
 cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
index 6ddb2709eb..bfb3bd5c2c 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
@@ -875,7 +875,7 @@ __launch_bounds__(1024, 1) __global__
         reinterpret_cast<const OpT*>(pq_cluster_data + pq_line_width * i),
         lut_scores,
         i < n_samples);
-      if (fscore < float(score)) { score = OutT(fscore); }
+      if (i < n_samples && fscore < float(score)) { score = OutT(fscore); }
       if constexpr (kManageLocalTopK) {
         block_topk.add(score, cluster_offset + i);
       } else {

From 507158e36ad550c89508f338dc03b19c76b3c87d Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 19 Oct 2022 16:28:15 +0200
Subject: [PATCH 08/31] Fix incorrect copying of pq_centers in PER_CLUSTER case

---
 .../raft/spatial/knn/detail/ivf_pq_build.cuh  | 20 +++++++--------
 .../raft/spatial/knn/detail/ivf_pq_search.cuh |  2 +-
 cpp/test/neighbors/ann_ivf_flat.cu            | 10 +++-----
 cpp/test/neighbors/ann_ivf_pq.cuh             | 12 ++++-----
 .../ann_ivf_pq/test_float_int64_t.cu          |  4 +--
 .../ann_ivf_pq/test_float_uint32_t.cu         |  4 +--
 .../ann_ivf_pq/test_float_uint64_t.cu         |  4 +--
 .../ann_ivf_pq/test_int8_t_uint64_t.cu        |  4 +--
 .../ann_ivf_pq/test_uint8_t_uint64_t.cu       |  4 +--
 cpp/test/neighbors/ann_utils.cuh              | 25 ++++++++++---------
 10 files changed, 43 insertions(+), 46 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
index 3f88c616ae..63a192f4da 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
@@ -845,14 +845,14 @@ inline auto extend(const handle_t& handle,
   }
 
   // Assemble the extended index
-  ivf_pq::index<IdxT> ext_index(handle,
-                                orig_index.metric(),
-                                orig_index.codebook_kind(),
-                                n_clusters,
-                                orig_index.dim(),
-                                orig_index.pq_bits(),
-                                orig_index.pq_dim(),
-                                n_nonempty_lists);
+  index<IdxT> ext_index(handle,
+                        orig_index.metric(),
+                        orig_index.codebook_kind(),
+                        n_clusters,
+                        orig_index.dim(),
+                        orig_index.pq_bits(),
+                        orig_index.pq_dim(),
+                        n_nonempty_lists);
   ext_index.allocate(handle, orig_index.size() + n_rows);
 
   // Copy the unchanged parts
@@ -902,7 +902,7 @@ inline auto extend(const handle_t& handle,
            stream);
     } break;
     case codebook_gen::PER_CLUSTER: {
-      auto d = orig_index.pq_book_size() * orig_index.pq_len();
+      auto d = orig_index.pq_book_size() * Pow2<4>::roundUp(orig_index.pq_len());
       utils::copy_selected(n_clusters,
                            d,
                            orig_index.pq_centers().data_handle(),
@@ -981,7 +981,7 @@ inline auto build(
 
   auto stream = handle.get_stream();
 
-  ivf_pq::index<IdxT> index(handle, params, dim);
+  index<IdxT> index(handle, params, dim);
   utils::memzero(index.list_offsets().data_handle(), index.list_offsets().size(), stream);
 
   auto trainset_ratio = std::max<size_t>(
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
index bfb3bd5c2c..c83eb028ea 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
@@ -1333,7 +1333,7 @@ template <typename IdxT>
 struct ivfpq_search {
  public:
   using fun_t = void (*)(const handle_t&,
-                         const ivf_pq::index<IdxT>&,
+                         const index<IdxT>&,
                          uint32_t,
                          uint32_t,
                          uint32_t,
diff --git a/cpp/test/neighbors/ann_ivf_flat.cu b/cpp/test/neighbors/ann_ivf_flat.cu
index 01af7ea0bd..b5e5da90d9 100644
--- a/cpp/test/neighbors/ann_ivf_flat.cu
+++ b/cpp/test/neighbors/ann_ivf_flat.cu
@@ -20,9 +20,9 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/distance/distance_types.hpp>
+#include <raft/neighbors/ivf_flat.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/spatial/knn/ann.cuh>
-#include <raft/spatial/knn/ivf_flat.cuh>
 #include <raft/spatial/knn/knn.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -40,9 +40,7 @@
 #include <iostream>
 #include <vector>
 
-namespace raft {
-namespace spatial {
-namespace knn {
+namespace raft::neighbors::ivf_flat {
 
 template <typename IdxT>
 struct AnnIvfFlatInputs {
@@ -302,6 +300,4 @@ TEST_P(AnnIVFFlatTestF_int8, AnnIVFFlat) { this->testIVFFlat(); }
 
 INSTANTIATE_TEST_CASE_P(AnnIVFFlatTest, AnnIVFFlatTestF_int8, ::testing::ValuesIn(inputs));
 
-}  // namespace knn
-}  // namespace spatial
-}  // namespace raft
+}  // namespace raft::neighbors::ivf_flat
diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index a247f0101f..6a8bc86e84 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -20,10 +20,10 @@
 
 #include <raft/core/logger.hpp>
 #include <raft/distance/distance_types.hpp>
+#include <raft/neighbors/ivf_pq.cuh>
 #include <raft/random/rng.cuh>
-#include <raft/spatial/knn/ivf_pq.cuh>
 #if defined RAFT_NN_COMPILED
-#include <raft/spatial/knn/specializations.cuh>
+#include <raft/neighbors/specializations.cuh>
 #else
 #pragma message("NN specializations are not enabled; expect very long building times.")
 #endif
@@ -42,15 +42,15 @@
 #include <iostream>
 #include <vector>
 
-namespace raft::spatial::knn {
+namespace raft::neighbors::ivf_pq {
 
 struct ivf_pq_inputs {
   uint32_t num_db_vecs = 4096;
   uint32_t num_queries = 1024;
   uint32_t dim         = 64;
   uint32_t k           = 32;
-  raft::spatial::knn::ivf_pq::index_params index_params;
-  raft::spatial::knn::ivf_pq::search_params search_params;
+  ivf_pq::index_params index_params;
+  ivf_pq::search_params search_params;
 
   // Set some default parameters for tests
   ivf_pq_inputs()
@@ -484,4 +484,4 @@ inline auto special_cases() -> test_cases_t
 #define INSTANTIATE(type, vals) \
   INSTANTIATE_TEST_SUITE_P(IvfPq, type, ::testing::ValuesIn(vals)); /* NOLINT */
 
-}  // namespace raft::spatial::knn
+}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/test/neighbors/ann_ivf_pq/test_float_int64_t.cu b/cpp/test/neighbors/ann_ivf_pq/test_float_int64_t.cu
index 30203fd2f0..ecb2faa6a0 100644
--- a/cpp/test/neighbors/ann_ivf_pq/test_float_int64_t.cu
+++ b/cpp/test/neighbors/ann_ivf_pq/test_float_int64_t.cu
@@ -16,7 +16,7 @@
 
 #include "../ann_ivf_pq.cuh"
 
-namespace raft::spatial::knn {
+namespace raft::neighbors::ivf_pq {
 
 using f32_f32_i64 = ivf_pq_test<float, float, int64_t>;
 
@@ -24,4 +24,4 @@ TEST_BUILD_SEARCH(f32_f32_i64)
 TEST_BUILD_EXTEND_SEARCH(f32_f32_i64)
 INSTANTIATE(f32_f32_i64, enum_variety_l2() + enum_variety_ip() + big_dims_small_lut());
 
-}  // namespace raft::spatial::knn
+}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/test/neighbors/ann_ivf_pq/test_float_uint32_t.cu b/cpp/test/neighbors/ann_ivf_pq/test_float_uint32_t.cu
index cf2cf1ac54..57d87f47f6 100644
--- a/cpp/test/neighbors/ann_ivf_pq/test_float_uint32_t.cu
+++ b/cpp/test/neighbors/ann_ivf_pq/test_float_uint32_t.cu
@@ -16,11 +16,11 @@
 
 #include "../ann_ivf_pq.cuh"
 
-namespace raft::spatial::knn {
+namespace raft::neighbors::ivf_pq {
 
 using f32_f32_u32 = ivf_pq_test<float, float, uint32_t>;
 
 TEST_BUILD_SEARCH(f32_f32_u32)
 INSTANTIATE(f32_f32_u32, defaults() + var_n_probes() + var_k() + special_cases());
 
-}  // namespace raft::spatial::knn
+}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/test/neighbors/ann_ivf_pq/test_float_uint64_t.cu b/cpp/test/neighbors/ann_ivf_pq/test_float_uint64_t.cu
index 5321783a32..2c203d12e7 100644
--- a/cpp/test/neighbors/ann_ivf_pq/test_float_uint64_t.cu
+++ b/cpp/test/neighbors/ann_ivf_pq/test_float_uint64_t.cu
@@ -16,11 +16,11 @@
 
 #include "../ann_ivf_pq.cuh"
 
-namespace raft::spatial::knn {
+namespace raft::neighbors::ivf_pq {
 
 using f32_f32_u64 = ivf_pq_test<float, float, uint64_t>;
 
 TEST_BUILD_EXTEND_SEARCH(f32_f32_u64)
 INSTANTIATE(f32_f32_u64, defaults() + small_dims() + big_dims_moderate_lut());
 
-}  // namespace raft::spatial::knn
+}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/test/neighbors/ann_ivf_pq/test_int8_t_uint64_t.cu b/cpp/test/neighbors/ann_ivf_pq/test_int8_t_uint64_t.cu
index e6cafb0ab4..c1029d590c 100644
--- a/cpp/test/neighbors/ann_ivf_pq/test_int8_t_uint64_t.cu
+++ b/cpp/test/neighbors/ann_ivf_pq/test_int8_t_uint64_t.cu
@@ -16,11 +16,11 @@
 
 #include "../ann_ivf_pq.cuh"
 
-namespace raft::spatial::knn {
+namespace raft::neighbors::ivf_pq {
 
 using f32_i08_u64 = ivf_pq_test<float, int8_t, uint64_t>;
 
 TEST_BUILD_SEARCH(f32_i08_u64)
 INSTANTIATE(f32_i08_u64, defaults() + big_dims() + var_k());
 
-}  // namespace raft::spatial::knn
+}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/test/neighbors/ann_ivf_pq/test_uint8_t_uint64_t.cu b/cpp/test/neighbors/ann_ivf_pq/test_uint8_t_uint64_t.cu
index 23a4c87e14..729e99d22c 100644
--- a/cpp/test/neighbors/ann_ivf_pq/test_uint8_t_uint64_t.cu
+++ b/cpp/test/neighbors/ann_ivf_pq/test_uint8_t_uint64_t.cu
@@ -16,7 +16,7 @@
 
 #include "../ann_ivf_pq.cuh"
 
-namespace raft::spatial::knn {
+namespace raft::neighbors::ivf_pq {
 
 using f32_u08_u64 = ivf_pq_test<float, uint8_t, uint64_t>;
 
@@ -24,4 +24,4 @@ TEST_BUILD_SEARCH(f32_u08_u64)
 TEST_BUILD_EXTEND_SEARCH(f32_u08_u64)
 INSTANTIATE(f32_u08_u64, small_dims_per_cluster() + enum_variety());
 
-}  // namespace raft::spatial::knn
+}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/test/neighbors/ann_utils.cuh b/cpp/test/neighbors/ann_utils.cuh
index faf6fad115..3e3735719f 100644
--- a/cpp/test/neighbors/ann_utils.cuh
+++ b/cpp/test/neighbors/ann_utils.cuh
@@ -25,7 +25,7 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
-namespace raft::spatial::knn {
+namespace raft::neighbors {
 
 struct print_dtype {
   cudaDataType_t value;
@@ -169,16 +169,17 @@ void naiveBfKnn(EvalT* dist_topk,
     naive_distance_kernel<EvalT, DataT, IdxT><<<grid_dim, block_dim, 0, stream>>>(
       dist.data(), x + offset * dim, y, batch_size, input_len, dim, type);
 
-    detail::select_topk<EvalT, IdxT>(dist.data(),
-                                     nullptr,
-                                     batch_size,
-                                     input_len,
-                                     static_cast<int>(k),
-                                     dist_topk + offset * k,
-                                     indices_topk + offset * k,
-                                     type != raft::distance::DistanceType::InnerProduct,
-                                     stream,
-                                     mr);
+    spatial::knn::detail::select_topk<EvalT, IdxT>(
+      dist.data(),
+      nullptr,
+      batch_size,
+      input_len,
+      static_cast<int>(k),
+      dist_topk + offset * k,
+      indices_topk + offset * k,
+      type != raft::distance::DistanceType::InnerProduct,
+      stream,
+      mr);
   }
   RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 }
@@ -244,4 +245,4 @@ auto eval_neighbours(const std::vector<T>& expected_idx,
   return testing::AssertionSuccess();
 }
 
-}  // namespace raft::spatial::knn
+}  // namespace raft::neighbors

From e75e77a308dd8525c53cbbd400df14099b2d70ae Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 20 Oct 2022 13:09:49 +0200
Subject: [PATCH 09/31] Filter topk results to minimize the number of
 warp-sorts

---
 .../raft/spatial/knn/detail/ivf_pq_search.cuh | 55 +++++++++++++------
 .../spatial/knn/detail/topk/warpsort_topk.cuh | 12 +++-
 2 files changed, 46 insertions(+), 21 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
index c83eb028ea..4bd991aed5 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
@@ -39,6 +39,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 
 #include <cub/cub.cuh>
+#include <thrust/fill.h>
 #include <thrust/sequence.h>
 
 #include <cuda_fp16.h>
@@ -477,7 +478,7 @@ __device__ __forceinline__ void compute_score_inner_loop(
 {
   uint8_t code = extract_bits<PqBits>(pq_code);
   if constexpr (BitsLeft < PqBits) {  // `code` is on the border between two `OpT` ints
-    pq_code = valid_range ? *pq_head : OpT(0);
+    pq_code = valid_range ? __ldcs(pq_head) : OpT(0);
     ++pq_head;
     code <<= PqBits - BitsLeft;
     code |= extract_bits<PqBits - BitsLeft>(pq_code);
@@ -504,7 +505,7 @@ __device__ __forceinline__ auto compute_score(uint32_t pq_dim,
 
   // These two nested loops combined span exactly `pq_dim` iterations.
   for (; pq_dim > 0; pq_dim -= kVecLen) {
-    OpT pq_code = valid_range ? *pq_head : OpT(0);
+    OpT pq_code = valid_range ? __ldcs(pq_head) : OpT(0);
     ++pq_head;
     compute_score_inner_loop<kBitsTotal, PqBits, OpT, LutT>(
       pq_head, lut_scores, pq_code, score, valid_range);
@@ -591,13 +592,14 @@ __device__ __noinline__ auto ivfpq_compute_score(const uint32_t pq_bits,
 
 template <typename T, typename IdxT>
 struct dummy_block_sort_t {
-  using queue_t = topk::warp_sort_immediate<WarpSize, true, T, IdxT>;
-  __device__ dummy_block_sort_t(int k, uint8_t* smem_buf){};
+  using queue_t = topk::warp_sort_filtered<WarpSize, true, T, IdxT>;
+  template <typename... Args>
+  __device__ dummy_block_sort_t(int k, uint8_t* smem_buf, Args...){};
 };
 
 template <int Capacity, typename T, typename IdxT>
 struct pq_block_sort {
-  using type = topk::block_sort<topk::warp_sort_immediate, Capacity, true, T, IdxT>;
+  using type = topk::block_sort<topk::warp_sort_filtered, Capacity, true, T, IdxT>;
 };
 
 template <typename T, typename IdxT>
@@ -700,6 +702,7 @@ __launch_bounds__(1024, 1) __global__
                                        const uint32_t* _chunk_indices,
                                        const float* queries,
                                        const uint32_t* index_list,
+                                       float* query_kths,
                                        LutT* lut_scores,
                                        OutT* _out_scores,
                                        IdxT* _out_indices)
@@ -746,7 +749,6 @@ __launch_bounds__(1024, 1) __global__
       query_ix = index_list[ib] / n_probes;
       probe_ix = index_list[ib] % n_probes;
     }
-    if (query_ix >= n_queries || probe_ix >= n_probes) continue;
 
     const uint32_t* chunk_indices = _chunk_indices + (n_probes * query_ix);
     const float* query            = queries + (dim * query_ix);
@@ -860,7 +862,9 @@ __launch_bounds__(1024, 1) __global__
     auto pq_cluster_data         = pq_dataset + size_t(cluster_offset) * size_t(pq_line_width);
 
     using local_topk_t = block_sort_t<Capacity, OutT, IdxT>;
-    local_topk_t block_topk(topk, smem_buf);
+    OutT query_kth     = local_topk_t::queue_t::kDummy;
+    if constexpr (kManageLocalTopK) { query_kth = OutT(query_kths[query_ix]); }
+    local_topk_t block_topk(topk, smem_buf, query_kth);
 
     // Ensure lut_scores is written by all threads before using it in ivfpq-compute-score
     __threadfence_block();
@@ -887,6 +891,7 @@ __launch_bounds__(1024, 1) __global__
       __syncthreads();
       block_topk.done();
       block_topk.store(out_scores, out_indices);
+      if (threadIdx.x == 0) { atomicMin(query_kths + query_ix, float(out_scores[topk - 1])); }
     } else {
       // fill in the rest of the out_scores with dummy values
       uint32_t max_samples = uint32_t(Pow2<128>::roundUp(cluster_offsets[n_probes]));
@@ -926,6 +931,7 @@ struct ivfpq_compute_similarity {
                             const uint32_t*,
                             const float*,
                             const uint32_t*,
+                            float*,
                             LutT*,
                             OutT*,
                             IdxT*);
@@ -952,7 +958,7 @@ struct ivfpq_compute_similarity {
       if constexpr (Capacity > 0) {
         if (k_max == 0 || k_max > Capacity) { return kernel_try_capacity<OpT, 0>(k_max); }
       }
-      if constexpr (Capacity > 32) {
+      if constexpr (Capacity > 1) {
         if (k_max * 2 <= Capacity) { return kernel_try_capacity<OpT, (Capacity / 2)>(k_max); }
       }
       return ivfpq_compute_similarity_kernel<OpT,
@@ -1047,9 +1053,13 @@ struct ivfpq_compute_similarity {
     } else {
       n_threads = preferred_thread_block_size;
     }
+    uint32_t subwarp_size = WarpSize;
+    while (topk * 2 <= subwarp_size) {
+      subwarp_size /= 2;
+    }
     size_t smem_size_local_topk =
       manage_local_topk
-        ? topk::template calc_smem_size_for_block_wide<OutT, IdxT>(n_threads / WarpSize, topk)
+        ? topk::template calc_smem_size_for_block_wide<OutT, IdxT>(n_threads / subwarp_size, topk)
         : 0;
     size_t smem_fast        = max(lut_mem + bdf_mem, smem_size_local_topk);
     size_t smem_no_basediff = max(lut_mem, smem_size_local_topk);
@@ -1075,15 +1085,15 @@ struct ivfpq_compute_similarity {
           "required: "
           "%zu bytes)",
           smem_size);
-        kernel       = kernel_no_smem_lut;
-        smem_size    = smem_no_smem_lut;
-        use_smem_lut = false;
-        n_threads    = 1024;
-        smem_size_local_topk =
-          manage_local_topk
-            ? topk::template calc_smem_size_for_block_wide<OutT, IdxT>(n_threads / WarpSize, topk)
-            : 0;
-        n_blocks = getMultiProcessorCount();
+        kernel               = kernel_no_smem_lut;
+        smem_size            = smem_no_smem_lut;
+        use_smem_lut         = false;
+        n_threads            = 1024;
+        smem_size_local_topk = manage_local_topk
+                                 ? topk::template calc_smem_size_for_block_wide<OutT, IdxT>(
+                                     n_threads / subwarp_size, topk)
+                                 : 0;
+        n_blocks             = getMultiProcessorCount();
       }
     }
     if (kernel_no_basediff_available) {
@@ -1275,6 +1285,14 @@ void ivfpq_search_worker(const handle_t& handle,
     int(index.metric()),
     int(index.codebook_kind()));
 
+  rmm::device_uvector<float> query_kths(0, stream, mr);
+  if (manage_local_topk) {
+    query_kths.resize(n_queries, stream);
+    thrust::fill_n(handle.get_thrust_policy(),
+                   query_kths.data(),
+                   n_queries,
+                   float(dummy_block_sort_t<ScoreT, IdxT>::queue_t::kDummy));
+  }
   search_instance(stream,
                   index.size(),
                   index.rot_dim(),
@@ -1293,6 +1311,7 @@ void ivfpq_search_worker(const handle_t& handle,
                   chunk_index.data(),
                   query,
                   index_list_sorted,
+                  query_kths.data(),
                   device_lut.data(),
                   distances_buf.data(),
                   neighbors_ptr);
diff --git a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
index 84cc072620..da645290f9 100644
--- a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
@@ -276,8 +276,8 @@ class warp_sort_filtered : public warp_sort<Capacity, Ascending, T, IdxT> {
   using warp_sort<Capacity, Ascending, T, IdxT>::kWarpWidth;
   using warp_sort<Capacity, Ascending, T, IdxT>::k;
 
-  __device__ warp_sort_filtered(int k)
-    : warp_sort<Capacity, Ascending, T, IdxT>(k), buf_len_(0), k_th_(kDummy)
+  __device__ warp_sort_filtered(int k, T limit)
+    : warp_sort<Capacity, Ascending, T, IdxT>(k), buf_len_(0), k_th_(limit)
   {
 #pragma unroll
     for (int i = 0; i < kMaxBufLen; i++) {
@@ -286,6 +286,11 @@ class warp_sort_filtered : public warp_sort<Capacity, Ascending, T, IdxT> {
     }
   }
 
+  __device__ __forceinline__ explicit warp_sort_filtered(int k)
+    : warp_sort_filtered<Capacity, Ascending, T, IdxT>(k, kDummy)
+  {
+  }
+
   __device__ void add(T val, IdxT idx)
   {
     // comparing for k_th should reduce the total amount of updates:
@@ -436,7 +441,8 @@ class block_sort {
  public:
   using queue_t = WarpSortWarpWide<Capacity, Ascending, T, IdxT>;
 
-  __device__ block_sort(int k, uint8_t* smem_buf) : queue_(k)
+  template <typename... Args>
+  __device__ block_sort(int k, uint8_t* smem_buf, Args... args) : queue_(k, args...)
   {
     val_smem_             = reinterpret_cast<T*>(smem_buf);
     const int num_of_warp = subwarp_align::div(blockDim.x);

From 662124a1a0dcdfa958ff4a37028793a9e52de00e Mon Sep 17 00:00:00 2001
From: Artem Chirkin <achirkin@nvidia.com>
Date: Sat, 22 Oct 2022 01:17:26 -0700
Subject: [PATCH 10/31] Implemented warp_sort_distributed

---
 .../raft/spatial/knn/detail/ivf_pq_search.cuh |   4 +-
 .../spatial/knn/detail/topk/warpsort_topk.cuh | 102 ++++++++++++++++++
 2 files changed, 104 insertions(+), 2 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
index 4bd991aed5..0e9d9ccb33 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
@@ -592,14 +592,14 @@ __device__ __noinline__ auto ivfpq_compute_score(const uint32_t pq_bits,
 
 template <typename T, typename IdxT>
 struct dummy_block_sort_t {
-  using queue_t = topk::warp_sort_filtered<WarpSize, true, T, IdxT>;
+  using queue_t = topk::warp_sort_distributed<WarpSize, true, T, IdxT>;
   template <typename... Args>
   __device__ dummy_block_sort_t(int k, uint8_t* smem_buf, Args...){};
 };
 
 template <int Capacity, typename T, typename IdxT>
 struct pq_block_sort {
-  using type = topk::block_sort<topk::warp_sort_filtered, Capacity, true, T, IdxT>;
+  using type = topk::block_sort<topk::warp_sort_distributed, Capacity, true, T, IdxT>;
 };
 
 template <typename T, typename IdxT>
diff --git a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
index da645290f9..fe1a7611b9 100644
--- a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
@@ -361,6 +361,108 @@ class warp_sort_filtered : public warp_sort<Capacity, Ascending, T, IdxT> {
   T k_th_;
 };
 
+/**
+ * This version of warp_sort compares each input element against the current
+ * estimate of k-th value before adding it to the intermediate sorting buffer.
+ * In contrast to `warp_sort_filtered`, it keeps one distributed buffer for
+ * all threads in a warp (independently of the subwarp size), which makes its flushing less often.
+ */
+template <int Capacity, bool Ascending, typename T, typename IdxT>
+class warp_sort_distributed : public warp_sort<Capacity, Ascending, T, IdxT> {
+ public:
+  using warp_sort<Capacity, Ascending, T, IdxT>::kDummy;
+  using warp_sort<Capacity, Ascending, T, IdxT>::kWarpWidth;
+  using warp_sort<Capacity, Ascending, T, IdxT>::k;
+
+  __device__ warp_sort_distributed(int k, T limit)
+    : warp_sort<Capacity, Ascending, T, IdxT>(k),
+      buf_val_(kDummy),
+      buf_idx_(IdxT{}),
+      buf_len_(0),
+      k_th_(limit)
+  {
+  }
+
+  __device__ __forceinline__ explicit warp_sort_distributed(int k)
+    : warp_sort_distributed<Capacity, Ascending, T, IdxT>(k, kDummy)
+  {
+  }
+
+  __device__ void add(T val, IdxT idx)
+  {
+    // mask tells which lanes in the warp have valid items to be added
+    uint32_t mask = ballot(is_ordered<Ascending>(val, k_th_));
+    if (mask == 0) { return; }
+    // how many elements to be added
+    uint32_t n_valid = __popc(mask);
+    // index of the source lane containing the value to put into the current lane.
+    uint32_t src_ix = 0;
+    // remove a few smallest set bits from the mask.
+    for (uint32_t i = std::min(n_valid, Pow2<WarpSize>::mod(uint32_t(laneId()) - buf_len_)); i > 0;
+         i--) {
+      src_ix = __ffs(mask) - 1;
+      mask ^= (0x1u << src_ix);
+    }
+    // now the least significant bit of the mask corresponds to the lane id we want to get.
+    // for not-added (invalid) indices, the mask is zeroed by now.
+    src_ix = __ffs(mask) - 1;
+    // rearrange the inputs to be ready to put them into the tmp buffer
+    val = shfl(val, src_ix);
+    idx = shfl(idx, src_ix);
+    // for non-valid lanes, src_ix should be uint(-1)
+    if (mask == 0) { val = kDummy; }
+    // save the values into the free slots of the warp tmp buffer
+    if (laneId() >= buf_len_) {
+      buf_val_ = val;
+      buf_idx_ = idx;
+    }
+    buf_len_ += n_valid;
+    if (buf_len_ < WarpSize) { return; }
+    // merge the warp tmp buffer into the queue
+    merge_buf_();
+    buf_len_ -= WarpSize;
+    // save the inputs that couldn't fit before the merge
+    if (laneId() < buf_len_) {
+      buf_val_ = val;
+      buf_idx_ = idx;
+    }
+  }
+
+  __device__ void done()
+  {
+    if (buf_len_ != 0) {
+      merge_buf_();
+      buf_len_ = 0;
+    }
+  }
+
+ private:
+  __device__ __forceinline__ void set_k_th_()
+  {
+    // NB on using srcLane: it's ok if it is outside the warp size / width;
+    //                      the modulo op will be done inside the __shfl_sync.
+    k_th_ = shfl(val_arr_[kMaxArrLen - 1], k - 1, kWarpWidth);
+  }
+
+  __device__ __forceinline__ void merge_buf_()
+  {
+    topk::bitonic<1>(!Ascending, kWarpWidth).sort(buf_val_, buf_idx_);
+    this->merge_in<1>(&buf_val_, &buf_idx_);
+    set_k_th_();  // contains warp sync
+    buf_val_ = kDummy;
+  }
+
+  using warp_sort<Capacity, Ascending, T, IdxT>::kMaxArrLen;
+  using warp_sort<Capacity, Ascending, T, IdxT>::val_arr_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::idx_arr_;
+
+  T buf_val_;
+  IdxT buf_idx_;
+  uint32_t buf_len_;  // 0 <= buf_len_ <= WarpSize
+
+  T k_th_;
+};
+
 /**
  * This version of warp_sort adds every input element into the intermediate sorting
  * buffer, and thus does the sorting step every `Capacity` input elements.

From 17a75b24e44efcf807e72bef64feb8b0eac677d6 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Sun, 23 Oct 2022 17:44:03 +0200
Subject: [PATCH 11/31] Add missing 'ballot' wrapper

---
 cpp/include/raft/util/cuda_utils.cuh | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/cpp/include/raft/util/cuda_utils.cuh b/cpp/include/raft/util/cuda_utils.cuh
index 1d1c82eb94..ef54255e9c 100644
--- a/cpp/include/raft/util/cuda_utils.cuh
+++ b/cpp/include/raft/util/cuda_utils.cuh
@@ -611,6 +611,16 @@ DI bool all(bool inFlag, uint32_t mask = 0xffffffffu)
   return inFlag;
 }
 
+/** For every thread in the warp, set the corresponding bit to the thread's flag value.  */
+DI uint32_t ballot(bool inFlag, uint32_t mask = 0xffffffffu)
+{
+#if CUDART_VERSION >= 9000
+  return __ballot_sync(mask, inFlag);
+#else
+  return __ballot(inFlag);
+#endif
+}
+
 /**
  * @brief Shuffle the data inside a warp
  * @tparam T the data type (currently assumed to be 4B)

From a30ad2bb4ec12f0c92cef7d65bbdc35e584511a8 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 24 Oct 2022 13:48:15 +0200
Subject: [PATCH 12/31] Load pq_centers coalesced, but not vectorized

---
 cpp/include/raft/neighbors/ivf_pq_types.hpp   | 11 ++--
 .../raft/spatial/knn/detail/ivf_pq_build.cuh  | 24 ++++----
 .../raft/spatial/knn/detail/ivf_pq_search.cuh | 58 +++++++------------
 3 files changed, 36 insertions(+), 57 deletions(-)

diff --git a/cpp/include/raft/neighbors/ivf_pq_types.hpp b/cpp/include/raft/neighbors/ivf_pq_types.hpp
index 815e3fc1de..683a88150f 100644
--- a/cpp/include/raft/neighbors/ivf_pq_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_pq_types.hpp
@@ -290,12 +290,12 @@ struct index : ann::index {
   }
 
   using pq_centers_extents =
-    std::experimental::extents<uint32_t, dynamic_extent, dynamic_extent, dynamic_extent, 4>;
+    std::experimental::extents<uint32_t, dynamic_extent, dynamic_extent, dynamic_extent>;
   /**
    * PQ cluster centers
    *
-   *   - codebook_gen::PER_SUBSPACE: [pq_dim , ceildiv(pq_len, 4), pq_book_size, 4]
-   *   - codebook_gen::PER_CLUSTER:  [n_lists, ceildiv(pq_len, 4), pq_book_size, 4]
+   *   - codebook_gen::PER_SUBSPACE: [pq_dim , pq_len, pq_book_size]
+   *   - codebook_gen::PER_CLUSTER:  [n_lists, pq_len, pq_book_size]
    */
   inline auto pq_centers() noexcept -> device_mdspan<float, pq_centers_extents, row_major>
   {
@@ -408,12 +408,11 @@ struct index : ann::index {
 
   auto make_pq_centers_extents() -> pq_centers_extents
   {
-    auto d = raft::div_rounding_up_unsafe(pq_len(), 4);
     switch (codebook_kind()) {
       case codebook_gen::PER_SUBSPACE:
-        return make_extents<uint32_t>(pq_dim(), d, pq_book_size(), 4);
+        return make_extents<uint32_t>(pq_dim(), pq_len(), pq_book_size());
       case codebook_gen::PER_CLUSTER:
-        return make_extents<uint32_t>(n_lists(), d, pq_book_size(), 4);
+        return make_extents<uint32_t>(n_lists(), pq_len(), pq_book_size());
       default: RAFT_FAIL("Unreachable code");
     }
   }
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
index 63a192f4da..9839f346cc 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
@@ -376,11 +376,9 @@ void compute_pq_codes(
         pq_centers_tmp.data(),
         pq_len * pq_width,
         [pq_centers, pq_width, pq_len, l] __device__(float* out, uint32_t i) {
-          auto pq_i = i % pq_len;
-          auto i0   = pq_i % 4;
-          auto i2   = pq_i / 4;
-          auto i1   = i / pq_len;
-          *out      = pq_centers(l, i2, i1, i0);
+          auto i0 = i / pq_len;
+          auto i1 = i % pq_len;
+          *out    = pq_centers(l, i1, i0);
         },
         stream);
     }
@@ -394,11 +392,9 @@ void compute_pq_codes(
           pq_centers_tmp.data(),
           pq_len * pq_width,
           [pq_centers, pq_width, pq_len, j] __device__(float* out, uint32_t i) {
-            auto pq_i = i % pq_len;
-            auto i0   = pq_i % 4;
-            auto i2   = pq_i / 4;
-            auto i1   = i / pq_len;
-            *out      = pq_centers(j, i2, i1, i0);
+            auto i0 = i / pq_len;
+            auto i1 = i % pq_len;
+            *out    = pq_centers(j, i1, i0);
           },
           stream);
       }
@@ -500,13 +496,13 @@ void transpose_pq_centers(index<IdxT>& index,
     index.pq_centers().data_handle(),
     index.pq_centers().size(),
     [pq_centers_source, extents, pq_len, pq_width] __device__(float* out, size_t i) {
-      uint32_t ii[4];
-      for (int r = 3; r > 0; r--) {
+      uint32_t ii[3];
+      for (int r = 2; r > 0; r--) {
         ii[r] = i % extents.extent(r);
         i /= extents.extent(r);
       }
       ii[0]   = i;
-      auto j2 = ii[3] + ii[1] * extents.extent(3);
+      auto j2 = ii[1];
       auto j1 = ii[2];
       auto j0 = ii[0];
       auto j  = ((j0 * pq_width) + j1) * pq_len + j2;
@@ -902,7 +898,7 @@ inline auto extend(const handle_t& handle,
            stream);
     } break;
     case codebook_gen::PER_CLUSTER: {
-      auto d = orig_index.pq_book_size() * Pow2<4>::roundUp(orig_index.pq_len());
+      auto d = orig_index.pq_book_size() * orig_index.pq_len();
       utils::copy_selected(n_clusters,
                            d,
                            orig_index.pq_centers().data_handle(),
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
index 0e9d9ccb33..30b570fd52 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
@@ -769,7 +769,7 @@ __launch_bounds__(1024, 1) __global__
     if (codebook_kind == codebook_gen::PER_SUBSPACE) {
       pq_center = pq_centers;
     } else {
-      pq_center = pq_centers + (Pow2<4>::roundUp(pq_len) << pq_bits) * label;
+      pq_center = pq_centers + (pq_len << pq_bits) * label;
     }
 
     if constexpr (PrecompBaseDiff) {
@@ -797,58 +797,42 @@ __launch_bounds__(1024, 1) __global__
       // For each subspace, the lookup table stores the distance between the actual query vector
       // (projected into the subspace) and all possible pq vectors in that subspace.
       const uint32_t pq_shift = 1u << pq_bits;
-      using f4                = raft::TxN_t<float, 4>;
       for (uint32_t i = threadIdx.x; i < lut_size; i += blockDim.x) {
         const uint32_t i_pq  = i >> pq_bits;
         uint32_t j           = i_pq * pq_len;
         const uint32_t j_end = pq_len + j;
-        auto cur_pq_center = reinterpret_cast<const f4::io_t*>(pq_center) + (i & (pq_shift - 1u)) +
-                             (codebook_kind == codebook_gen::PER_SUBSPACE
-                                ? (i_pq * Pow2<4>::roundUp(pq_len)) << (pq_bits - 2)
-                                : 0u);
+        auto cur_pq_center   = pq_center + (i & (pq_shift - 1u)) +
+                             (codebook_kind == codebook_gen::PER_SUBSPACE ? j << pq_bits : 0u);
         float score = 0.0;
         do {
-          f4 pq_c;
-          *pq_c.vectorized_data() = *cur_pq_center;
+          float pq_c = *cur_pq_center;
           cur_pq_center += pq_shift;
           switch (metric) {
             case distance::DistanceType::L2Expanded: {
-              f4 diff;
-#pragma unroll
-              for (int l = 0; l < 4; l++, j++) {
-                if constexpr (PrecompBaseDiff) {
-                  diff.val.data[l] = j < j_end ? base_diff[j] : 0.0f;
-                } else {
-                  diff.val.data[l] = j < j_end ? query[j] - cluster_center[j] : 0.0f;
-                }
-              }
-#pragma unroll
-              for (int l = 0; l < 4; l++) {
-                float d = diff.val.data[l] - pq_c.val.data[l];
-                score += d * d;
+              float diff;
+              if constexpr (PrecompBaseDiff) {
+                diff = base_diff[j];
+              } else {
+                diff = query[j] - cluster_center[j];
               }
+              diff -= pq_c;
+              score += diff * diff;
             } break;
             case distance::DistanceType::InnerProduct: {
               // NB: we negate the scores as we hardcoded select-topk to always compute the minimum
-              f4 q;
-#pragma unroll
-              for (int l = 0; l < 4; l++, j++) {
-                if constexpr (PrecompBaseDiff) {
-                  float2 pvals  = j < j_end ? reinterpret_cast<float2*>(base_diff)[j] : float2{};
-                  q.val.data[l] = pvals.x;
-                  score -= pvals.y;
-                } else {
-                  q.val.data[l] = j < j_end ? query[j] : 0.0f;
-                  score -= j < j_end ? q.val.data[l] * cluster_center[j] : 0.0f;
-                }
-              }
-#pragma unroll
-              for (int l = 0; l < 4; l++) {
-                score -= q.val.data[l] * pq_c.val.data[l];
+              float q;
+              if constexpr (PrecompBaseDiff) {
+                float2 pvals = reinterpret_cast<float2*>(base_diff)[j];
+                q            = pvals.x;
+                score -= pvals.y;
+              } else {
+                q = query[j];
+                score -= q * cluster_center[j];
               }
+              score -= q * pq_c;
             } break;
           }
-        } while (j < j_end);
+        } while (++j < j_end);
         lut_scores[i] = LutT(score);
       }
     }

From 910ae3322bcc16eb499f746543c2adf75aad5217 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 27 Oct 2022 07:52:38 +0200
Subject: [PATCH 13/31] Changed layout of the pq data

---
 cpp/include/raft/neighbors/ivf_pq_types.hpp   |  53 ++-
 .../raft/spatial/knn/detail/ivf_pq_build.cuh  | 120 ++++---
 .../raft/spatial/knn/detail/ivf_pq_search.cuh | 322 ++++++------------
 3 files changed, 217 insertions(+), 278 deletions(-)

diff --git a/cpp/include/raft/neighbors/ivf_pq_types.hpp b/cpp/include/raft/neighbors/ivf_pq_types.hpp
index 683a88150f..f4db8b7463 100644
--- a/cpp/include/raft/neighbors/ivf_pq_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_pq_types.hpp
@@ -119,6 +119,11 @@ struct search_params : ann::search_params {
 static_assert(std::is_aggregate_v<index_params>);
 static_assert(std::is_aggregate_v<search_params>);
 
+/** Size of the interleaved group. */
+constexpr static uint32_t kIndexGroupSize = 32;
+/** Stride of the interleaved group for vectorized loads. */
+constexpr static uint32_t kIndexGroupVecLen = 16;
+
 /**
  * @brief IVF-PQ index.
  *
@@ -247,12 +252,12 @@ struct index : ann::index {
       pq_dim_(pq_dim == 0 ? calculate_pq_dim(dim) : pq_dim),
       n_nonempty_lists_(n_nonempty_lists),
       pq_centers_{make_device_mdarray<float>(handle, make_pq_centers_extents())},
-      pq_dataset_{make_device_mdarray<uint8_t>(
-        handle, make_extents<IdxT>(0, this->pq_dim() * this->pq_bits() / 8))},
+      pq_dataset_{make_device_mdarray<uint8_t>(handle, make_pq_dataset_extents(0))},
       indices_{make_device_mdarray<IdxT>(handle, make_extents<IdxT>(0))},
       rotation_matrix_{
         make_device_mdarray<float>(handle, make_extents<uint32_t>(this->rot_dim(), this->dim()))},
       list_offsets_{make_device_mdarray<IdxT>(handle, make_extents<uint32_t>(this->n_lists() + 1))},
+      list_sizes_{make_device_mdarray<uint32_t>(handle, make_extents<uint32_t>(this->n_lists()))},
       centers_{make_device_mdarray<float>(
         handle, make_extents<uint32_t>(this->n_lists(), this->dim_ext()))},
       centers_rot_{make_device_mdarray<float>(
@@ -283,9 +288,8 @@ struct index : ann::index {
    */
   void allocate(const handle_t& handle, IdxT index_size)
   {
-    pq_dataset_ =
-      make_device_mdarray<uint8_t>(handle, make_extents<IdxT>(index_size, pq_dataset_.extent(1)));
-    indices_ = make_device_mdarray<IdxT>(handle, make_extents<IdxT>(index_size));
+    pq_dataset_ = make_device_mdarray<uint8_t>(handle, make_pq_dataset_extents(index_size));
+    indices_    = make_device_mdarray<IdxT>(handle, make_extents<IdxT>(index_size));
     check_consistency();
   }
 
@@ -307,13 +311,21 @@ struct index : ann::index {
     return pq_centers_.view();
   }
 
-  /** PQ-encoded data [size, pq_dim * pq_bits / 8]. */
-  inline auto pq_dataset() noexcept -> device_mdspan<uint8_t, extent_2d<IdxT>, row_major>
+  using pq_dataset_extents = std::experimental::
+    extents<IdxT, dynamic_extent, dynamic_extent, kIndexGroupSize, kIndexGroupVecLen>;
+  /** PQ-encoded data
+   *    [ceildiv(size, kIndexGroupSize)
+   *    , ceildiv(pq_dim * pq_bits / 8, kIndexGroupVecLen)
+   *    , kIndexGroupSize
+   *    , kIndexGroupVecLen
+   *    ].
+   */
+  inline auto pq_dataset() noexcept -> device_mdspan<uint8_t, pq_dataset_extents, row_major>
   {
     return pq_dataset_.view();
   }
   [[nodiscard]] inline auto pq_dataset() const noexcept
-    -> device_mdspan<const uint8_t, extent_2d<IdxT>, row_major>
+    -> device_mdspan<const uint8_t, pq_dataset_extents, row_major>
   {
     return pq_dataset_.view();
   }
@@ -354,6 +366,17 @@ struct index : ann::index {
     return list_offsets_.view();
   }
 
+  /** Sizes of the lists [n_lists]. */
+  inline auto list_sizes() noexcept -> device_mdspan<uint32_t, extent_1d<uint32_t>, row_major>
+  {
+    return list_sizes_.view();
+  }
+  [[nodiscard]] inline auto list_sizes() const noexcept
+    -> device_mdspan<const uint32_t, extent_1d<uint32_t>, row_major>
+  {
+    return list_sizes_.view();
+  }
+
   /** Cluster centers corresponding to the lists in the original space [n_lists, dim_ext] */
   inline auto centers() noexcept -> device_mdspan<float, extent_2d<uint32_t>, row_major>
   {
@@ -376,6 +399,17 @@ struct index : ann::index {
     return centers_rot_.view();
   }
 
+  /** A helper function to determine the extents of an array enough to hold a given amount of data.
+   */
+  auto make_pq_dataset_extents(IdxT n_rows) -> pq_dataset_extents
+  {
+    auto l = pq_dim() * pq_bits() / 8;
+    return make_extents<IdxT>(raft::div_rounding_up_safe<IdxT>(n_rows, kIndexGroupSize),
+                              raft::div_rounding_up_safe<IdxT>(l, kIndexGroupVecLen),
+                              kIndexGroupSize,
+                              kIndexGroupVecLen);
+  }
+
  private:
   raft::distance::DistanceType metric_;
   codebook_gen codebook_kind_;
@@ -386,10 +420,11 @@ struct index : ann::index {
   uint32_t n_nonempty_lists_;
 
   device_mdarray<float, pq_centers_extents, row_major> pq_centers_;
-  device_mdarray<uint8_t, extent_2d<IdxT>, row_major> pq_dataset_;
+  device_mdarray<uint8_t, pq_dataset_extents, row_major> pq_dataset_;
   device_mdarray<IdxT, extent_1d<IdxT>, row_major> indices_;
   device_mdarray<float, extent_2d<uint32_t>, row_major> rotation_matrix_;
   device_mdarray<IdxT, extent_1d<uint32_t>, row_major> list_offsets_;
+  device_mdarray<uint32_t, extent_1d<uint32_t>, row_major> list_sizes_;
   device_mdarray<float, extent_2d<uint32_t>, row_major> centers_;
   device_mdarray<float, extent_2d<uint32_t>, row_major> centers_rot_;
 
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
index 9839f346cc..5a0c92e264 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
@@ -26,6 +26,7 @@
 #include <raft/core/logger.hpp>
 #include <raft/core/nvtx.hpp>
 #include <raft/distance/distance_types.hpp>
+#include <raft/linalg/add.cuh>
 #include <raft/linalg/detail/qr.cuh>
 #include <raft/linalg/gemm.cuh>
 #include <raft/matrix/matrix.cuh>
@@ -53,6 +54,7 @@ using namespace raft::spatial::knn::detail;  // NOLINT
 using raft::neighbors::ivf_pq::codebook_gen;
 using raft::neighbors::ivf_pq::index;
 using raft::neighbors::ivf_pq::index_params;
+using raft::neighbors::ivf_pq::kIndexGroupSize;
 
 namespace {
 
@@ -777,31 +779,27 @@ inline auto extend(const handle_t& handle,
   rmm::device_uvector<uint32_t> ext_cluster_sizes_buf(n_clusters, stream, &managed_memory);
   rmm::device_uvector<IdxT> old_cluster_offsets_buf(n_clusters + 1, stream, &managed_memory);
   rmm::device_uvector<IdxT> ext_cluster_offsets_buf(n_clusters + 1, stream, &managed_memory);
-  rmm::device_uvector<uint32_t> cluster_ordering(n_clusters, stream, &managed_memory);
+  rmm::device_uvector<uint32_t> cluster_ordering_buf(n_clusters, stream, &managed_memory);
   auto old_cluster_sizes   = old_cluster_sizes_buf.data();
   auto ext_cluster_sizes   = ext_cluster_sizes_buf.data();
   auto old_cluster_offsets = old_cluster_offsets_buf.data();
   auto ext_cluster_offsets = ext_cluster_offsets_buf.data();
+  auto cluster_ordering    = cluster_ordering_buf.data();
   copy(old_cluster_offsets,
        orig_index.list_offsets().data_handle(),
        orig_index.list_offsets().size(),
        stream);
+  copy(old_cluster_sizes,
+       orig_index.list_sizes().data_handle(),
+       orig_index.list_sizes().size(),
+       stream);
 
   uint32_t n_nonempty_lists = 0;
   {
     rmm::device_uvector<uint32_t> ext_cluster_sizes_buf_in(n_clusters, stream, device_memory);
     rmm::device_uvector<uint32_t> cluster_ordering_in(n_clusters, stream, device_memory);
     auto ext_cluster_sizes_in = ext_cluster_sizes_buf_in.data();
-    linalg::writeOnlyUnaryOp(
-      old_cluster_sizes,
-      n_clusters,
-      [ext_cluster_sizes_in, new_cluster_sizes, old_cluster_offsets] __device__(uint32_t * out,
-                                                                                size_t i) {
-        auto old_size           = old_cluster_offsets[i + 1] - old_cluster_offsets[i];
-        ext_cluster_sizes_in[i] = old_size + new_cluster_sizes[i];
-        *out                    = old_size;
-      },
-      stream);
+    linalg::add(ext_cluster_sizes_in, old_cluster_sizes, new_cluster_sizes, n_clusters, stream);
 
     thrust::sequence(handle.get_thrust_policy(),
                      cluster_ordering_in.data(),
@@ -815,7 +813,7 @@ inline auto extend(const handle_t& handle,
                                               ext_cluster_sizes_in,
                                               ext_cluster_sizes,
                                               cluster_ordering_in.data(),
-                                              cluster_ordering.data(),
+                                              cluster_ordering,
                                               n_clusters,
                                               begin_bit,
                                               end_bit,
@@ -826,7 +824,7 @@ inline auto extend(const handle_t& handle,
                                               ext_cluster_sizes_in,
                                               ext_cluster_sizes,
                                               cluster_ordering_in.data(),
-                                              cluster_ordering.data(),
+                                              cluster_ordering,
                                               n_clusters,
                                               begin_bit,
                                               end_bit,
@@ -849,35 +847,41 @@ inline auto extend(const handle_t& handle,
                         orig_index.pq_bits(),
                         orig_index.pq_dim(),
                         n_nonempty_lists);
-  ext_index.allocate(handle, orig_index.size() + n_rows);
-
-  // Copy the unchanged parts
-  copy(ext_index.rotation_matrix().data_handle(),
-       orig_index.rotation_matrix().data_handle(),
-       orig_index.rotation_matrix().size(),
-       stream);
-
   // calculate extended cluster offsets
-  auto ext_indices = ext_index.indices().data_handle();
   {
-    IdxT zero = 0;
-    update_device(ext_cluster_offsets, &zero, 1, stream);
-    thrust::inclusive_scan(handle.get_thrust_policy(),
-                           ext_cluster_sizes,
-                           ext_cluster_sizes + n_clusters,
-                           ext_cluster_offsets + 1,
-                           [] __device__(IdxT s, uint32_t l) { return s + l; });
+    using group_align = Pow2<kIndexGroupSize>;
+    IdxT size         = 0;
+    update_device(ext_cluster_offsets, &size, 1, stream);
+    thrust::inclusive_scan(
+      handle.get_thrust_policy(),
+      ext_cluster_sizes,
+      ext_cluster_sizes + n_clusters,
+      ext_cluster_offsets + 1,
+      [] __device__(IdxT a, IdxT b) { return group_align::roundUp(a) + group_align::roundUp(b); });
+    update_host(&size, ext_cluster_offsets + n_clusters, 1, stream);
+    handle.sync_stream();
     copy(ext_index.list_offsets().data_handle(),
          ext_cluster_offsets,
          ext_index.list_offsets().size(),
          stream);
+    copy(ext_index.list_sizes().data_handle(),
+         ext_cluster_sizes,
+         ext_index.list_sizes().size(),
+         stream);
+    ext_index.allocate(handle, size);
   }
 
+  // Copy the unchanged parts
+  copy(ext_index.rotation_matrix().data_handle(),
+       orig_index.rotation_matrix().data_handle(),
+       orig_index.rotation_matrix().size(),
+       stream);
+
   // copy cluster-ordering-dependent data
   utils::copy_selected(n_clusters,
                        ext_index.dim_ext(),
                        orig_index.centers().data_handle(),
-                       cluster_ordering.data(),
+                       cluster_ordering,
                        orig_index.dim_ext(),
                        ext_index.centers().data_handle(),
                        ext_index.dim_ext(),
@@ -885,7 +889,7 @@ inline auto extend(const handle_t& handle,
   utils::copy_selected(n_clusters,
                        ext_index.rot_dim(),
                        orig_index.centers_rot().data_handle(),
-                       cluster_ordering.data(),
+                       cluster_ordering,
                        orig_index.rot_dim(),
                        ext_index.centers_rot().data_handle(),
                        ext_index.rot_dim(),
@@ -902,7 +906,7 @@ inline auto extend(const handle_t& handle,
       utils::copy_selected(n_clusters,
                            d,
                            orig_index.pq_centers().data_handle(),
-                           cluster_ordering.data(),
+                           cluster_ordering,
                            d,
                            ext_index.pq_centers().data_handle(),
                            d,
@@ -913,8 +917,9 @@ inline auto extend(const handle_t& handle,
 
   // Make ext_indices
   handle.sync_stream();  // make sure cluster sizes are up-to-date
+  auto ext_indices = ext_index.indices().data_handle();
   for (uint32_t l = 0; l < ext_index.n_lists(); l++) {
-    auto k                = cluster_ordering.data()[l];
+    auto k                = cluster_ordering[l];
     auto old_cluster_size = old_cluster_sizes[k];
     auto new_cluster_size = new_cluster_sizes[k];
     if (old_cluster_size > 0) {
@@ -944,19 +949,43 @@ inline auto extend(const handle_t& handle,
   }
 
   /* Extend the pq_dataset */
-  auto ext_pq_dataset    = ext_index.pq_dataset().data_handle();
-  size_t pq_dataset_unit = ext_index.pq_dim() * ext_index.pq_bits() / 8;
+  auto ext_pq_dataset = ext_index.pq_dataset().data_handle();
+  auto pq_dataset_unit =
+    size_t(ext_index.pq_dataset().extent(1)) * size_t(ext_index.pq_dataset().extent(3));
+  auto raw_data_unit = size_t(ext_index.pq_dim() * ext_index.pq_bits() / 8);
   for (uint32_t l = 0; l < ext_index.n_lists(); l++) {
-    auto k                = cluster_ordering.data()[l];
-    auto old_cluster_size = size_t(old_cluster_sizes[k]);
-    copy(ext_pq_dataset + pq_dataset_unit * size_t(ext_cluster_offsets[l]),
-         orig_index.pq_dataset().data_handle() + pq_dataset_unit * size_t(old_cluster_offsets[k]),
-         pq_dataset_unit * old_cluster_size,
-         stream);
-    copy(ext_pq_dataset + pq_dataset_unit * size_t(ext_cluster_offsets[l] + old_cluster_size),
-         new_pq_codes.data() + pq_dataset_unit * size_t(new_cluster_offsets.data()[k]),
-         pq_dataset_unit * size_t(new_cluster_sizes[k]),
-         stream);
+    auto k                = cluster_ordering[l];
+    auto old_cluster_size = old_cluster_sizes[k];
+    auto old_pq_dataset =
+      make_mdspan(orig_index.pq_dataset().data_handle() + pq_dataset_unit * old_cluster_offsets[k],
+                  ext_index.make_pq_dataset_extents(old_cluster_size));
+    auto new_pq_data =
+      make_mdspan(new_pq_codes.data() + raw_data_unit * new_cluster_offsets.data()[k],
+                  make_extents<size_t>(new_cluster_sizes[k], raw_data_unit));
+    linalg::writeOnlyUnaryOp(
+      ext_pq_dataset + pq_dataset_unit * ext_cluster_offsets[l],
+      pq_dataset_unit * size_t(ext_cluster_offsets[l + 1] - ext_cluster_offsets[l]),
+      [old_pq_dataset, new_pq_data, old_cluster_size] __device__(uint8_t * out, size_t i_flat) {
+        size_t i[4];
+        for (int r = 3; r > 0; r--) {
+          i[r] = i_flat % old_pq_dataset.extent(r);
+          i_flat /= old_pq_dataset.extent(r);
+        }
+        i[0]        = i_flat;
+        auto row_ix = i[0] * old_pq_dataset.extent(2) + i[2];
+        if (row_ix < old_cluster_size) {
+          *out = old_pq_dataset(i[0], i[1], i[2], i[3]);
+        } else {
+          row_ix -= old_cluster_size;
+          auto col_ix = i[1] * old_pq_dataset.extent(3) + i[3];
+          if (row_ix < new_pq_data.extent(0) && col_ix < new_pq_data.extent(1)) {
+            *out = new_pq_data(row_ix, col_ix);
+          } else {
+            *out = 0;
+          }
+        }
+      },
+      stream);
   }
 
   return ext_index;
@@ -979,6 +1008,7 @@ inline auto build(
 
   index<IdxT> index(handle, params, dim);
   utils::memzero(index.list_offsets().data_handle(), index.list_offsets().size(), stream);
+  utils::memzero(index.list_sizes().data_handle(), index.list_sizes().size(), stream);
 
   auto trainset_ratio = std::max<size_t>(
     1,
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
index 30b570fd52..1bdf9ce6cb 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
@@ -60,6 +60,8 @@ using namespace raft::spatial::knn::detail;  // NOLINT
 
 using raft::neighbors::ivf_pq::codebook_gen;
 using raft::neighbors::ivf_pq::index;
+using raft::neighbors::ivf_pq::kIndexGroupSize;
+using raft::neighbors::ivf_pq::kIndexGroupVecLen;
 using raft::neighbors::ivf_pq::search_params;
 
 /** 8-bit floating-point storage type.
@@ -232,10 +234,10 @@ void select_clusters(const handle_t& handle,
  * in chunk_indices. Essentially this is a segmented inclusive scan of the cluster sizes. The total
  * number of samples per query (sum of the cluster sizes that we probe) is returned in n_samples.
  */
-template <int BlockDim, typename IdxT>
+template <int BlockDim>
 __launch_bounds__(BlockDim) __global__
   void calc_chunk_indices_kernel(uint32_t n_probes,
-                                 const IdxT* cluster_offsets,        // [n_clusters + 1]
+                                 const uint32_t* cluster_sizes,      // [n_clusters]
                                  const uint32_t* clusters_to_probe,  // [n_queries, n_probes]
                                  uint32_t* chunk_indices,            // [n_queries, n_probes]
                                  uint32_t* n_samples                 // [n_queries]
@@ -253,9 +255,7 @@ __launch_bounds__(BlockDim) __global__
   uint32_t total                  = 0;
   for (uint32_t probe_ix = threadIdx.x; probe_ix < n_probes_aligned; probe_ix += BlockDim) {
     auto label = probe_ix < n_probes ? clusters_to_probe[probe_ix] : 0u;
-    auto chunk = probe_ix < n_probes
-                   ? static_cast<uint32_t>(cluster_offsets[label + 1] - cluster_offsets[label])
-                   : 0u;
+    auto chunk = probe_ix < n_probes ? cluster_sizes[label] : 0u;
     if (threadIdx.x == 0) { chunk += total; }
     block_scan(shm).InclusiveSum(chunk, chunk, total);
     __syncthreads();
@@ -265,7 +265,6 @@ __launch_bounds__(BlockDim) __global__
   if (threadIdx.x == 0) { n_samples[blockIdx.x] = total; }
 }
 
-template <typename IdxT>
 struct calc_chunk_indices {
  public:
   struct configured {
@@ -274,19 +273,19 @@ struct calc_chunk_indices {
     dim3 grid_dim;
     uint32_t n_probes;
 
-    void operator()(const IdxT* cluster_offsets,
-                    const uint32_t* clusters_to_probe,
-                    uint32_t* chunk_indices,
-                    uint32_t* n_samples,
-                    rmm::cuda_stream_view stream)
+    inline void operator()(const uint32_t* cluster_sizes,
+                           const uint32_t* clusters_to_probe,
+                           uint32_t* chunk_indices,
+                           uint32_t* n_samples,
+                           rmm::cuda_stream_view stream)
     {
       void* args[] =  // NOLINT
-        {&n_probes, &cluster_offsets, &clusters_to_probe, &chunk_indices, &n_samples};
+        {&n_probes, &cluster_sizes, &clusters_to_probe, &chunk_indices, &n_samples};
       RAFT_CUDA_TRY(cudaLaunchKernel(kernel, grid_dim, block_dim, args, 0, stream));
     }
   };
 
-  static auto configure(uint32_t n_probes, uint32_t n_queries) -> configured
+  static inline auto configure(uint32_t n_probes, uint32_t n_queries) -> configured
   {
     return try_block_dim<1024>(n_probes, n_queries);
   }
@@ -298,7 +297,7 @@ struct calc_chunk_indices {
     if constexpr (BlockDim >= WarpSize * 2) {
       if (BlockDim >= n_probes * 2) { return try_block_dim<(BlockDim / 2)>(n_probes, n_queries); }
     }
-    return {reinterpret_cast<void*>(calc_chunk_indices_kernel<BlockDim, IdxT>),
+    return {reinterpret_cast<void*>(calc_chunk_indices_kernel<BlockDim>),
             dim3(BlockDim, 1, 1),
             dim3(n_queries, 1, 1),
             n_probes};
@@ -450,146 +449,6 @@ void postprocess_distances(float* out,        // [n_queries, topk]
   }
 }
 
-/** Extract `Bits` lowest bits from the `source` and shift the remaining content by this number.  */
-template <uint32_t Bits, typename OpT>
-__device__ __forceinline__ auto extract_bits(OpT& source) -> uint8_t
-{
-  if constexpr (Bits < 8 * sizeof(OpT)) {
-    constexpr uint32_t Mask = (1u << Bits) - 1u;  // NOLINT
-    uint8_t r               = source & Mask;
-    source >>= Bits;
-    return r;
-  } else {
-    uint8_t r = source;
-    source    = 0;
-    return r;
-  }
-}
-
-/**
- * Starting with `BitsLeft = kBitsTotal = 8 * sizeof(OpT)`,
- *   loop over `lcm(kBitsTotal, PqBits) = PqBits * kVecLen` bits of data.
- * Thanks to `lcm` in the expression above,
- *   `BitsLeft` is zero after exactly kVecLen iterations.
- */
-template <uint32_t BitsLeft, uint32_t PqBits, typename OpT, typename LutT>
-__device__ __forceinline__ void compute_score_inner_loop(
-  const OpT*& pq_head, const LutT*& lut_scores, OpT& pq_code, float& score, const bool valid_range)
-{
-  uint8_t code = extract_bits<PqBits>(pq_code);
-  if constexpr (BitsLeft < PqBits) {  // `code` is on the border between two `OpT` ints
-    pq_code = valid_range ? __ldcs(pq_head) : OpT(0);
-    ++pq_head;
-    code <<= PqBits - BitsLeft;
-    code |= extract_bits<PqBits - BitsLeft>(pq_code);
-  }
-  score += valid_range ? float(lut_scores[code]) : 0.0f;
-  lut_scores += (1u << PqBits);
-  if constexpr (BitsLeft != PqBits) {
-    constexpr uint32_t kBitsLeftNext =
-      BitsLeft < PqBits ? (8 * sizeof(OpT) + BitsLeft - PqBits) : (BitsLeft - PqBits);
-    compute_score_inner_loop<kBitsLeftNext, PqBits, OpT, LutT>(
-      pq_head, lut_scores, pq_code, score, valid_range);
-  }
-}
-
-template <uint32_t PqBits, typename OpT, typename LutT>
-__device__ __forceinline__ auto compute_score(uint32_t pq_dim,
-                                              const OpT* pq_head,
-                                              const LutT* lut_scores,
-                                              const bool valid_range) -> float
-{
-  float score                   = 0.0;
-  constexpr uint32_t kBitsTotal = 8 * sizeof(OpT);
-  constexpr uint32_t kVecLen    = kBitsTotal / gcd<uint32_t>(kBitsTotal, PqBits);
-
-  // These two nested loops combined span exactly `pq_dim` iterations.
-  for (; pq_dim > 0; pq_dim -= kVecLen) {
-    OpT pq_code = valid_range ? __ldcs(pq_head) : OpT(0);
-    ++pq_head;
-    compute_score_inner_loop<kBitsTotal, PqBits, OpT, LutT>(
-      pq_head, lut_scores, pq_code, score, valid_range);
-  }
-  return score;
-}
-
-/**
- * @brief Compute the similarity score between a vector from `pq_dataset` and a query vector.
- *
- * @tparam OpT an unsigned integer type that is used for bit operations on multiple PQ codes
- *   at once; it's selected to maximize throughput while matching criteria:
- *     1. `pq_bits * vec_len % 8 * sizeof(OpT) == 0`.
- *     2. `pq_dim % vec_len == 0`
- *
- *   Here `vec_len` is fully determined by `OpT` and `pq_bits`:
- *        vec_len == 8 * sizeof(OpT) / gcd(8 * sizeof(OpT), pq_bits)
- *                == lcm(8 * sizeof(OpT), pq_bits) / pq_bits
- *
- *
- * @tparam LutT type of the elements in the lookup table.
- *
- * @param pq_bits The bit length of an encoded vector element after compression by PQ
- * @param pq_dim
- * @param[in] pq_code_ptr
- *   a device pointer to the dataset at the indexed position (`pq_dim * pq_bits` bits-wide)
- * @param[in] lut_scores
- *   a device or shared memory pointer to the lookup table [pq_dim, pq_book_size]
- * @param valid_range
- *   whether reads and writes are allowed
- *
- * @return the score for the entry `data_ix` in the `pq_dataset`.
- */
-template <typename OpT, typename LutT>
-__device__ __noinline__ auto ivfpq_compute_score(const uint32_t pq_bits,
-                                                 uint32_t pq_dim,
-                                                 const OpT* pq_head,
-                                                 const LutT* lut_scores,
-                                                 const bool valid_range) -> float
-{
-  switch (pq_bits) {
-    case 4: return compute_score<4, OpT, LutT>(pq_dim, pq_head, lut_scores, valid_range);
-    case 5: return compute_score<5, OpT, LutT>(pq_dim, pq_head, lut_scores, valid_range);
-    case 6: return compute_score<6, OpT, LutT>(pq_dim, pq_head, lut_scores, valid_range);
-    case 7: return compute_score<7, OpT, LutT>(pq_dim, pq_head, lut_scores, valid_range);
-    case 8: return compute_score<8, OpT, LutT>(pq_dim, pq_head, lut_scores, valid_range);
-  }
-}
-
-// template <typename OpT, typename LutT>
-// __device__ __forceinline__ auto ivfpq_compute_score(const uint32_t pq_bits,
-//                                                     const uint32_t vec_len,
-//                                                     uint32_t pq_dim,
-//                                                     const OpT* pq_head,
-//                                                     const LutT* lut_scores,
-//                                                     bool valid_range) -> float
-// {
-//   float score                  = 0.0;
-//   constexpr int32_t kBitsTotal = 8 * sizeof(OpT);
-//   // These two nested loops combined span exactly `pq_dim` iterations.
-//   for (; pq_dim > 0; pq_dim -= vec_len) {
-//     OpT pq_code = valid_range ? *pq_head : OpT(0);
-//     ++pq_head;
-//     // Loop over lcm(kBitsTotal, pq_bits) = pq_bits * vec_len.
-//     // Thanks to `lcm` in the expression above,
-//     // `bits_left` is zero after exactly vec_len iterations.
-// #pragma unroll sizeof(OpT)
-//     for (int32_t bits_left = kBitsTotal; bits_left != 0;) {
-//       uint8_t code = extract_bits(pq_bits, pq_code);
-//       bits_left -= pq_bits;
-//       if (bits_left < 0) {  // `code` is on the border between two `OpT` ints
-//         pq_code = valid_range ? *pq_head : OpT(0);
-//         ++pq_head;
-//         code <<= -bits_left;
-//         code |= extract_bits(-bits_left, pq_code);
-//         bits_left += kBitsTotal;
-//       }
-//       score += valid_range ? float(lut_scores[code]) : 0.0f;
-//       lut_scores += (1 << pq_bits);
-//     }
-//   }
-//   return score;
-// }
-
 template <typename T, typename IdxT>
 struct dummy_block_sort_t {
   using queue_t = topk::warp_sort_distributed<WarpSize, true, T, IdxT>;
@@ -677,8 +536,7 @@ using block_sort_t = typename pq_block_sort<Capacity, T, IdxT>::type;
  *   The device pointer to the output indices [n_queries, n_probes, topk].
  *   Ignored  when `Capacity == 0`.
  */
-template <typename OpT,
-          typename IdxT,
+template <typename IdxT,
           typename OutT,
           typename LutT,
           int Capacity,
@@ -718,7 +576,9 @@ __launch_bounds__(1024, 1) __global__
   constexpr bool kManageLocalTopK = Capacity > 0;
 
   const uint32_t pq_len   = dim / pq_dim;
-  const uint32_t lut_size = pq_dim << pq_bits;
+  const uint32_t pq_shift = 1u << pq_bits;
+  const uint32_t lut_size = pq_dim * pq_shift;
+  const uint32_t pq_mask  = pq_shift - 1u;
 
   if constexpr (EnableSMemLut) {
     lut_scores = reinterpret_cast<LutT*>(smem_buf);
@@ -746,8 +606,9 @@ __launch_bounds__(1024, 1) __global__
       query_ix = ib % n_queries;
       probe_ix = ib / n_queries;
     } else {
-      query_ix = index_list[ib] / n_probes;
-      probe_ix = index_list[ib] % n_probes;
+      auto ordered_ix = index_list[ib];
+      query_ix        = ordered_ix / n_probes;
+      probe_ix        = ordered_ix % n_probes;
     }
 
     const uint32_t* chunk_indices = _chunk_indices + (n_probes * query_ix);
@@ -788,6 +649,7 @@ __launch_bounds__(1024, 1) __global__
             reinterpret_cast<float2*>(base_diff)[i] = pvals;
           }
         } break;
+        default: __builtin_unreachable();
       }
       __syncthreads();
     }
@@ -796,13 +658,12 @@ __launch_bounds__(1024, 1) __global__
       // Create a lookup table
       // For each subspace, the lookup table stores the distance between the actual query vector
       // (projected into the subspace) and all possible pq vectors in that subspace.
-      const uint32_t pq_shift = 1u << pq_bits;
       for (uint32_t i = threadIdx.x; i < lut_size; i += blockDim.x) {
         const uint32_t i_pq  = i >> pq_bits;
         uint32_t j           = i_pq * pq_len;
         const uint32_t j_end = pq_len + j;
-        auto cur_pq_center   = pq_center + (i & (pq_shift - 1u)) +
-                             (codebook_kind == codebook_gen::PER_SUBSPACE ? j << pq_bits : 0u);
+        auto cur_pq_center   = pq_center + (i & pq_mask) +
+                             (codebook_kind == codebook_gen::PER_SUBSPACE ? j * pq_shift : 0u);
         float score = 0.0;
         do {
           float pq_c = *cur_pq_center;
@@ -831,22 +692,32 @@ __launch_bounds__(1024, 1) __global__
               }
               score -= q * pq_c;
             } break;
+            default: __builtin_unreachable();
           }
         } while (++j < j_end);
         lut_scores[i] = LutT(score);
       }
     }
 
+    using group_align  = Pow2<kIndexGroupSize>;
+    using vec_align    = Pow2<kIndexGroupVecLen>;
+    using local_topk_t = block_sort_t<Capacity, OutT, IdxT>;
+    using op_t         = uint32_t;
+    using vec_t        = TxN_t<op_t, kIndexGroupVecLen / sizeof(op_t)>;
+
     uint32_t sample_offset = 0;
     if (probe_ix > 0) { sample_offset = chunk_indices[probe_ix - 1]; }
-    uint32_t n_samples           = chunk_indices[probe_ix] - sample_offset;
-    uint32_t n_samples32         = Pow2<32>::roundUp(n_samples);
-    IdxT cluster_offset          = cluster_offsets[label];
-    const uint32_t pq_line_width = pq_dim * pq_bits / 8;
-    auto pq_cluster_data         = pq_dataset + size_t(cluster_offset) * size_t(pq_line_width);
-
-    using local_topk_t = block_sort_t<Capacity, OutT, IdxT>;
-    OutT query_kth     = local_topk_t::queue_t::kDummy;
+    uint32_t n_samples         = chunk_indices[probe_ix] - sample_offset;
+    uint32_t n_samples_aligned = group_align::roundUp(n_samples);
+    IdxT cluster_offset        = cluster_offsets[label];
+    uint32_t pq_line_width     = vec_align::roundUp(pq_dim * pq_bits / 8);
+    auto pq_thread_data =
+      pq_dataset + (size_t(cluster_offset) + group_align::roundDown(threadIdx.x)) * pq_line_width +
+      group_align::mod(threadIdx.x) * vec_align::Value;
+    pq_line_width *= blockDim.x;
+
+    constexpr OutT kDummy = upper_bound<OutT>();
+    OutT query_kth        = kDummy;
     if constexpr (kManageLocalTopK) { query_kth = OutT(query_kths[query_ix]); }
     local_topk_t block_topk(topk, smem_buf, query_kth);
 
@@ -855,19 +726,52 @@ __launch_bounds__(1024, 1) __global__
     __syncthreads();
 
     // Compute a distance for each sample
-    for (uint32_t i = threadIdx.x; i < n_samples32; i += blockDim.x) {
-      OutT score   = local_topk_t::queue_t::kDummy;
-      float fscore = ivfpq_compute_score<OpT, LutT>(
-        pq_bits,
-        pq_dim,
-        reinterpret_cast<const OpT*>(pq_cluster_data + pq_line_width * i),
-        lut_scores,
-        i < n_samples);
-      if (i < n_samples && fscore < float(score)) { score = OutT(fscore); }
+    for (uint32_t i = threadIdx.x; i < n_samples_aligned;
+         i += blockDim.x, pq_thread_data += pq_line_width) {
+      float score = 0;
+      bool valid  = i < n_samples;
+
+      {
+        // pq_reader_t<op_t> reader(pq_thread_data);
+        auto pq_head       = reinterpret_cast<const vec_t::io_t*>(pq_thread_data);
+        uint32_t bits_left = 0;
+        op_t pq_code       = 0;
+        auto lut_head      = lut_scores;
+        auto lut_end       = lut_scores + lut_size;
+        vec_t pq_codes;
+        do {
+          *pq_codes.vectorized_data() = *pq_head;
+          pq_head += kIndexGroupSize;
+#pragma unroll
+          for (uint32_t l = 0; l < vec_t::Ratio; l++) {
+            while (bits_left >= pq_bits && lut_head < lut_end) {
+              uint8_t code = pq_code & pq_mask;
+              pq_code >>= pq_bits;
+              score += valid ? float(lut_head[code]) : 0.0f;
+              lut_head += pq_shift;
+              bits_left -= pq_bits;
+            }
+            if (lut_head < lut_end) {
+              uint32_t rem_bits = pq_bits - bits_left;
+              uint8_t code      = pq_code << rem_bits;
+              pq_code           = pq_codes.val.data[l];
+              code |= pq_code & ((1u << rem_bits) - 1u);
+              pq_code >>= rem_bits;
+              score += valid ? float(lut_head[code]) : 0.0f;
+              lut_head += pq_shift;
+              bits_left += 8 * sizeof(op_t) - pq_bits;
+            } else {
+              break;
+            }
+          }
+        } while (lut_head < lut_end);
+      }
+
+      if (!valid || score >= float(kDummy)) { score = float(kDummy); }
       if constexpr (kManageLocalTopK) {
-        block_topk.add(score, cluster_offset + i);
+        block_topk.add(OutT(score), cluster_offset + i);
       } else {
-        if (i < n_samples) { out_scores[i + sample_offset] = score; }
+        if (valid) { out_scores[i + sample_offset] = OutT(score); }
       }
     }
     if constexpr (kManageLocalTopK) {
@@ -882,7 +786,7 @@ __launch_bounds__(1024, 1) __global__
       if (probe_ix + 1 == n_probes) {
         for (uint32_t i = threadIdx.x + sample_offset + n_samples; i < max_samples;
              i += blockDim.x) {
-          out_scores[i] = local_topk_t::queue_t::kDummy;
+          out_scores[i] = kDummy;
         }
       }
     }
@@ -930,43 +834,28 @@ struct ivfpq_compute_similarity {
      * @param pq_dim
      * @param k_max
      */
-    static auto kernel(uint32_t pq_bits, uint32_t pq_dim, uint32_t k_max) -> kernel_t
+    static auto kernel(uint32_t k_max) -> kernel_t
     {
-      return kernel_base(pq_bits, pq_dim, k_max);
+      return kernel_try_capacity<kMaxCapacity>(k_max);
     }
 
    private:
-    template <typename OpT, int Capacity>
+    template <int Capacity>
     static auto kernel_try_capacity(uint32_t k_max) -> kernel_t
     {
       if constexpr (Capacity > 0) {
-        if (k_max == 0 || k_max > Capacity) { return kernel_try_capacity<OpT, 0>(k_max); }
+        if (k_max == 0 || k_max > Capacity) { return kernel_try_capacity<0>(k_max); }
       }
       if constexpr (Capacity > 1) {
-        if (k_max * 2 <= Capacity) { return kernel_try_capacity<OpT, (Capacity / 2)>(k_max); }
+        if (k_max * 2 <= Capacity) { return kernel_try_capacity<(Capacity / 2)>(k_max); }
       }
-      return ivfpq_compute_similarity_kernel<OpT,
-                                             IdxT,
+      return ivfpq_compute_similarity_kernel<IdxT,
                                              OutT,
                                              LutT,
                                              Capacity,
                                              PrecompBaseDiff,
                                              EnableSMemLut>;
     }
-
-    static auto kernel_base(uint32_t pq_bits, uint32_t pq_dim, uint32_t k_max) -> kernel_t
-    {
-      switch (gcd<uint32_t>(pq_bits * pq_dim, 64)) {
-        case 64: return kernel_try_capacity<uint64_t, kMaxCapacity>(k_max);
-        case 32: return kernel_try_capacity<uint32_t, kMaxCapacity>(k_max);
-        case 16: return kernel_try_capacity<uint16_t, kMaxCapacity>(k_max);
-        case 8: return kernel_try_capacity<uint8_t, kMaxCapacity>(k_max);
-        default:
-          RAFT_FAIL("`pq_bits * pq_dim` must be a multiple of 8 (pq_bits = %u, pq_dim = %u).",
-                    pq_bits,
-                    pq_dim);
-      }
-    }
   };
 
   struct selected {
@@ -1008,11 +897,9 @@ struct ivfpq_compute_similarity {
     using conf_no_basediff = configured<false, true>;
     using conf_no_smem_lut = configured<true, false>;
 
-    kernel_t kernel_fast = conf_fast::kernel(pq_bits, pq_dim, manage_local_topk ? topk : 0u);
-    kernel_t kernel_no_basediff =
-      conf_no_basediff::kernel(pq_bits, pq_dim, manage_local_topk ? topk : 0u);
-    kernel_t kernel_no_smem_lut =
-      conf_no_smem_lut::kernel(pq_bits, pq_dim, manage_local_topk ? topk : 0u);
+    kernel_t kernel_fast        = conf_fast::kernel(manage_local_topk ? topk : 0u);
+    kernel_t kernel_no_basediff = conf_no_basediff::kernel(manage_local_topk ? topk : 0u);
+    kernel_t kernel_no_smem_lut = conf_no_smem_lut::kernel(manage_local_topk ? topk : 0u);
 
     const size_t smem_threshold = 48 * 1024;
     size_t lut_mem              = sizeof(LutT) * (pq_dim << pq_bits);
@@ -1160,6 +1047,7 @@ void ivfpq_search_worker(const handle_t& handle,
   auto data_indices    = index.indices().data_handle();
   auto cluster_centers = index.centers_rot().data_handle();
   auto cluster_offsets = index.list_offsets().data_handle();
+  auto cluster_sizes   = index.list_sizes().data_handle();
 
   bool manage_local_topk = is_local_topk_feasible(topK, n_probes, n_queries);
   auto topk_len          = manage_local_topk ? n_probes * topK : max_samples;
@@ -1183,8 +1071,8 @@ void ivfpq_search_worker(const handle_t& handle,
     neighbors_ptr = neighbors_buf.data();
   }
 
-  calc_chunk_indices<IdxT>::configure(n_probes, n_queries)(
-    cluster_offsets, clusters_to_probe, chunk_index.data(), num_samples.data(), stream);
+  calc_chunk_indices::configure(n_probes, n_queries)(
+    cluster_sizes, clusters_to_probe, chunk_index.data(), num_samples.data(), stream);
 
   if (n_queries * n_probes > 256) {
     // Sorting index by cluster number (label).
@@ -1255,20 +1143,6 @@ void ivfpq_search_worker(const handle_t& handle,
                                                          topK);
 
   rmm::device_uvector<LutT> device_lut(search_instance.device_lut_size, stream, mr);
-  printf(
-    "===|     raft: n_rows = %zu, n_lists = %u, n_queries = %u, n_probes = %u, dim = %u, pq_dim = "
-    "%u, k = %u, max_samples = %u, metric = %d, codebook_kind = %d\n",
-    uint64_t(index.size()),
-    uint32_t(index.n_lists()),
-    uint32_t(n_queries),
-    uint32_t(n_probes),
-    uint32_t(index.rot_dim()),
-    uint32_t(index.pq_dim()),
-    uint32_t(topK),
-    uint32_t(max_samples),
-    int(index.metric()),
-    int(index.codebook_kind()));
-
   rmm::device_uvector<float> query_kths(0, stream, mr);
   if (manage_local_topk) {
     query_kths.resize(n_queries, stream);

From 57ae845490c0a5fcb580c357e628ef9147435565 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 3 Nov 2022 11:42:59 +0100
Subject: [PATCH 14/31] Overhauled the launch configuration

---
 cpp/include/raft/neighbors/ivf_pq_types.hpp   |  17 +-
 .../raft/spatial/knn/detail/ivf_pq_search.cuh | 490 ++++++++++++------
 cpp/test/neighbors/ann_ivf_pq.cuh             |   5 -
 3 files changed, 331 insertions(+), 181 deletions(-)

diff --git a/cpp/include/raft/neighbors/ivf_pq_types.hpp b/cpp/include/raft/neighbors/ivf_pq_types.hpp
index f4db8b7463..4cfdf6cc0a 100644
--- a/cpp/include/raft/neighbors/ivf_pq_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_pq_types.hpp
@@ -108,12 +108,21 @@ struct search_params : ann::search_params {
    */
   cudaDataType_t internal_distance_dtype = CUDA_R_32F;
   /**
-   * Thread block size of the distance calculation kernel at search time.
-   * When zero, an optimal block size is selected using a heuristic.
+   * Preferred fraction of SM's unified memory / L1 cache to be used as shared memory.
    *
-   * Possible values: [0, 256, 512, 1024]
+   * Possible values: [0.0 - 1.0] as a fraction of the `sharedMemPerMultiprocessor`.
+   *
+   * One wants to increase the carveout to make sure a good GPU occupancy for the main search
+   * kernel, but not to keep it too high to leave some memory to be used as L1 cache. Note, this
+   * value is interpreted only as a hint. Moreover, a GPU usually allows only a fixed set of cache
+   * configurations, so the provided value is rounded up to the nearest configuration. Refer to the
+   * NVIDIA tuning guide for the target GPU architecture.
+   * The default value of `0.64` seems to be a good compromise for the majority of the current GPUs.
+   *
+   * Note, this is a low-level tuning parameter that can have drastic negative effects on the search
+   * performance if tweaked incorrectly.
    */
-  uint32_t preferred_thread_block_size = 0;
+  double preferred_shmem_carveout = 0.64;
 };
 
 static_assert(std::is_aggregate_v<index_params>);
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
index 1bdf9ce6cb..e3e54b40ec 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
@@ -542,28 +542,27 @@ template <typename IdxT,
           int Capacity,
           bool PrecompBaseDiff,
           bool EnableSMemLut>
-__launch_bounds__(1024, 1) __global__
-  void ivfpq_compute_similarity_kernel(uint32_t n_rows,
-                                       uint32_t dim,
-                                       uint32_t n_probes,
-                                       uint32_t pq_bits,
-                                       uint32_t pq_dim,
-                                       uint32_t n_queries,
-                                       distance::DistanceType metric,
-                                       codebook_gen codebook_kind,
-                                       uint32_t topk,
-                                       const float* cluster_centers,
-                                       const float* pq_centers,
-                                       const uint8_t* pq_dataset,
-                                       const IdxT* cluster_offsets,
-                                       const uint32_t* cluster_labels,
-                                       const uint32_t* _chunk_indices,
-                                       const float* queries,
-                                       const uint32_t* index_list,
-                                       float* query_kths,
-                                       LutT* lut_scores,
-                                       OutT* _out_scores,
-                                       IdxT* _out_indices)
+__global__ void ivfpq_compute_similarity_kernel(uint32_t n_rows,
+                                                uint32_t dim,
+                                                uint32_t n_probes,
+                                                uint32_t pq_bits,
+                                                uint32_t pq_dim,
+                                                uint32_t n_queries,
+                                                distance::DistanceType metric,
+                                                codebook_gen codebook_kind,
+                                                uint32_t topk,
+                                                const float* cluster_centers,
+                                                const float* pq_centers,
+                                                const uint8_t* pq_dataset,
+                                                const IdxT* cluster_offsets,
+                                                const uint32_t* cluster_labels,
+                                                const uint32_t* _chunk_indices,
+                                                const float* queries,
+                                                const uint32_t* index_list,
+                                                float* query_kths,
+                                                LutT* lut_scores,
+                                                OutT* _out_scores,
+                                                IdxT* _out_indices)
 {
   /* Shared memory:
 
@@ -732,7 +731,6 @@ __launch_bounds__(1024, 1) __global__
       bool valid  = i < n_samples;
 
       {
-        // pq_reader_t<op_t> reader(pq_thread_data);
         auto pq_head       = reinterpret_cast<const vec_t::io_t*>(pq_thread_data);
         uint32_t bits_left = 0;
         op_t pq_code       = 0;
@@ -793,6 +791,79 @@ __launch_bounds__(1024, 1) __global__
   }
 }
 
+/**
+ * An approximation to the number of times each cluster appears in a batched sample.
+ *
+ * If the pairs (probe_ix, query_ix) are sorted by the probe_ix, there is a good chance that
+ * the same probe_ix (cluster) is processed by several blocks on a single SM. This greatly
+ * increases the L1 cache hit rate (i.e. increases the data locality).
+ *
+ * This function gives an estimate of how many times a specific cluster may appear in the
+ * batch. Thus, it gives a practical limit to how many blocks should be active on the same SM
+ * to improve the L1 cache hit rate.
+ */
+constexpr inline auto expected_probe_coresidency(uint32_t n_clusters,
+                                                 uint32_t n_probes,
+                                                 uint32_t n_queries) -> uint32_t
+{
+  /*
+    Let say:
+      n = n_clusters
+      k = n_probes
+      m = n_queries
+      r = # of times a specific block appears in the batched sample.
+
+    Then:
+      P(r) = C(n,k) * k^r * (n - k)^r / n^m
+      E[r] = m * k / n
+      E[r | r > 0] = m * k / n / (1 - (1 - k/n)^m)
+
+    The latter can be approximated by a much simpler formula, assuming (k / n) -> 0:
+      E[r | r > 0] = 1 + (m - 1) * k / (2 * n) + O( (k/n)^2 )
+   */
+  return 1 + (n_queries - 1) * n_probes / (2 * n_clusters);
+}
+
+/**
+ * Estimate a carveout value as expected by `cudaFuncAttributePreferredSharedMemoryCarveout`,
+ * given by a desired schmem-L1 split and a per-block memory requirement in bytes.
+ *
+ * @param shmem_fraction
+ *   a fraction representing a desired split (shmem / (shmem + L1)) [0, 1].
+ * @param shmem_per_block
+ *   a shared memory usage per block (dynamic + static shared memory sizes), in bytes.
+ * @param dev_props
+ *   device properties.
+ * @return
+ *   a carveout value in percents [0, 100].
+ */
+constexpr inline auto estimate_carveout(double shmem_fraction,
+                                        size_t shmem_per_block,
+                                        const cudaDeviceProp& dev_props) -> int
+{
+  using shmem_unit = Pow2<128>;
+  /*
+    Memory carveout setting is just a hint for the driver; it's free to choose any shmem-L1
+    configuration it deems appropriate. Nevertheless, we can guide it a little to choose a more
+    optimal setting in non-edge cases. Entirely undocumented, the driver seems to perform the
+    following steps to calculate the final split:
+    1. Calculate the hinted memory usage:
+       mem_use_hint = (carveout_hint * sharedMemPerMultiprocessor / 100)
+    2. Get the maximum occupancy (possibly including other kernel limits):
+       blocks_per_sm = min(other_block_limits, mem_use_hint / shmem_unit::roundUp(shmem_per_block)).
+    3. Add up the hinted memory and the system per-block memory:
+       requested_mem = mem_use_hint + blocks_per_sm * reservedSharedMemPerBlock
+    4. Round up to the nearest possible config
+       (e.g. one of 0,8,16,32,64,100 KB for compute capability 8.6).
+
+    The algorithm below performs the reverse of steps 1-3.
+   */
+  size_t m = shmem_unit::roundUp(shmem_per_block);
+  size_t r = dev_props.reservedSharedMemPerBlock;
+  size_t s = dev_props.sharedMemPerMultiprocessor;
+  return (size_t(100 * s * m * shmem_fraction) - (m - 1) * r) / (s * (m + r));
+}
+
 /**
  * This structure selects configurable template parameters (instance) based on
  * the search/index parameters at runtime.
@@ -802,38 +873,12 @@ __launch_bounds__(1024, 1) __global__
  */
 template <typename IdxT, typename OutT, typename LutT>
 struct ivfpq_compute_similarity {
-  using kernel_t = void (*)(uint32_t,
-                            uint32_t,
-                            uint32_t,
-                            uint32_t,
-                            uint32_t,
-                            uint32_t,
-                            distance::DistanceType,
-                            codebook_gen,
-                            uint32_t,
-                            const float*,
-                            const float*,
-                            const uint8_t*,
-                            const IdxT*,
-                            const uint32_t*,
-                            const uint32_t*,
-                            const float*,
-                            const uint32_t*,
-                            float*,
-                            LutT*,
-                            OutT*,
-                            IdxT*);
+  using kernel_t = decltype(&ivfpq_compute_similarity_kernel<IdxT, OutT, LutT, 0, true, true>);
 
   template <bool PrecompBaseDiff, bool EnableSMemLut>
   struct configured {
    public:
-    /**
-     * Select a proper kernel instance based on the runtime parameters.
-     *
-     * @param pq_bits
-     * @param pq_dim
-     * @param k_max
-     */
+    /** Select a proper kernel instance based on the runtime parameters. */
     static auto kernel(uint32_t k_max) -> kernel_t
     {
       return kernel_try_capacity<kMaxCapacity>(k_max);
@@ -859,7 +904,7 @@ struct ivfpq_compute_similarity {
   };
 
   struct selected {
-    void* kernel;
+    kernel_t kernel;
     dim3 grid_dim;
     dim3 block_dim;
     size_t smem_size;
@@ -868,8 +913,8 @@ struct ivfpq_compute_similarity {
     template <typename... Args>
     void operator()(rmm::cuda_stream_view stream, Args... args)
     {
-      void* xs[] = {&args...};  // NOLINT
-      RAFT_CUDA_TRY(cudaLaunchKernel(kernel, grid_dim, block_dim, xs, smem_size, stream));
+      kernel<<<grid_dim, block_dim, smem_size, stream>>>(args...);
+      RAFT_CHECK_CUDA(stream);
     }
   };
 
@@ -883,129 +928,243 @@ struct ivfpq_compute_similarity {
    *    whether use the fused calculate+select or just calculate the distances for each
    *    query and probed cluster.
    *
+   * @param locality_hint
+   *    beyond this limit do not consider increasing the number of active blocks per SM
+   *    would improve locality anymore.
    */
   static inline auto select(bool manage_local_topk,
+                            int locality_hint,
+                            double preferred_shmem_carveout,
                             uint32_t pq_bits,
                             uint32_t pq_dim,
                             uint32_t precomp_data_count,
-                            uint32_t preferred_thread_block_size,
                             uint32_t n_queries,
                             uint32_t n_probes,
                             uint32_t topk) -> selected
   {
-    using conf_fast        = configured<true, true>;
-    using conf_no_basediff = configured<false, true>;
-    using conf_no_smem_lut = configured<true, false>;
-
-    kernel_t kernel_fast        = conf_fast::kernel(manage_local_topk ? topk : 0u);
-    kernel_t kernel_no_basediff = conf_no_basediff::kernel(manage_local_topk ? topk : 0u);
-    kernel_t kernel_no_smem_lut = conf_no_smem_lut::kernel(manage_local_topk ? topk : 0u);
-
-    const size_t smem_threshold = 48 * 1024;
-    size_t lut_mem              = sizeof(LutT) * (pq_dim << pq_bits);
-    size_t bdf_mem              = sizeof(float) * precomp_data_count;
-
-    uint32_t n_blocks  = n_queries * n_probes;
-    uint32_t n_threads = 1024;
-    // preferred_thread_block_size == 0 means using auto thread block size calculation mode
-    if (preferred_thread_block_size == 0) {
-      const uint32_t thread_min = 256;
+    cudaDeviceProp dev_props;
+    {
       int cur_dev;
-      cudaDeviceProp dev_props;
       RAFT_CUDA_TRY(cudaGetDevice(&cur_dev));
       RAFT_CUDA_TRY(cudaGetDeviceProperties(&dev_props, cur_dev));
-      while (n_threads > thread_min) {
-        if (n_blocks < uint32_t(getMultiProcessorCount() * (1024 / (n_threads / 2)))) { break; }
-        if (dev_props.sharedMemPerMultiprocessor * 2 / 3 < lut_mem * (1024 / (n_threads / 2))) {
-          break;
+    }
+    // Shared memory for storing the lookup table
+    size_t lut_mem = sizeof(LutT) * (pq_dim << pq_bits);
+    // Shared memory for storing pre-computed pieces to speedup the lookup table construction
+    // (e.g. the distance between a cluster center and the query for L2).
+    size_t bdf_mem = sizeof(float) * precomp_data_count;
+    // Shared memory for the fused top-k component; it may overlap with the other uses of shared
+    // memory and depends on the number of threads.
+    struct ltk_mem_t {
+      uint32_t subwarp_size;
+      uint32_t topk;
+      bool manage_local_topk;
+      ltk_mem_t(bool manage_local_topk, uint32_t topk)
+        : manage_local_topk(manage_local_topk), topk(topk)
+      {
+        subwarp_size = WarpSize;
+        while (topk * 2 <= subwarp_size) {
+          subwarp_size /= 2;
         }
-        n_threads /= 2;
       }
-    } else {
-      n_threads = preferred_thread_block_size;
+
+      [[nodiscard]] auto operator()(uint32_t n_threads) const -> size_t
+      {
+        return manage_local_topk ? topk::template calc_smem_size_for_block_wide<OutT, IdxT>(
+                                     n_threads / subwarp_size, topk)
+                                 : 0;
+      }
+    } ltk_mem{manage_local_topk, topk};
+
+    // Total amount of work; should be enough to occupy the GPU.
+    uint32_t n_blocks = n_queries * n_probes;
+
+    // The minimum block size we may want:
+    //   1. It's a power-of-two for efficient L1 caching of pq_centers values
+    //      (multiples of `1 << pq_bits`).
+    //   2. It should be large enough to fully utilize an SM.
+    uint32_t n_threads_min = WarpSize;
+    while (dev_props.maxBlocksPerMultiProcessor * int(n_threads_min) <
+           dev_props.maxThreadsPerMultiProcessor) {
+      n_threads_min *= 2;
+    }
+    // Further increase the minimum block size to make sure full device occupancy
+    // (NB: this may lead to `n_threads_min` being larger than the kernel's maximum)
+    while (int(n_blocks * n_threads_min) <
+             dev_props.multiProcessorCount * dev_props.maxThreadsPerMultiProcessor &&
+           int(n_threads_min) < dev_props.maxThreadsPerBlock) {
+      n_threads_min *= 2;
     }
-    uint32_t subwarp_size = WarpSize;
-    while (topk * 2 <= subwarp_size) {
-      subwarp_size /= 2;
+    // Even further, increase it to allow less blocks per SM if there not enough queries.
+    // With this, we reduce the chance of different clusters being processed by two blocks
+    // on the same SM and thus improve the data locality for L1 caching.
+    while (int(n_queries * n_threads_min) < dev_props.maxThreadsPerMultiProcessor &&
+           int(n_threads_min) < dev_props.maxThreadsPerBlock) {
+      n_threads_min *= 2;
     }
-    size_t smem_size_local_topk =
-      manage_local_topk
-        ? topk::template calc_smem_size_for_block_wide<OutT, IdxT>(n_threads / subwarp_size, topk)
-        : 0;
-    size_t smem_fast        = max(lut_mem + bdf_mem, smem_size_local_topk);
-    size_t smem_no_basediff = max(lut_mem, smem_size_local_topk);
-    size_t smem_no_smem_lut = max(bdf_mem, smem_size_local_topk);
-
-    kernel_t kernel  = kernel_no_basediff;
-    size_t smem_size = smem_no_basediff;
-
-    bool kernel_no_basediff_available = true;
-    bool use_smem_lut                 = true;
-    if (smem_no_basediff > smem_threshold) {
-      cudaError_t cuda_status = cudaFuncSetAttribute(
-        kernel_no_basediff, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
+
+    // Granularity of changing the number of threads when computing the maximum block size.
+    // It's good to have it multiple of the PQ book width.
+    uint32_t n_threads_gty = round_up_safe<uint32_t>(1u << pq_bits, WarpSize);
+
+    /*
+     Shared memory / L1 cache balance is the main limiter of this kernel.
+     The more blocks per SM we launch, the more shared memory we need. Besides that, we have
+     three versions of the kernel varying in performance and shmem usage.
+
+     We try the most demanding and the fastest kernel first, trying to maximize occupancy with
+     the minimum number of blocks (just one, really). Then, we tweak the `n_threads` to further
+     optimize occupancy and data locality for the L1 cache.
+     */
+    using conf_fast        = configured<true, true>;
+    using conf_no_basediff = configured<false, true>;
+    using conf_no_smem_lut = configured<true, false>;
+    auto topk_or_zero      = manage_local_topk ? topk : 0u;
+    std::array candidates{std::make_tuple(conf_fast::kernel(topk_or_zero), lut_mem + bdf_mem, true),
+                          std::make_tuple(conf_no_basediff::kernel(topk_or_zero), lut_mem, true),
+                          std::make_tuple(conf_no_smem_lut::kernel(topk_or_zero), bdf_mem, false)};
+
+    // allocation unit size (for rounding up)
+    using shmem_unit = Pow2<128>;
+    // we may allow slightly lower than 100% occupancy;
+    constexpr double kTargetOccupancy = 0.75;
+    // these two parameters are used to select the better candidate
+    double selected_occupancy = 0.0;
+    double selected_shmem_use = 1.0;
+    selected selected_config;
+    for (auto [kernel, smem_size_const, lut_is_in_shmem] : candidates) {
+      if (smem_size_const > dev_props.sharedMemPerBlockOptin) {
+        // Even a single block cannot fit into an SM due to shmem requirements. Skip the candidate.
+        continue;
+      }
+
+      // First, we set the carveout hint to the preferred value. The driver will increase this if
+      // needed to run at least one block per SM. At the same time, if more blocks fit into one SM,
+      // this carveout value will limit the calculated occupancy. When we're done selecting the best
+      // launch configuration, we will tighten the carveout once more, based on the final memory
+      // usage and occupancy.
+      const int max_carveout =
+        estimate_carveout(preferred_shmem_carveout, smem_size_const, dev_props);
+      RAFT_CUDA_TRY(
+        cudaFuncSetAttribute(kernel, cudaFuncAttributePreferredSharedMemoryCarveout, max_carveout));
+
+      // Get the theoretical maximum possible number of threads per block
+      cudaFuncAttributes kernel_attrs;
+      RAFT_CUDA_TRY(cudaFuncGetAttributes(&kernel_attrs, kernel));
+      uint32_t n_threads =
+        round_down_safe<uint32_t>(kernel_attrs.maxThreadsPerBlock, n_threads_gty);
+
+      // Actual required shmem depens on the number of threads
+      size_t smem_size = max(smem_size_const, ltk_mem(n_threads));
+
+      // Make sure the kernel can get enough shmem.
+      cudaError_t cuda_status =
+        cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
       if (cuda_status != cudaSuccess) {
         RAFT_EXPECTS(
           cuda_status == cudaGetLastError(),
           "Tried to reset the expected cuda error code, but it didn't match the expectation");
-        kernel_no_basediff_available = false;
-
-        // Use "kernel_no_smem_lut" which just uses small amount of shared memory.
-        RAFT_LOG_DEBUG(
-          "Non-shared-mem look-up table kernel is selected, because it wouldn't fit shmem "
-          "required: "
-          "%zu bytes)",
-          smem_size);
-        kernel               = kernel_no_smem_lut;
-        smem_size            = smem_no_smem_lut;
-        use_smem_lut         = false;
-        n_threads            = 1024;
-        smem_size_local_topk = manage_local_topk
-                                 ? topk::template calc_smem_size_for_block_wide<OutT, IdxT>(
-                                     n_threads / subwarp_size, topk)
-                                 : 0;
-        n_blocks             = getMultiProcessorCount();
+        // Failed to request enough shmem for the kernel. Skip the candidate.
+        continue;
       }
-    }
-    if (kernel_no_basediff_available) {
-      bool kernel_fast_available = true;
-      if (smem_fast > smem_threshold) {
-        cudaError_t cuda_status =
-          cudaFuncSetAttribute(kernel_fast, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_fast);
-        if (cuda_status != cudaSuccess) {
-          RAFT_EXPECTS(
-            cuda_status == cudaGetLastError(),
-            "Tried to reset the expected cuda error code, but it didn't match the expectation");
-          kernel_fast_available = false;
-          RAFT_LOG_DEBUG(
-            "No-precomputed-basediff kernel is selected, because the basediff wouldn't fit (shmem "
-            "required: %zu bytes)",
-            smem_fast);
+
+      int blocks_per_sm;
+      RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &blocks_per_sm, kernel, n_threads, smem_size));
+      if (blocks_per_sm <= 0) {
+        // For some reason, we still cannot make this kernel run. Skip the candidate.
+        continue;
+      }
+
+      auto occupancy =
+        double(blocks_per_sm * n_threads) / double(dev_props.maxThreadsPerMultiProcessor);
+      auto shmem_use = double(shmem_unit::roundUp(smem_size) * blocks_per_sm) /
+                       double(dev_props.sharedMemPerMultiprocessor);
+
+      {
+        // Try to reduce the number of threads to increase occupancy and data locality
+        auto n_threads_tmp = n_threads_min;
+        while (n_threads_tmp * 2 < n_threads) {
+          n_threads_tmp *= 2;
+        }
+        if (n_threads_tmp < n_threads) {
+          while (n_threads_tmp >= n_threads_min) {
+            int blocks_per_sm_tmp;
+            auto smem_size_tmp = max(smem_size_const, ltk_mem(n_threads_tmp));
+            RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+              &blocks_per_sm_tmp, kernel, n_threads_tmp, smem_size_tmp));
+            auto occupancy_tmp = double(blocks_per_sm_tmp * n_threads_tmp) /
+                                 double(dev_props.maxThreadsPerMultiProcessor);
+            auto shmem_use_tmp = double(shmem_unit::roundUp(smem_size) * blocks_per_sm_tmp) /
+                                 double(dev_props.sharedMemPerMultiprocessor);
+            bool select_it = false;
+            if (lut_is_in_shmem && locality_hint >= blocks_per_sm_tmp) {
+              // Normally, the smaller the block the better for L1 cache hit rate.
+              // Hence, the occupancy should be "just good enough"
+              select_it = occupancy_tmp >= min(kTargetOccupancy, occupancy);
+            } else if (lut_is_in_shmem) {
+              // If we don't have enough repeating probes (locality_hint < blocks_per_sm_tmp),
+              // the locality is not going to improve with increasing the number of blocks per SM.
+              // Hence, the only metric here is the occupancy.
+              select_it = occupancy_tmp > occupancy;
+            } else {
+              // If we don't use shared memory for the lookup table, increasing the number of blocks
+              // is very taxing on the global memory usage.
+              // In this case, the occupancy must increase a lot to make it worth the cost.
+              select_it = occupancy_tmp >= min(1.0, occupancy / kTargetOccupancy);
+            }
+            if (select_it) {
+              n_threads     = n_threads_tmp;
+              smem_size     = smem_size_tmp;
+              blocks_per_sm = blocks_per_sm_tmp;
+              occupancy     = occupancy_tmp;
+              shmem_use     = shmem_use_tmp;
+            }
+            n_threads_tmp /= 2;
+          }
         }
       }
-      if (kernel_fast_available) {
-        int kernel_no_basediff_n_blocks = 0;
-        RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-          &kernel_no_basediff_n_blocks, kernel_no_basediff, n_threads, smem_no_basediff));
-
-        int kernel_fast_n_blocks = 0;
-        RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-          &kernel_fast_n_blocks, kernel_fast, n_threads, smem_fast));
-
-        // Use "kernel_fast" only if GPU occupancy does not drop too much
-        if (kernel_no_basediff_n_blocks * 3 <= kernel_fast_n_blocks * 4) {
-          kernel    = kernel_fast;
-          smem_size = smem_fast;
+
+      {
+        if (selected_occupancy <= 0.0  // no candidate yet
+            || (selected_occupancy < occupancy * kTargetOccupancy &&
+                selected_shmem_use >= shmem_use)  // much improved occupancy
+        ) {
+          selected_occupancy = occupancy;
+          selected_shmem_use = shmem_use;
+          if (lut_is_in_shmem) {
+            selected_config = {
+              kernel, dim3(n_blocks, 1, 1), dim3(n_threads, 1, 1), smem_size, size_t(0)};
+          } else {
+            // When the global memory is used for the lookup table, we need to minimize the grid
+            // size; otherwise, the kernel may quickly run out of memory.
+            auto n_blocks_min =
+              std::min<uint32_t>(n_blocks, blocks_per_sm * dev_props.multiProcessorCount);
+            selected_config = {kernel,
+                               dim3(n_blocks_min, 1, 1),
+                               dim3(n_threads, 1, 1),
+                               smem_size,
+                               size_t(n_blocks_min) * size_t(pq_dim << pq_bits)};
+          }
+          // Actual shmem/L1 split wildly rounds up the specified preferred carveout, so we set here
+          // a rather conservative bar; most likely, the kernel gets more shared memory than this,
+          // and the occupancy doesn't get hurt.
+          auto carveout = std::min<int>(max_carveout, std::ceil(100.0 * selected_shmem_use));
+          RAFT_CUDA_TRY(
+            cudaFuncSetAttribute(kernel, cudaFuncAttributePreferredSharedMemoryCarveout, carveout));
+          if (occupancy >= kTargetOccupancy) { break; }
+        } else if (selected_occupancy > 0.0) {
+          // If we found a reasonable candidate on a previous iteration, and this one is not better,
+          // then don't try any more candidates because they are much slower anyway.
+          break;
         }
       }
     }
 
-    uint32_t device_lut_size = use_smem_lut ? 0u : n_blocks * (pq_dim << pq_bits);
-    return {reinterpret_cast<void*>(kernel),
-            dim3(n_blocks, 1, 1),
-            dim3(n_threads, 1, 1),
-            smem_size,
-            device_lut_size};
+    RAFT_EXPECTS(selected_occupancy > 0.0,
+                 "Couldn't determine a working kernel launch configuration.");
+
+    return selected_config;
   }
 };
 
@@ -1031,13 +1190,13 @@ void ivfpq_search_worker(const handle_t& handle,
                          uint32_t max_samples,
                          uint32_t n_probes,
                          uint32_t topK,
-                         uint32_t preferred_thread_block_size,
                          uint32_t n_queries,
                          const uint32_t* clusters_to_probe,  // [n_queries, n_probes]
                          const float* query,                 // [n_queries, rot_dim]
                          IdxT* neighbors,                    // [n_queries, topK]
                          float* distances,                   // [n_queries, topK]
                          float scaling_factor,
+                         double preferred_shmem_carveout,
                          rmm::mr::device_memory_resource* mr)
 {
   auto stream = handle.get_stream();
@@ -1074,7 +1233,9 @@ void ivfpq_search_worker(const handle_t& handle,
   calc_chunk_indices::configure(n_probes, n_queries)(
     cluster_sizes, clusters_to_probe, chunk_index.data(), num_samples.data(), stream);
 
-  if (n_queries * n_probes > 256) {
+  auto coresidency = expected_probe_coresidency(index.n_lists(), n_probes, n_queries);
+
+  if (coresidency > 1) {
     // Sorting index by cluster number (label).
     // The goal is to incrase the L2 cache hit rate to read the vectors
     // of a cluster by processing the cluster at the same time as much as
@@ -1132,12 +1293,14 @@ void ivfpq_search_worker(const handle_t& handle,
       RAFT_FAIL("Unsupported metric");
     } break;
   }
+
   auto search_instance =
     ivfpq_compute_similarity<IdxT, ScoreT, LutT>::select(manage_local_topk,
+                                                         coresidency,
+                                                         preferred_shmem_carveout,
                                                          index.pq_bits(),
                                                          index.pq_dim(),
                                                          precomp_data_count,
-                                                         preferred_thread_block_size,
                                                          n_queries,
                                                          n_probes,
                                                          topK);
@@ -1209,19 +1372,7 @@ void ivfpq_search_worker(const handle_t& handle,
 template <typename IdxT>
 struct ivfpq_search {
  public:
-  using fun_t = void (*)(const handle_t&,
-                         const index<IdxT>&,
-                         uint32_t,
-                         uint32_t,
-                         uint32_t,
-                         uint32_t,
-                         uint32_t,
-                         const uint32_t*,
-                         const float*,
-                         IdxT*,
-                         float*,
-                         float,
-                         rmm::mr::device_memory_resource*);
+  using fun_t = decltype(&ivfpq_search_worker<float, float, IdxT>);
 
   /**
    * Select an instance of the ivf-pq search function based on search tuning parameters,
@@ -1341,11 +1492,6 @@ inline void search(const handle_t& handle,
   RAFT_EXPECTS(params.lut_dtype == CUDA_R_16F || params.lut_dtype == CUDA_R_32F ||
                  params.lut_dtype == CUDA_R_8U,
                "lut_dtype must be CUDA_R_16F, CUDA_R_32F or CUDA_R_8U");
-  RAFT_EXPECTS(
-    params.preferred_thread_block_size == 256 || params.preferred_thread_block_size == 512 ||
-      params.preferred_thread_block_size == 1024 || params.preferred_thread_block_size == 0,
-    "preferred_thread_block_size must be 0, 256, 512 or 1024, but %u is given.",
-    params.preferred_thread_block_size);
   RAFT_EXPECTS(k > 0, "parameter `k` in top-k must be positive.");
   RAFT_EXPECTS(
     k <= index.size(),
@@ -1452,13 +1598,13 @@ inline void search(const handle_t& handle,
                       max_samples,
                       params.n_probes,
                       k,
-                      params.preferred_thread_block_size,
                       batch_size,
                       clusters_to_probe.data() + uint64_t(params.n_probes) * offset_b,
                       rot_queries.data() + uint64_t(index.rot_dim()) * offset_b,
                       neighbors + uint64_t(k) * (offset_q + offset_b),
                       distances + uint64_t(k) * (offset_q + offset_b),
                       utils::config<T>::kDivisor / utils::config<float>::kDivisor,
+                      params.preferred_shmem_carveout,
                       mr);
     }
   }
diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index 6a8bc86e84..36fbef4bd5 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -102,7 +102,6 @@ inline auto operator<<(std::ostream& os, const ivf_pq_inputs& p) -> std::ostream
   PRINT_DIFF_V(.search_params.lut_dtype, print_dtype{p.search_params.lut_dtype});
   PRINT_DIFF_V(.search_params.internal_distance_dtype,
                print_dtype{p.search_params.internal_distance_dtype});
-  PRINT_DIFF(.search_params.preferred_thread_block_size);
   os << "}";
   return os;
 }
@@ -365,10 +364,6 @@ inline auto enum_variety() -> test_cases_t
   ADD_CASE({ x.search_params.internal_distance_dtype = CUDA_R_32F; });
   ADD_CASE({ x.search_params.internal_distance_dtype = CUDA_R_16F; });
 
-  ADD_CASE({ x.search_params.preferred_thread_block_size = 256; });
-  ADD_CASE({ x.search_params.preferred_thread_block_size = 512; });
-  ADD_CASE({ x.search_params.preferred_thread_block_size = 1024; });
-
   return xs;
 }
 

From 0e4e631388eaf4549efdcc2e5108f417c70df32d Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 9 Nov 2022 11:13:14 +0100
Subject: [PATCH 15/31] Changed pq_dataset layout once again and promoted
 pq_bits to the template param

---
 cpp/include/raft/neighbors/ivf_pq_types.hpp   |   9 +-
 .../raft/spatial/knn/detail/ivf_pq_build.cuh  | 134 ++++++++++--------
 .../raft/spatial/knn/detail/ivf_pq_search.cuh | 109 +++++++-------
 3 files changed, 140 insertions(+), 112 deletions(-)

diff --git a/cpp/include/raft/neighbors/ivf_pq_types.hpp b/cpp/include/raft/neighbors/ivf_pq_types.hpp
index 4cfdf6cc0a..bdac4cff75 100644
--- a/cpp/include/raft/neighbors/ivf_pq_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_pq_types.hpp
@@ -323,8 +323,8 @@ struct index : ann::index {
   using pq_dataset_extents = std::experimental::
     extents<IdxT, dynamic_extent, dynamic_extent, kIndexGroupSize, kIndexGroupVecLen>;
   /** PQ-encoded data
-   *    [ceildiv(size, kIndexGroupSize)
-   *    , ceildiv(pq_dim * pq_bits / 8, kIndexGroupVecLen)
+   *    [ ceildiv(size, kIndexGroupSize)
+   *    , ceildiv(pq_dim, (kIndexGroupVecLen * 8u) / pq_bits)
    *    , kIndexGroupSize
    *    , kIndexGroupVecLen
    *    ].
@@ -412,9 +412,10 @@ struct index : ann::index {
    */
   auto make_pq_dataset_extents(IdxT n_rows) -> pq_dataset_extents
   {
-    auto l = pq_dim() * pq_bits() / 8;
+    // how many elems of pq_dim fit into one kIndexGroupVecLen-byte chunk
+    auto pq_chunk = (kIndexGroupVecLen * 8u) / pq_bits();
     return make_extents<IdxT>(raft::div_rounding_up_safe<IdxT>(n_rows, kIndexGroupSize),
-                              raft::div_rounding_up_safe<IdxT>(l, kIndexGroupVecLen),
+                              raft::div_rounding_up_safe<IdxT>(pq_dim(), pq_chunk),
                               kIndexGroupSize,
                               kIndexGroupVecLen);
   }
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
index 5a0c92e264..cc4fb12a01 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
@@ -55,6 +55,9 @@ using raft::neighbors::ivf_pq::codebook_gen;
 using raft::neighbors::ivf_pq::index;
 using raft::neighbors::ivf_pq::index_params;
 using raft::neighbors::ivf_pq::kIndexGroupSize;
+using raft::neighbors::ivf_pq::kIndexGroupVecLen;
+
+using pq_codes_exts = extents<size_t, dynamic_extent, dynamic_extent, kIndexGroupVecLen>;
 
 namespace {
 
@@ -114,55 +117,68 @@ struct bitfield_view_t {
   NB: label type is uint32_t although it can only contain values up to `1 << pq_bits`.
       We keep it this way to not force one more overload for kmeans::predict.
  */
-template <uint32_t PqBits>
-HDI void ivfpq_encode_core(uint32_t n_rows, uint32_t pq_dim, const uint32_t* label, uint8_t* output)
+template <uint32_t PqBits, size_t VecLen>
+__device__ void ivfpq_encode_core(uint32_t n_rows,
+                                  uint32_t pq_dim,
+                                  const uint32_t* label,
+                                  uint8_t* output)
 {
-  bitfield_view_t<PqBits> out{output};
-  for (uint32_t j = 0; j < pq_dim; j++, label += n_rows) {
-    out[j] = static_cast<uint8_t>(*label);
+  constexpr uint32_t kChunkSize = (VecLen * 8u) / PqBits;
+  TxN_t<uint8_t, VecLen> vec;
+  for (uint32_t j = 0; j < pq_dim;) {
+    vec.fill(0);
+    bitfield_view_t<PqBits> out{vec.val.data};
+#pragma unroll
+    for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++, label += n_rows) {
+      out[k] = static_cast<uint8_t>(*label);
+    }
+    vec.store(output, 0);
+    output += VecLen;
   }
 }
 
 template <uint32_t BlockDim, uint32_t PqBits>
-__launch_bounds__(BlockDim) __global__
-  void ivfpq_encode_kernel(uint32_t n_rows,
-                           uint32_t pq_dim,
-                           const uint32_t* label,  // [pq_dim, n_rows]
-                           uint8_t* output         // [n_rows, pq_dim]
-  )
+__launch_bounds__(BlockDim) __global__ void ivfpq_encode_kernel(
+  uint32_t pq_dim,
+  const uint32_t* label,                                   // [pq_dim, n_rows]
+  device_mdspan<uint8_t, pq_codes_exts, row_major> output  // [n_rows, pq_dim]
+)
 {
   uint32_t i = threadIdx.x + BlockDim * blockIdx.x;
-  if (i >= n_rows) return;
-  ivfpq_encode_core<PqBits>(n_rows, pq_dim, label + i, output + (pq_dim * PqBits / 8) * i);
+  if (i >= output.extent(0)) return;
+  ivfpq_encode_core<PqBits, output.static_extent(2)>(
+    output.extent(0),
+    pq_dim,
+    label + i,
+    output.data_handle() + output.extent(1) * output.extent(2) * i);
 }
 }  // namespace
 
-inline void ivfpq_encode(uint32_t n_rows,
-                         uint32_t pq_dim,
+inline void ivfpq_encode(uint32_t pq_dim,
                          uint32_t pq_bits,       // 4 <= pq_bits <= 8
                          const uint32_t* label,  // [pq_dim, n_rows]
-                         uint8_t* output,        // [n_rows, pq_dim]
+                         device_mdspan<uint8_t, pq_codes_exts, row_major> output,  // [n_rows, ..]
                          rmm::cuda_stream_view stream)
 {
   constexpr uint32_t kBlockDim = 128;
   dim3 threads(kBlockDim, 1, 1);
-  dim3 blocks(raft::ceildiv<uint32_t>(n_rows, kBlockDim), 1, 1);
+  dim3 blocks(raft::ceildiv<uint32_t>(output.extent(0), kBlockDim), 1, 1);
   switch (pq_bits) {
     case 4:
       return ivfpq_encode_kernel<kBlockDim, 4>
-        <<<blocks, threads, 0, stream>>>(n_rows, pq_dim, label, output);
+        <<<blocks, threads, 0, stream>>>(pq_dim, label, output);
     case 5:
       return ivfpq_encode_kernel<kBlockDim, 5>
-        <<<blocks, threads, 0, stream>>>(n_rows, pq_dim, label, output);
+        <<<blocks, threads, 0, stream>>>(pq_dim, label, output);
     case 6:
       return ivfpq_encode_kernel<kBlockDim, 6>
-        <<<blocks, threads, 0, stream>>>(n_rows, pq_dim, label, output);
+        <<<blocks, threads, 0, stream>>>(pq_dim, label, output);
     case 7:
       return ivfpq_encode_kernel<kBlockDim, 7>
-        <<<blocks, threads, 0, stream>>>(n_rows, pq_dim, label, output);
+        <<<blocks, threads, 0, stream>>>(pq_dim, label, output);
     case 8:
       return ivfpq_encode_kernel<kBlockDim, 8>
-        <<<blocks, threads, 0, stream>>>(n_rows, pq_dim, label, output);
+        <<<blocks, threads, 0, stream>>>(pq_dim, label, output);
     default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits);
   }
 }
@@ -284,8 +300,8 @@ void select_residuals(const handle_t& handle,
  *    it should be partitioned by the clusters by now.
  * @param cluster_sizes    // [n_clusters]
  * @param cluster_offsets  // [n_clusters + 1]
- * @param pq_centers                 // [...]
- * @param pq_dataset  // [n_rows, pq_dim * pq_bits / 8]
+ * @param pq_centers  // [...]
+ * @param pq_dataset  // [n_rows, ...]
  * @param device_memory
  */
 template <typename T, typename IdxT>
@@ -307,7 +323,7 @@ void compute_pq_codes(
   const uint32_t* cluster_sizes,
   const IdxT* cluster_offsets,
   device_mdspan<const float, typename index<IdxT>::pq_centers_extents, row_major> pq_centers,
-  uint8_t* pq_dataset,
+  device_mdspan<uint8_t, pq_codes_exts, row_major> pq_dataset,
   rmm::mr::device_memory_resource* device_memory)
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
@@ -333,10 +349,6 @@ void compute_pq_codes(
     size_t(max_cluster_size) * size_t(pq_dim * pq_len), stream, device_memory);
   rmm::device_uvector<uint32_t> sub_vector_labels(
     size_t(max_cluster_size) * size_t(pq_dim), stream, device_memory);
-  rmm::device_uvector<uint8_t> my_pq_dataset(
-    size_t(max_cluster_size) * size_t(pq_dim * pq_bits / 8) /* NB: pq_dim * bitPQ % 8 == 0 */,
-    stream,
-    device_memory);
 
   for (uint32_t l = 0; l < n_clusters; l++) {
     auto cluster_size = cluster_sizes[l];
@@ -416,11 +428,14 @@ void compute_pq_codes(
     // PQ encoding
     //
     ivfpq_encode(
-      cluster_size, pq_dim, pq_bits, sub_vector_labels.data(), my_pq_dataset.data(), stream);
-    copy(pq_dataset + size_t(cluster_offsets[l]) * size_t(pq_dim * pq_bits / 8),
-         my_pq_dataset.data(),
-         size_t(cluster_size) * size_t(pq_dim * pq_bits / 8),
-         stream);
+      pq_dim,
+      pq_bits,
+      sub_vector_labels.data(),
+      make_mdspan(
+        pq_dataset.data_handle() +
+          size_t(cluster_offsets[l]) * pq_dataset.extent(1) * pq_dataset.extent(2),
+        make_extents<IdxT>(cluster_size, pq_dataset.extent(1), pq_dataset.static_extent(2))),
+      stream);
   }
 }
 
@@ -751,8 +766,9 @@ inline auto extend(const handle_t& handle,
   //
   // Compute PQ code for new vectors
   //
-  rmm::device_uvector<uint8_t> new_pq_codes(
-    size_t(n_rows) * size_t(orig_index.pq_dim() * orig_index.pq_bits() / 8), stream, device_memory);
+  pq_codes_exts new_pq_exts = make_extents<size_t>(
+    n_rows, orig_index.pq_dataset().extent(1), orig_index.pq_dataset().static_extent(3));
+  auto new_pq_codes = make_device_mdarray<uint8_t>(handle, device_memory, new_pq_exts);
   compute_pq_codes<T>(handle,
                       n_rows,
                       orig_index.dim(),
@@ -770,7 +786,7 @@ inline auto extend(const handle_t& handle,
                       new_cluster_sizes,
                       new_cluster_offsets.data(),
                       orig_index.pq_centers(),
-                      new_pq_codes.data(),
+                      new_pq_codes.view(),
                       device_memory);
 
   // Get the combined cluster sizes and sort the clusters in decreasing order
@@ -949,39 +965,45 @@ inline auto extend(const handle_t& handle,
   }
 
   /* Extend the pq_dataset */
-  auto ext_pq_dataset = ext_index.pq_dataset().data_handle();
-  auto pq_dataset_unit =
-    size_t(ext_index.pq_dataset().extent(1)) * size_t(ext_index.pq_dataset().extent(3));
-  auto raw_data_unit = size_t(ext_index.pq_dim() * ext_index.pq_bits() / 8);
+  using vec_t = TxN_t<uint8_t, kIndexGroupVecLen>::io_t;
+
+  auto data_unit = ext_index.pq_dataset().extent(1);
+  auto ext_pq_dataset =
+    make_mdspan(reinterpret_cast<vec_t*>(ext_index.pq_dataset().data_handle()),
+                make_extents<size_t>(
+                  ext_index.pq_dataset().extent(0), data_unit, ext_index.pq_dataset().extent(2)));
+
   for (uint32_t l = 0; l < ext_index.n_lists(); l++) {
     auto k                = cluster_ordering[l];
     auto old_cluster_size = old_cluster_sizes[k];
     auto old_pq_dataset =
-      make_mdspan(orig_index.pq_dataset().data_handle() + pq_dataset_unit * old_cluster_offsets[k],
-                  ext_index.make_pq_dataset_extents(old_cluster_size));
-    auto new_pq_data =
-      make_mdspan(new_pq_codes.data() + raw_data_unit * new_cluster_offsets.data()[k],
-                  make_extents<size_t>(new_cluster_sizes[k], raw_data_unit));
+      make_mdspan(reinterpret_cast<const vec_t*>(orig_index.pq_dataset().data_handle()) +
+                    data_unit * old_cluster_offsets[k],
+                  make_extents<size_t>(div_rounding_up_safe(old_cluster_size, kIndexGroupSize),
+                                       data_unit,
+                                       ext_pq_dataset.extent(2)));
+    auto new_pq_data = make_mdspan(reinterpret_cast<vec_t*>(new_pq_codes.data_handle()) +
+                                     data_unit * new_cluster_offsets.data()[k],
+                                   make_extents<size_t>(new_cluster_sizes[k], data_unit));
     linalg::writeOnlyUnaryOp(
-      ext_pq_dataset + pq_dataset_unit * ext_cluster_offsets[l],
-      pq_dataset_unit * size_t(ext_cluster_offsets[l + 1] - ext_cluster_offsets[l]),
-      [old_pq_dataset, new_pq_data, old_cluster_size] __device__(uint8_t * out, size_t i_flat) {
-        size_t i[4];
-        for (int r = 3; r > 0; r--) {
+      ext_pq_dataset.data_handle() + data_unit * ext_cluster_offsets[l],
+      data_unit * size_t(ext_cluster_offsets[l + 1] - ext_cluster_offsets[l]),
+      [old_pq_dataset, new_pq_data, old_cluster_size] __device__(vec_t * out, size_t i_flat) {
+        size_t i[3];
+        for (int r = 2; r > 0; r--) {
           i[r] = i_flat % old_pq_dataset.extent(r);
           i_flat /= old_pq_dataset.extent(r);
         }
         i[0]        = i_flat;
         auto row_ix = i[0] * old_pq_dataset.extent(2) + i[2];
         if (row_ix < old_cluster_size) {
-          *out = old_pq_dataset(i[0], i[1], i[2], i[3]);
+          *out = old_pq_dataset(i[0], i[1], i[2]);
         } else {
           row_ix -= old_cluster_size;
-          auto col_ix = i[1] * old_pq_dataset.extent(3) + i[3];
-          if (row_ix < new_pq_data.extent(0) && col_ix < new_pq_data.extent(1)) {
-            *out = new_pq_data(row_ix, col_ix);
+          if (row_ix < new_pq_data.extent(0)) {
+            *out = new_pq_data(row_ix, i[1]);
           } else {
-            *out = 0;
+            *out = vec_t{};
           }
         }
       },
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
index e3e54b40ec..b8f940101d 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
@@ -477,14 +477,15 @@ using block_sort_t = typename pq_block_sort<Capacity, T, IdxT>::type;
  * Each block processes a (query, probe) pair: it calculates the distance between the single query
  * vector and all the dataset vector in the cluster that we are probing.
  *
- * @tparam OpT is a carrier integer type selected to maximize throughput;
- *   Used solely in ivfpq-compute-score;
  * @tparam IdxT
  *   The type of data indices
  * @tparam OutT
  *   The output type - distances.
  * @tparam LutT
  *   The lookup table element type (lut_scores).
+ * @tparam PqBits
+ *   The bit length of an encoded vector element after compression by PQ
+ *   (NB: pq_book_size = 1 << PqBits).
  * @tparam Capacity
  *   Power-of-two; the maximum possible `k` in top-k. Value zero disables fused top-k search.
  * @tparam PrecompBaseDiff
@@ -499,8 +500,6 @@ using block_sort_t = typename pq_block_sort<Capacity, T, IdxT>::type;
  * @param n_rows the number of records in the dataset
  * @param dim the dimensionality of the data (NB: after rotation transform, i.e. `index.rot_dim()`).
  * @param n_probes the number of clusters to search for each query
- * @param pq_bits the bit length of an encoded vector element after compression by PQ
- *   (NB: pq_book_size = 1 << pq_bits).
  * @param pq_dim
  *   The dimensionality of an encoded vector after compression by PQ.
  * @param n_queries the number of queries.
@@ -514,7 +513,7 @@ using block_sort_t = typename pq_block_sort<Capacity, T, IdxT>::type;
  *   The device pointer to the cluster centers in the PQ space
  *   [pq_dim, pq_book_size, pq_len] or [n_clusters, pq_book_size, pq_len,].
  * @param pq_dataset
- *   The device pointer to the PQ index (data) [n_rows, pq_dim * pq_bits / 8].
+ *   The device pointer to the PQ index (data) [n_rows, ...].
  * @param cluster_offsets
  *   The device pointer to the cluster offsets [n_clusters + 1].
  * @param cluster_labels
@@ -527,7 +526,7 @@ using block_sort_t = typename pq_block_sort<Capacity, T, IdxT>::type;
  *   An optional device pointer to the enforced order of search [n_queries, n_probes].
  *   One can pass reordered indices here to try to improve data reading locality.
  * @param lut_scores
- *   The device pointer for storing the lookup table globally [gridDim.x, pq_dim << pq_bits].
+ *   The device pointer for storing the lookup table globally [gridDim.x, pq_dim << PqBits].
  *   Ignored when `EnableSMemLut == true`.
  * @param _out_scores
  *   The device pointer to the output scores
@@ -539,13 +538,13 @@ using block_sort_t = typename pq_block_sort<Capacity, T, IdxT>::type;
 template <typename IdxT,
           typename OutT,
           typename LutT,
+          uint32_t PqBits,
           int Capacity,
           bool PrecompBaseDiff,
           bool EnableSMemLut>
 __global__ void ivfpq_compute_similarity_kernel(uint32_t n_rows,
                                                 uint32_t dim,
                                                 uint32_t n_probes,
-                                                uint32_t pq_bits,
                                                 uint32_t pq_dim,
                                                 uint32_t n_queries,
                                                 distance::DistanceType metric,
@@ -566,7 +565,7 @@ __global__ void ivfpq_compute_similarity_kernel(uint32_t n_rows,
 {
   /* Shared memory:
 
-    * lut_scores: lookup table (LUT) of size = `pq_dim << pq_bits`  (when EnableSMemLut)
+    * lut_scores: lookup table (LUT) of size = `pq_dim << PqBits`  (when EnableSMemLut)
     * base_diff: size = dim (which is equal to `pq_dim * pq_len`)  or dim*2
     * topk::block_sort: some amount of shared memory, but overlaps with the rest:
         block_sort only needs shared memory for `.done()` operation, which can come very last.
@@ -574,10 +573,11 @@ __global__ void ivfpq_compute_similarity_kernel(uint32_t n_rows,
   extern __shared__ __align__(256) uint8_t smem_buf[];  // NOLINT
   constexpr bool kManageLocalTopK = Capacity > 0;
 
+  constexpr uint32_t PqShift = 1u << PqBits;  // NOLINT
+  constexpr uint32_t PqMask  = PqShift - 1u;  // NOLINT
+
   const uint32_t pq_len   = dim / pq_dim;
-  const uint32_t pq_shift = 1u << pq_bits;
-  const uint32_t lut_size = pq_dim * pq_shift;
-  const uint32_t pq_mask  = pq_shift - 1u;
+  const uint32_t lut_size = pq_dim * PqShift;
 
   if constexpr (EnableSMemLut) {
     lut_scores = reinterpret_cast<LutT*>(smem_buf);
@@ -629,7 +629,7 @@ __global__ void ivfpq_compute_similarity_kernel(uint32_t n_rows,
     if (codebook_kind == codebook_gen::PER_SUBSPACE) {
       pq_center = pq_centers;
     } else {
-      pq_center = pq_centers + (pq_len << pq_bits) * label;
+      pq_center = pq_centers + (pq_len << PqBits) * label;
     }
 
     if constexpr (PrecompBaseDiff) {
@@ -658,15 +658,15 @@ __global__ void ivfpq_compute_similarity_kernel(uint32_t n_rows,
       // For each subspace, the lookup table stores the distance between the actual query vector
       // (projected into the subspace) and all possible pq vectors in that subspace.
       for (uint32_t i = threadIdx.x; i < lut_size; i += blockDim.x) {
-        const uint32_t i_pq  = i >> pq_bits;
+        const uint32_t i_pq  = i >> PqBits;
         uint32_t j           = i_pq * pq_len;
         const uint32_t j_end = pq_len + j;
-        auto cur_pq_center   = pq_center + (i & pq_mask) +
-                             (codebook_kind == codebook_gen::PER_SUBSPACE ? j * pq_shift : 0u);
+        auto cur_pq_center   = pq_center + (i & PqMask) +
+                             (codebook_kind == codebook_gen::PER_SUBSPACE ? j * PqShift : 0u);
         float score = 0.0;
         do {
           float pq_c = *cur_pq_center;
-          cur_pq_center += pq_shift;
+          cur_pq_center += PqShift;
           switch (metric) {
             case distance::DistanceType::L2Expanded: {
               float diff;
@@ -709,7 +709,8 @@ __global__ void ivfpq_compute_similarity_kernel(uint32_t n_rows,
     uint32_t n_samples         = chunk_indices[probe_ix] - sample_offset;
     uint32_t n_samples_aligned = group_align::roundUp(n_samples);
     IdxT cluster_offset        = cluster_offsets[label];
-    uint32_t pq_line_width     = vec_align::roundUp(pq_dim * pq_bits / 8);
+    const uint32_t pq_chunk    = (kIndexGroupVecLen * 8u) / PqBits;
+    uint32_t pq_line_width     = div_rounding_up_unsafe(pq_dim, pq_chunk) * vec_align::Value;
     auto pq_thread_data =
       pq_dataset + (size_t(cluster_offset) + group_align::roundDown(threadIdx.x)) * pq_line_width +
       group_align::mod(threadIdx.x) * vec_align::Value;
@@ -730,36 +731,32 @@ __global__ void ivfpq_compute_similarity_kernel(uint32_t n_rows,
       float score = 0;
       bool valid  = i < n_samples;
 
-      {
-        auto pq_head       = reinterpret_cast<const vec_t::io_t*>(pq_thread_data);
-        uint32_t bits_left = 0;
-        op_t pq_code       = 0;
-        auto lut_head      = lut_scores;
-        auto lut_end       = lut_scores + lut_size;
+      if (valid) {
+        auto pq_head  = reinterpret_cast<const vec_t::io_t*>(pq_thread_data);
+        auto lut_head = lut_scores;
+        auto lut_end  = lut_scores + lut_size;
         vec_t pq_codes;
         do {
           *pq_codes.vectorized_data() = *pq_head;
           pq_head += kIndexGroupSize;
+          uint32_t bits_left = 0;
+          op_t pq_code       = 0;
 #pragma unroll
-          for (uint32_t l = 0; l < vec_t::Ratio; l++) {
-            while (bits_left >= pq_bits && lut_head < lut_end) {
-              uint8_t code = pq_code & pq_mask;
-              pq_code >>= pq_bits;
-              score += valid ? float(lut_head[code]) : 0.0f;
-              lut_head += pq_shift;
-              bits_left -= pq_bits;
-            }
-            if (lut_head < lut_end) {
-              uint32_t rem_bits = pq_bits - bits_left;
-              uint8_t code      = pq_code << rem_bits;
-              pq_code           = pq_codes.val.data[l];
-              code |= pq_code & ((1u << rem_bits) - 1u);
-              pq_code >>= rem_bits;
-              score += valid ? float(lut_head[code]) : 0.0f;
-              lut_head += pq_shift;
-              bits_left += 8 * sizeof(op_t) - pq_bits;
-            } else {
-              break;
+          for (uint32_t l = 0; l < vec_t::Ratio && lut_head < lut_end; l++) {
+            uint32_t rem_bits = PqBits - bits_left;
+            uint8_t code      = pq_code << rem_bits;
+            pq_code           = pq_codes.val.data[l];
+            code |= pq_code & ((1u << rem_bits) - 1u);
+            pq_code >>= rem_bits;
+            score += float(lut_head[code]);
+            lut_head += PqShift;
+            bits_left += 8 * sizeof(op_t) - PqBits;
+            while (bits_left >= PqBits && lut_head < lut_end) {
+              code = pq_code & PqMask;
+              pq_code >>= PqBits;
+              score += float(lut_head[code]);
+              lut_head += PqShift;
+              bits_left -= PqBits;
             }
           }
         } while (lut_head < lut_end);
@@ -873,30 +870,38 @@ constexpr inline auto estimate_carveout(double shmem_fraction,
  */
 template <typename IdxT, typename OutT, typename LutT>
 struct ivfpq_compute_similarity {
-  using kernel_t = decltype(&ivfpq_compute_similarity_kernel<IdxT, OutT, LutT, 0, true, true>);
+  using kernel_t = decltype(&ivfpq_compute_similarity_kernel<IdxT, OutT, LutT, 8, 0, true, true>);
 
   template <bool PrecompBaseDiff, bool EnableSMemLut>
   struct configured {
    public:
     /** Select a proper kernel instance based on the runtime parameters. */
-    static auto kernel(uint32_t k_max) -> kernel_t
+    static auto kernel(uint32_t pq_bits, uint32_t k_max) -> kernel_t
     {
-      return kernel_try_capacity<kMaxCapacity>(k_max);
+      switch (pq_bits) {
+        case 4: return kernel_try_capacity<4, kMaxCapacity>(k_max);
+        case 5: return kernel_try_capacity<5, kMaxCapacity>(k_max);
+        case 6: return kernel_try_capacity<6, kMaxCapacity>(k_max);
+        case 7: return kernel_try_capacity<7, kMaxCapacity>(k_max);
+        case 8: return kernel_try_capacity<8, kMaxCapacity>(k_max);
+        default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits);
+      }
     }
 
    private:
-    template <int Capacity>
+    template <uint32_t PqBits, int Capacity>
     static auto kernel_try_capacity(uint32_t k_max) -> kernel_t
     {
       if constexpr (Capacity > 0) {
-        if (k_max == 0 || k_max > Capacity) { return kernel_try_capacity<0>(k_max); }
+        if (k_max == 0 || k_max > Capacity) { return kernel_try_capacity<PqBits, 0>(k_max); }
       }
       if constexpr (Capacity > 1) {
-        if (k_max * 2 <= Capacity) { return kernel_try_capacity<(Capacity / 2)>(k_max); }
+        if (k_max * 2 <= Capacity) { return kernel_try_capacity<PqBits, (Capacity / 2)>(k_max); }
       }
       return ivfpq_compute_similarity_kernel<IdxT,
                                              OutT,
                                              LutT,
+                                             PqBits,
                                              Capacity,
                                              PrecompBaseDiff,
                                              EnableSMemLut>;
@@ -1020,9 +1025,10 @@ struct ivfpq_compute_similarity {
     using conf_no_basediff = configured<false, true>;
     using conf_no_smem_lut = configured<true, false>;
     auto topk_or_zero      = manage_local_topk ? topk : 0u;
-    std::array candidates{std::make_tuple(conf_fast::kernel(topk_or_zero), lut_mem + bdf_mem, true),
-                          std::make_tuple(conf_no_basediff::kernel(topk_or_zero), lut_mem, true),
-                          std::make_tuple(conf_no_smem_lut::kernel(topk_or_zero), bdf_mem, false)};
+    std::array candidates{
+      std::make_tuple(conf_fast::kernel(pq_bits, topk_or_zero), lut_mem + bdf_mem, true),
+      std::make_tuple(conf_no_basediff::kernel(pq_bits, topk_or_zero), lut_mem, true),
+      std::make_tuple(conf_no_smem_lut::kernel(pq_bits, topk_or_zero), bdf_mem, false)};
 
     // allocation unit size (for rounding up)
     using shmem_unit = Pow2<128>;
@@ -1318,7 +1324,6 @@ void ivfpq_search_worker(const handle_t& handle,
                   index.size(),
                   index.rot_dim(),
                   n_probes,
-                  index.pq_bits(),
                   index.pq_dim(),
                   n_queries,
                   index.metric(),

From c0e1c80b2dd8710a85bc90208bbfb43479e31193 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 9 Nov 2022 16:08:52 +0100
Subject: [PATCH 16/31] Manually unroll the compute-score loop with templates
 (+8%-+16% qps)

---
 .../raft/spatial/knn/detail/ivf_pq_search.cuh | 102 ++++++++++++------
 1 file changed, 72 insertions(+), 30 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
index b8f940101d..19721ea952 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
@@ -469,6 +469,76 @@ struct pq_block_sort<0, T, IdxT> : dummy_block_sort_t<T, IdxT> {
 template <int Capacity, typename T, typename IdxT>
 using block_sort_t = typename pq_block_sort<Capacity, T, IdxT>::type;
 
+/* Manually unrolled loop over a chunk of pq_dataset that fits into one VecT. */
+template <typename LutT,
+          typename VecT,
+          bool CheckBounds,
+          uint32_t PqBits,
+          uint32_t BitsLeft = 0,
+          uint32_t Ix       = 0>
+__device__ __forceinline__ void ivfpq_compute_chunk(float& score /* NOLINT */,
+                                                    typename VecT::math_t& pq_code,
+                                                    const VecT& pq_codes,
+                                                    const LutT*& lut_head,
+                                                    const LutT*& lut_end)
+{
+  if constexpr (CheckBounds) {
+    if (lut_head >= lut_end) { return; }
+  }
+  constexpr uint32_t kTotalBits = 8 * sizeof(typename VecT::math_t);
+  constexpr uint32_t kPqShift   = 1u << PqBits;
+  constexpr uint32_t kPqMask    = kPqShift - 1u;
+  if constexpr (BitsLeft >= PqBits) {
+    uint8_t code = pq_code & kPqMask;
+    pq_code >>= PqBits;
+    score += float(lut_head[code]);
+    lut_head += kPqShift;
+    return ivfpq_compute_chunk<LutT, VecT, CheckBounds, PqBits, BitsLeft - PqBits, Ix>(
+      score, pq_code, pq_codes, lut_head, lut_end);
+  } else if constexpr (Ix < VecT::Ratio) {
+    constexpr uint32_t kRemBits = PqBits - BitsLeft;
+    constexpr uint32_t kRemMask = (1u << kRemBits) - 1u;
+    uint8_t code                = pq_code << kRemBits;
+    pq_code                     = pq_codes.val.data[Ix];
+    code |= pq_code & kRemMask;
+    pq_code >>= kRemBits;
+    score += float(lut_head[code]);
+    lut_head += kPqShift;
+    return ivfpq_compute_chunk<LutT,
+                               VecT,
+                               CheckBounds,
+                               PqBits,
+                               kTotalBits + BitsLeft - PqBits,
+                               Ix + 1>(score, pq_code, pq_codes, lut_head, lut_end);
+  }
+}
+
+/* Compute the similarity for one vector in the pq_dataset */
+template <typename LutT, typename VecT, uint32_t PqBits>
+__device__ auto ivfpq_compute_score(uint32_t pq_dim,
+                                    const typename VecT::io_t* pq_head,
+                                    const LutT* lut_scores) -> float
+{
+  constexpr uint32_t kChunks = sizeof(VecT) * 8 / PqBits;
+  auto lut_head              = lut_scores;
+  auto lut_end               = lut_scores + (pq_dim << PqBits);
+  VecT pq_codes;
+  float score        = 0;
+  uint32_t dims_left = pq_dim;
+  for (; dims_left >= kChunks; dims_left -= kChunks) {
+    *pq_codes.vectorized_data() = *pq_head;
+    pq_head += kIndexGroupSize;
+    typename VecT::math_t pq_code = 0;
+    ivfpq_compute_chunk<LutT, VecT, false, PqBits>(score, pq_code, pq_codes, lut_head, lut_end);
+  }
+  if (dims_left > 0) {
+    *pq_codes.vectorized_data()   = *pq_head;
+    typename VecT::math_t pq_code = 0;
+    ivfpq_compute_chunk<LutT, VecT, true, PqBits>(score, pq_code, pq_codes, lut_head, lut_end);
+  }
+  return score;
+}
+
 /**
  * The main kernel that computes similarity scores across multiple queries and probes.
  * When `Capacity > 0`, it also selects top K candidates for each query and probe
@@ -730,38 +800,10 @@ __global__ void ivfpq_compute_similarity_kernel(uint32_t n_rows,
          i += blockDim.x, pq_thread_data += pq_line_width) {
       float score = 0;
       bool valid  = i < n_samples;
-
       if (valid) {
-        auto pq_head  = reinterpret_cast<const vec_t::io_t*>(pq_thread_data);
-        auto lut_head = lut_scores;
-        auto lut_end  = lut_scores + lut_size;
-        vec_t pq_codes;
-        do {
-          *pq_codes.vectorized_data() = *pq_head;
-          pq_head += kIndexGroupSize;
-          uint32_t bits_left = 0;
-          op_t pq_code       = 0;
-#pragma unroll
-          for (uint32_t l = 0; l < vec_t::Ratio && lut_head < lut_end; l++) {
-            uint32_t rem_bits = PqBits - bits_left;
-            uint8_t code      = pq_code << rem_bits;
-            pq_code           = pq_codes.val.data[l];
-            code |= pq_code & ((1u << rem_bits) - 1u);
-            pq_code >>= rem_bits;
-            score += float(lut_head[code]);
-            lut_head += PqShift;
-            bits_left += 8 * sizeof(op_t) - PqBits;
-            while (bits_left >= PqBits && lut_head < lut_end) {
-              code = pq_code & PqMask;
-              pq_code >>= PqBits;
-              score += float(lut_head[code]);
-              lut_head += PqShift;
-              bits_left -= PqBits;
-            }
-          }
-        } while (lut_head < lut_end);
+        score = ivfpq_compute_score<LutT, vec_t, PqBits>(
+          pq_dim, reinterpret_cast<const vec_t::io_t*>(pq_thread_data), lut_scores);
       }
-
       if (!valid || score >= float(kDummy)) { score = float(kDummy); }
       if constexpr (kManageLocalTopK) {
         block_topk.add(OutT(score), cluster_offset + i);

From 5e39679d174a067572276ca4488f4d061822658a Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 9 Nov 2022 17:37:32 +0100
Subject: [PATCH 17/31] Use OutT instead of float as the compute-score output
 type to save on extra conversions when possible

---
 .../raft/spatial/knn/detail/ivf_pq_search.cuh | 38 ++++++++++---------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
index 19721ea952..b4df54bd36 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
@@ -86,6 +86,7 @@ struct fp_8bit {
     return *this;
   }
   HDI explicit operator float() const { return fp_8bit2float(*this); }
+  HDI explicit operator half() const { return half(fp_8bit2float(*this)); }
 
  private:
   static constexpr float kMin = 1.0f / float(1u << ExpMask);
@@ -470,13 +471,14 @@ template <int Capacity, typename T, typename IdxT>
 using block_sort_t = typename pq_block_sort<Capacity, T, IdxT>::type;
 
 /* Manually unrolled loop over a chunk of pq_dataset that fits into one VecT. */
-template <typename LutT,
+template <typename OutT,
+          typename LutT,
           typename VecT,
           bool CheckBounds,
           uint32_t PqBits,
           uint32_t BitsLeft = 0,
           uint32_t Ix       = 0>
-__device__ __forceinline__ void ivfpq_compute_chunk(float& score /* NOLINT */,
+__device__ __forceinline__ void ivfpq_compute_chunk(OutT& score /* NOLINT */,
                                                     typename VecT::math_t& pq_code,
                                                     const VecT& pq_codes,
                                                     const LutT*& lut_head,
@@ -491,9 +493,9 @@ __device__ __forceinline__ void ivfpq_compute_chunk(float& score /* NOLINT */,
   if constexpr (BitsLeft >= PqBits) {
     uint8_t code = pq_code & kPqMask;
     pq_code >>= PqBits;
-    score += float(lut_head[code]);
+    score += OutT(lut_head[code]);
     lut_head += kPqShift;
-    return ivfpq_compute_chunk<LutT, VecT, CheckBounds, PqBits, BitsLeft - PqBits, Ix>(
+    return ivfpq_compute_chunk<OutT, LutT, VecT, CheckBounds, PqBits, BitsLeft - PqBits, Ix>(
       score, pq_code, pq_codes, lut_head, lut_end);
   } else if constexpr (Ix < VecT::Ratio) {
     constexpr uint32_t kRemBits = PqBits - BitsLeft;
@@ -502,9 +504,10 @@ __device__ __forceinline__ void ivfpq_compute_chunk(float& score /* NOLINT */,
     pq_code                     = pq_codes.val.data[Ix];
     code |= pq_code & kRemMask;
     pq_code >>= kRemBits;
-    score += float(lut_head[code]);
+    score += OutT(lut_head[code]);
     lut_head += kPqShift;
-    return ivfpq_compute_chunk<LutT,
+    return ivfpq_compute_chunk<OutT,
+                               LutT,
                                VecT,
                                CheckBounds,
                                PqBits,
@@ -514,27 +517,29 @@ __device__ __forceinline__ void ivfpq_compute_chunk(float& score /* NOLINT */,
 }
 
 /* Compute the similarity for one vector in the pq_dataset */
-template <typename LutT, typename VecT, uint32_t PqBits>
+template <typename OutT, typename LutT, typename VecT, uint32_t PqBits>
 __device__ auto ivfpq_compute_score(uint32_t pq_dim,
                                     const typename VecT::io_t* pq_head,
-                                    const LutT* lut_scores) -> float
+                                    const LutT* lut_scores) -> OutT
 {
   constexpr uint32_t kChunks = sizeof(VecT) * 8 / PqBits;
   auto lut_head              = lut_scores;
   auto lut_end               = lut_scores + (pq_dim << PqBits);
   VecT pq_codes;
-  float score        = 0;
+  OutT score{0};
   uint32_t dims_left = pq_dim;
   for (; dims_left >= kChunks; dims_left -= kChunks) {
     *pq_codes.vectorized_data() = *pq_head;
     pq_head += kIndexGroupSize;
     typename VecT::math_t pq_code = 0;
-    ivfpq_compute_chunk<LutT, VecT, false, PqBits>(score, pq_code, pq_codes, lut_head, lut_end);
+    ivfpq_compute_chunk<OutT, LutT, VecT, false, PqBits>(
+      score, pq_code, pq_codes, lut_head, lut_end);
   }
   if (dims_left > 0) {
     *pq_codes.vectorized_data()   = *pq_head;
     typename VecT::math_t pq_code = 0;
-    ivfpq_compute_chunk<LutT, VecT, true, PqBits>(score, pq_code, pq_codes, lut_head, lut_end);
+    ivfpq_compute_chunk<OutT, LutT, VecT, true, PqBits>(
+      score, pq_code, pq_codes, lut_head, lut_end);
   }
   return score;
 }
@@ -798,17 +803,16 @@ __global__ void ivfpq_compute_similarity_kernel(uint32_t n_rows,
     // Compute a distance for each sample
     for (uint32_t i = threadIdx.x; i < n_samples_aligned;
          i += blockDim.x, pq_thread_data += pq_line_width) {
-      float score = 0;
-      bool valid  = i < n_samples;
+      OutT score = kDummy;
+      bool valid = i < n_samples;
       if (valid) {
-        score = ivfpq_compute_score<LutT, vec_t, PqBits>(
+        score = ivfpq_compute_score<OutT, LutT, vec_t, PqBits>(
           pq_dim, reinterpret_cast<const vec_t::io_t*>(pq_thread_data), lut_scores);
       }
-      if (!valid || score >= float(kDummy)) { score = float(kDummy); }
       if constexpr (kManageLocalTopK) {
-        block_topk.add(OutT(score), cluster_offset + i);
+        block_topk.add(score, cluster_offset + i);
       } else {
-        if (valid) { out_scores[i + sample_offset] = OutT(score); }
+        if (valid) { out_scores[i + sample_offset] = score; }
       }
     }
     if constexpr (kManageLocalTopK) {

From 781433942e635b5f4af539113241e400642b8f3c Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 9 Nov 2022 18:00:59 +0100
Subject: [PATCH 18/31] set default shmem carveout to 1.0 to allow smaller
 block sizes

---
 cpp/include/raft/neighbors/ivf_pq_types.hpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/include/raft/neighbors/ivf_pq_types.hpp b/cpp/include/raft/neighbors/ivf_pq_types.hpp
index bdac4cff75..61eba29638 100644
--- a/cpp/include/raft/neighbors/ivf_pq_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_pq_types.hpp
@@ -117,12 +117,11 @@ struct search_params : ann::search_params {
    * value is interpreted only as a hint. Moreover, a GPU usually allows only a fixed set of cache
    * configurations, so the provided value is rounded up to the nearest configuration. Refer to the
    * NVIDIA tuning guide for the target GPU architecture.
-   * The default value of `0.64` seems to be a good compromise for the majority of the current GPUs.
    *
    * Note, this is a low-level tuning parameter that can have drastic negative effects on the search
    * performance if tweaked incorrectly.
    */
-  double preferred_shmem_carveout = 0.64;
+  double preferred_shmem_carveout = 1.0;
 };
 
 static_assert(std::is_aggregate_v<index_params>);

From fe3399ca984f6f38111bffc6128b8114678265da Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 10 Nov 2022 08:37:51 +0100
Subject: [PATCH 19/31] Fix build error due to merged code

---
 .../raft/spatial/knn/detail/ivf_pq_build.cuh  | 31 ++++++++++---------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
index eaf41224f2..86b2d97d79 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
@@ -432,7 +432,7 @@ void compute_pq_codes(
       pq_dim,
       pq_bits,
       sub_vector_labels.data(),
-      make_mdspan(
+      make_mdspan<uint8_t, IdxT, row_major, memory_type::device>(
         pq_dataset.data_handle() +
           size_t(cluster_offsets[l]) * pq_dataset.extent(1) * pq_dataset.extent(2),
         make_extents<IdxT>(cluster_size, pq_dataset.extent(1), pq_dataset.static_extent(2))),
@@ -960,24 +960,25 @@ inline auto extend(const handle_t& handle,
   /* Extend the pq_dataset */
   using vec_t = TxN_t<uint8_t, kIndexGroupVecLen>::io_t;
 
-  auto data_unit = ext_index.pq_dataset().extent(1);
-  auto ext_pq_dataset =
-    make_mdspan(reinterpret_cast<vec_t*>(ext_index.pq_dataset().data_handle()),
-                make_extents<size_t>(
-                  ext_index.pq_dataset().extent(0), data_unit, ext_index.pq_dataset().extent(2)));
+  auto data_unit      = ext_index.pq_dataset().extent(1);
+  auto ext_pq_dataset = make_mdspan<vec_t, size_t, row_major, memory_type::device>(
+    reinterpret_cast<vec_t*>(ext_index.pq_dataset().data_handle()),
+    make_extents<size_t>(
+      ext_index.pq_dataset().extent(0), data_unit, ext_index.pq_dataset().extent(2)));
 
   for (uint32_t l = 0; l < ext_index.n_lists(); l++) {
     auto k                = cluster_ordering[l];
     auto old_cluster_size = old_cluster_sizes[k];
-    auto old_pq_dataset =
-      make_mdspan(reinterpret_cast<const vec_t*>(orig_index.pq_dataset().data_handle()) +
-                    data_unit * old_cluster_offsets[k],
-                  make_extents<size_t>(div_rounding_up_safe(old_cluster_size, kIndexGroupSize),
-                                       data_unit,
-                                       ext_pq_dataset.extent(2)));
-    auto new_pq_data = make_mdspan(reinterpret_cast<vec_t*>(new_pq_codes.data_handle()) +
-                                     data_unit * new_cluster_offsets.data()[k],
-                                   make_extents<size_t>(new_cluster_sizes[k], data_unit));
+    auto old_pq_dataset   = make_mdspan<const vec_t, size_t, row_major, memory_type::device>(
+      reinterpret_cast<const vec_t*>(orig_index.pq_dataset().data_handle()) +
+        data_unit * old_cluster_offsets[k],
+      make_extents<size_t>(div_rounding_up_safe(old_cluster_size, kIndexGroupSize),
+                           data_unit,
+                           ext_pq_dataset.extent(2)));
+    auto new_pq_data = make_mdspan<vec_t, size_t, row_major, memory_type::device>(
+      reinterpret_cast<vec_t*>(new_pq_codes.data_handle()) +
+        data_unit * new_cluster_offsets.data()[k],
+      make_extents<size_t>(new_cluster_sizes[k], data_unit));
     linalg::writeOnlyUnaryOp(
       ext_pq_dataset.data_handle() + data_unit * ext_cluster_offsets[l],
       data_unit * size_t(ext_cluster_offsets[l + 1] - ext_cluster_offsets[l]),

From acb1bf2d3221c041edf42ccb45f57f00ec814e55 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 10 Nov 2022 09:52:10 +0100
Subject: [PATCH 20/31] Add the early stop optimization for metrics that allow
 this (i.e. for now, it's L2)

---
 .../raft/spatial/knn/detail/ivf_pq_search.cuh | 21 +++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
index b4df54bd36..8f365cdda8 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
@@ -520,7 +520,8 @@ __device__ __forceinline__ void ivfpq_compute_chunk(OutT& score /* NOLINT */,
 template <typename OutT, typename LutT, typename VecT, uint32_t PqBits>
 __device__ auto ivfpq_compute_score(uint32_t pq_dim,
                                     const typename VecT::io_t* pq_head,
-                                    const LutT* lut_scores) -> OutT
+                                    const LutT* lut_scores,
+                                    OutT early_stop_limit) -> OutT
 {
   constexpr uint32_t kChunks = sizeof(VecT) * 8 / PqBits;
   auto lut_head              = lut_scores;
@@ -534,6 +535,10 @@ __device__ auto ivfpq_compute_score(uint32_t pq_dim,
     typename VecT::math_t pq_code = 0;
     ivfpq_compute_chunk<OutT, LutT, VecT, false, PqBits>(
       score, pq_code, pq_codes, lut_head, lut_end);
+    // Early stop when it makes sense (otherwise early_stop_limit is kDummy/infinity).
+    if constexpr (kChunks > 1) {
+      if (score >= early_stop_limit) { return score; }
+    }
   }
   if (dims_left > 0) {
     *pq_codes.vectorized_data()   = *pq_head;
@@ -795,6 +800,15 @@ __global__ void ivfpq_compute_similarity_kernel(uint32_t n_rows,
     OutT query_kth        = kDummy;
     if constexpr (kManageLocalTopK) { query_kth = OutT(query_kths[query_ix]); }
     local_topk_t block_topk(topk, smem_buf, query_kth);
+    OutT early_stop_limit = kDummy;
+    switch (metric) {
+      // If the metric is non-negative, we can use the query_kth approximation as an early stop
+      // threshold to skip some iterations when computing the score. Add such metrics here.
+      case distance::DistanceType::L2Expanded: {
+        early_stop_limit = query_kth;
+      } break;
+      default: break;
+    }
 
     // Ensure lut_scores is written by all threads before using it in ivfpq-compute-score
     __threadfence_block();
@@ -807,7 +821,10 @@ __global__ void ivfpq_compute_similarity_kernel(uint32_t n_rows,
       bool valid = i < n_samples;
       if (valid) {
         score = ivfpq_compute_score<OutT, LutT, vec_t, PqBits>(
-          pq_dim, reinterpret_cast<const vec_t::io_t*>(pq_thread_data), lut_scores);
+          pq_dim,
+          reinterpret_cast<const vec_t::io_t*>(pq_thread_data),
+          lut_scores,
+          early_stop_limit);
       }
       if constexpr (kManageLocalTopK) {
         block_topk.add(score, cluster_offset + i);

From 5b51b3eba55d275d6dcc17a08d03f6e560afea87 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 10 Nov 2022 18:03:21 +0100
Subject: [PATCH 21/31] Address the first review comments

---
 cpp/include/raft/neighbors/ivf_pq_types.hpp   |  3 +-
 .../raft/spatial/knn/detail/ivf_pq_build.cuh  |  8 ++---
 .../raft/spatial/knn/detail/ivf_pq_search.cuh | 34 +++++++------------
 .../spatial/knn/detail/topk/warpsort_topk.cuh |  6 ++--
 4 files changed, 21 insertions(+), 30 deletions(-)

diff --git a/cpp/include/raft/neighbors/ivf_pq_types.hpp b/cpp/include/raft/neighbors/ivf_pq_types.hpp
index 61eba29638..d09bcf4f94 100644
--- a/cpp/include/raft/neighbors/ivf_pq_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_pq_types.hpp
@@ -321,7 +321,8 @@ struct index : ann::index {
 
   using pq_dataset_extents = std::experimental::
     extents<IdxT, dynamic_extent, dynamic_extent, kIndexGroupSize, kIndexGroupVecLen>;
-  /** PQ-encoded data
+  /** PQ-encoded data stored in the interleaved format:
+   *
    *    [ ceildiv(size, kIndexGroupSize)
    *    , ceildiv(pq_dim, (kIndexGroupVecLen * 8u) / pq_bits)
    *    , kIndexGroupSize
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
index 86b2d97d79..08d0966162 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
@@ -432,7 +432,7 @@ void compute_pq_codes(
       pq_dim,
       pq_bits,
       sub_vector_labels.data(),
-      make_mdspan<uint8_t, IdxT, row_major, memory_type::device>(
+      make_mdspan<uint8_t, IdxT, row_major, false, true>(
         pq_dataset.data_handle() +
           size_t(cluster_offsets[l]) * pq_dataset.extent(1) * pq_dataset.extent(2),
         make_extents<IdxT>(cluster_size, pq_dataset.extent(1), pq_dataset.static_extent(2))),
@@ -961,7 +961,7 @@ inline auto extend(const handle_t& handle,
   using vec_t = TxN_t<uint8_t, kIndexGroupVecLen>::io_t;
 
   auto data_unit      = ext_index.pq_dataset().extent(1);
-  auto ext_pq_dataset = make_mdspan<vec_t, size_t, row_major, memory_type::device>(
+  auto ext_pq_dataset = make_mdspan<vec_t, size_t, row_major, false, true>(
     reinterpret_cast<vec_t*>(ext_index.pq_dataset().data_handle()),
     make_extents<size_t>(
       ext_index.pq_dataset().extent(0), data_unit, ext_index.pq_dataset().extent(2)));
@@ -969,13 +969,13 @@ inline auto extend(const handle_t& handle,
   for (uint32_t l = 0; l < ext_index.n_lists(); l++) {
     auto k                = cluster_ordering[l];
     auto old_cluster_size = old_cluster_sizes[k];
-    auto old_pq_dataset   = make_mdspan<const vec_t, size_t, row_major, memory_type::device>(
+    auto old_pq_dataset   = make_mdspan<const vec_t, size_t, row_major, false, true>(
       reinterpret_cast<const vec_t*>(orig_index.pq_dataset().data_handle()) +
         data_unit * old_cluster_offsets[k],
       make_extents<size_t>(div_rounding_up_safe(old_cluster_size, kIndexGroupSize),
                            data_unit,
                            ext_pq_dataset.extent(2)));
-    auto new_pq_data = make_mdspan<vec_t, size_t, row_major, memory_type::device>(
+    auto new_pq_data = make_mdspan<vec_t, size_t, row_major, false, true>(
       reinterpret_cast<vec_t*>(new_pq_codes.data_handle()) +
         data_unit * new_cluster_offsets.data()[k],
       make_extents<size_t>(new_cluster_sizes[k], data_unit));
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
index 8f365cdda8..8fca5cf36a 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
@@ -873,8 +873,8 @@ constexpr inline auto expected_probe_coresidency(uint32_t n_clusters,
       m = n_queries
       r = # of times a specific block appears in the batched sample.
 
-    Then:
-      P(r) = C(n,k) * k^r * (n - k)^r / n^m
+    Then, r has the Binomial distribution (p = k / n):
+      P(r) = C(m,r) * k^r * (n - k)^(m - r) / n^m
       E[r] = m * k / n
       E[r | r > 0] = m * k / n / (1 - (1 - k/n)^m)
 
@@ -885,9 +885,15 @@ constexpr inline auto expected_probe_coresidency(uint32_t n_clusters,
 }
 
 /**
- * Estimate a carveout value as expected by `cudaFuncAttributePreferredSharedMemoryCarveout`,
+ * Estimate a carveout value as expected by `cudaFuncAttributePreferredSharedMemoryCarveout`
+ * (which does not take into account `reservedSharedMemPerBlock`),
  * given by a desired schmem-L1 split and a per-block memory requirement in bytes.
  *
+ * NB: As per the programming guide, the memory carveout setting is just a hint for the driver; it's
+ * free to choose any shmem-L1 configuration it deems appropriate. For example, if you set the
+ * carveout to zero, it will choose a non-zero config that will allow to run at least one active
+ * block per SM.
+ *
  * @param shmem_fraction
  *   a fraction representing a desired split (shmem / (shmem + L1)) [0, 1].
  * @param shmem_per_block
@@ -902,25 +908,9 @@ constexpr inline auto estimate_carveout(double shmem_fraction,
                                         const cudaDeviceProp& dev_props) -> int
 {
   using shmem_unit = Pow2<128>;
-  /*
-    Memory carveout setting is just a hint for the driver; it's free to choose any shmem-L1
-    configuration it deems appropriate. Nevertheless, we can guide it a little to choose a more
-    optimal setting in non-edge cases. Entirely undocumented, the driver seems to perform the
-    following steps to calculate the final split:
-    1. Calculate the hinted memory usage:
-       mem_use_hint = (carveout_hint * sharedMemPerMultiprocessor / 100)
-    2. Get the maximum occupancy (possibly including other kernel limits):
-       blocks_per_sm = min(other_block_limits, mem_use_hint / shmem_unit::roundUp(shmem_per_block)).
-    3. Add up the hinted memory and the system per-block memory:
-       requested_mem = mem_use_hint + blocks_per_sm * reservedSharedMemPerBlock
-    4. Round up to the nearest possible config
-       (e.g. one of 0,8,16,32,64,100 KB for compute capability 8.6).
-
-    The algorithm below performs the reverse of steps 1-3.
-   */
-  size_t m = shmem_unit::roundUp(shmem_per_block);
-  size_t r = dev_props.reservedSharedMemPerBlock;
-  size_t s = dev_props.sharedMemPerMultiprocessor;
+  size_t m         = shmem_unit::roundUp(shmem_per_block);
+  size_t r         = dev_props.reservedSharedMemPerBlock;
+  size_t s         = dev_props.sharedMemPerMultiprocessor;
   return (size_t(100 * s * m * shmem_fraction) - (m - 1) * r) / (s * (m + r));
 }
 
diff --git a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
index fe1a7611b9..fa6cc5c6dd 100644
--- a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
@@ -33,7 +33,7 @@
   Three APIs of different scopes are provided:
     1. host function: warp_sort_topk()
     2. block-wide API: class block_sort
-    3. warp-wide API: class warp_sort_filtered and class warp_sort_immediate
+    3. warp-wide API: several implementations of warp_sort_*
 
 
   1. warp_sort_topk()
@@ -42,7 +42,7 @@
   2. class block_sort
     It can be regarded as a fixed size priority queue for a thread block,
     although the API is not typical.
-    class warp_sort_filtered and warp_sort_immediate can be used to instantiate block_sort.
+    one of the classes `warp_sort_*` can be used to instantiate block_sort.
 
     It uses dynamic shared memory as an intermediate buffer.
     So the required shared memory size should be calculated using
@@ -70,7 +70,7 @@
      kernel<<<grid_dim, block_dim, smem_size>>>();
 
 
-  3. class warp_sort_filtered and class warp_sort_immediate
+  3. class warp_sort_*
     These two classes can be regarded as fixed size priority queue for a warp.
     Usage is similar to class block_sort. No shared memory is needed.
 

From a78b52bede3207cae8cb486a111720d0241d4283 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 11 Nov 2022 15:57:42 +0100
Subject: [PATCH 22/31] Address more review comments

---
 .../raft/spatial/knn/detail/ivf_pq_search.cuh | 95 +++++++++++--------
 1 file changed, 54 insertions(+), 41 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
index 8fca5cf36a..48f2a60f73 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
@@ -778,6 +778,16 @@ __global__ void ivfpq_compute_similarity_kernel(uint32_t n_rows,
       }
     }
 
+    // Define helper types for efficient access to the pq_dataset, which is stored in an interleaved
+    // format. The chunks of PQ data are stored in kIndexGroupVecLen-bytes-long chunks, interleaved
+    // in groups of kIndexGroupSize elems (which is normally equal to the warp size) for the fastest
+    // possible access by thread warps.
+    //
+    // Consider one record in the pq_dataset is `pq_dim * pq_bits`-bit-long.
+    // Assuming `kIndexGroupVecLen = 16`, one chunk of data read by a thread at once is 128-bits.
+    // Then, such a chunk contains `chunk_size = 128 / pq_bits` record elements, and the record
+    // consists of `ceildiv(pq_dim, chunk_size)` chunks. The chunks are interleaved in groups of 32,
+    // so that the warp can achieve the best coalesced read throughput.
     using group_align  = Pow2<kIndexGroupSize>;
     using vec_align    = Pow2<kIndexGroupVecLen>;
     using local_topk_t = block_sort_t<Capacity, OutT, IdxT>;
@@ -961,6 +971,28 @@ struct ivfpq_compute_similarity {
     }
   };
 
+  /** Estimate the occupancy for the given kernel on the given device. */
+  struct occupancy_t {
+    using shmem_unit = Pow2<128>;
+
+    int blocks_per_sm = 0;
+    double occupancy  = 0.0;
+    double shmem_use  = 1.0;
+
+    inline occupancy_t() = default;
+    inline occupancy_t(size_t smem,
+                       uint32_t n_threads,
+                       kernel_t kernel,
+                       const cudaDeviceProp& dev_props)
+    {
+      RAFT_CUDA_TRY(
+        cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blocks_per_sm, kernel, n_threads, smem));
+      occupancy = double(blocks_per_sm * n_threads) / double(dev_props.maxThreadsPerMultiProcessor);
+      shmem_use = double(shmem_unit::roundUp(smem) * blocks_per_sm) /
+                  double(dev_props.sharedMemPerMultiprocessor);
+    }
+  };
+
   struct selected {
     kernel_t kernel;
     dim3 grid_dim;
@@ -1083,13 +1115,10 @@ struct ivfpq_compute_similarity {
       std::make_tuple(conf_no_basediff::kernel(pq_bits, topk_or_zero), lut_mem, true),
       std::make_tuple(conf_no_smem_lut::kernel(pq_bits, topk_or_zero), bdf_mem, false)};
 
-    // allocation unit size (for rounding up)
-    using shmem_unit = Pow2<128>;
     // we may allow slightly lower than 100% occupancy;
     constexpr double kTargetOccupancy = 0.75;
-    // these two parameters are used to select the better candidate
-    double selected_occupancy = 0.0;
-    double selected_shmem_use = 1.0;
+    // This struct is used to select the better candidate
+    occupancy_t selected_perf{};
     selected selected_config;
     for (auto [kernel, smem_size_const, lut_is_in_shmem] : candidates) {
       if (smem_size_const > dev_props.sharedMemPerBlockOptin) {
@@ -1127,19 +1156,12 @@ struct ivfpq_compute_similarity {
         continue;
       }
 
-      int blocks_per_sm;
-      RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &blocks_per_sm, kernel, n_threads, smem_size));
-      if (blocks_per_sm <= 0) {
+      occupancy_t cur(smem_size, n_threads, kernel, dev_props);
+      if (cur.blocks_per_sm <= 0) {
         // For some reason, we still cannot make this kernel run. Skip the candidate.
         continue;
       }
 
-      auto occupancy =
-        double(blocks_per_sm * n_threads) / double(dev_props.maxThreadsPerMultiProcessor);
-      auto shmem_use = double(shmem_unit::roundUp(smem_size) * blocks_per_sm) /
-                       double(dev_props.sharedMemPerMultiprocessor);
-
       {
         // Try to reduce the number of threads to increase occupancy and data locality
         auto n_threads_tmp = n_threads_min;
@@ -1148,36 +1170,28 @@ struct ivfpq_compute_similarity {
         }
         if (n_threads_tmp < n_threads) {
           while (n_threads_tmp >= n_threads_min) {
-            int blocks_per_sm_tmp;
             auto smem_size_tmp = max(smem_size_const, ltk_mem(n_threads_tmp));
-            RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-              &blocks_per_sm_tmp, kernel, n_threads_tmp, smem_size_tmp));
-            auto occupancy_tmp = double(blocks_per_sm_tmp * n_threads_tmp) /
-                                 double(dev_props.maxThreadsPerMultiProcessor);
-            auto shmem_use_tmp = double(shmem_unit::roundUp(smem_size) * blocks_per_sm_tmp) /
-                                 double(dev_props.sharedMemPerMultiprocessor);
+            occupancy_t tmp(smem_size_tmp, n_threads_tmp, kernel, dev_props);
             bool select_it = false;
-            if (lut_is_in_shmem && locality_hint >= blocks_per_sm_tmp) {
+            if (lut_is_in_shmem && locality_hint >= tmp.blocks_per_sm) {
               // Normally, the smaller the block the better for L1 cache hit rate.
               // Hence, the occupancy should be "just good enough"
-              select_it = occupancy_tmp >= min(kTargetOccupancy, occupancy);
+              select_it = tmp.occupancy >= min(kTargetOccupancy, cur.occupancy);
             } else if (lut_is_in_shmem) {
-              // If we don't have enough repeating probes (locality_hint < blocks_per_sm_tmp),
+              // If we don't have enough repeating probes (locality_hint < tmp.blocks_per_sm),
               // the locality is not going to improve with increasing the number of blocks per SM.
               // Hence, the only metric here is the occupancy.
-              select_it = occupancy_tmp > occupancy;
+              select_it = tmp.occupancy > cur.occupancy;
             } else {
               // If we don't use shared memory for the lookup table, increasing the number of blocks
               // is very taxing on the global memory usage.
               // In this case, the occupancy must increase a lot to make it worth the cost.
-              select_it = occupancy_tmp >= min(1.0, occupancy / kTargetOccupancy);
+              select_it = tmp.occupancy >= min(1.0, cur.occupancy / kTargetOccupancy);
             }
             if (select_it) {
-              n_threads     = n_threads_tmp;
-              smem_size     = smem_size_tmp;
-              blocks_per_sm = blocks_per_sm_tmp;
-              occupancy     = occupancy_tmp;
-              shmem_use     = shmem_use_tmp;
+              n_threads = n_threads_tmp;
+              smem_size = smem_size_tmp;
+              cur       = tmp;
             }
             n_threads_tmp /= 2;
           }
@@ -1185,12 +1199,11 @@ struct ivfpq_compute_similarity {
       }
 
       {
-        if (selected_occupancy <= 0.0  // no candidate yet
-            || (selected_occupancy < occupancy * kTargetOccupancy &&
-                selected_shmem_use >= shmem_use)  // much improved occupancy
+        if (selected_perf.occupancy <= 0.0  // no candidate yet
+            || (selected_perf.occupancy < cur.occupancy * kTargetOccupancy &&
+                selected_perf.shmem_use >= cur.shmem_use)  // much improved occupancy
         ) {
-          selected_occupancy = occupancy;
-          selected_shmem_use = shmem_use;
+          selected_perf = cur;
           if (lut_is_in_shmem) {
             selected_config = {
               kernel, dim3(n_blocks, 1, 1), dim3(n_threads, 1, 1), smem_size, size_t(0)};
@@ -1198,7 +1211,7 @@ struct ivfpq_compute_similarity {
             // When the global memory is used for the lookup table, we need to minimize the grid
             // size; otherwise, the kernel may quickly run out of memory.
             auto n_blocks_min =
-              std::min<uint32_t>(n_blocks, blocks_per_sm * dev_props.multiProcessorCount);
+              std::min<uint32_t>(n_blocks, cur.blocks_per_sm * dev_props.multiProcessorCount);
             selected_config = {kernel,
                                dim3(n_blocks_min, 1, 1),
                                dim3(n_threads, 1, 1),
@@ -1208,11 +1221,11 @@ struct ivfpq_compute_similarity {
           // Actual shmem/L1 split wildly rounds up the specified preferred carveout, so we set here
           // a rather conservative bar; most likely, the kernel gets more shared memory than this,
           // and the occupancy doesn't get hurt.
-          auto carveout = std::min<int>(max_carveout, std::ceil(100.0 * selected_shmem_use));
+          auto carveout = std::min<int>(max_carveout, std::ceil(100.0 * cur.shmem_use));
           RAFT_CUDA_TRY(
             cudaFuncSetAttribute(kernel, cudaFuncAttributePreferredSharedMemoryCarveout, carveout));
-          if (occupancy >= kTargetOccupancy) { break; }
-        } else if (selected_occupancy > 0.0) {
+          if (cur.occupancy >= kTargetOccupancy) { break; }
+        } else if (selected_perf.occupancy > 0.0) {
           // If we found a reasonable candidate on a previous iteration, and this one is not better,
           // then don't try any more candidates because they are much slower anyway.
           break;
@@ -1220,7 +1233,7 @@ struct ivfpq_compute_similarity {
       }
     }
 
-    RAFT_EXPECTS(selected_occupancy > 0.0,
+    RAFT_EXPECTS(selected_perf.occupancy > 0.0,
                  "Couldn't determine a working kernel launch configuration.");
 
     return selected_config;

From f8a056884768389d7f7961dc45576f9e21dbccb1 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 11 Nov 2022 16:31:11 +0100
Subject: [PATCH 23/31] Address one more set of comments

---
 .../raft/spatial/knn/detail/ivf_pq_build.cuh  | 22 +++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
index 08d0966162..efb4eee2a9 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
@@ -155,6 +155,10 @@ __launch_bounds__(BlockDim) __global__ void ivfpq_encode_kernel(
 }
 }  // namespace
 
+/**
+ * Compress the cluster labels into an encoding with pq_bits bits, and transform it into a form to
+ * facilitate vectorized loads
+ */
 inline void ivfpq_encode(uint32_t pq_dim,
                          uint32_t pq_bits,       // 4 <= pq_bits <= 8
                          const uint32_t* label,  // [pq_dim, n_rows]
@@ -283,6 +287,7 @@ void select_residuals(const handle_t& handle,
 }
 
 /**
+ *
  * @param handle,
  * @param n_rows
  * @param data_dim
@@ -301,8 +306,12 @@ void select_residuals(const handle_t& handle,
  *    it should be partitioned by the clusters by now.
  * @param cluster_sizes    // [n_clusters]
  * @param cluster_offsets  // [n_clusters + 1]
- * @param pq_centers  // [...]
- * @param pq_dataset  // [n_rows, ...]
+ * @param pq_centers  // [...] (see ivf_pq::index::pq_centers() layout)
+ * @param pq_dataset
+ *   // [n_rows, ceildiv(pq_dim, (kIndexGroupVecLen * 8u) / pq_bits), kIndexGroupVecLen]
+ *   NB: in contrast to the final interleaved layout in ivf_pq::index::pq_dataset(), this function
+ *       produces a non-interleaved data; it gets interleaved later when adding the data to the
+ *       index.
  * @param device_memory
  */
 template <typename T, typename IdxT>
@@ -958,6 +967,8 @@ inline auto extend(const handle_t& handle,
   }
 
   /* Extend the pq_dataset */
+  // For simplicity and performance, we reinterpret the last dimension of the dataset
+  // as a single vector element.
   using vec_t = TxN_t<uint8_t, kIndexGroupVecLen>::io_t;
 
   auto data_unit      = ext_index.pq_dataset().extent(1);
@@ -967,6 +978,9 @@ inline auto extend(const handle_t& handle,
       ext_index.pq_dataset().extent(0), data_unit, ext_index.pq_dataset().extent(2)));
 
   for (uint32_t l = 0; l < ext_index.n_lists(); l++) {
+    // Extend the data cluster-by-cluster;
+    // The original/old index stores the data interleaved;
+    // the new data produced by `compute_pq_codes` is not interleaved.
     auto k                = cluster_ordering[l];
     auto old_cluster_size = old_cluster_sizes[k];
     auto old_pq_dataset   = make_mdspan<const vec_t, size_t, row_major, false, true>(
@@ -979,10 +993,12 @@ inline auto extend(const handle_t& handle,
       reinterpret_cast<vec_t*>(new_pq_codes.data_handle()) +
         data_unit * new_cluster_offsets.data()[k],
       make_extents<size_t>(new_cluster_sizes[k], data_unit));
+    // Write all cluster data, vec-by-vec
     linalg::writeOnlyUnaryOp(
       ext_pq_dataset.data_handle() + data_unit * ext_cluster_offsets[l],
       data_unit * size_t(ext_cluster_offsets[l + 1] - ext_cluster_offsets[l]),
       [old_pq_dataset, new_pq_data, old_cluster_size] __device__(vec_t * out, size_t i_flat) {
+        // find the proper 3D index from the flat offset
         size_t i[3];
         for (int r = 2; r > 0; r--) {
           i[r] = i_flat % old_pq_dataset.extent(r);
@@ -991,8 +1007,10 @@ inline auto extend(const handle_t& handle,
         i[0]        = i_flat;
         auto row_ix = i[0] * old_pq_dataset.extent(2) + i[2];
         if (row_ix < old_cluster_size) {
+          // First, pack the original/old data
           *out = old_pq_dataset(i[0], i[1], i[2]);
         } else {
+          // Then add the new data
           row_ix -= old_cluster_size;
           if (row_ix < new_pq_data.extent(0)) {
             *out = new_pq_data(row_ix, i[1]);

From 598a84eca3d90e42bbb5b05c7b05e9e54c4b72bc Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 14 Nov 2022 10:41:26 +0100
Subject: [PATCH 24/31] Cosmetic improvements

---
 .../raft/spatial/knn/detail/ivf_pq_build.cuh  | 10 +++----
 .../raft/spatial/knn/detail/ivf_pq_search.cuh | 27 +++++++++----------
 2 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
index efb4eee2a9..4e25cc2622 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
@@ -139,11 +139,11 @@ __device__ void ivfpq_encode_core(uint32_t n_rows,
 }
 
 template <uint32_t BlockDim, uint32_t PqBits>
-__launch_bounds__(BlockDim) __global__ void ivfpq_encode_kernel(
-  uint32_t pq_dim,
-  const uint32_t* label,                                   // [pq_dim, n_rows]
-  device_mdspan<uint8_t, pq_codes_exts, row_major> output  // [n_rows, pq_dim]
-)
+__launch_bounds__(BlockDim) __global__
+  void ivfpq_encode_kernel(uint32_t pq_dim,
+                           const uint32_t* label,  // [pq_dim, n_rows]
+                           device_mdspan<uint8_t, pq_codes_exts, row_major> output  // [n_rows, ..]
+  )
 {
   uint32_t i = threadIdx.x + BlockDim * blockIdx.x;
   if (i >= output.extent(0)) return;
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
index 48f2a60f73..e154137818 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
@@ -511,7 +511,7 @@ __device__ __forceinline__ void ivfpq_compute_chunk(OutT& score /* NOLINT */,
                                VecT,
                                CheckBounds,
                                PqBits,
-                               kTotalBits + BitsLeft - PqBits,
+                               kTotalBits - kRemBits,
                                Ix + 1>(score, pq_code, pq_codes, lut_head, lut_end);
   }
 }
@@ -523,24 +523,21 @@ __device__ auto ivfpq_compute_score(uint32_t pq_dim,
                                     const LutT* lut_scores,
                                     OutT early_stop_limit) -> OutT
 {
-  constexpr uint32_t kChunks = sizeof(VecT) * 8 / PqBits;
-  auto lut_head              = lut_scores;
-  auto lut_end               = lut_scores + (pq_dim << PqBits);
+  constexpr uint32_t kChunkSize = sizeof(VecT) * 8u / PqBits;
+  auto lut_head                 = lut_scores;
+  auto lut_end                  = lut_scores + (pq_dim << PqBits);
   VecT pq_codes;
   OutT score{0};
-  uint32_t dims_left = pq_dim;
-  for (; dims_left >= kChunks; dims_left -= kChunks) {
+  for (; pq_dim >= kChunkSize; pq_dim -= kChunkSize) {
     *pq_codes.vectorized_data() = *pq_head;
     pq_head += kIndexGroupSize;
     typename VecT::math_t pq_code = 0;
     ivfpq_compute_chunk<OutT, LutT, VecT, false, PqBits>(
       score, pq_code, pq_codes, lut_head, lut_end);
     // Early stop when it makes sense (otherwise early_stop_limit is kDummy/infinity).
-    if constexpr (kChunks > 1) {
-      if (score >= early_stop_limit) { return score; }
-    }
+    if (score >= early_stop_limit) { return score; }
   }
-  if (dims_left > 0) {
+  if (pq_dim > 0) {
     *pq_codes.vectorized_data()   = *pq_head;
     typename VecT::math_t pq_code = 0;
     ivfpq_compute_chunk<OutT, LutT, VecT, true, PqBits>(
@@ -796,11 +793,11 @@ __global__ void ivfpq_compute_similarity_kernel(uint32_t n_rows,
 
     uint32_t sample_offset = 0;
     if (probe_ix > 0) { sample_offset = chunk_indices[probe_ix - 1]; }
-    uint32_t n_samples         = chunk_indices[probe_ix] - sample_offset;
-    uint32_t n_samples_aligned = group_align::roundUp(n_samples);
-    IdxT cluster_offset        = cluster_offsets[label];
-    const uint32_t pq_chunk    = (kIndexGroupVecLen * 8u) / PqBits;
-    uint32_t pq_line_width     = div_rounding_up_unsafe(pq_dim, pq_chunk) * vec_align::Value;
+    uint32_t n_samples            = chunk_indices[probe_ix] - sample_offset;
+    uint32_t n_samples_aligned    = group_align::roundUp(n_samples);
+    IdxT cluster_offset           = cluster_offsets[label];
+    constexpr uint32_t kChunkSize = (kIndexGroupVecLen * 8u) / PqBits;
+    uint32_t pq_line_width        = div_rounding_up_unsafe(pq_dim, kChunkSize) * kIndexGroupVecLen;
     auto pq_thread_data =
       pq_dataset + (size_t(cluster_offset) + group_align::roundDown(threadIdx.x)) * pq_line_width +
       group_align::mod(threadIdx.x) * vec_align::Value;

From b690d183e89e7db86b80729ed9db16bec86d773d Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 14 Nov 2022 14:59:18 +0100
Subject: [PATCH 25/31] Simplify transpose_pq_centers code using mdspans

---
 .../raft/spatial/knn/detail/ivf_pq_build.cuh  | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
index 4e25cc2622..66d741b4d3 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
@@ -508,24 +508,23 @@ void transpose_pq_centers(index<IdxT>& index,
                           const float* pq_centers_source,
                           rmm::cuda_stream_view stream)
 {
-  auto extents  = index.pq_centers().extents();
-  auto pq_len   = index.pq_len();
-  auto pq_width = index.pq_book_size();
+  auto extents = index.pq_centers().extents();
+  static_assert(extents.rank() == 3);
+  auto extents_source =
+    make_extents<uint32_t>(extents.extent(0), extents.extent(2), extents.extent(1));
+  auto span_source =
+    make_mdspan<const float, uint32_t, row_major, false, true>(pq_centers_source, extents_source);
   linalg::writeOnlyUnaryOp(
     index.pq_centers().data_handle(),
     index.pq_centers().size(),
-    [pq_centers_source, extents, pq_len, pq_width] __device__(float* out, size_t i) {
+    [span_source, extents] __device__(float* out, size_t i) {
       uint32_t ii[3];
       for (int r = 2; r > 0; r--) {
         ii[r] = i % extents.extent(r);
         i /= extents.extent(r);
       }
-      ii[0]   = i;
-      auto j2 = ii[1];
-      auto j1 = ii[2];
-      auto j0 = ii[0];
-      auto j  = ((j0 * pq_width) + j1) * pq_len + j2;
-      *out    = j2 < pq_len ? pq_centers_source[j] : 0.0f;
+      ii[0] = i;
+      *out  = span_source(ii[0], ii[2], ii[1]);
     },
     stream);
 }

From 1cb1aa9007b9a047eadc9e7c9a20bc2ce7d305c3 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 14 Nov 2022 16:34:05 +0100
Subject: [PATCH 26/31] Fix incorrect composing of 'code' from the bits on
 boundaries between source ints

---
 cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
index e154137818..3305a8e599 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
@@ -498,11 +498,11 @@ __device__ __forceinline__ void ivfpq_compute_chunk(OutT& score /* NOLINT */,
     return ivfpq_compute_chunk<OutT, LutT, VecT, CheckBounds, PqBits, BitsLeft - PqBits, Ix>(
       score, pq_code, pq_codes, lut_head, lut_end);
   } else if constexpr (Ix < VecT::Ratio) {
+    uint8_t code                = pq_code;
+    pq_code                     = pq_codes.val.data[Ix];
     constexpr uint32_t kRemBits = PqBits - BitsLeft;
     constexpr uint32_t kRemMask = (1u << kRemBits) - 1u;
-    uint8_t code                = pq_code << kRemBits;
-    pq_code                     = pq_codes.val.data[Ix];
-    code |= pq_code & kRemMask;
+    code |= (pq_code & kRemMask) << BitsLeft;
     pq_code >>= kRemBits;
     score += OutT(lut_head[code]);
     lut_head += kPqShift;

From 44ae87ae04acdae32c922ee7e941ea4a9996b7b5 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 15 Nov 2022 15:30:49 +0100
Subject: [PATCH 27/31] Allow ivf_pq::build inputs be on the host (for large
 datasets)

---
 .../raft/spatial/knn/detail/ann_utils.cuh     | 57 +++++++++++-
 .../raft/spatial/knn/detail/ivf_pq_build.cuh  | 88 ++++++++++++++++---
 2 files changed, 132 insertions(+), 13 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index 5d031cc51d..266f5a2f56 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -54,8 +54,9 @@ struct pointer_residency_count<Type, Types...> {
     cudaPointerAttributes attr;
     RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, ptr));
     switch (attr.type) {
-      case cudaMemoryTypeUnregistered:
-      case cudaMemoryTypeHost: return std::make_tuple(on_device, on_host + 1);
+      case cudaMemoryTypeUnregistered: return std::make_tuple(on_device, on_host + 1);
+      case cudaMemoryTypeHost:
+        return std::make_tuple(on_device + int(attr.devicePointer == ptr), on_host + 1);
       case cudaMemoryTypeDevice: return std::make_tuple(on_device + 1, on_host);
       case cudaMemoryTypeManaged: return std::make_tuple(on_device + 1, on_host + 1);
       default: return std::make_tuple(on_device, on_host);
@@ -75,6 +76,58 @@ auto check_pointer_residency(const Types*... ptrs) -> pointer_residency
   return pointer_residency::mixed;
 }
 
+/** RAII helper to access the host data from gpu when necessary. */
+template <typename PtrT, typename Action>
+struct with_mapped_memory_t {
+  with_mapped_memory_t(PtrT ptr, size_t size, Action action) : action_(action)
+  {
+    if (ptr == nullptr) { return; }
+    switch (utils::check_pointer_residency(ptr)) {
+      case utils::pointer_residency::device_only:
+      case utils::pointer_residency::host_and_device: {
+        dev_ptr_ = (void*)ptr;  // NOLINT
+      } break;
+      default: {
+        host_ptr_ = (void*)ptr;  // NOLINT
+        RAFT_CUDA_TRY(cudaHostRegister(host_ptr_, size, choose_flags(ptr)));
+        RAFT_CUDA_TRY(cudaHostGetDevicePointer(&dev_ptr_, host_ptr_, 0));
+      } break;
+    }
+  }
+
+  ~with_mapped_memory_t()
+  {
+    if (host_ptr_ != nullptr) { cudaHostUnregister(host_ptr_); }
+  }
+
+  auto operator()() { return action_((PtrT)dev_ptr_); }  // NOLINT
+
+ private:
+  Action action_;
+  void* host_ptr_ = nullptr;
+  void* dev_ptr_  = nullptr;
+
+  template <typename T>
+  static auto choose_flags(const T*) -> unsigned int
+  {
+    int dev_id, readonly_supported;
+    RAFT_CUDA_TRY(cudaGetDevice(&dev_id));
+    RAFT_CUDA_TRY(cudaDeviceGetAttribute(
+      &readonly_supported, cudaDevAttrHostRegisterReadOnlySupported, dev_id));
+    if (readonly_supported) {
+      return cudaHostRegisterMapped | cudaHostRegisterReadOnly;
+    } else {
+      return cudaHostRegisterMapped;
+    }
+  }
+
+  template <typename T>
+  static auto choose_flags(T*) -> unsigned int
+  {
+    return cudaHostRegisterMapped;
+  }
+};
+
 template <typename T>
 struct config {
 };
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
index 66d741b4d3..de7fa1b2f7 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
@@ -676,13 +676,17 @@ void train_per_cluster(const handle_t& handle,
   transpose_pq_centers(index, pq_centers_tmp.data(), stream);
 }
 
-/** See raft::spatial::knn::ivf_pq::extend docs */
+/**
+ * See raft::spatial::knn::ivf_pq::extend docs.
+ *
+ * This version requires `new_vectors` and `new_indices` (if non-null) to be on-device.
+ */
 template <typename T, typename IdxT>
-inline auto extend(const handle_t& handle,
-                   const index<IdxT>& orig_index,
-                   const T* new_vectors,
-                   const IdxT* new_indices,
-                   IdxT n_rows) -> index<IdxT>
+inline auto extend_device(const handle_t& handle,
+                          const index<IdxT>& orig_index,
+                          const T* new_vectors,
+                          const IdxT* new_indices,
+                          IdxT n_rows) -> index<IdxT>
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
     "ivf_pq::extend(%zu, %u)", size_t(n_rows), orig_index.dim());
@@ -694,6 +698,13 @@ inline auto extend(const handle_t& handle,
   static_assert(std::is_same_v<T, float> || std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
                 "Unsupported data type");
 
+  switch (new_indices != nullptr ? utils::check_pointer_residency(new_vectors, new_indices)
+                                 : utils::check_pointer_residency(new_vectors)) {
+    case utils::pointer_residency::device_only:
+    case utils::pointer_residency::host_and_device: break;
+    default: RAFT_FAIL("[ivf_pq::extend_device] The added data must be available on device.");
+  }
+
   rmm::mr::device_memory_resource* device_memory = nullptr;
   auto pool_guard = raft::get_pool_memory_resource(device_memory, 1024 * 1024);
   if (pool_guard) {
@@ -1024,9 +1035,33 @@ inline auto extend(const handle_t& handle,
   return ext_index;
 }
 
-/** See raft::spatial::knn::ivf_pq::build docs */
+/** See raft::spatial::knn::ivf_pq::extend docs */
 template <typename T, typename IdxT>
-inline auto build(
+inline auto extend(const handle_t& handle,
+                   const index<IdxT>& orig_index,
+                   const T* new_vectors,
+                   const IdxT* new_indices,
+                   IdxT n_rows) -> index<IdxT>
+{
+  size_t vec_size = sizeof(T) * size_t(n_rows) * size_t(orig_index.dim());
+  size_t ind_size = sizeof(IdxT) * size_t(n_rows);
+  return utils::with_mapped_memory_t{
+    new_vectors, vec_size, [&](const T* new_vectors_dev) {
+      return utils::with_mapped_memory_t{
+        new_indices, ind_size, [&](const IdxT* new_indices_dev) {
+          return extend_device<T, IdxT>(
+            handle, orig_index, new_vectors_dev, new_indices_dev, n_rows);
+        }}();
+    }}();
+}
+
+/**
+ * See raft::spatial::knn::ivf_pq::build docs.
+ *
+ * This version requires `dataset` to be on-device.
+ */
+template <typename T, typename IdxT>
+inline auto build_device(
   const handle_t& handle, const index_params& params, const T* dataset, IdxT n_rows, uint32_t dim)
   -> index<IdxT>
 {
@@ -1037,6 +1072,12 @@ inline auto build(
 
   RAFT_EXPECTS(n_rows > 0 && dim > 0, "empty dataset");
 
+  switch (utils::check_pointer_residency(dataset)) {
+    case utils::pointer_residency::device_only:
+    case utils::pointer_residency::host_and_device: break;
+    default: RAFT_FAIL("[ivf_pq::build_device] The dataset pointer must be available on device.");
+  }
+
   auto stream = handle.get_stream();
 
   index<IdxT> index(handle, params, dim);
@@ -1059,9 +1100,21 @@ inline auto build(
   rmm::mr::pool_memory_resource<rmm::mr::managed_memory_resource> managed_memory(
     &managed_memory_upstream, 1024 * 1024);
 
+  // If the trainset is small enough to comfortably fit into device memory, put it there.
+  // Otherwise, use the managed memory.
+  rmm::mr::device_memory_resource* big_memory_resource = &managed_memory;
+  {
+    size_t free_mem, total_mem;
+    constexpr size_t kTolerableRatio = 4;
+    RAFT_CUDA_TRY(cudaMemGetInfo(&free_mem, &total_mem));
+    if (sizeof(float) * n_rows_train * index.dim() * kTolerableRatio < free_mem) {
+      big_memory_resource = device_memory;
+    }
+  }
+
   // Besides just sampling, we transform the input dataset into floats to make it easier
   // to use gemm operations from cublas.
-  rmm::device_uvector<float> trainset(n_rows_train * index.dim(), stream, device_memory);
+  rmm::device_uvector<float> trainset(n_rows_train * index.dim(), stream, big_memory_resource);
   // TODO: a proper sampling
   if constexpr (std::is_same_v<T, float>) {
     RAFT_CUDA_TRY(cudaMemcpy2DAsync(trainset.data(),
@@ -1101,7 +1154,7 @@ inline auto build(
                              stream);
 
   // Trainset labels are needed for training PQ codebooks
-  rmm::device_uvector<uint32_t> labels(n_rows_train, stream, device_memory);
+  rmm::device_uvector<uint32_t> labels(n_rows_train, stream, big_memory_resource);
   kmeans::predict(handle,
                   cluster_centers,
                   index.n_lists(),
@@ -1190,10 +1243,23 @@ inline auto build(
 
   // add the data if necessary
   if (params.add_data_on_build) {
-    return detail::extend<T, IdxT>(handle, index, dataset, nullptr, n_rows);
+    return detail::extend_device<T, IdxT>(handle, index, dataset, nullptr, n_rows);
   } else {
     return index;
   }
 }
 
+/** See raft::spatial::knn::ivf_pq::build docs */
+template <typename T, typename IdxT>
+inline auto build(
+  const handle_t& handle, const index_params& params, const T* dataset, IdxT n_rows, uint32_t dim)
+  -> index<IdxT>
+{
+  size_t data_size = sizeof(T) * size_t(n_rows) * size_t(dim);
+  return utils::with_mapped_memory_t{dataset, data_size, [&](const T* dataset_dev) {
+                                       return build_device<T, IdxT>(
+                                         handle, params, dataset_dev, n_rows, dim);
+                                     }}();
+}
+
 }  // namespace raft::spatial::knn::ivf_pq::detail

From ead6bff65766114e5fe2ab9e2f2941b817f6d2e3 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 15 Nov 2022 15:38:40 +0100
Subject: [PATCH 28/31] Update the documentation: ivf_pq::build accepts both
 host and device pointers

---
 cpp/include/raft/neighbors/ivf_pq.cuh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/include/raft/neighbors/ivf_pq.cuh b/cpp/include/raft/neighbors/ivf_pq.cuh
index 207e298947..5b2035fadf 100644
--- a/cpp/include/raft/neighbors/ivf_pq.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq.cuh
@@ -53,7 +53,7 @@ namespace raft::neighbors::ivf_pq {
  *
  * @param handle
  * @param params configure the index building
- * @param[in] dataset a device pointer to a row-major matrix [n_rows, dim]
+ * @param[in] dataset a device/host pointer to a row-major matrix [n_rows, dim]
  * @param n_rows the number of samples
  * @param dim the dimensionality of the data
  *
@@ -91,8 +91,8 @@ inline auto build(
  *
  * @param handle
  * @param orig_index original index
- * @param[in] new_vectors a device pointer to a row-major matrix [n_rows, index.dim()]
- * @param[in] new_indices a device pointer to a vector of indices [n_rows].
+ * @param[in] new_vectors a device/host pointer to a row-major matrix [n_rows, index.dim()]
+ * @param[in] new_indices a device/host pointer to a vector of indices [n_rows].
  *    If the original index is empty (`orig_index.size() == 0`), you can pass `nullptr`
  *    here to imply a continuous range `[0...n_rows)`.
  * @param n_rows the number of samples
@@ -118,8 +118,8 @@ inline auto extend(const handle_t& handle,
  *
  * @param handle
  * @param[inout] index
- * @param[in] new_vectors a device pointer to a row-major matrix [n_rows, index.dim()]
- * @param[in] new_indices a device pointer to a vector of indices [n_rows].
+ * @param[in] new_vectors a device/host pointer to a row-major matrix [n_rows, index.dim()]
+ * @param[in] new_indices a device/host pointer to a vector of indices [n_rows].
  *    If the original index is empty (`orig_index.size() == 0`), you can pass `nullptr`
  *    here to imply a continuous range `[0...n_rows)`.
  * @param n_rows the number of samples

From d64799f1f8a45c47a06f2e964bb05908587d0f9c Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 16 Nov 2022 07:34:39 +0100
Subject: [PATCH 29/31] Fix an assert that is no longer valid

This PR introduces alignment into the cluster sizes and thus the total
index size exceeds `n_rows`.
---
 python/pylibraft/pylibraft/test/test_ivf_pq.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pylibraft/pylibraft/test/test_ivf_pq.py b/python/pylibraft/pylibraft/test/test_ivf_pq.py
index 367ff6d44a..78a02ad77e 100644
--- a/python/pylibraft/pylibraft/test/test_ivf_pq.py
+++ b/python/pylibraft/pylibraft/test/test_ivf_pq.py
@@ -131,7 +131,7 @@ def run_ivf_pq_build_search_test(
         index = ivf_pq.extend(index, dataset_1_device, indices_1_device)
         index = ivf_pq.extend(index, dataset_2_device, indices_2_device)
 
-    assert index.size == n_rows
+    assert index.size >= n_rows
 
     queries = generate_data((n_queries, n_cols), dtype)
     out_idx = np.zeros((n_queries, k), dtype=np.uint64)

From b28364d75d6d11b1d839b9efebdf0743b9bbf427 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 17 Nov 2022 09:59:01 +0100
Subject: [PATCH 30/31] Fix incorrect lookup of the DB record when the query
 result index is exactly the size of a cluster

---
 cpp/include/raft/neighbors/ivf_pq_types.hpp   | 19 +++++++++++++++
 .../raft/spatial/knn/detail/ivf_pq_search.cuh |  4 ++--
 cpp/test/neighbors/ann_ivf_pq.cuh             | 24 +++++++++++++++++++
 3 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/cpp/include/raft/neighbors/ivf_pq_types.hpp b/cpp/include/raft/neighbors/ivf_pq_types.hpp
index d09bcf4f94..afb3eb6cd6 100644
--- a/cpp/include/raft/neighbors/ivf_pq_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_pq_types.hpp
@@ -23,6 +23,8 @@
 #include <raft/distance/distance_types.hpp>
 #include <raft/util/integer_utils.hpp>
 
+#include <thrust/fill.h>
+
 #include <type_traits>
 
 namespace raft::neighbors::ivf_pq {
@@ -183,6 +185,19 @@ struct index : ann::index {
                 "IdxT must be able to represent all values of uint32_t");
 
  public:
+  /**
+   * Default value filled in the `indices()` array.
+   * One may encounter it trying to access a record within a cluster that is outside of the
+   * `list_sizes()` bound (due to the record alignment `kIndexGroupSize`).
+   */
+  constexpr static IdxT kInvalidRecord = std::numeric_limits<IdxT>::max() - 1;
+  /**
+   * Default value returned by `search` when the `n_probes` is too small and top-k is too large.
+   * One may encounter it if the combined size of probed clusters is smaller than the requested
+   * number of results per query.
+   */
+  constexpr static IdxT kOutOfBoundsRecord = std::numeric_limits<IdxT>::max();
+
   /** Total length of the index. */
   [[nodiscard]] constexpr inline auto size() const noexcept -> IdxT { return indices_.extent(0); }
   /** Dimensionality of the input data. */
@@ -298,6 +313,10 @@ struct index : ann::index {
   {
     pq_dataset_ = make_device_mdarray<uint8_t>(handle, make_pq_dataset_extents(index_size));
     indices_    = make_device_mdarray<IdxT>(handle, make_extents<IdxT>(index_size));
+    if (index_size > 0) {
+      thrust::fill_n(
+        handle.get_thrust_policy(), indices_.data_handle(), index_size, kInvalidRecord);
+    }
     check_consistency();
   }
 
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
index f4db1d67b0..0ff659ae5d 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
@@ -327,7 +327,7 @@ __device__ auto find_db_row(IdxT& x,  // NOLINT
   uint32_t ix_max = n_probes;
   do {
     uint32_t i = (ix_min + ix_max) / 2;
-    if (IdxT(chunk_indices[i]) < x) {
+    if (IdxT(chunk_indices[i]) <= x) {
       ix_min = i + 1;
     } else {
       ix_max = i;
@@ -365,7 +365,7 @@ __launch_bounds__(BlockDim) __global__
                         clusters_to_probe + n_probes * query_ix,
                         chunk_indices + n_probes * query_ix);
   }
-  neighbors[k] = valid ? db_indices[data_ix] : std::numeric_limits<IdxT>::max();
+  neighbors[k] = valid ? db_indices[data_ix] : index<IdxT>::kOutOfBoundsRecord;
 }
 
 /**
diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index 36fbef4bd5..bb6fb30ad3 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -459,6 +459,30 @@ inline auto special_cases() -> test_cases_t
     x.search_params.n_probes     = 50;
   });
 
+  ADD_CASE({
+    x.num_db_vecs                = 10000;
+    x.dim                        = 16;
+    x.num_queries                = 500;
+    x.k                          = 128;
+    x.index_params.metric        = distance::DistanceType::L2Expanded;
+    x.index_params.codebook_kind = ivf_pq::codebook_gen::PER_SUBSPACE;
+    x.index_params.pq_bits       = 8;
+    x.index_params.n_lists       = 100;
+    x.search_params.n_probes     = 100;
+  });
+
+  ADD_CASE({
+    x.num_db_vecs                = 10000;
+    x.dim                        = 16;
+    x.num_queries                = 500;
+    x.k                          = 129;
+    x.index_params.metric        = distance::DistanceType::L2Expanded;
+    x.index_params.codebook_kind = ivf_pq::codebook_gen::PER_SUBSPACE;
+    x.index_params.pq_bits       = 8;
+    x.index_params.n_lists       = 100;
+    x.search_params.n_probes     = 100;
+  });
+
   return xs;
 }
 

From 56d6d5bc5be2118c91384caa1fa8888d43caea27 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 17 Nov 2022 11:30:12 +0100
Subject: [PATCH 31/31] Add extra checks for index invariants

---
 cpp/test/neighbors/ann_ivf_pq.cuh | 49 ++++++++++++++++++++++++++++++-
 1 file changed, 48 insertions(+), 1 deletion(-)

diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index bb6fb30ad3..9d6ad11ccb 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -35,6 +35,8 @@
 
 #include <gtest/gtest.h>
 
+#include <cub/cub.cuh>
+#include <thrust/reduce.h>
 #include <thrust/sequence.h>
 
 #include <algorithm>
@@ -106,6 +108,18 @@ inline auto operator<<(std::ostream& os, const ivf_pq_inputs& p) -> std::ostream
   return os;
 }
 
+template <typename IdxT>
+auto min_output_size(const handle_t& handle, const ivf_pq::index<IdxT>& index, uint32_t n_probes)
+  -> IdxT
+{
+  uint32_t skip = index.n_nonempty_lists() > n_probes ? index.n_nonempty_lists() - n_probes : 0;
+  auto map_type = [] __device__(uint32_t x) { return IdxT(x); };
+  using iter    = cub::TransformInputIterator<IdxT, decltype(map_type), const uint32_t*>;
+  iter start(index.list_sizes().data_handle() + skip, map_type);
+  iter end(index.list_sizes().data_handle() + index.n_nonempty_lists(), map_type);
+  return thrust::reduce(handle.get_thrust_policy(), start, end);
+}
+
 template <typename EvalT, typename DataT, typename IdxT>
 class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
  public:
@@ -189,7 +203,7 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
   }
 
   template <typename BuildIndex>
-  auto run(BuildIndex build_index)
+  void run(BuildIndex build_index)
   {
     auto index = build_index();
 
@@ -228,6 +242,39 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
                                 ps.k,
                                 0.001 / low_precision_factor,
                                 min_recall));
+
+    // Test a few extra invariants
+    IdxT min_results = min_output_size(handle_, index, ps.search_params.n_probes);
+    IdxT max_oob     = ps.k <= min_results ? 0 : ps.k - min_results;
+    IdxT found_oob   = 0;
+    for (uint32_t query_ix = 0; query_ix < ps.num_queries; query_ix++) {
+      for (uint32_t k = 0; k < ps.k; k++) {
+        auto flat_i   = query_ix * ps.k + k;
+        auto found_ix = indices_ivf_pq[flat_i];
+        if (found_ix == ivf_pq::index<IdxT>::kOutOfBoundsRecord) {
+          found_oob++;
+          continue;
+        }
+        ASSERT_NE(found_ix, ivf_pq::index<IdxT>::kInvalidRecord)
+          << "got an invalid record at query_ix = " << query_ix << ", k = " << k
+          << " (distance = " << distances_ivf_pq[flat_i] << ")";
+        ASSERT_LT(found_ix, ps.num_db_vecs)
+          << "got an impossible index = " << found_ix << " at query_ix = " << query_ix
+          << ", k = " << k << " (distance = " << distances_ivf_pq[flat_i] << ")";
+      }
+    }
+    ASSERT_LE(found_oob, max_oob)
+      << "got too many records out-of-bounds (see ivf_pq::index<IdxT>::kOutOfBoundsRecord).";
+    if (found_oob > 0) {
+      RAFT_LOG_WARN(
+        "Got %zu results out-of-bounds because of large top-k (%zu) and small n_probes (%u) and "
+        "small DB size/n_lists ratio (%zu / %u)",
+        size_t(found_oob),
+        size_t(ps.k),
+        ps.search_params.n_probes,
+        size_t(ps.num_db_vecs),
+        ps.index_params.n_lists);
+    }
   }
 
   void SetUp() override  // NOLINT