From 35ab60d603afb996cbcb0dd00115db807117d07f Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 13 May 2022 10:51:37 +0200
Subject: [PATCH 001/118] inital commit and formatting cleanup

---
 .../raft/common/device_loads_stores.cuh       |  376 ++++-
 cpp/include/raft/spatial/knn/ann.cuh          |   24 +-
 cpp/include/raft/spatial/knn/ann.hpp          |   76 +-
 cpp/include/raft/spatial/knn/ann_common.h     |    2 +
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  | 1364 ++++++++++++++++
 .../knn/detail/ann_ivf_flat_kernel.cuh        | 1411 +++++++++++++++++
 .../knn/detail/ann_kmeans_balanced.cuh        |  414 +++++
 .../knn/detail/ann_quantized_faiss.cuh        |  266 +++-
 .../raft/spatial/knn/detail/ann_utils.cuh     |  586 +++++++
 .../spatial/knn/detail/topk/radix_topk.cuh    |  204 ++-
 cpp/test/CMakeLists.txt                       |    1 +
 cpp/test/spatial/ann_base_kernel.cuh          |   88 +
 cpp/test/spatial/ann_ivf_flat.cu              |  291 ++++
 13 files changed, 4944 insertions(+), 159 deletions(-)
 create mode 100644 cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
 create mode 100644 cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
 create mode 100644 cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
 create mode 100644 cpp/include/raft/spatial/knn/detail/ann_utils.cuh
 create mode 100644 cpp/test/spatial/ann_base_kernel.cuh
 create mode 100644 cpp/test/spatial/ann_ivf_flat.cu

diff --git a/cpp/include/raft/common/device_loads_stores.cuh b/cpp/include/raft/common/device_loads_stores.cuh
index 41dc9cab08..0c4750aa69 100644
--- a/cpp/include/raft/common/device_loads_stores.cuh
+++ b/cpp/include/raft/common/device_loads_stores.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,6 +31,121 @@ namespace raft {
  * @param[out] addr shared memory address (should be aligned to vector size)
  * @param[in]  x    data to be stored at this address
  */
+DI void sts(uint8_t* addr, const uint8_t& x)
+{
+  uint32_t x_int;
+  x_int   = x;
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<uint8_t*>(addr));
+  asm volatile("st.shared.u8 [%0], {%1};" : : "l"(s1), "r"(x_int));
+}
+DI void sts(uint8_t* addr, const uint8_t (&x)[1])
+{
+  uint32_t x_int[1];
+  x_int[0] = x[0];
+  auto s1  = __cvta_generic_to_shared(reinterpret_cast<uint8_t*>(addr));
+  asm volatile("st.shared.u8 [%0], {%1};" : : "l"(s1), "r"(x_int[0]));
+}
+DI void sts(uint8_t* addr, const uint8_t (&x)[2])
+{
+  uint32_t x_int[2];
+  x_int[0] = x[0];
+  x_int[1] = x[1];
+  auto s2  = __cvta_generic_to_shared(reinterpret_cast<uint8_t*>(addr));
+  asm volatile("st.shared.v2.u8 [%0], {%1, %2};" : : "l"(s2), "r"(x_int[0]), "r"(x_int[1]));
+}
+DI void sts(uint8_t* addr, const uint8_t (&x)[4])
+{
+  uint32_t x_int[4];
+  x_int[0] = x[0];
+  x_int[1] = x[1];
+  x_int[2] = x[2];
+  x_int[3] = x[3];
+  auto s4  = __cvta_generic_to_shared(reinterpret_cast<uint8_t*>(addr));
+  asm volatile("st.shared.v4.u8 [%0], {%1, %2, %3, %4};"
+               :
+               : "l"(s4), "r"(x_int[0]), "r"(x_int[1]), "r"(x_int[2]), "r"(x_int[3]));
+}
+
+DI void sts(int8_t* addr, const int8_t& x)
+{
+  int32_t x_int = x;
+  auto s1       = __cvta_generic_to_shared(reinterpret_cast<int8_t*>(addr));
+  asm volatile("st.shared.s8 [%0], {%1};" : : "l"(s1), "r"(x_int));
+}
+DI void sts(int8_t* addr, const int8_t (&x)[1])
+{
+  int32_t x_int[1];
+  x_int[0] = x[0];
+  auto s1  = __cvta_generic_to_shared(reinterpret_cast<int8_t*>(addr));
+  asm volatile("st.shared.s8 [%0], {%1};" : : "l"(s1), "r"(x_int[0]));
+}
+DI void sts(int8_t* addr, const int8_t (&x)[2])
+{
+  int32_t x_int[2];
+  x_int[0] = x[0];
+  x_int[1] = x[1];
+  auto s2  = __cvta_generic_to_shared(reinterpret_cast<int8_t*>(addr));
+  asm volatile("st.shared.v2.s8 [%0], {%1, %2};" : : "l"(s2), "r"(x_int[0]), "r"(x_int[1]));
+}
+DI void sts(int8_t* addr, const int8_t (&x)[4])
+{
+  int32_t x_int[4];
+  x_int[0] = x[0];
+  x_int[1] = x[1];
+  x_int[2] = x[2];
+  x_int[3] = x[3];
+  auto s4  = __cvta_generic_to_shared(reinterpret_cast<int8_t*>(addr));
+  asm volatile("st.shared.v4.s8 [%0], {%1, %2, %3, %4};"
+               :
+               : "l"(s4), "r"(x_int[0]), "r"(x_int[1]), "r"(x_int[2]), "r"(x_int[3]));
+}
+
+DI void sts(uint32_t* addr, const uint32_t& x)
+{
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<uint32_t*>(addr));
+  asm volatile("st.shared.u32 [%0], {%1};" : : "l"(s1), "r"(x));
+}
+DI void sts(uint32_t* addr, const uint32_t (&x)[1])
+{
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<uint32_t*>(addr));
+  asm volatile("st.shared.u32 [%0], {%1};" : : "l"(s1), "r"(x[0]));
+}
+DI void sts(uint32_t* addr, const uint32_t (&x)[2])
+{
+  auto s2 = __cvta_generic_to_shared(reinterpret_cast<uint2*>(addr));
+  asm volatile("st.shared.v2.u32 [%0], {%1, %2};" : : "l"(s2), "r"(x[0]), "r"(x[1]));
+}
+DI void sts(uint32_t* addr, const uint32_t (&x)[4])
+{
+  auto s4 = __cvta_generic_to_shared(reinterpret_cast<uint4*>(addr));
+  asm volatile("st.shared.v4.u32 [%0], {%1, %2, %3, %4};"
+               :
+               : "l"(s4), "r"(x[0]), "r"(x[1]), "r"(x[2]), "r"(x[3]));
+}
+
+DI void sts(int32_t* addr, const int32_t& x)
+{
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<int32_t*>(addr));
+  asm volatile("st.shared.u32 [%0], {%1};" : : "l"(s1), "r"(x));
+}
+DI void sts(int32_t* addr, const int32_t (&x)[1])
+{
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<int32_t*>(addr));
+  asm volatile("st.shared.u32 [%0], {%1};" : : "l"(s1), "r"(x[0]));
+}
+DI void sts(int32_t* addr, const int32_t (&x)[2])
+{
+  auto s2 = __cvta_generic_to_shared(reinterpret_cast<int2*>(addr));
+  asm volatile("st.shared.v2.u32 [%0], {%1, %2};" : : "l"(s2), "r"(x[0]), "r"(x[1]));
+}
+DI void sts(int32_t* addr, const int32_t (&x)[4])
+{
+  auto s4 = __cvta_generic_to_shared(reinterpret_cast<int4*>(addr));
+  asm volatile("st.shared.v4.u32 [%0], {%1, %2, %3, %4};"
+               :
+               : "l"(s4), "r"(x[0]), "r"(x[1]), "r"(x[2]), "r"(x[3]));
+}
+
 DI void sts(float* addr, const float& x)
 {
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<float*>(addr));
@@ -83,6 +198,152 @@ DI void sts(double* addr, const double (&x)[2])
  * @param[in]  addr shared memory address from where to load
  *                  (should be aligned to vector size)
  */
+
+DI void lds(uint8_t& x, const uint8_t* addr)
+{
+  uint32_t x_int;
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<const uint8_t*>(addr));
+  asm volatile("ld.shared.u8 {%0}, [%1];" : "=r"(x_int) : "l"(s1));
+  x = x_int;
+}
+DI void lds(uint8_t (&x)[1], const uint8_t* addr)
+{
+  uint32_t x_int[1];
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<const uint8_t*>(addr));
+  asm volatile("ld.shared.u8 {%0}, [%1];" : "=r"(x_int[0]) : "l"(s1));
+  x[0] = x_int[0];
+}
+DI void lds(uint8_t (&x)[2], const uint8_t* addr)
+{
+  uint32_t x_int[2];
+  auto s2 = __cvta_generic_to_shared(reinterpret_cast<const uint8_t*>(addr));
+  asm volatile("ld.shared.v2.u8 {%0, %1}, [%2];" : "=r"(x_int[0]), "=r"(x_int[1]) : "l"(s2));
+  x[0] = x_int[0];
+  x[1] = x_int[1];
+}
+DI void lds(uint8_t (&x)[4], const uint8_t* addr)
+{
+  uint32_t x_int[4];
+  auto s4 = __cvta_generic_to_shared(reinterpret_cast<const uint8_t*>(addr));
+  asm volatile("ld.shared.v4.u8 {%0, %1, %2, %3}, [%4];"
+               : "=r"(x_int[0]), "=r"(x_int[1]), "=r"(x_int[2]), "=r"(x_int[3])
+               : "l"(s4));
+  x[0] = x_int[0];
+  x[1] = x_int[1];
+  x[2] = x_int[2];
+  x[3] = x_int[3];
+}
+
+DI void lds(int8_t& x, const int8_t* addr)
+{
+  int32_t x_int;
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<const int8_t*>(addr));
+  asm volatile("ld.shared.s8 {%0}, [%1];" : "=r"(x_int) : "l"(s1));
+  x = x_int;
+}
+DI void lds(int8_t (&x)[1], const int8_t* addr)
+{
+  int32_t x_int[1];
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<const int8_t*>(addr));
+  asm volatile("ld.shared.s8 {%0}, [%1];" : "=r"(x_int[0]) : "l"(s1));
+  x[0] = x_int[0];
+}
+DI void lds(int8_t (&x)[2], const int8_t* addr)
+{
+  int32_t x_int[2];
+  auto s2 = __cvta_generic_to_shared(reinterpret_cast<const int8_t*>(addr));
+  asm volatile("ld.shared.v2.s8 {%0, %1}, [%2];" : "=r"(x_int[0]), "=r"(x_int[1]) : "l"(s2));
+  x[0] = x_int[0];
+  x[1] = x_int[1];
+}
+DI void lds(int8_t (&x)[4], const int8_t* addr)
+{
+  int32_t x_int[4];
+  auto s4 = __cvta_generic_to_shared(reinterpret_cast<const int8_t*>(addr));
+  asm volatile("ld.shared.v4.s8 {%0, %1, %2, %3}, [%4];"
+               : "=r"(x_int[0]), "=r"(x_int[1]), "=r"(x_int[2]), "=r"(x_int[3])
+               : "l"(s4));
+  x[0] = x_int[0];
+  x[1] = x_int[1];
+  x[2] = x_int[2];
+  x[3] = x_int[3];
+}
+
+DI void lds(uint32_t (&x)[4], const uint32_t* addr)
+{
+  auto s4 = __cvta_generic_to_shared(reinterpret_cast<const uint32_t*>(addr));
+  asm volatile("ld.shared.v4.u32 {%0, %1, %2, %3}, [%4];"
+               : "=r"(x[0]), "=r"(x[1]), "=r"(x[2]), "=r"(x[3])
+               : "l"(s4));
+}
+
+DI void lds(uint32_t (&x)[2], const uint32_t* addr)
+{
+  auto s2 = __cvta_generic_to_shared(reinterpret_cast<const uint32_t*>(addr));
+  asm volatile("ld.shared.v2.u32 {%0, %1}, [%2];" : "=r"(x[0]), "=r"(x[1]) : "l"(s2));
+}
+
+DI void lds(uint32_t (&x)[1], const uint32_t* addr)
+{
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<const uint32_t*>(addr));
+  asm volatile("ld.shared.u32 {%0}, [%1];" : "=r"(x[0]) : "l"(s1));
+}
+
+DI void lds(uint32_t& x, const uint32_t* addr)
+{
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<const uint32_t*>(addr));
+  asm volatile("ld.shared.u32 {%0}, [%1];" : "=r"(x) : "l"(s1));
+}
+
+DI void lds(int32_t (&x)[4], const int32_t* addr)
+{
+  auto s4 = __cvta_generic_to_shared(reinterpret_cast<const int32_t*>(addr));
+  asm volatile("ld.shared.v4.u32 {%0, %1, %2, %3}, [%4];"
+               : "=r"(x[0]), "=r"(x[1]), "=r"(x[2]), "=r"(x[3])
+               : "l"(s4));
+}
+
+DI void lds(int32_t (&x)[2], const int32_t* addr)
+{
+  auto s2 = __cvta_generic_to_shared(reinterpret_cast<const int32_t*>(addr));
+  asm volatile("ld.shared.v2.u32 {%0, %1}, [%2];" : "=r"(x[0]), "=r"(x[1]) : "l"(s2));
+}
+
+DI void lds(int32_t (&x)[1], const int32_t* addr)
+{
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<const int32_t*>(addr));
+  asm volatile("ld.shared.u32 {%0}, [%1];" : "=r"(x[0]) : "l"(s1));
+}
+
+DI void lds(int32_t& x, const int32_t* addr)
+{
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<const int32_t*>(addr));
+  asm volatile("ld.shared.u32 {%0}, [%1];" : "=r"(x) : "l"(s1));
+}
+
+DI void lds(float& x, const float* addr)
+{
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<const float*>(addr));
+  asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x) : "l"(s1));
+}
+DI void lds(float (&x)[1], const float* addr)
+{
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<const float*>(addr));
+  asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x[0]) : "l"(s1));
+}
+DI void lds(float (&x)[2], const float* addr)
+{
+  auto s2 = __cvta_generic_to_shared(reinterpret_cast<const float2*>(addr));
+  asm volatile("ld.shared.v2.f32 {%0, %1}, [%2];" : "=f"(x[0]), "=f"(x[1]) : "l"(s2));
+}
+DI void lds(float (&x)[4], const float* addr)
+{
+  auto s4 = __cvta_generic_to_shared(reinterpret_cast<const float4*>(addr));
+  asm volatile("ld.shared.v4.f32 {%0, %1, %2, %3}, [%4];"
+               : "=f"(x[0]), "=f"(x[1]), "=f"(x[2]), "=f"(x[3])
+               : "l"(s4));
+}
+
 DI void lds(float& x, float* addr)
 {
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<float*>(addr));
@@ -159,6 +420,119 @@ DI void ldg(double (&x)[2], const double* addr)
 {
   asm volatile("ld.global.cg.v2.f64 {%0, %1}, [%2];" : "=d"(x[0]), "=d"(x[1]) : "l"(addr));
 }
+
+DI void ldg(uint32_t (&x)[4], const uint32_t* const& addr)
+{
+  asm volatile("ld.global.cg.v4.u32 {%0, %1, %2, %3}, [%4];"
+               : "=r"(x[0]), "=r"(x[1]), "=r"(x[2]), "=r"(x[3])
+               : "l"(addr));
+}
+
+DI void ldg(uint32_t (&x)[2], const uint32_t* const& addr)
+{
+  asm volatile("ld.global.cg.v2.u32 {%0, %1}, [%2];" : "=r"(x[0]), "=r"(x[1]) : "l"(addr));
+}
+
+DI void ldg(uint32_t (&x)[1], const uint32_t* const& addr)
+{
+  asm volatile("ld.global.cg.u32 %0, [%1];" : "=r"(x[0]) : "l"(addr));
+}
+
+DI void ldg(uint32_t& x, const uint32_t* const& addr)
+{
+  asm volatile("ld.global.cg.u32 %0, [%1];" : "=r"(x) : "l"(addr));
+}
+
+DI void ldg(int32_t (&x)[4], const int32_t* const& addr)
+{
+  asm volatile("ld.global.cg.v4.u32 {%0, %1, %2, %3}, [%4];"
+               : "=r"(x[0]), "=r"(x[1]), "=r"(x[2]), "=r"(x[3])
+               : "l"(addr));
+}
+
+DI void ldg(int32_t (&x)[2], const int32_t* const& addr)
+{
+  asm volatile("ld.global.cg.v2.u32 {%0, %1}, [%2];" : "=r"(x[0]), "=r"(x[1]) : "l"(addr));
+}
+
+DI void ldg(int32_t (&x)[1], const int32_t* const& addr)
+{
+  asm volatile("ld.global.cg.u32 %0, [%1];" : "=r"(x[0]) : "l"(addr));
+}
+
+DI void ldg(int32_t& x, const int32_t* const& addr)
+{
+  asm volatile("ld.global.cg.u32 %0, [%1];" : "=r"(x) : "l"(addr));
+}
+
+DI void ldg(uint8_t (&x)[4], const uint8_t* const& addr)
+{
+  uint32_t x_int[4];
+  asm volatile("ld.global.cg.v4.u8 {%0, %1, %2, %3}, [%4];"
+               : "=r"(x_int[0]), "=r"(x_int[1]), "=r"(x_int[2]), "=r"(x_int[3])
+               : "l"(addr));
+  x[0] = x_int[0];
+  x[1] = x_int[1];
+  x[2] = x_int[2];
+  x[3] = x_int[3];
+}
+
+DI void ldg(uint8_t (&x)[2], const uint8_t* const& addr)
+{
+  uint32_t x_int[2];
+  asm volatile("ld.global.cg.v2.u8 {%0, %1}, [%2];" : "=r"(x_int[0]), "=r"(x_int[1]) : "l"(addr));
+  x[0] = x_int[0];
+  x[1] = x_int[1];
+}
+
+DI void ldg(uint8_t (&x)[1], const uint8_t* const& addr)
+{
+  uint32_t x_int;
+  asm volatile("ld.global.cg.u8 %0, [%1];" : "=r"(x_int) : "l"(addr));
+  x[0] = x_int;
+}
+
+DI void ldg(uint8_t& x, const uint8_t* const& addr)
+{
+  uint32_t x_int;
+  asm volatile("ld.global.cg.u8 %0, [%1];" : "=r"(x_int) : "l"(addr));
+  x = x_int;
+}
+
+DI void ldg(int8_t (&x)[4], const int8_t* const& addr)
+{
+  int x_int[4];
+  asm volatile("ld.global.cg.v4.s8 {%0, %1, %2, %3}, [%4];"
+               : "=r"(x_int[0]), "=r"(x_int[1]), "=r"(x_int[2]), "=r"(x_int[3])
+               : "l"(addr));
+  x[0] = x_int[0];
+  x[1] = x_int[1];
+  x[2] = x_int[2];
+  x[3] = x_int[3];
+}
+
+DI void ldg(int8_t (&x)[2], const int8_t* const& addr)
+{
+  int x_int[2];
+  asm volatile("ld.global.cg.v2.s8 {%0, %1}, [%2];" : "=r"(x_int[0]), "=r"(x_int[1]) : "l"(addr));
+  x[0] = x_int[0];
+  x[1] = x_int[1];
+}
+
+DI void ldg(int8_t& x, const int8_t* const& addr)
+{
+  int x_int;
+  asm volatile("ld.global.cg.s8 %0, [%1];" : "=r"(x_int) : "l"(addr));
+  x = x_int;
+}
+
+DI void ldg(int8_t (&x)[1], const int8_t* const& addr)
+{
+  int x_int;
+  asm volatile("ld.global.cg.s8 %0, [%1];" : "=r"(x_int) : "l"(addr));
+  x[0] = x_int;
+}
+
 /** @} */
 
 }  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/ann.cuh b/cpp/include/raft/spatial/knn/ann.cuh
index 2ef2ae0fa4..4dfb1b6d89 100644
--- a/cpp/include/raft/spatial/knn/ann.cuh
+++ b/cpp/include/raft/spatial/knn/ann.cuh
@@ -14,9 +14,6 @@
  * limitations under the License.
  */
 
-#ifndef __ANN_H
-#define __ANN_H
-
 #pragma once
 
 #include "ann_common.h"
@@ -25,9 +22,7 @@
 #include <faiss/gpu/GpuIndex.h>
 #include <raft/spatial/knn/faiss_mr.hpp>
 
-namespace raft {
-namespace spatial {
-namespace knn {
+namespace raft::spatial::knn {
 
 /**
  * @brief Flat C++ API function to build an approximate nearest neighbors index
@@ -42,13 +37,13 @@ namespace knn {
  * @param[in] n number of rows in the index array
  * @param[in] D the dimensionality of the index array
  */
-template <typename value_idx = int>
+template <typename T = float, typename value_idx = int>
 inline void approx_knn_build_index(raft::handle_t& handle,
                                    raft::spatial::knn::knnIndex* index,
                                    knnIndexParam* params,
                                    raft::distance::DistanceType metric,
                                    float metricArg,
-                                   float* index_array,
+                                   T* index_array,
                                    value_idx n,
                                    value_idx D)
 {
@@ -68,20 +63,17 @@ inline void approx_knn_build_index(raft::handle_t& handle,
  * @param[in] query_array the query to perform a search with
  * @param[in] n number of rows in the query array
  */
-template <typename value_idx = int>
+template <typename T = float, typename value_idx = int>
 inline void approx_knn_search(raft::handle_t& handle,
                               float* distances,
                               int64_t* indices,
                               raft::spatial::knn::knnIndex* index,
+                              knnIndexParam* params,
                               value_idx k,
-                              float* query_array,
+                              T* query_array,
                               value_idx n)
 {
-  detail::approx_knn_search(handle, distances, indices, index, k, query_array, n);
+  detail::approx_knn_search(handle, distances, indices, index, params, k, query_array, n);
 }
 
-}  // namespace knn
-}  // namespace spatial
-}  // namespace raft
-
-#endif
\ No newline at end of file
+}  // namespace raft::spatial::knn
diff --git a/cpp/include/raft/spatial/knn/ann.hpp b/cpp/include/raft/spatial/knn/ann.hpp
index b6d3ca2976..516435271d 100644
--- a/cpp/include/raft/spatial/knn/ann.hpp
+++ b/cpp/include/raft/spatial/knn/ann.hpp
@@ -13,79 +13,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-/**
- * This file is deprecated and will be removed in release 22.06.
- * Please use the cuh version instead.
- */
-
-#ifndef __ANN_H
-#define __ANN_H
 
 #pragma once
 
-#include "ann_common.h"
-#include "detail/ann_quantized_faiss.cuh"
-
-#include <faiss/gpu/GpuIndex.h>
-#include <raft/spatial/knn/faiss_mr.hpp>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-
-/**
- * @brief Flat C++ API function to build an approximate nearest neighbors index
- * from an index array and a set of parameters.
- *
- * @param[in] handle RAFT handle
- * @param[out] index index to be built
- * @param[in] params parametrization of the index to be built
- * @param[in] metric distance metric to use. Euclidean (L2) is used by default
- * @param[in] metricArg metric argument
- * @param[in] index_array the index array to build the index with
- * @param[in] n number of rows in the index array
- * @param[in] D the dimensionality of the index array
- */
-template <typename value_idx = int>
-inline void approx_knn_build_index(raft::handle_t& handle,
-                                   raft::spatial::knn::knnIndex* index,
-                                   knnIndexParam* params,
-                                   raft::distance::DistanceType metric,
-                                   float metricArg,
-                                   float* index_array,
-                                   value_idx n,
-                                   value_idx D)
-{
-  detail::approx_knn_build_index(handle, index, params, metric, metricArg, index_array, n, D);
-}
-
-/**
- * @brief Flat C++ API function to perform an approximate nearest neighbors
- * search from previously built index and a query array
- *
- * @param[in] handle RAFT handle
- * @param[out] distances distances of the nearest neighbors toward
- *                       their query point
- * @param[out] indices indices of the nearest neighbors
- * @param[in] index index to perform a search with
- * @param[in] k the number of nearest neighbors to search for
- * @param[in] query_array the query to perform a search with
- * @param[in] n number of rows in the query array
- */
-template <typename value_idx = int>
-inline void approx_knn_search(raft::handle_t& handle,
-                              float* distances,
-                              int64_t* indices,
-                              raft::spatial::knn::knnIndex* index,
-                              value_idx k,
-                              float* query_array,
-                              value_idx n)
-{
-  detail::approx_knn_search(handle, distances, indices, index, k, query_array, n);
-}
-
-}  // namespace knn
-}  // namespace spatial
-}  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "ann.cuh"
diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h
index 5cdd6b1141..cfbde4bf21 100644
--- a/cpp/include/raft/spatial/knn/ann_common.h
+++ b/cpp/include/raft/spatial/knn/ann_common.h
@@ -18,6 +18,7 @@
 
 #include <raft/distance/distance_type.hpp>
 
+#include "detail/ann_ivf_flat.cuh"
 #include <faiss/gpu/GpuIndex.h>
 #include <raft/spatial/knn/faiss_mr.hpp>
 
@@ -29,6 +30,7 @@ struct knnIndex {
   faiss::gpu::GpuIndex* index;
   raft::distance::DistanceType metric;
   float metricArg;
+  std::unique_ptr<detail::cuivflHandle> handle_;
 
   raft::spatial::knn::RmmGpuResources* gpu_res;
   int device;
diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
new file mode 100644
index 0000000000..1768bf1a1d
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -0,0 +1,1364 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "ann_kmeans_balanced.cuh"
+#include "ann_utils.cuh"
+#include "knn_brute_force_faiss.cuh"
+#include <cublas_v2.h>
+#include <library_types.h>
+#include <raft/spatial/knn/ann_common.h>
+//#include "ann_ivf_flat.cuh"
+#include "ann_ivf_flat_kernel.cuh"
+#include "topk/radix_topk.cuh"
+
+#include "common_faiss.h"
+#include "processing.hpp"
+
+#include "processing.hpp"
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+
+//#include <label/classlabels.cuh>
+#include <raft/distance/distance.hpp>
+#include <raft/spatial/knn/faiss_mr.hpp>
+
+#include <faiss/gpu/GpuDistance.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/GpuIndexIVFFlat.h>
+#include <faiss/gpu/GpuIndexIVFPQ.h>
+#include <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/utils/Limits.cuh>
+#include <faiss/gpu/utils/Select.cuh>
+#include <faiss/gpu/utils/Tensor.cuh>
+#include <faiss/utils/Heap.h>
+
+#include <thrust/iterator/transform_iterator.h>
+
+#include <raft/distance/distance_type.hpp>
+
+#include <iostream>
+#include <set>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+namespace detail {
+
+template <typename T>
+void _ivfflat_interleaved(
+  T* list_data, T* dataset, uint32_t dim, size_t index, size_t prefix, uint32_t veclen)
+{
+  size_t group_id = index / kWarpSize;
+  size_t in_id    = (index % kWarpSize) * veclen;
+  list_data += (prefix + group_id * kWarpSize) * dim + in_id;
+
+  for (size_t i = 0; i < dim; i += veclen) {
+    for (size_t j = 0; j < veclen; j++) {
+      list_data[i * kWarpSize + j] = dataset[i + j];
+    }
+  }
+}
+
+// This kernel intends to remove the dependency of having dataset in managed mem/host mem.
+//
+template <typename T>
+__global__ void write_ivf_flat_interleaved_index(
+  T* list_data, T* dataset, uint32_t dim, size_t index, size_t prefix, uint32_t veclen)
+{
+  size_t group_id = index / kWarpSize;
+  size_t in_id    = (index % kWarpSize) * veclen;
+  list_data += (prefix + group_id * kWarpSize) * dim + in_id;
+
+  for (size_t i = 0; i < dim; i += veclen) {
+    for (size_t j = 0; j < veclen; j++) {
+      list_data[i * kWarpSize + j] = dataset[i + j];
+    }
+  }
+}
+
+/* CUIVFL status type */
+enum cuivflStatus_t : unsigned int {
+  CUIVFL_STATUS_SUCCESS           = 0,
+  CUIVFL_STATUS_ALLOC_FAILED      = 1,
+  CUIVFL_STATUS_NOT_INITIALIZED   = 2,
+  CUIVFL_STATUS_INVALID_VALUE     = 3,
+  CUIVFL_STATUS_INTERNAL_ERROR    = 4,
+  CUIVFL_STATUS_FILEIO_ERROR      = 5,
+  CUIVFL_STATUS_CUDA_ERROR        = 6,
+  CUIVFL_STATUS_CUBLAS_ERROR      = 7,
+  CUIVFL_STATUS_INVALID_POINTER   = 8,
+  CUIVFL_STATUS_VERSION_ERROR     = 9,
+  CUIVFL_STATUS_UNSUPPORTED_DTYPE = 10,
+  CUIVFL_STATUS_FAISS_ERROR       = 11,
+  CUIVFL_STATUS_NOT_BUILD         = 12
+};
+
+class cuivflHandle {
+ public:
+  cuivflHandle(raft::distance::DistanceType metric_type,
+               uint32_t dim,
+               uint32_t nlist,
+               uint32_t niter,
+               uint32_t device);
+  ~cuivflHandle();
+  cuivflStatus_t cuivflBuildIndex(const void* dataset,
+                                  void* trainset,
+                                  cudaDataType_t dtype,
+                                  uint32_t nrow,
+                                  uint32_t nTrainset,
+                                  cudaStream_t stream);
+  cuivflStatus_t cuivflSaveIndex(const char* fileName);
+
+  cuivflStatus_t cuivflLoadIndex(const char* fileName);
+
+  cuivflStatus_t cuivflSetSearchParameters(const uint32_t nprobe,
+                                           const uint32_t max_batch,
+                                           const uint32_t max_k);
+
+  cuivflStatus_t cuivflSearch(const void* queries,
+                              uint32_t batch_size,
+                              uint32_t k,
+                              size_t* neighbors,
+                              float* distances,
+                              cudaStream_t stream,
+                              cudaDataType_t dtype);
+
+  cuivflStatus_t queryIVFFlatGridSize(const uint32_t nprobe,
+                                      const uint32_t batch_size,
+                                      const uint32_t k);
+  uint32_t getDim();
+
+ private:
+  uint32_t device_;
+  cublasHandle_t cublas_handle_;
+  cudaDataType_t dtype_;
+  raft::distance::DistanceType metric_type_;
+  bool greater_;
+  bool hierarchialClustering_;
+  uint32_t nlist_;        // The number of inverted lists= the number of centriods
+  uint32_t niter_;        // The number of uint32_terations for kmeans to build the indexs
+  uint32_t dim_;          // The dimension of vectors for input dataset
+  uint32_t nprobe_;       // The number of clusters for searching
+  uint32_t nrow_;         // The number of elements for input dataset
+  size_t ninterleave_;    // The number of elements in 32 interleaved group for input dataset
+  size_t buf_topk_size_;  // The size of buffer used for topk select.
+  size_t floatQuerySize;  // The size of float converted queries from int8_t/uint8_t
+  cudaStream_t stream_;   // The stream for build and search
+  uint32_t veclen;        // The vectorization length of dataset in index.
+  uint32_t gridDimX_;     // The number of blocks launched across nprobe.
+
+ private:
+  // device pointer
+  //  The device memory pointer; inverted list for data; size [ninterleave_, dim_]
+  void* list_data_dev_ptr_;
+  // The device memory pointer; inverted list for index; size [ninterleave_]
+  uint32_t* list_index_dev_ptr_;
+  // The device memory pointer; Used for list_data_manage_ptr_; size [nlist_]
+  uint32_t* list_prefix_interleaved_dev_ptr_;
+  // The device memory pointer; the number of each cluster(list); size [nlist_]
+  uint32_t* list_lengths_dev_ptr_;
+  // The device memory pointer; centriod; size [nlist_, dim_]
+  float* centriod_dev_ptr_;
+  // The device memory pointer; centriod norm ; size [nlist_, dim_]
+  float* centriod_norm_dev_ptr_;
+
+  // host pointer
+  //  The host memory pointer; inverted list for data; size [ninterleave_, dim_]
+  void* list_data_host_ptr_;
+  // The host memory pointer; inverted list for index; size [ninterleave_]
+  uint32_t* list_index_host_ptr_;
+  // The host memory pointer; Used for list_data_manage_ptr_; size [nlist_]
+  uint32_t* list_prefix_interleaved_host_ptr_;
+  // The host memory pointer; the number of each cluster(list); size [nlist_]
+  uint32_t* list_lengths_host_ptr_;
+  // The host memory pointer; centriod; size [nlist_, dim_]
+  float* centriod_host_ptr_;
+  // The host memory pointer; centriod norm ; size [nlist_, dim_]
+  float* centriod_norm_host_ptr_;
+  // The device memory; used for topk select.
+  void* buf_dev_ptr_;
+
+ private:
+  cuivflStatus_t cuivflBuildOptimizedKmeans(float* centriod_manage_ptr,
+                                            const void* dataset,
+                                            void* trainset,
+                                            uint32_t* clusterSize,
+                                            cudaDataType_t dtype,
+                                            uint32_t nrow,
+                                            uint32_t ntrain,
+                                            cudaStream_t stream);
+  template <typename T, typename value_t>
+  cuivflStatus_t cuivflSearchImpl(const T* queries,
+                                  uint32_t batch_size,
+                                  uint32_t k,
+                                  size_t* neighbors,
+                                  value_t* distances,
+                                  cudaStream_t stream);
+};
+
+// cuivflCreate
+cuivflHandle::cuivflHandle(raft::distance::DistanceType metric_type,
+                           uint32_t dim,
+                           uint32_t nlist,
+                           uint32_t niter,
+                           uint32_t device)
+{
+  // Device
+  device_        = device;
+  dim_           = dim;
+  nlist_         = nlist;
+  niter_         = niter;
+  metric_type_   = metric_type;
+  floatQuerySize = 0;
+  veclen         = 1;
+  gridDimX_      = 0;
+  stream_        = 0;
+
+  if ((dim % 4) == 0) {
+    veclen = 4;
+  } else if ((dim % 2) == 0) {
+    veclen = 2;
+  }
+
+  // cuBLAS
+  cublasStatus_t cublasError;
+  cublasError = cublasCreate(&(cublas_handle_));
+
+  if (cublasError != CUBLAS_STATUS_SUCCESS) {
+    fprintf(stderr, "(%s) cublasCreate() failed\n", __func__);
+    throw cuivflStatus_t::CUIVFL_STATUS_CUBLAS_ERROR;
+  }
+
+  list_data_dev_ptr_               = nullptr;
+  list_index_dev_ptr_              = nullptr;
+  list_prefix_interleaved_dev_ptr_ = nullptr;
+  list_lengths_dev_ptr_            = nullptr;
+  centriod_dev_ptr_                = nullptr;
+  centriod_norm_dev_ptr_           = nullptr;
+
+  list_data_host_ptr_               = nullptr;
+  list_index_host_ptr_              = nullptr;
+  list_prefix_interleaved_host_ptr_ = nullptr;
+  list_lengths_host_ptr_            = nullptr;
+  centriod_host_ptr_                = nullptr;
+  centriod_norm_host_ptr_           = nullptr;
+
+  buf_dev_ptr_           = nullptr;
+  hierarchialClustering_ = true;
+}
+
+uint32_t cuivflHandle::getDim() { return dim_; }
+
+// cuivflDestroy
+cuivflHandle::~cuivflHandle()
+{
+  if (list_data_dev_ptr_ != nullptr) {
+    cudaFree(list_data_dev_ptr_);
+    list_data_dev_ptr_ = nullptr;
+  }
+  if (list_index_dev_ptr_ != nullptr) {
+    cudaFree(list_index_dev_ptr_);
+    list_index_dev_ptr_ = nullptr;
+  }
+  if (list_prefix_interleaved_dev_ptr_ != nullptr) {
+    cudaFree(list_prefix_interleaved_dev_ptr_);
+    list_prefix_interleaved_dev_ptr_ = nullptr;
+  }
+  if (list_lengths_dev_ptr_ != nullptr) {
+    cudaFree(list_lengths_dev_ptr_);
+    list_lengths_dev_ptr_ = nullptr;
+  }
+  if (centriod_dev_ptr_ != nullptr) {
+    cudaFree(centriod_dev_ptr_);
+    centriod_dev_ptr_ = nullptr;
+  }
+  if (centriod_norm_dev_ptr_ != nullptr) {
+    cudaFree(centriod_norm_dev_ptr_);
+    centriod_norm_dev_ptr_ = nullptr;
+  }
+
+  if (list_data_host_ptr_ != nullptr) {
+    free(list_data_host_ptr_);
+    list_data_host_ptr_ = nullptr;
+  }
+  if (list_index_host_ptr_ != nullptr) {
+    free(list_index_host_ptr_);
+    list_index_host_ptr_ = nullptr;
+  }
+  if (list_prefix_interleaved_host_ptr_ != nullptr) {
+    free(list_prefix_interleaved_host_ptr_);
+    list_prefix_interleaved_host_ptr_ = nullptr;
+  }
+  if (list_lengths_host_ptr_ != nullptr) {
+    free(list_lengths_host_ptr_);
+    list_lengths_host_ptr_ = nullptr;
+  }
+  if (centriod_host_ptr_ != nullptr) {
+    free(centriod_host_ptr_);
+    centriod_host_ptr_ = nullptr;
+  }
+  if (centriod_norm_host_ptr_ != nullptr) {
+    free(centriod_norm_host_ptr_);
+    centriod_norm_host_ptr_ = nullptr;
+  }
+  cublasDestroy(cublas_handle_);
+}  // end func cuivflHandle::cuivflHand
+
+// cuivflLoadIndex
+cuivflStatus_t cuivflHandle::cuivflLoadIndex(const char* fileName)
+{
+  // Step 1: Open the file
+  FILE* fp = fopen(fileName, "r");
+
+  if (fp == NULL) {
+    fprintf(stderr, "(%s) failed to open file (%s)\n", __func__, fileName);
+    return cuivflStatus_t::CUIVFL_STATUS_FILEIO_ERROR;
+  }
+  // Step 2: Write the meta data
+  size_t read_counts = 0;
+  read_counts += fread(&nrow_, sizeof(uint32_t), 1, fp);
+  read_counts += fread(&dtype_, sizeof(dtype_), 1, fp);
+  read_counts += fread(&ninterleave_, sizeof(ninterleave_), 1, fp);
+
+  size_t total_counts =
+    3 + 2 * nlist_ + nlist_ * dim_ + ninterleave_ + ninterleave_ * dim_ + nlist_;
+
+  list_prefix_interleaved_host_ptr_ = (uint32_t*)malloc(sizeof(uint32_t) * nlist_);
+  list_lengths_host_ptr_            = (uint32_t*)malloc(sizeof(uint32_t) * nlist_);
+  list_index_host_ptr_              = (uint32_t*)malloc(sizeof(uint32_t) * ninterleave_);
+  RAFT_CUDA_TRY(cudaMalloc(&list_prefix_interleaved_dev_ptr_, sizeof(uint32_t) * nlist_));
+  RAFT_CUDA_TRY(cudaMalloc(&list_lengths_dev_ptr_, sizeof(uint32_t) * nlist_));
+  RAFT_CUDA_TRY(cudaMalloc(&list_index_dev_ptr_, sizeof(uint32_t) * ninterleave_));
+
+  if (dtype_ == CUDA_R_32F) {
+    list_data_host_ptr_ = malloc(sizeof(float) * ninterleave_ * dim_);
+    RAFT_CUDA_TRY(cudaMalloc(&list_data_dev_ptr_, sizeof(float) * ninterleave_ * dim_));
+  } else if (dtype_ == CUDA_R_8U) {
+    list_data_host_ptr_ = malloc(sizeof(uint8_t) * ninterleave_ * dim_);
+    RAFT_CUDA_TRY(cudaMalloc(&list_data_dev_ptr_, sizeof(uint8_t) * ninterleave_ * dim_));
+  } else if (dtype_ == CUDA_R_8I) {
+    list_data_host_ptr_ = malloc(sizeof(int8_t) * ninterleave_ * dim_);
+    RAFT_CUDA_TRY(cudaMalloc(&list_data_dev_ptr_, sizeof(int8_t) * ninterleave_ * dim_));
+  }
+
+  centriod_host_ptr_ = (float*)malloc(sizeof(float) * nlist_ * dim_);
+  RAFT_CUDA_TRY(cudaMalloc(&centriod_dev_ptr_, sizeof(float) * nlist_ * dim_));
+
+  centriod_norm_host_ptr_ = (float*)malloc(sizeof(float) * nlist_);
+  RAFT_CUDA_TRY(cudaMalloc(&centriod_norm_dev_ptr_, sizeof(float) * nlist_));
+
+  // Step 3: Read the list
+  read_counts += fread(list_prefix_interleaved_host_ptr_, sizeof(uint32_t), nlist_, fp);
+  RAFT_CUDA_TRY(cudaMemcpy(list_prefix_interleaved_dev_ptr_,
+                           list_prefix_interleaved_host_ptr_,
+                           sizeof(uint32_t) * nlist_,
+                           cudaMemcpyHostToDevice));
+
+  read_counts += fread(list_lengths_host_ptr_, sizeof(uint32_t), nlist_, fp);
+  RAFT_CUDA_TRY(cudaMemcpy(list_lengths_dev_ptr_,
+                           list_lengths_host_ptr_,
+                           sizeof(uint32_t) * nlist_,
+                           cudaMemcpyHostToDevice));
+
+  if (dtype_ == CUDA_R_32F) {
+    read_counts += fread(list_data_host_ptr_, sizeof(float), ninterleave_ * dim_, fp);
+    RAFT_CUDA_TRY(cudaMemcpy(list_data_dev_ptr_,
+                             list_data_host_ptr_,
+                             sizeof(float) * ninterleave_ * dim_,
+                             cudaMemcpyHostToDevice));
+  } else if (dtype_ == CUDA_R_8U) {
+    read_counts += fread(list_data_host_ptr_, sizeof(uint8_t), ninterleave_ * dim_, fp);
+    RAFT_CUDA_TRY(cudaMemcpy(list_data_dev_ptr_,
+                             list_data_host_ptr_,
+                             sizeof(uint8_t) * ninterleave_ * dim_,
+                             cudaMemcpyHostToDevice));
+  } else if (dtype_ == CUDA_R_8I) {
+    read_counts += fread(list_data_host_ptr_, sizeof(int8_t), ninterleave_ * dim_, fp);
+    RAFT_CUDA_TRY(cudaMemcpy(list_data_dev_ptr_,
+                             list_data_host_ptr_,
+                             sizeof(int8_t) * ninterleave_ * dim_,
+                             cudaMemcpyHostToDevice));
+  }
+
+  read_counts += fread(list_index_host_ptr_, sizeof(uint32_t), ninterleave_, fp);
+  RAFT_CUDA_TRY(cudaMemcpy(list_index_dev_ptr_,
+                           list_index_host_ptr_,
+                           sizeof(uint32_t) * ninterleave_,
+                           cudaMemcpyHostToDevice));
+  read_counts += fread(centriod_host_ptr_, sizeof(float), nlist_ * dim_, fp);
+  RAFT_CUDA_TRY(cudaMemcpy(
+    centriod_dev_ptr_, centriod_host_ptr_, sizeof(float) * nlist_ * dim_, cudaMemcpyHostToDevice));
+
+  // centriod_norm_host_ptr_
+  read_counts += fread(centriod_norm_host_ptr_, sizeof(float), nlist_, fp);
+  RAFT_CUDA_TRY(cudaMemcpy(centriod_norm_dev_ptr_,
+                           centriod_norm_host_ptr_,
+                           sizeof(float) * nlist_,
+                           cudaMemcpyHostToDevice));
+
+#ifdef DEBUG_L2
+  printDevPtr(centriod_norm_dev_ptr_, 20, "centriod_norm_dev_ptr_");
+#endif
+  if (read_counts != total_counts) {
+    fprintf(stderr, "(%s) failed to load index to file (%s)\n", __func__, fileName);
+    return cuivflStatus_t::CUIVFL_STATUS_FILEIO_ERROR;
+  }
+  fclose(fp);
+  return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
+}  // end func cuivflHandle::cuivflLoadIndex
+
+// cuivflSaveIndex
+cuivflStatus_t cuivflHandle::cuivflSaveIndex(const char* fileName)
+{
+  // Step 1: Open the file
+  FILE* fp = fopen(fileName, "w");
+  if (fp == NULL) {
+    fprintf(stderr, "(%s) failed to open file (%s)\n", __func__, fileName);
+    return cuivflStatus_t::CUIVFL_STATUS_FILEIO_ERROR;
+  }
+  // Step 2: Write the meta data
+  size_t written_counts = 0;
+  size_t total_counts =
+    3 + 2 * nlist_ + nlist_ * dim_ + ninterleave_ + ninterleave_ * dim_ + nlist_;
+
+  written_counts += fwrite(&nrow_, sizeof(uint32_t), 1, fp);
+  written_counts += fwrite(&dtype_, sizeof(dtype_), 1, fp);
+  written_counts += fwrite(&ninterleave_, sizeof(ninterleave_), 1, fp);
+  // Step 3: Write the list
+
+  written_counts += fwrite(list_prefix_interleaved_host_ptr_, sizeof(uint32_t), nlist_, fp);
+  written_counts += fwrite(list_lengths_host_ptr_, sizeof(uint32_t), nlist_, fp);
+
+  if (dtype_ == CUDA_R_32F) {
+    written_counts += fwrite(list_data_host_ptr_, sizeof(float), ninterleave_ * dim_, fp);
+  } else if (dtype_ == CUDA_R_8U) {
+    written_counts += fwrite(list_data_host_ptr_, sizeof(uint8_t), ninterleave_ * dim_, fp);
+  } else if (dtype_ == CUDA_R_8I) {
+    written_counts += fwrite(list_data_host_ptr_, sizeof(int8_t), ninterleave_ * dim_, fp);
+  }
+
+  written_counts += fwrite(list_index_host_ptr_, sizeof(uint32_t), ninterleave_, fp);
+
+  written_counts += fwrite(centriod_host_ptr_, sizeof(float), nlist_ * dim_, fp);
+  written_counts += fwrite(centriod_norm_host_ptr_, sizeof(float), nlist_, fp);
+
+  if (written_counts != total_counts) {
+    fprintf(stderr, "(%s) failed to save index to file (%s)\n", __func__, fileName);
+    return cuivflStatus_t::CUIVFL_STATUS_FILEIO_ERROR;
+  }
+  fclose(fp);
+
+  return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
+}  // end func cuivflHandle::cuivflSaveIndex
+
+// cuivflBuildIndex
+
+cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_manage_ptr,
+                                                        const void* dataset,
+                                                        void* trainset,
+                                                        uint32_t* datasetLabels,
+                                                        cudaDataType_t dtype,
+                                                        uint32_t nrow,
+                                                        uint32_t ntrain,
+                                                        cudaStream_t stream)
+{
+  uint32_t numTrainset   = ntrain;
+  uint32_t numClusters   = nlist_;
+  uint32_t dimDataset    = dim_;
+  uint32_t numIterations = niter_;
+
+  uint32_t* trainsetLabels = nullptr;
+  RAFT_CUDA_TRY(cudaMallocManaged(&trainsetLabels, sizeof(uint32_t) * numTrainset));
+
+  float* clusterCenters = centriod_manage_ptr;
+
+  uint32_t numMesoClusters = pow((double)(numClusters), (double)1.0 / 2.0) + 0.5;
+  fprintf(stderr, "# numMesoClusters: %u\n", numMesoClusters);
+
+  float* mesoClusterCenters;  // [numMesoClusters, dimDataset]
+  RAFT_CUDA_TRY(
+    cudaMallocManaged(&mesoClusterCenters, sizeof(float) * numMesoClusters * dimDataset));
+
+  uint32_t* mesoClusterLabels;  // [numTrainset,]
+  RAFT_CUDA_TRY(cudaMallocManaged(&mesoClusterLabels, sizeof(uint32_t) * numTrainset));
+
+  uint32_t* mesoClusterSize;  // [numMesoClusters,]
+  RAFT_CUDA_TRY(cudaMallocManaged(&mesoClusterSize, sizeof(uint32_t) * numMesoClusters));
+
+  float* mesoClusterCentersTemp;  // [numMesoClusters, dimDataset]
+  RAFT_CUDA_TRY(
+    cudaMallocManaged(&mesoClusterCentersTemp, sizeof(float) * numMesoClusters * dimDataset));
+
+  size_t sizePredictWorkspace =
+    _cuann_kmeans_predict_bufferSize(numMesoClusters,  // number of centers
+                                     dimDataset,
+                                     numTrainset  // number of vectors
+    );
+  void* predictWorkspace = NULL;
+  RAFT_CUDA_TRY(cudaMallocManaged(&predictWorkspace, sizePredictWorkspace));
+  // Training meso-clusters
+  for (uint32_t iter = 0; iter < 2 * numIterations; iter += 2) {
+    fprintf(stderr,
+            "(%s) Training kmeans of meso-clusters: %.1f / %u    \r",
+            __func__,
+            (float)iter / 2,
+            numIterations);
+    _cuann_kmeans_predict(cublas_handle_,
+                          mesoClusterCenters,
+                          numMesoClusters,
+                          dimDataset,
+                          trainset,
+                          dtype,
+                          numTrainset,
+                          mesoClusterLabels,
+                          metric_type_,
+                          (iter != 0),
+                          predictWorkspace,
+                          mesoClusterCentersTemp,
+                          mesoClusterSize);
+
+    if (iter < 2 * (numIterations - 2)) {
+      if (_cuann_kmeans_adjust_centers(mesoClusterCenters,
+                                       numMesoClusters,
+                                       dimDataset,
+                                       trainset,
+                                       dtype,
+                                       numTrainset,
+                                       mesoClusterLabels,
+                                       metric_type_,
+                                       mesoClusterSize,
+                                       (float)1.0 / 4)) {
+        iter -= 1;
+      }  // end if _cuann_kmeans_adjust_centers
+    }    // end if iter < 2 * (numIterations - 2)
+  }      // end for (int iter = 0; iter < 2 * numIterations; iter += 2)
+
+  fprintf(stderr, "\n");
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+  uint32_t* numFineClusters;  // [numMesoClusters,]
+  numFineClusters            = (uint32_t*)malloc(sizeof(uint32_t) * numMesoClusters);
+  uint32_t* csumFineClusters = (uint32_t*)malloc(sizeof(uint32_t) * (numMesoClusters + 1));
+  csumFineClusters[0]        = 0;
+
+  uint32_t numClustersRemain  = numClusters;
+  uint32_t numTrainsetRemain  = numTrainset;
+  uint32_t mesoClusterSizeMax = 0;
+  uint32_t mesoClusterSizeSum = 0;
+  uint32_t numFineClustersSum = 0;  // checking
+  uint32_t numFineClustersMax = 0;
+  for (uint32_t i = 0; i < numMesoClusters; i++) {
+    if (i < numMesoClusters - 1) {
+      numFineClusters[i] = (double)numClustersRemain * mesoClusterSize[i] / numTrainsetRemain + .5;
+    } else {
+      numFineClusters[i] = numClustersRemain;
+    }
+    numClustersRemain -= numFineClusters[i];
+    numTrainsetRemain -= mesoClusterSize[i];
+    mesoClusterSizeMax = max(mesoClusterSizeMax, mesoClusterSize[i]);
+    mesoClusterSizeSum += mesoClusterSize[i];
+    numFineClustersSum += numFineClusters[i];
+    numFineClustersMax      = max(numFineClustersMax, numFineClusters[i]);
+    csumFineClusters[i + 1] = csumFineClusters[i] + numFineClusters[i];
+  }  // end for (uint32_t i = 0; i < numMesoClusters; i++)
+  // fprintf(stderr, "# mesoClusterSizeSum: %u\n", mesoClusterSizeSum);
+  // fprintf(stderr, "# numFineClustersSum: %u\n", numFineClustersSum);
+  assert(mesoClusterSizeSum == numTrainset);
+  assert(numFineClustersSum == numClusters);
+  assert(csumFineClusters[numMesoClusters] == numClusters);
+
+  // uint32_t *idsTrainset = (uint32_t *)malloc(sizeof(uint32_t) * mesoClusterSizeMax);
+  uint32_t* idsTrainset;
+  RAFT_CUDA_TRY(cudaMallocManaged(&idsTrainset, sizeof(uint32_t) * mesoClusterSizeMax));
+  float* subTrainset;
+  RAFT_CUDA_TRY(cudaMallocManaged(&subTrainset, sizeof(float) * mesoClusterSizeMax * dimDataset));
+
+  sizePredictWorkspace = 0;
+  for (uint32_t i = 0; i < numMesoClusters; i++) {
+    sizePredictWorkspace =
+      max(sizePredictWorkspace,
+          _cuann_kmeans_predict_bufferSize(numFineClusters[i],  // number of centers
+                                           dimDataset,
+                                           mesoClusterSize[i]  // number of vectors
+                                           ));
+  }
+
+  // label (cluster ID) of each vector
+  uint32_t* labelsMP = NULL;
+  RAFT_CUDA_TRY(cudaMallocManaged(&labelsMP, mesoClusterSizeMax * sizeof(uint32_t)));
+
+  cudaFree(predictWorkspace);
+  RAFT_CUDA_TRY(cudaMallocManaged(&predictWorkspace, sizePredictWorkspace));
+
+  float* clusterCentersEach = NULL;
+  RAFT_CUDA_TRY(
+    cudaMallocManaged(&clusterCentersEach, numFineClustersMax * dimDataset * sizeof(float)));
+
+  float* clusterCentersMP = NULL;
+  RAFT_CUDA_TRY(
+    cudaMallocManaged(&clusterCentersMP, numFineClustersMax * dimDataset * sizeof(float)));
+
+  // number of vectors in each cluster
+  uint32_t* clusterSizeMP = NULL;
+  RAFT_CUDA_TRY(cudaMallocManaged(&clusterSizeMP, numFineClustersMax * sizeof(uint32_t)));
+
+  // Training clusters in each meso-clusters
+  uint32_t numClustersDone = 0;
+  for (uint32_t i = 0; i < numMesoClusters; i++) {
+    uint32_t k = 0;
+    for (uint32_t j = 0; j < numTrainset; j++) {
+      if (mesoClusterLabels[j] != i) continue;
+      idsTrainset[k++] = j;
+    }
+    assert(k == mesoClusterSize[i]);
+
+    if (dtype == CUDA_R_32F) {
+      float divisor = 1.0;
+      utils::_cuann_copy_with_list<float>(mesoClusterSize[i],
+                                          dimDataset,
+                                          (const float*)trainset,
+                                          (const uint32_t*)idsTrainset,
+                                          dimDataset,
+                                          subTrainset,
+                                          dimDataset,
+                                          divisor);
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+    } else if (dtype == CUDA_R_8U) {
+      float divisor = 256.0;
+      utils::_cuann_copy_with_list<uint8_t>(mesoClusterSize[i],
+                                            dimDataset,
+                                            (const uint8_t*)trainset,
+                                            (const uint32_t*)idsTrainset,
+                                            dimDataset,
+                                            subTrainset,
+                                            dimDataset,
+                                            divisor);
+    } else if (dtype == CUDA_R_8I) {
+      float divisor = 128.0;
+      utils::_cuann_copy_with_list<int8_t>(mesoClusterSize[i],
+                                           dimDataset,
+                                           (const int8_t*)trainset,
+                                           (const uint32_t*)idsTrainset,
+                                           dimDataset,
+                                           subTrainset,
+                                           dimDataset,
+                                           divisor);
+    }
+    for (uint32_t iter = 0; iter < 2 * numIterations; iter += 2) {
+      fprintf(stderr,
+              "(%s) Training kmeans of clusters in meso-cluster %u (numClusters: %u): "
+              "%.1f / %u    \r",
+              __func__,
+              i,
+              numFineClusters[i],
+              (float)iter / 2,
+              numIterations);
+
+      _cuann_kmeans_predict(cublas_handle_,
+                            clusterCentersEach,
+                            numFineClusters[i],
+                            dimDataset,
+                            subTrainset,
+                            CUDA_R_32F,
+                            mesoClusterSize[i],
+                            labelsMP,
+                            metric_type_,
+                            (iter != 0),
+                            predictWorkspace,
+                            clusterCentersMP,
+                            clusterSizeMP);
+
+      if (iter < 2 * (numIterations - 2)) {
+        if (_cuann_kmeans_adjust_centers(clusterCentersEach,
+                                         numFineClusters[i],
+                                         dimDataset,
+                                         subTrainset,
+                                         CUDA_R_32F,
+                                         mesoClusterSize[i],
+                                         labelsMP,
+                                         metric_type_,
+                                         clusterSizeMP,
+                                         (float)1.0 / 4)) {
+          iter -= 1;
+        }
+      }
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+    }
+    RAFT_CUDA_TRY(cudaMemcpy(clusterCenters + (dimDataset * csumFineClusters[i]),
+                             clusterCentersEach,
+                             sizeof(float) * numFineClusters[i] * dimDataset,
+                             cudaMemcpyDefault));
+    numClustersDone += numFineClusters[i];
+  }  // end for (uint32_t i = 0; i < numMesoClusters; i++)
+  fprintf(stderr, "\n");
+  assert(numClustersDone == numClusters);
+
+  cudaFree(subTrainset);
+  cudaFree(idsTrainset);
+  free(numFineClusters);
+  free(csumFineClusters);
+  cudaFree(mesoClusterSize);
+  cudaFree(mesoClusterLabels);
+  cudaFree(mesoClusterCenters);
+  cudaFree(predictWorkspace);
+  cudaFree(clusterSizeMP);
+  RAFT_CUDA_TRY(cudaFree(clusterCentersEach));
+  RAFT_CUDA_TRY(cudaFree(clusterCentersMP));
+  RAFT_CUDA_TRY(cudaFree(labelsMP));
+
+  // [numClusters, dimDataset]
+  RAFT_CUDA_TRY(cudaMallocManaged(&clusterCentersMP, numClusters * dimDataset * sizeof(float)));
+
+  // [numClusters]
+  RAFT_CUDA_TRY(cudaMallocManaged(&clusterSizeMP, numClusters * sizeof(uint32_t)));
+
+  // [...]
+  sizePredictWorkspace = _cuann_kmeans_predict_bufferSize(numClusters, dimDataset, numTrainset);
+  RAFT_CUDA_TRY(cudaMallocManaged(&predictWorkspace, sizePredictWorkspace));
+
+  // Fitting whole clusters using whole trainset.
+  for (int iter = 0; iter < 2; iter++) {
+    _cuann_kmeans_predict(cublas_handle_,
+                          clusterCenters,
+                          numClusters,
+                          dimDataset,
+                          trainset,
+                          dtype,
+                          numTrainset,
+                          trainsetLabels,
+                          metric_type_,
+                          true,
+                          predictWorkspace,
+                          clusterCentersMP,
+                          clusterSizeMP,
+                          true);
+  }  // end for (int iter = 0; iter < 2; iter++)
+
+  fprintf(stderr, "(%s) Final fitting\n", __func__);
+
+  RAFT_CUDA_TRY(cudaFree(trainsetLabels));
+  RAFT_CUDA_TRY(cudaFree(predictWorkspace));
+
+  sizePredictWorkspace = _cuann_kmeans_predict_bufferSize(numClusters, dimDataset, nrow_);
+  RAFT_CUDA_TRY(cudaMallocManaged(&predictWorkspace, sizePredictWorkspace));
+
+  _cuann_kmeans_predict(cublas_handle_,
+                        (float*)clusterCenters,
+                        nlist_,
+                        dim_,
+                        dataset,
+                        dtype,
+                        nrow_,
+                        datasetLabels,
+                        metric_type_,
+                        true,
+                        predictWorkspace,
+                        clusterCentersMP,
+                        clusterSizeMP,
+                        true);
+
+  _cuann_kmeans_predict(cublas_handle_,
+                        (float*)clusterCenters,
+                        nlist_,
+                        dim_,
+                        dataset,
+                        dtype,
+                        nrow_,
+                        datasetLabels,
+                        metric_type_,
+                        true,
+                        predictWorkspace,
+                        clusterCentersMP,
+                        clusterSizeMP,
+                        false);
+  RAFT_CUDA_TRY(cudaFree(clusterCentersMP));
+  RAFT_CUDA_TRY(cudaFree(clusterSizeMP));
+  RAFT_CUDA_TRY(cudaFree(predictWorkspace));
+
+  return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
+}  // end func cuivflBuildOptimizedKmeans
+
+cuivflStatus_t cuivflHandle::cuivflBuildIndex(const void* dataset,
+                                              void* trainset,
+                                              cudaDataType_t dtype,
+                                              uint32_t nrow,
+                                              uint32_t ntrain,
+                                              cudaStream_t stream)
+{
+  nrow_   = nrow;
+  dtype_  = dtype;
+  stream_ = stream;
+
+  float* centriod_manage_ptr = nullptr;
+  RAFT_CUDA_TRY(cudaMallocManaged(&centriod_manage_ptr, sizeof(float) * nlist_ * dim_));
+
+  if (this == NULL || nrow_ == 0) { return CUIVFL_STATUS_NOT_INITIALIZED; }
+  if (dtype != CUDA_R_32F && dtype != CUDA_R_8U && dtype != CUDA_R_8I) {
+    return CUIVFL_STATUS_UNSUPPORTED_DTYPE;
+  }
+
+  // Alloc manage memory for centriods, trainset and workspace
+  uint32_t* datasetLabels;  // [numDataset]
+  RAFT_CUDA_TRY(cudaMallocManaged(&datasetLabels, sizeof(uint32_t) * nrow_));
+
+  // Step 3: Predict labels of the whole dataset
+  cuivflBuildOptimizedKmeans(
+    centriod_manage_ptr, dataset, trainset, datasetLabels, dtype, nrow, ntrain, stream);
+
+  // Step 3.2: Calculate the L2 related result
+  centriod_norm_host_ptr_ = (float*)malloc(sizeof(float) * nlist_);
+  RAFT_CUDA_TRY(cudaMalloc(&centriod_norm_dev_ptr_, sizeof(float) * nlist_));
+
+  if (metric_type_ == raft::distance::DistanceType::L2Expanded) {
+    utils::_cuann_sqsum(nlist_, dim_, centriod_manage_ptr, centriod_norm_dev_ptr_);
+#ifdef DEBUG_L2
+    printDevPtr(centriod_norm_dev_ptr_, 20, "centriod_norm_dev_ptr_");
+#endif
+  }
+
+  // Step 4: Record the number of elements in each clusters
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  list_lengths_host_ptr_            = (uint32_t*)malloc(sizeof(uint32_t) * nlist_);
+  list_prefix_interleaved_host_ptr_ = (uint32_t*)malloc(sizeof(uint32_t) * nlist_);
+  memset(list_lengths_host_ptr_, 0, sizeof(uint32_t) * nlist_);
+
+  for (uint32_t i = 0; i < nrow_; i++) {
+    uint32_t id_cluster = datasetLabels[i];
+    list_lengths_host_ptr_[id_cluster] += 1;
+  }
+
+  ninterleave_ = 0;
+  for (uint32_t i = 0; i < nlist_; i++) {
+    list_prefix_interleaved_host_ptr_[i] = ninterleave_;
+    ninterleave_ += ((list_lengths_host_ptr_[i] - 1) / kWarpSize + 1) * kWarpSize;
+  }
+
+  if (dtype == CUDA_R_32F) {
+    list_data_host_ptr_ = malloc(sizeof(float) * ninterleave_ * dim_);
+    memset(list_data_host_ptr_, 0, sizeof(float) * ninterleave_ * dim_);
+  } else if (dtype == CUDA_R_8U) {
+    list_data_host_ptr_ = malloc(sizeof(uint8_t) * ninterleave_ * dim_);
+    memset(list_data_host_ptr_, 0, sizeof(uint8_t) * ninterleave_ * dim_);
+  } else if (dtype == CUDA_R_8I) {
+    list_data_host_ptr_ = malloc(sizeof(int8_t) * ninterleave_ * dim_);
+    memset(list_data_host_ptr_, 0, sizeof(int8_t) * ninterleave_ * dim_);
+  }
+  list_index_host_ptr_ = (uint32_t*)malloc(sizeof(uint32_t) * ninterleave_);
+  memset(list_index_host_ptr_, 0, sizeof(uint32_t) * ninterleave_);
+  memset(list_lengths_host_ptr_, 0, sizeof(uint32_t) * nlist_);
+
+  if ((dtype == CUDA_R_8I) || (dtype == CUDA_R_8U)) {
+    if ((dim_ % 16) == 0) {
+      veclen = 16;
+    } else if ((dim_ % 8) == 0) {
+      veclen = 8;
+    }
+  }
+
+  for (size_t i = 0; i < nrow_; i++) {
+    uint32_t id_cluster     = datasetLabels[i];
+    uint32_t current_add    = list_lengths_host_ptr_[id_cluster];
+    uint32_t interleave_add = list_prefix_interleaved_host_ptr_[id_cluster];
+
+    if (dtype == CUDA_R_32F) {
+      float* list_data = (float*)list_data_host_ptr_;
+      float* ori_data  = (float*)dataset;
+      _ivfflat_interleaved(
+        list_data, ori_data + i * dim_, dim_, current_add, interleave_add, veclen);
+    } else if (dtype == CUDA_R_8U) {
+      uint8_t* list_data = (uint8_t*)list_data_host_ptr_;
+      uint8_t* ori_data  = (uint8_t*)dataset;
+      _ivfflat_interleaved(
+        list_data, ori_data + i * dim_, dim_, current_add, interleave_add, veclen);
+    } else if (dtype == CUDA_R_8I) {
+      int8_t* list_data = (int8_t*)list_data_host_ptr_;
+      int8_t* ori_data  = (int8_t*)dataset;
+      _ivfflat_interleaved(
+        list_data, ori_data + i * dim_, dim_, current_add, interleave_add, veclen);
+    }
+    list_index_host_ptr_[interleave_add + current_add] = i;
+    list_lengths_host_ptr_[id_cluster] += 1;
+  }
+
+  RAFT_CUDA_TRY(cudaMalloc(&centriod_dev_ptr_, sizeof(float) * nlist_ * dim_));
+  RAFT_CUDA_TRY(cudaMemcpyAsync(centriod_dev_ptr_,
+                                centriod_manage_ptr,
+                                sizeof(float) * nlist_ * dim_,
+                                cudaMemcpyDefault,
+                                stream));
+
+  RAFT_CUDA_TRY(cudaFree(datasetLabels));
+  RAFT_CUDA_TRY(cudaFree(centriod_manage_ptr));
+
+  // Store index on GPU memory: temp WAR until we've entire index building buffers on device
+  RAFT_CUDA_TRY(cudaMalloc(&list_prefix_interleaved_dev_ptr_, sizeof(uint32_t) * nlist_));
+  RAFT_CUDA_TRY(cudaMalloc(&list_lengths_dev_ptr_, sizeof(uint32_t) * nlist_));
+  RAFT_CUDA_TRY(cudaMalloc(&list_index_dev_ptr_, sizeof(uint32_t) * ninterleave_));
+
+  if (dtype_ == CUDA_R_32F) {
+    RAFT_CUDA_TRY(cudaMalloc(&list_data_dev_ptr_, sizeof(float) * ninterleave_ * dim_));
+  } else if (dtype_ == CUDA_R_8U) {
+    RAFT_CUDA_TRY(cudaMalloc(&list_data_dev_ptr_, sizeof(uint8_t) * ninterleave_ * dim_));
+  } else if (dtype_ == CUDA_R_8I) {
+    RAFT_CUDA_TRY(cudaMalloc(&list_data_dev_ptr_, sizeof(int8_t) * ninterleave_ * dim_));
+  }
+
+  // Step 3: Read the list
+  RAFT_CUDA_TRY(cudaMemcpyAsync(list_prefix_interleaved_dev_ptr_,
+                                list_prefix_interleaved_host_ptr_,
+                                sizeof(uint32_t) * nlist_,
+                                cudaMemcpyHostToDevice,
+                                stream));
+  RAFT_CUDA_TRY(cudaMemcpyAsync(list_lengths_dev_ptr_,
+                                list_lengths_host_ptr_,
+                                sizeof(uint32_t) * nlist_,
+                                cudaMemcpyHostToDevice,
+                                stream));
+
+  if (dtype_ == CUDA_R_32F) {
+    RAFT_CUDA_TRY(cudaMemcpyAsync(list_data_dev_ptr_,
+                                  list_data_host_ptr_,
+                                  sizeof(float) * ninterleave_ * dim_,
+                                  cudaMemcpyHostToDevice,
+                                  stream));
+  } else if (dtype_ == CUDA_R_8U) {
+    RAFT_CUDA_TRY(cudaMemcpyAsync(list_data_dev_ptr_,
+                                  list_data_host_ptr_,
+                                  sizeof(uint8_t) * ninterleave_ * dim_,
+                                  cudaMemcpyHostToDevice,
+                                  stream));
+  } else if (dtype_ == CUDA_R_8I) {
+    RAFT_CUDA_TRY(cudaMemcpyAsync(list_data_dev_ptr_,
+                                  list_data_host_ptr_,
+                                  sizeof(int8_t) * ninterleave_ * dim_,
+                                  cudaMemcpyHostToDevice,
+                                  stream));
+  }
+  RAFT_CUDA_TRY(cudaMemcpyAsync(list_index_dev_ptr_,
+                                list_index_host_ptr_,
+                                sizeof(uint32_t) * ninterleave_,
+                                cudaMemcpyHostToDevice,
+                                stream));
+
+  return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
+}  // end func cuivflBuildIndex
+
+cuivflStatus_t cuivflHandle::queryIVFFlatGridSize(const uint32_t nprobe,
+                                                  const uint32_t batch_size,
+                                                  const uint32_t k)
+{
+  // query the gridDimX size to store probes topK output
+  switch (dtype_) {
+    case CUDA_R_32F:
+      ivfflat_interleaved_scan<float, float>(nullptr,
+                                             nullptr,
+                                             nullptr,
+                                             nullptr,
+                                             nullptr,
+                                             nullptr,
+                                             metric_type_,
+                                             nprobe,
+                                             k,
+                                             batch_size,
+                                             dim_,
+                                             nullptr,
+                                             nullptr,
+                                             0,
+                                             greater_,
+                                             veclen,
+                                             gridDimX_);
+      break;
+    case CUDA_R_8U:
+      // we use int32_t for accumulation, and final store in fp32
+      ivfflat_interleaved_scan<uint8_t, uint32_t>(nullptr,
+                                                  nullptr,
+                                                  nullptr,
+                                                  nullptr,
+                                                  nullptr,
+                                                  nullptr,
+                                                  metric_type_,
+                                                  nprobe,
+                                                  k,
+                                                  batch_size,
+                                                  dim_,
+                                                  nullptr,
+                                                  nullptr,
+                                                  0,
+                                                  greater_,
+                                                  veclen,
+                                                  gridDimX_);
+      break;
+    case CUDA_R_8I:
+      ivfflat_interleaved_scan<int8_t, int32_t>(nullptr,
+                                                nullptr,
+                                                nullptr,
+                                                nullptr,
+                                                nullptr,
+                                                nullptr,
+                                                metric_type_,
+                                                nprobe,
+                                                k,
+                                                batch_size,
+                                                dim_,
+                                                nullptr,
+                                                nullptr,
+                                                0,
+                                                greater_,
+                                                veclen,
+                                                gridDimX_);
+      break;
+    default: break;
+  }
+  return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
+}
+
+// cuivflSetSearchParameters
+cuivflStatus_t cuivflHandle::cuivflSetSearchParameters(const uint32_t nprobe,
+                                                       const uint32_t max_batch,
+                                                       const uint32_t max_k)
+{
+  nprobe_ = nprobe;
+  if (nprobe_ <= 0) { return CUIVFL_STATUS_INVALID_VALUE; }
+  // Set the greater_
+  if (metric_type_ == raft::distance::DistanceType::L2Expanded ||
+      metric_type_ == raft::distance::DistanceType::L2Unexpanded) {
+    greater_ = false;
+  } else {
+    greater_ = false;  // Need to set this to true for inner product if need FAISS like behavior for
+                       // inner product
+  }
+
+  // Set buffer
+  if ((dtype_ == CUDA_R_8U) || (dtype_ == CUDA_R_8I)) {
+    floatQuerySize = sizeof(float) * max_batch * dim_;
+    if ((dim_ % 16) == 0) {
+      veclen = 16;
+    } else if ((dim_ % 8) == 0) {
+      veclen = 8;
+    }
+  } else {
+    floatQuerySize = 0;
+  }
+
+  size_t buf_coarse_size = 0;
+  topk::radix_topk_11bits<float, uint32_t>(nullptr,
+                                           buf_coarse_size,
+                                           nullptr,
+                                           (uint32_t)max_batch,
+                                           (uint32_t)nlist_,
+                                           (uint32_t)nprobe,
+                                           nullptr,
+                                           nullptr,
+                                           greater_,
+                                           0);
+
+  size_t buf_refine_size = 0;
+//#ifdef RADIX
+#if 1
+  topk::radix_topk_11bits<float, size_t>(nullptr,
+                                         buf_refine_size,
+                                         nullptr,
+                                         nullptr,
+                                         (size_t)max_batch,
+                                         (size_t)max_k * nprobe,
+                                         (size_t)max_k,
+                                         nullptr,
+                                         nullptr,
+                                         greater_,
+                                         0);
+#else
+  nv::warp_sort_topk<float, size_t>(nullptr,
+                                    buf_refine_size,
+                                    nullptr,
+                                    nullptr,
+                                    (size_t)max_batch,
+                                    (size_t)(max_k * nprobe),
+                                    (size_t)max_k,
+                                    nullptr,
+                                    nullptr,
+                                    greater_,
+                                    0);
+#endif
+
+  buf_topk_size_            = buf_coarse_size > buf_refine_size ? buf_coarse_size : buf_refine_size;
+  uint32_t query_norm_size  = max_batch * sizeof(float);
+  std::vector<size_t> sizes = {query_norm_size,
+                               max_batch * nlist_ * sizeof(float),
+                               max_batch * nprobe * sizeof(float),
+                               max_batch * nprobe * sizeof(uint32_t),
+                               max_batch * nprobe * max_k * sizeof(float),
+                               max_batch * nprobe * max_k * sizeof(size_t),
+                               buf_topk_size_,
+                               floatQuerySize};
+
+  size_t total_size = utils::calc_aligned_size(sizes);
+
+  if (buf_dev_ptr_ != nullptr) { RAFT_CUDA_TRY(cudaFree(buf_dev_ptr_)); }
+  RAFT_CUDA_TRY(cudaMalloc(&buf_dev_ptr_, total_size));
+  return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
+}
+
+// cuivflSearch
+cuivflStatus_t cuivflHandle::cuivflSearch(const void* queries,  // [numQueries, dimDataset]
+                                          uint32_t batch_size,
+                                          uint32_t k,
+                                          size_t* neighbors,  // [numQueries, topK]
+                                          float* distances,
+                                          cudaStream_t stream,
+                                          cudaDataType_t dtype)
+{
+  switch (dtype) {
+    case CUDA_R_32F:
+      cuivflSearchImpl<float, float>(reinterpret_cast<const float*>(queries),
+                                     batch_size,
+                                     k,
+                                     neighbors,
+                                     reinterpret_cast<float*>(distances),
+                                     stream);
+      break;
+    case CUDA_R_8U:
+      cuivflSearchImpl<uint8_t, float>(
+        reinterpret_cast<const uint8_t*>(queries), batch_size, k, neighbors, distances, stream);
+      break;
+    case CUDA_R_8I:
+      cuivflSearchImpl<int8_t, float>(
+        reinterpret_cast<const int8_t*>(queries), batch_size, k, neighbors, distances, stream);
+      break;
+    default: printf("unsupported data type\n"); break;
+  }
+
+  return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
+}  // end func cuivflSearch
+
+template <typename T, typename value_t>
+cuivflStatus_t cuivflHandle::cuivflSearchImpl(const T* queries,  // [numQueries, dimDataset]
+                                              uint32_t batch_size,
+                                              uint32_t k,
+                                              size_t* neighbors,  // [numQueries, topK]
+                                              value_t* distances,
+                                              cudaStream_t stream)
+{
+  uint32_t nprobe = std::min(nprobe_, (uint32_t)nlist_);
+  stream_         = stream;
+
+  cublasSetStream(cublas_handle_, stream_);
+  gridDimX_ = 0;
+  queryIVFFlatGridSize(nprobe, batch_size, k);
+  // Prepare the buffer for topk calculation
+  uint32_t query_norm_size            = batch_size * sizeof(float);
+  std::vector<size_t> sizes           = {query_norm_size,
+                               batch_size * nlist_ * sizeof(float),
+                               batch_size * nprobe * sizeof(float),
+                               batch_size * nprobe * sizeof(uint32_t),
+                               batch_size * nprobe * k * sizeof(float),
+                               batch_size * nprobe * k * sizeof(size_t),
+                               buf_topk_size_,
+                               floatQuerySize};
+  size_t total_size                   = utils::calc_aligned_size(sizes);
+  std::vector<void*> aligned_pointers = utils::calc_aligned_pointers(buf_dev_ptr_, sizes);
+
+  // The norm of query [batch_size];
+  float* query_norm_dev_ptr = static_cast<float*>(aligned_pointers[0]);
+  // The distance value of cluster(list) and queries;[batch, nlist_]
+  float* distance_buffer_dev_ptr = static_cast<float*>(aligned_pointers[1]);
+  // The topk distance value of cluster(list) and queries;[batch, nprobe]
+  float* coarse_distances_dev_ptr = static_cast<float*>(aligned_pointers[2]);
+  // TODO:use float datatype here for now.
+  // The topk  index of cluster(list) and queries;[batch, nprobe]
+  uint32_t* coarse_indices_dev_ptr = static_cast<uint32_t*>(aligned_pointers[3]);
+  // The topk distance value of candicate vectors from each cluster(list);[batch,k]
+  value_t* refined_distances_dev_ptr = static_cast<value_t*>(aligned_pointers[4]);
+  // The topk index of candicate vectors from each cluster(list);[batch, k]
+  size_t* refined_indices_dev_ptr = static_cast<size_t*>(aligned_pointers[5]);
+  void* buf_topk_dev_ptr          = static_cast<void*>(aligned_pointers[6]);
+  float* convertedQueries         = static_cast<float*>(aligned_pointers[7]);
+
+  if constexpr (std::is_same<T, uint8_t>{}) {
+    constexpr float divisor = 256.0;
+    utils::_cuann_copy<uint8_t, float>(
+      batch_size, dim_, (uint8_t*)queries, dim_, convertedQueries, dim_, stream, divisor);
+  } else if constexpr (std::is_same<T, int8_t>{}) {
+    constexpr float divisor = 128.0;
+    utils::_cuann_copy<int8_t, float>(
+      batch_size, dim_, (int8_t*)queries, dim_, convertedQueries, dim_, stream, divisor);
+  } else {
+    convertedQueries = (float*)(queries);
+  }
+
+  float alpha = 1.0f;
+  float beta  = 0.0f;
+
+  if (metric_type_ == raft::distance::DistanceType::L2Expanded) {
+    alpha = -2.0f;
+    beta  = 1.0f;
+    utils::_cuann_sqsum(batch_size, dim_, convertedQueries, query_norm_dev_ptr);
+    utils::_cuann_outer_add(
+      query_norm_dev_ptr, batch_size, centriod_norm_dev_ptr_, nlist_, distance_buffer_dev_ptr);
+#ifdef DEBUG_L2
+    utils::printDevPtr(centriod_norm_dev_ptr_, 20, "centriod_norm_dev_ptr_");
+    utils::printDevPtr(distance_buffer_dev_ptr, 20, "distance_buffer_dev_ptr");
+#endif
+  } else {
+    alpha = 1.0f;
+    beta  = 0.0f;
+  }
+
+  cublasGemmEx(cublas_handle_,
+               CUBLAS_OP_T,
+               CUBLAS_OP_N,
+               nlist_,
+               batch_size,
+               dim_,
+               &alpha,
+               centriod_dev_ptr_,
+               CUDA_R_32F,
+               dim_,
+               convertedQueries,
+               CUDA_R_32F,
+               dim_,
+               &beta,
+               distance_buffer_dev_ptr,
+               CUDA_R_32F,
+               nlist_,
+               CUDA_R_32F,
+               CUBLAS_GEMM_DEFAULT);
+
+#ifdef DEBUG_L2
+  utils::printDevPtr(distance_buffer_dev_ptr, 20, "distance_buffer_dev_ptr");
+#endif
+  topk::radix_topk_11bits<value_t, uint32_t>(buf_topk_dev_ptr,
+                                             buf_topk_size_,
+                                             distance_buffer_dev_ptr,
+                                             (uint32_t)batch_size,
+                                             (uint32_t)nlist_,
+                                             (uint32_t)nprobe,
+                                             coarse_distances_dev_ptr,
+                                             coarse_indices_dev_ptr,
+                                             greater_,
+                                             stream);
+#ifdef DEBUG_L2
+  utils::printDevPtr(coarse_indices_dev_ptr, 1 * nprobe, "coarse_indices_dev_ptr");
+  utils::printDevPtr(coarse_distances_dev_ptr, 1 * nprobe, "coarse_distances_dev_ptr");
+#endif
+
+  value_t* distances_dev_ptr = refined_distances_dev_ptr;
+  size_t* indices_dev_ptr    = refined_indices_dev_ptr;
+  if (nprobe == 1 || gridDimX_ == 1) {
+    distances_dev_ptr = distances;
+    indices_dev_ptr   = neighbors;
+  }
+
+  if constexpr (std::is_same<T, float>{}) {
+    ivfflat_interleaved_scan<float, float>(queries,
+                                           coarse_indices_dev_ptr,
+                                           list_index_dev_ptr_,
+                                           list_data_dev_ptr_,
+                                           list_lengths_dev_ptr_,
+                                           list_prefix_interleaved_dev_ptr_,
+                                           metric_type_,
+                                           nprobe,
+                                           k,
+                                           batch_size,
+                                           dim_,
+                                           indices_dev_ptr,
+                                           distances_dev_ptr,
+                                           stream,
+                                           greater_,
+                                           veclen,
+                                           gridDimX_);
+  } else if constexpr (std::is_same<T, uint8_t>{}) {
+    // we use int32_t for accumulation, and final store in fp32
+    ivfflat_interleaved_scan<uint8_t, uint32_t>(queries,
+                                                coarse_indices_dev_ptr,
+                                                list_index_dev_ptr_,
+                                                list_data_dev_ptr_,
+                                                list_lengths_dev_ptr_,
+                                                list_prefix_interleaved_dev_ptr_,
+                                                metric_type_,
+                                                nprobe,
+                                                k,
+                                                batch_size,
+                                                dim_,
+                                                indices_dev_ptr,
+                                                distances_dev_ptr,
+                                                stream,
+                                                greater_,
+                                                veclen,
+                                                gridDimX_);
+  } else if constexpr (std::is_same<T, int8_t>{}) {
+    ivfflat_interleaved_scan<int8_t, int32_t>(queries,
+                                              coarse_indices_dev_ptr,
+                                              list_index_dev_ptr_,
+                                              list_data_dev_ptr_,
+                                              list_lengths_dev_ptr_,
+                                              list_prefix_interleaved_dev_ptr_,
+                                              metric_type_,
+                                              nprobe,
+                                              k,
+                                              batch_size,
+                                              dim_,
+                                              indices_dev_ptr,
+                                              distances_dev_ptr,
+                                              stream,
+                                              greater_,
+                                              veclen,
+                                              gridDimX_);
+  }
+
+#ifdef DEBUG_L2
+  utils::printDevPtr(distances_dev_ptr, 2 * k, "distances_dev_ptr");
+  utils::printDevPtr(indices_dev_ptr, 2 * k, "indices_dev_ptr");
+#endif
+
+  if (gridDimX_ > 1) {
+//#ifdef RADIX
+#if 1
+    topk::radix_topk_11bits<value_t, size_t>(buf_topk_dev_ptr,
+                                             buf_topk_size_,
+                                             refined_distances_dev_ptr,
+                                             refined_indices_dev_ptr,
+                                             (size_t)batch_size,
+                                             (size_t)k * gridDimX_,
+                                             (size_t)k,
+                                             distances,
+                                             neighbors,
+                                             greater_,
+                                             stream);
+#else
+    topk::warp_sort_topk<value_t, size_t>(buf_topk_dev_ptr,
+                                          buf_topk_size_,
+                                          refined_distances_dev_ptr,
+                                          refined_indices_dev_ptr,
+                                          (size_t)batch_size,
+                                          (size_t)(k * gridDimX_),
+                                          (size_t)k,
+                                          distances,
+                                          neighbors,
+                                          greater_,
+                                          stream);
+#endif
+  }  // end if nprobe=1
+
+  return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
+}  // end func cuivflHandle::cuivflSearchImpl
+
+}  // namespace detail
+}  // namespace knn
+}  // namespace spatial
+}  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
new file mode 100644
index 0000000000..904a64c374
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
@@ -0,0 +1,1411 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "../ann_common.h"
+#include "ann_utils.cuh"
+#include "knn_brute_force_faiss.cuh"
+#include "topk/warpsort_topk.cuh"
+#include <raft/common/device_loads_stores.cuh>
+
+#include "common_faiss.h"
+#include "processing.hpp"
+
+#include "processing.hpp"
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+
+//#include <label/classlabels.cuh>
+#include <raft/distance/distance.hpp>
+#include <raft/spatial/knn/faiss_mr.hpp>
+
+#include <faiss/gpu/GpuDistance.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/GpuIndexIVFFlat.h>
+#include <faiss/gpu/GpuIndexIVFPQ.h>
+#include <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/utils/Limits.cuh>
+#include <faiss/gpu/utils/Select.cuh>
+#include <faiss/gpu/utils/Tensor.cuh>
+#include <faiss/utils/Heap.h>
+
+#include <thrust/iterator/transform_iterator.h>
+
+#include <raft/distance/distance_type.hpp>
+
+#include <iostream>
+#include <set>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+namespace detail {
+
+#define SHFL_SYNC(VAL, SRC_LANE, WIDTH) __shfl_sync(0xffffffff, VAL, SRC_LANE, WIDTH)
+// build
+
+// #define DEBUG
+/// Init centriods
+template <typename T>
+void ivfflat_centriod_init(T* dataset, T* centriod, int nlist, int dim, int n)
+{
+  // srand (time(NULL));
+  int nparts = n / nlist;
+  int index  = rand() % nparts;
+  for (int i = 0; i < nlist; i++) {
+    memcpy(centriod + i * dim, dataset + i * nparts + index, sizeof(T) * dim);
+  }  // end for
+}  // end func ivfflat_centriod_init
+
+// search
+template <typename U, typename V>
+constexpr __host__ __device__ auto divUp(U a, V b)
+{
+  return (a + b - 1) / b;
+}
+
+template <typename U, typename V>
+constexpr __host__ __device__ auto divDown(U a, V b)
+{
+  return (a / b);
+}
+
+template <typename U, typename V>
+constexpr __host__ __device__ auto roundDown(U a, V b)
+{
+  return divDown(a, b) * b;
+}
+
+template <typename T, int veclen>
+__device__ __forceinline__ void queryLoadToShmem(const T* const& query,
+                                                 T* queryShared,
+                                                 const int loadDim)
+{
+  T queryReg[veclen];
+  const int loadIndex = loadDim * veclen;
+  ldg(queryReg, query + loadIndex);
+  sts(&queryShared[loadIndex], queryReg);
+}
+
+template <>
+__device__ __forceinline__ void queryLoadToShmem<uint8_t, 8>(const uint8_t* const& query,
+                                                             uint8_t* queryShared,
+                                                             const int loadDim)
+{
+  constexpr int veclen = 2;  // 8 uint8_t
+  uint32_t queryReg[veclen];
+  const int loadIndex = loadDim * veclen;
+  ldg(queryReg, reinterpret_cast<uint32_t const*>(query) + loadIndex);
+  sts(reinterpret_cast<uint32_t*>(queryShared) + loadIndex, queryReg);
+}
+
+template <>
+__device__ __forceinline__ void queryLoadToShmem<uint8_t, 16>(const uint8_t* const& query,
+                                                              uint8_t* queryShared,
+                                                              const int loadDim)
+{
+  constexpr int veclen = 4;  // 16 uint8_t
+  uint32_t queryReg[veclen];
+  const int loadIndex = loadDim * veclen;
+  ldg(queryReg, reinterpret_cast<uint32_t const*>(query) + loadIndex);
+  sts(reinterpret_cast<uint32_t*>(queryShared) + loadIndex, queryReg);
+}
+
+template <>
+__device__ __forceinline__ void queryLoadToShmem<int8_t, 8>(const int8_t* const& query,
+                                                            int8_t* queryShared,
+                                                            const int loadDim)
+{
+  constexpr int veclen = 2;  // 8 int8_t
+  int32_t queryReg[veclen];
+  const int loadIndex = loadDim * veclen;
+  ldg(queryReg, reinterpret_cast<int32_t const*>(query) + loadIndex);
+  sts(reinterpret_cast<int32_t*>(queryShared) + loadIndex, queryReg);
+}
+
+template <>
+__device__ __forceinline__ void queryLoadToShmem<int8_t, 16>(const int8_t* const& query,
+                                                             int8_t* queryShared,
+                                                             const int loadDim)
+{
+  constexpr int veclen = 4;  // 16 int8_t
+  int32_t queryReg[veclen];
+  const int loadIndex = loadDim * veclen;
+  ldg(queryReg, reinterpret_cast<int32_t const*>(query) + loadIndex);
+  sts(reinterpret_cast<int32_t*>(queryShared) + loadIndex, queryReg);
+}
+
+template <int kUnroll,
+          int wordsPerVectorBlockDim,
+          typename computeLambda,
+          int veclen,
+          typename T,
+          typename AccT>
+struct loadAndComputeDist {
+  computeLambda computeDist;
+  AccT& dist;
+
+  __device__ __forceinline__ loadAndComputeDist(AccT& dist, computeLambda op)
+    : dist(dist), computeDist(op)
+  {
+  }
+
+  template <typename IdxT>
+  __device__ __forceinline__ void runLoadShmemCompute(const T* const& data,
+                                                      const T* queryShared,
+                                                      IdxT loadIndex,
+                                                      IdxT baseShmemIndex,
+                                                      IdxT iShmemIndex)
+  {
+    T encV[kUnroll][veclen];
+    T queryRegs[kUnroll][veclen];
+    constexpr int stride  = kUnroll * veclen;
+    const int shmemStride = baseShmemIndex + iShmemIndex * stride;
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      ldg(encV[j], data + (loadIndex + j * wordsPerVectorBlockDim) * veclen);
+      const int d = shmemStride + j * veclen;
+      lds(queryRegs[j], &queryShared[d]);
+#pragma unroll
+      for (int k = 0; k < veclen; ++k) {
+        computeDist(dist, queryRegs[j][k], encV[j][k]);
+      }
+    }
+  }
+
+  template <typename IdxT>
+  __device__ __forceinline__ void runLoadShflAndCompute(const T*& data,
+                                                        const T* query,
+                                                        IdxT baseLoadIndex,
+                                                        const int laneId)
+  {
+    T encV[kUnroll][veclen];
+    T queryReg               = query[baseLoadIndex + laneId];
+    constexpr int stride     = kUnroll * veclen;
+    constexpr int totalIter  = kWarpSize / stride;
+    constexpr int gmemStride = stride * wordsPerVectorBlockDim;
+#pragma unroll
+    for (int i = 0; i < totalIter; ++i, data += gmemStride) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        ldg(encV[j], (data + (laneId + j * wordsPerVectorBlockDim) * veclen));
+        T q[veclen];
+        const int d = (i * kUnroll + j) * veclen;
+#pragma unroll
+        for (int k = 0; k < veclen; ++k) {
+          q[k] = SHFL_SYNC(queryReg, d + k, kWarpSize);
+          computeDist(dist, q[k], encV[j][k]);  //@TODO add other metrics
+        }
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(
+    const T*& data, const T* query, const int laneId, const int dim, const int dimBlocks)
+  {
+    const int loadDim     = dimBlocks + laneId;
+    T queryReg            = loadDim < dim ? query[loadDim] : 0;
+    const int loadDataIdx = laneId * veclen;
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += wordsPerVectorBlockDim * veclen) {
+      T enc[veclen];
+      T q[veclen];
+      ldg(enc, data + loadDataIdx);
+#pragma unroll
+      for (int k = 0; k < veclen; k++) {
+        q[k] = SHFL_SYNC(queryReg, d + k, kWarpSize);
+        computeDist(dist, q[k], enc[k]);
+      }
+    }  // end for d < dim - dimBlocks
+  }
+};
+
+// This handles uint8_t 8, 16 veclens
+template <int kUnroll, int wordsPerVectorBlockDim, typename computeLambda, int uint8_veclen>
+struct loadAndComputeDist<kUnroll,
+                          wordsPerVectorBlockDim,
+                          computeLambda,
+                          uint8_veclen,
+                          uint8_t,
+                          uint32_t> {
+  computeLambda computeDist;
+  uint32_t& dist;
+
+  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, computeLambda op)
+    : dist(dist), computeDist(op)
+  {
+  }
+
+  __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
+                                                      const uint8_t* queryShared,
+                                                      int loadIndex,
+                                                      int baseShmemIndex,
+                                                      int iShmemIndex)
+  {
+    constexpr int veclen_int = uint8_veclen / 4;  // converting uint8_t veclens to int
+    uint32_t encV[kUnroll][veclen_int];
+    uint32_t queryRegs[kUnroll][veclen_int];
+
+    loadIndex = loadIndex * veclen_int;
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      ldg(encV[j],
+          reinterpret_cast<unsigned const*>(data) + loadIndex +
+            j * wordsPerVectorBlockDim * veclen_int);
+      const int d = iShmemIndex * kUnroll + j * veclen_int;
+      lds(queryRegs[j], reinterpret_cast<unsigned const*>(queryShared + baseShmemIndex) + d);
+#pragma unroll
+      for (int k = 0; k < veclen_int; k++) {
+        computeDist(dist, queryRegs[j][k], encV[j][k]);
+      }
+    }
+  }
+  __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
+                                                        const uint8_t* query,
+                                                        int baseLoadIndex,
+                                                        const int laneId)
+  {
+    constexpr int veclen_int = uint8_veclen / 4;  // converting uint8_t veclens to int
+    uint32_t encV[kUnroll][veclen_int];
+    uint32_t queryReg =
+      (laneId < 8) ? reinterpret_cast<unsigned const*>(query + baseLoadIndex)[laneId] : 0;
+    uint32_t q[kUnroll][veclen_int];
+    constexpr int stride = kUnroll * uint8_veclen;
+
+#pragma unroll
+    for (int i = 0; i < kWarpSize / stride; ++i, data += stride * wordsPerVectorBlockDim) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        ldg(encV[j],
+            reinterpret_cast<unsigned const*>(data) +
+              (laneId + j * wordsPerVectorBlockDim) * veclen_int);
+        const int d = (i * kUnroll + j) * veclen_int;
+#pragma unroll
+        for (int k = 0; k < veclen_int; ++k) {
+          q[j][k] = SHFL_SYNC(queryReg, d + k, kWarpSize);
+          computeDist(dist, q[j][k], encV[j][k]);
+        }
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
+                                                                 const uint8_t* query,
+                                                                 const int laneId,
+                                                                 const int dim,
+                                                                 const int dimBlocks)
+  {
+    constexpr int veclen_int = uint8_veclen / 4;
+    const int loadDim        = dimBlocks + laneId * 4;  // Here 4 is for 1 - int
+    uint32_t queryReg = loadDim < dim ? reinterpret_cast<uint32_t const*>(query + loadDim)[0] : 0;
+    for (int d = 0; d < dim - dimBlocks;
+         d += uint8_veclen, data += wordsPerVectorBlockDim * uint8_veclen) {
+      uint32_t enc[veclen_int];
+      uint32_t q[veclen_int];
+      ldg(enc, reinterpret_cast<uint32_t const*>(data) + laneId * veclen_int);
+#pragma unroll
+      for (int k = 0; k < veclen_int; k++) {
+        q[k] = SHFL_SYNC(queryReg, (d / 4) + k, kWarpSize);
+        computeDist(dist, q[k], enc[k]);
+      }
+    }  // end for d < dim - dimBlocks
+  }
+};
+
+// Keep this specialized uint8 veclen = 4, because compiler is generating suboptimal code while
+// using above common template of int2/int4
+template <int kUnroll, int wordsPerVectorBlockDim, typename computeLambda>
+struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 4, uint8_t, uint32_t> {
+  computeLambda computeDist;
+  uint32_t& dist;
+
+  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, computeLambda op)
+    : dist(dist), computeDist(op)
+  {
+  }
+
+  __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
+                                                      const uint8_t* queryShared,
+                                                      int loadIndex,
+                                                      int baseShmemIndex,
+                                                      int iShmemIndex)
+  {
+    uint32_t encV[kUnroll];
+    uint32_t queryRegs[kUnroll];
+
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      encV[j]     = reinterpret_cast<unsigned const*>(data)[loadIndex + j * wordsPerVectorBlockDim];
+      const int d = (iShmemIndex * kUnroll + j);
+      queryRegs[j] = reinterpret_cast<unsigned const*>(queryShared + baseShmemIndex)[d];
+      computeDist(dist, queryRegs[j], encV[j]);
+    }
+  }
+  __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
+                                                        const uint8_t* query,
+                                                        int baseLoadIndex,
+                                                        const int laneId)
+  {
+    uint32_t encV[kUnroll];
+    uint32_t queryReg =
+      (laneId < 8) ? reinterpret_cast<unsigned const*>(query + baseLoadIndex)[laneId] : 0;
+    uint32_t q[kUnroll];
+    constexpr int veclen = 4;
+    constexpr int stride = kUnroll * veclen;
+
+#pragma unroll
+    for (int i = 0; i < kWarpSize / stride; ++i, data += stride * wordsPerVectorBlockDim) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        encV[j]     = reinterpret_cast<unsigned const*>(data)[laneId + j * wordsPerVectorBlockDim];
+        const int d = (i * kUnroll + j);
+        q[j]        = SHFL_SYNC(queryReg, d, kWarpSize);
+        computeDist(dist, q[j], encV[j]);
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
+                                                                 const uint8_t* query,
+                                                                 const int laneId,
+                                                                 const int dim,
+                                                                 const int dimBlocks)
+  {
+    constexpr int veclen = 4;
+    const int loadDim    = dimBlocks + laneId;
+    uint32_t queryReg    = loadDim < dim ? reinterpret_cast<unsigned const*>(query)[loadDim] : 0;
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += wordsPerVectorBlockDim * veclen) {
+      uint32_t enc = reinterpret_cast<unsigned const*>(data)[laneId];
+      uint32_t q   = SHFL_SYNC(queryReg, d / veclen, kWarpSize);
+      computeDist(dist, q, enc);
+    }  // end for d < dim - dimBlocks
+  }
+};
+
+template <int kUnroll, int wordsPerVectorBlockDim, typename computeLambda>
+struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 2, uint8_t, uint32_t> {
+  computeLambda computeDist;
+  uint32_t& dist;
+
+  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, computeLambda op)
+    : dist(dist), computeDist(op)
+  {
+  }
+
+  __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
+                                                      const uint8_t* queryShared,
+                                                      int loadIndex,
+                                                      int baseShmemIndex,
+                                                      int iShmemIndex)
+  {
+    uint32_t encV[kUnroll];
+    uint32_t queryRegs[kUnroll];
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      encV[j]     = 0;
+      encV[j]     = reinterpret_cast<uint16_t const*>(data)[loadIndex + j * wordsPerVectorBlockDim];
+      const int d = (iShmemIndex * kUnroll + j);
+      queryRegs[j] = 0;
+      queryRegs[j] = reinterpret_cast<uint16_t const*>(queryShared + baseShmemIndex)[d];
+      computeDist(dist, queryRegs[j], encV[j]);
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
+                                                        const uint8_t* query,
+                                                        int baseLoadIndex,
+                                                        const int laneId)
+  {
+    uint32_t encV[kUnroll];
+    uint32_t queryReg = 0;
+    queryReg = (laneId < 16) ? reinterpret_cast<uint16_t const*>(query + baseLoadIndex)[laneId] : 0;
+    uint32_t q[kUnroll];
+    constexpr int veclen = 2;
+    constexpr int stride = kUnroll * veclen;
+
+#pragma unroll
+    for (int i = 0; i < kWarpSize / stride; ++i, data += stride * wordsPerVectorBlockDim) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        encV[j]     = 0;
+        encV[j]     = reinterpret_cast<uint16_t const*>(data)[laneId + j * wordsPerVectorBlockDim];
+        const int d = (i * kUnroll + j);
+        q[j]        = SHFL_SYNC(queryReg, d, kWarpSize);
+        computeDist(dist, q[j], encV[j]);
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
+                                                                 const uint8_t* query,
+                                                                 const int laneId,
+                                                                 const int dim,
+                                                                 const int dimBlocks)
+  {
+    constexpr int veclen = 2;
+    int loadDim          = dimBlocks + laneId * veclen;
+    uint32_t queryReg    = 0;
+    queryReg = loadDim < dim ? reinterpret_cast<uint16_t const*>(query + loadDim)[0] : 0;
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += wordsPerVectorBlockDim * veclen) {
+      uint32_t enc = reinterpret_cast<uint16_t const*>(data)[laneId];
+      uint32_t q   = SHFL_SYNC(queryReg, d / veclen, kWarpSize);
+      computeDist(dist, q, enc);
+    }  // end for d < dim - dimBlocks
+  }
+};
+
+template <int kUnroll, int wordsPerVectorBlockDim, typename computeLambda>
+struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 1, uint8_t, uint32_t> {
+  computeLambda computeDist;
+  uint32_t& dist;
+
+  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, computeLambda op)
+    : dist(dist), computeDist(op)
+  {
+  }
+
+  __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
+                                                      const uint8_t* queryShared,
+                                                      int loadIndex,
+                                                      int baseShmemIndex,
+                                                      int iShmemIndex)
+  {
+    uint32_t encV[kUnroll];
+    uint32_t queryRegs[kUnroll];
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      encV[j]      = data[loadIndex + j * wordsPerVectorBlockDim];
+      const int d  = (iShmemIndex * kUnroll + j);
+      queryRegs[j] = queryShared[baseShmemIndex + d];
+      computeDist(dist, queryRegs[j], encV[j]);
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
+                                                        const uint8_t* query,
+                                                        int baseLoadIndex,
+                                                        const int laneId)
+  {
+    uint32_t encV[kUnroll];
+    uint32_t queryReg = 0;
+    queryReg          = query[baseLoadIndex + laneId];
+    uint32_t q[kUnroll];
+    constexpr int veclen = 1;
+    constexpr int stride = kUnroll * veclen;
+
+#pragma unroll
+    for (int i = 0; i < kWarpSize / stride; ++i, data += stride * wordsPerVectorBlockDim) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        encV[j]     = 0;
+        encV[j]     = data[laneId + j * wordsPerVectorBlockDim];
+        const int d = (i * kUnroll + j);
+        q[j]        = SHFL_SYNC(queryReg, d, kWarpSize);
+        computeDist(dist, q[j], encV[j]);
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
+                                                                 const uint8_t* query,
+                                                                 const int laneId,
+                                                                 const int dim,
+                                                                 const int dimBlocks)
+  {
+    constexpr int veclen = 1;
+    int loadDim          = dimBlocks + laneId;
+    uint32_t queryReg    = 0;
+    queryReg             = loadDim < dim ? query[loadDim] : 0;
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += wordsPerVectorBlockDim * veclen) {
+      uint32_t enc = 0;
+      enc          = data[laneId];
+      uint32_t q   = SHFL_SYNC(queryReg, d, kWarpSize);
+      computeDist(dist, q, enc);
+    }  // end for d < dim - dimBlocks
+  }
+};
+
+// This device function is for int8 veclens 4, 8 and 16
+template <int kUnroll, int wordsPerVectorBlockDim, typename computeLambda, int int8_veclen>
+struct loadAndComputeDist<kUnroll,
+                          wordsPerVectorBlockDim,
+                          computeLambda,
+                          int8_veclen,
+                          int8_t,
+                          int32_t> {
+  computeLambda computeDist;
+  int32_t& dist;
+
+  __device__ __forceinline__ loadAndComputeDist(int32_t& dist, computeLambda op)
+    : dist(dist), computeDist(op)
+  {
+  }
+
+  __device__ __forceinline__ void runLoadShmemCompute(const int8_t* const& data,
+                                                      const int8_t* queryShared,
+                                                      int loadIndex,
+                                                      int baseShmemIndex,
+                                                      int iShmemIndex)
+  {
+    constexpr int veclen_int = int8_veclen / 4;  // converting int8_t veclens to int
+    int32_t encV[kUnroll][veclen_int];
+    int32_t queryRegs[kUnroll][veclen_int];
+
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      ldg(encV[j],
+          reinterpret_cast<int32_t const*>(data) +
+            (loadIndex + j * wordsPerVectorBlockDim) * veclen_int);
+      const int d = iShmemIndex * kUnroll + j * veclen_int;
+      lds(queryRegs[j], reinterpret_cast<int32_t const*>(queryShared + baseShmemIndex) + d);
+#pragma unroll
+      for (int k = 0; k < veclen_int; k++) {
+        computeDist(dist, queryRegs[j][k], encV[j][k]);
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndCompute(const int8_t*& data,
+                                                        const int8_t* query,
+                                                        int baseLoadIndex,
+                                                        const int laneId)
+  {
+    constexpr int veclen_int = int8_veclen / 4;  // converting int8_t veclens to int
+    int32_t encV[kUnroll][veclen_int];
+    int32_t queryReg =
+      (laneId < 8) ? reinterpret_cast<int32_t const*>(query + baseLoadIndex)[laneId] : 0;
+    int32_t q[kUnroll][veclen_int];
+    constexpr int stride = kUnroll * int8_veclen;
+
+#pragma unroll
+    for (int i = 0; i < kWarpSize / stride; ++i, data += stride * wordsPerVectorBlockDim) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        ldg(encV[j],
+            reinterpret_cast<int32_t const*>(data) +
+              (laneId + j * wordsPerVectorBlockDim) * veclen_int);
+        const int d = (i * kUnroll + j) * veclen_int;
+#pragma unroll
+        for (int k = 0; k < veclen_int; ++k) {
+          q[j][k] = SHFL_SYNC(queryReg, d + k, kWarpSize);
+          computeDist(dist, q[j][k], encV[j][k]);
+        }
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(
+    const int8_t*& data, const int8_t* query, const int laneId, const int dim, const int dimBlocks)
+  {
+    constexpr int veclen_int = int8_veclen / 4;
+    const int loadDim        = dimBlocks + laneId * 4;  // Here 4 is for 1 - int;
+    int32_t queryReg = loadDim < dim ? reinterpret_cast<int32_t const*>(query + loadDim)[0] : 0;
+    for (int d = 0; d < dim - dimBlocks;
+         d += int8_veclen, data += wordsPerVectorBlockDim * int8_veclen) {
+      int32_t enc[veclen_int];
+      int32_t q[veclen_int];
+      ldg(enc, reinterpret_cast<int32_t const*>(data) + laneId * veclen_int);
+#pragma unroll
+      for (int k = 0; k < veclen_int; k++) {
+        q[k] = SHFL_SYNC(queryReg, (d / 4) + k, kWarpSize);  // Here 4 is for 1 - int;
+        computeDist(dist, q[k], enc[k]);
+      }
+    }  // end for d < dim - dimBlocks
+  }
+};
+
+template <int kUnroll, int wordsPerVectorBlockDim, typename computeLambda>
+struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 2, int8_t, int32_t> {
+  computeLambda computeDist;
+  int32_t& dist;
+  __device__ __forceinline__ loadAndComputeDist(int32_t& dist, computeLambda op)
+    : dist(dist), computeDist(op)
+  {
+  }
+  __device__ __forceinline__ void runLoadShmemCompute(const int8_t* const& data,
+                                                      const int8_t* queryShared,
+                                                      int loadIndex,
+                                                      int baseShmemIndex,
+                                                      int iShmemIndex)
+  {
+    int32_t encV[kUnroll];
+    int32_t queryRegs[kUnroll];
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      encV[j]     = 0;
+      encV[j]     = reinterpret_cast<uint16_t const*>(data)[loadIndex + j * wordsPerVectorBlockDim];
+      const int d = (iShmemIndex * kUnroll + j);
+      queryRegs[j] = 0;
+      queryRegs[j] = reinterpret_cast<uint16_t const*>(queryShared + baseShmemIndex)[d];
+      computeDist(dist, queryRegs[j], encV[j]);
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndCompute(const int8_t*& data,
+                                                        const int8_t* query,
+                                                        int baseLoadIndex,
+                                                        const int laneId)
+  {
+    int32_t encV[kUnroll];
+    int32_t queryReg = 0;
+    queryReg = (laneId < 16) ? reinterpret_cast<uint16_t const*>(query + baseLoadIndex)[laneId] : 0;
+    int32_t q[kUnroll];
+    constexpr int veclen = 2;
+    constexpr int stride = kUnroll * veclen;
+
+#pragma unroll
+    for (int i = 0; i < kWarpSize / stride; ++i, data += stride * wordsPerVectorBlockDim) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        encV[j]     = 0;
+        encV[j]     = reinterpret_cast<uint16_t const*>(data)[laneId + j * wordsPerVectorBlockDim];
+        const int d = (i * kUnroll + j);
+        q[j]        = SHFL_SYNC(queryReg, d, kWarpSize);
+        computeDist(dist, q[j], encV[j]);
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(
+    const int8_t*& data, const int8_t* query, const int laneId, const int dim, const int dimBlocks)
+  {
+    constexpr int veclen = 2;
+    int loadDim          = dimBlocks + laneId * veclen;
+    int32_t queryReg     = 0;
+    queryReg = loadDim < dim ? reinterpret_cast<uint16_t const*>(query + loadDim)[0] : 0;
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += wordsPerVectorBlockDim * veclen) {
+      int32_t enc = reinterpret_cast<uint16_t const*>(data + laneId * veclen)[0];
+      int32_t q   = SHFL_SYNC(queryReg, d / veclen, kWarpSize);
+      computeDist(dist, q, enc);
+    }  // end for d < dim - dimBlocks
+  }
+};
+
+template <int kUnroll, int wordsPerVectorBlockDim, typename computeLambda>
+struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 1, int8_t, int32_t> {
+  computeLambda computeDist;
+  int32_t& dist;
+  __device__ __forceinline__ loadAndComputeDist(int32_t& dist, computeLambda op)
+    : dist(dist), computeDist(op)
+  {
+  }
+
+  __device__ __forceinline__ void runLoadShmemCompute(const int8_t* const& data,
+                                                      const int8_t* queryShared,
+                                                      int loadIndex,
+                                                      int baseShmemIndex,
+                                                      int iShmemIndex)
+  {
+    int32_t encV[kUnroll];
+    int32_t queryRegs[kUnroll];
+
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      encV[j]      = 0;
+      encV[j]      = data[loadIndex + j * wordsPerVectorBlockDim];
+      const int d  = (iShmemIndex * kUnroll + j);
+      queryRegs[j] = 0;
+      queryRegs[j] = queryShared[baseShmemIndex + d];
+      computeDist(dist, queryRegs[j], encV[j]);
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndCompute(const int8_t*& data,
+                                                        const int8_t* query,
+                                                        int baseLoadIndex,
+                                                        const int laneId)
+  {
+    constexpr int veclen = 1;
+    constexpr int stride = kUnroll * veclen;
+    int32_t encV[kUnroll];
+    int32_t queryReg = 0;
+    queryReg         = query[baseLoadIndex + laneId];
+    int32_t q[kUnroll];
+
+#pragma unroll
+    for (int i = 0; i < kWarpSize / stride; ++i, data += stride * wordsPerVectorBlockDim) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        encV[j]     = 0;
+        encV[j]     = data[laneId + j * wordsPerVectorBlockDim];
+        const int d = (i * kUnroll + j);
+        q[j]        = SHFL_SYNC(queryReg, d, kWarpSize);
+        computeDist(dist, q[j], encV[j]);
+      }
+    }
+  }
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(
+    const int8_t*& data, const int8_t* query, const int laneId, const int dim, const int dimBlocks)
+  {
+    constexpr int veclen = 1;
+    const int loadDim    = dimBlocks + laneId;
+    int32_t queryReg     = 0;
+    queryReg             = loadDim < dim ? query[loadDim] : 0;
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += wordsPerVectorBlockDim * veclen) {
+      int32_t enc = 0;
+      enc         = data[laneId];
+      int32_t q   = SHFL_SYNC(queryReg, d, kWarpSize);
+      computeDist(dist, q, enc);
+    }  // end for d < dim - dimBlocks
+  }
+};
+
+//#define USE_FAISS 1
+
+template <int CAPACITY, int veclen, typename T, typename value_t, typename distLambda, bool GREATER>
+__global__ void interleaved_scan(
+  const T* queries,        // Input: Query Vector; [batch_size, dim]
+  uint32_t* coarse_index,  // Record the cluster(list) id; [batch_size,nprobe]
+  uint32_t* list_index,    // Record the id of vector for each cluster(list); [nrow]
+  const T* list_data,      // Record the full value of vector for each cluster(list) interleaved;
+                           // [nrow, dim]
+  uint32_t* list_lengths,  // The number of vectors in each cluster(list); [nlist]
+  uint32_t* list_prefix_interleave,           // The start offset of each cluster(list) for
+                                              // list_index; [nlist]
+  const raft::distance::DistanceType metric,  // Function to process the different metric
+  distLambda computeDist,
+  const uint32_t nprobe,
+  const uint32_t k,
+  const uint32_t dim,
+  size_t* neighbors,  // [batch_size, nprobe]
+  float* distances,   // [batch_size, nprobe]
+  const float dummy)
+{
+#ifdef USE_FAISS
+  // temporary use of FAISS blockSelect for development purpose of k <= 32
+  // for comparison purpose
+  __shared__ float smemK[utils::kNumWarps * 32];
+  __shared__ size_t smemV[utils::kNumWarps * 32];
+
+  constexpr auto Dir = GREATER;
+  constexpr auto identity =
+    Dir ? std::numeric_limits<float>::min() : std::numeric_limits<float>::max();
+  constexpr auto keyMax =
+    Dir ? std::numeric_limits<size_t>::min() : std::numeric_limits<size_t>::max();
+
+  faiss::gpu::
+    BlockSelect<float, size_t, Dir, faiss::gpu::Comparator<float>, 32, 2, utils::kThreadPerBlock>
+      queue(identity, keyMax, smemK, smemV, k);
+
+#else
+  extern __shared__ char smem_ext[];
+  constexpr auto Dir =
+    GREATER ? false : true;  // topk::block_sort uses ascending hence switch is needed.
+  topk::block_sort<topk::warp_sort_immediate, CAPACITY, Dir, float, size_t> queue(
+    k, dummy, smem_ext);
+#endif
+
+  const int laneId = threadIdx.x % kWarpSize;
+  const int warpId = threadIdx.x / kWarpSize;
+  int queryId      = blockIdx.y;
+
+  /// Set the address
+  auto query                           = queries + queryId * dim;
+  constexpr int bytesPerVectorBlockDim = sizeof(T) * kWarpSize;
+  constexpr int wordsPerVectorBlockDim = bytesPerVectorBlockDim / sizeof(T);
+
+  // int wordsPerVectorBlock = wordsPerVectorBlockDim * dim;
+  const int dimBlocks = roundDown(dim, kWarpSize);
+
+  // This should be multiple of warpSize = 32
+  constexpr uint32_t queryShmemSize = 2048;
+  __shared__ T queryShared[queryShmemSize];
+
+  int shLoadDim = (dim < queryShmemSize) ? dim : queryShmemSize;
+  shLoadDim     = shLoadDim / veclen;
+
+  for (int loadDim = threadIdx.x; loadDim < shLoadDim; loadDim += blockDim.x) {
+    queryLoadToShmem<T, veclen>(query, queryShared, loadDim);
+  }
+  __syncthreads();
+  shLoadDim = (dim > queryShmemSize) ? (shLoadDim * veclen) : dimBlocks;
+
+  for (int probeId = blockIdx.x; probeId < nprobe; probeId += gridDim.x) {
+    uint32_t listId = coarse_index[queryId * nprobe + probeId];  // The id of cluster(list)
+
+    /**
+     * Uses shared memory
+     */
+    //@TODO The result with dimension
+    // The start address of the full value of vector for each cluster(list) interleaved
+    auto vecsBase = list_data + size_t(list_prefix_interleave[listId]) * dim;
+    // The start address of index of vector for each cluster(list) interleaved
+    auto indexBase = list_index + list_prefix_interleave[listId];
+    // The number of vectors in each cluster(list); [nlist]
+    const uint32_t numVecs = list_lengths[listId];
+
+    // The number of interleaved group to be processed
+    const uint32_t numBlocks = divUp(numVecs, kWarpSize);
+
+    for (uint32_t block = warpId; block < numBlocks; block += utils::kNumWarps) {
+      value_t dist = 0;
+      // This is the vector a given lane/thread handles
+      const uint32_t vec = block * kWarpSize + laneId;
+      bool valid         = vec < numVecs;
+      size_t idx         = (valid) ? (size_t)indexBase[vec] : (size_t)laneId;
+      // This is where this warp begins reading data
+      const T* data =
+        vecsBase + size_t(block) * wordsPerVectorBlockDim * dim;  // Start position of this block
+
+      if (valid) {
+        /// load query from shared mem
+        for (int dBase = 0; dBase < shLoadDim; dBase += kWarpSize) {  //
+          constexpr int kUnroll   = kWarpSize / veclen;
+          constexpr int stride    = kUnroll * veclen;
+          constexpr int totalIter = kWarpSize / stride;
+
+          loadAndComputeDist<kUnroll,
+                             wordsPerVectorBlockDim,
+                             decltype(computeDist),
+                             veclen,
+                             T,
+                             value_t>
+            obj(dist, computeDist);
+#pragma unroll
+          for (int i = 0; i < totalIter; ++i, data += stride * wordsPerVectorBlockDim) {
+            obj.runLoadShmemCompute(data, queryShared, laneId, dBase, i);
+          }  // end for i < kWarpSize / kUnroll
+        }    // end for dBase < dimBlocks
+      }
+
+      if (dim > queryShmemSize) {
+        constexpr int kUnroll = kWarpSize / veclen;
+        ;
+        loadAndComputeDist<kUnroll,
+                           wordsPerVectorBlockDim,
+                           decltype(computeDist),
+                           veclen,
+                           T,
+                           value_t>
+          obj(dist, computeDist);
+        for (int dBase = shLoadDim; dBase < dimBlocks; dBase += kWarpSize) {  //
+          obj.runLoadShflAndCompute(data, query, dBase, laneId);
+        }
+        // Remainder chunk = dim - dimBlocks
+        obj.runLoadShflAndComputeRemainder(data, query, laneId, dim, dimBlocks);
+        // end for d < dim - dimBlocks
+      } else {
+        if (valid) {
+          /// Remainder chunk = dim - dimBlocks
+          for (int d = 0; d < dim - dimBlocks;
+               d += veclen, data += wordsPerVectorBlockDim * veclen) {
+            loadAndComputeDist<1, wordsPerVectorBlockDim, decltype(computeDist), veclen, T, value_t>
+              obj(dist, computeDist);
+            obj.runLoadShmemCompute(data, queryShared, laneId, dimBlocks + d, 0);
+          }  // end for d < dim - dimBlocks
+        }
+      }
+
+      /// Inqueue warp_wise
+      float val = (valid) ? (float)dist : dummy;
+      queue.add(val, idx);
+    }  // end for block < numBlocks
+  }
+
+  /// Warp_wise topk
+#ifdef USE_FAISS
+  queue.reduce();
+  for (int i = threadIdx.x; i < k; i += utils::kThreadPerBlock) {
+    neighbors[queryId * k * gridDim.x + blockIdx.x * k + i] = (size_t)smemV[i];
+    distances[queryId * k * gridDim.x + blockIdx.x * k + i] = smemK[i];
+  }
+#else
+  queue.done();
+  queue.store(distances + queryId * k * gridDim.x + blockIdx.x * k,
+              neighbors + queryId * k * gridDim.x + blockIdx.x * k);
+#endif
+}  // end kernel
+
+template <typename T>
+dim3 launchConfigGenerator(uint32_t numQueries, uint32_t nprobe, int32_t sMemSize, T func)
+{
+  int devId;
+  RAFT_CUDA_TRY(cudaGetDevice(&devId));
+  int numSMs;
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, devId));
+  int numBlocksPerSm = 0;
+  dim3 grid;
+  RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    &numBlocksPerSm, func, utils::kThreadPerBlock, sMemSize));
+
+  std::size_t minGridSize = numSMs * numBlocksPerSm;
+  std::size_t yChunks     = numQueries;
+  std::size_t xChunks     = nprobe;
+  // grid.y                  = yChunks > minGridSize ? minGridSize : yChunks;
+  grid.y = yChunks;
+  grid.x = (minGridSize - grid.y) <= 0 ? 1 : xChunks;
+  if (grid.x != 1) {
+    std::size_t i = 1;
+    while (grid.y * i < minGridSize) {
+      i++;
+    }
+    grid.x = i >= xChunks ? xChunks : i;
+  }
+
+  return grid;
+}
+
+template <int capacity, int veclen, typename T, typename acc_type>
+void launch_interleaved_scan_kernel(
+  const T* queries,        // Input: Query Vector; [batch_size, dim]
+  uint32_t* coarse_index,  // Record the cluster(list) id; [batch_size,nprobe]
+  uint32_t* list_index,    // Record the id of vector for each cluster(list); [nrow]
+  void* list_data,         // Record the full value of vector for each cluster(list) interleaved;
+                           // [nrow, dim]
+  uint32_t* list_lengths,  // The number of vectors in each cluster(list); [nlist]
+  uint32_t* list_prefix_interleave,     // The start offset of each cluster(list) for
+                                        // list_index; [nlist]
+  raft::distance::DistanceType metric,  // Function to process the different metric
+  const uint32_t nprobe,
+  const uint32_t k,
+  const uint32_t dim,
+  size_t* neighbors,  // [batch_size, nprobe]
+  float* distances,   // [batch_size, nprobe]
+  const bool greater,
+  const int smem_size,
+  const uint32_t batch_size,
+  cudaStream_t stream,
+  uint32_t& gridDimX)
+{
+  const float dummy = utils::numeric::get_dummy<float>(greater);  // should be value_t?
+
+  // Accumulation inner product lambda
+  auto inner_prod_lambda = [] __device__(acc_type & acc, acc_type & x, acc_type & y) {
+    if constexpr ((std::is_same<T, int8_t>{}) || (std::is_same<T, uint8_t>{})) {
+      if constexpr (veclen == 1) {
+        acc += x * y;
+      } else {
+        acc = __dp4a(x, y, acc);
+      }
+    } else if constexpr (std::is_same<T, float>{}) {
+      acc += x * y;
+    }
+  };
+
+  // Accumulation euclidean L2 lambda
+  auto euclidean_lambda = [] __device__(acc_type & acc, acc_type & x, acc_type & y) {
+    if constexpr ((std::is_same<T, uint8_t>{})) {
+      if constexpr (veclen == 1) {
+        const acc_type diff = x - y;
+        acc += diff * diff;
+      } else {
+        const acc_type diff = __vabsdiffu4(x, y);
+        // TODO: add CUDA_ARCH based guard as IDP is only available from SM 6.1 onwards
+        acc = __dp4a(diff, diff, acc);
+      }
+    } else if constexpr (std::is_same<T, int8_t>{}) {
+      if constexpr (veclen == 1) {
+        const acc_type diff = x - y;
+        acc += diff * diff;
+      } else {
+        asm("vabsdiff4.u32.s32.s32 %0,%1,%2,%3;" : "=r"(x) : "r"(x), "r"(y), "r"(0));
+        acc = __dp4a(x, x, acc);
+      }
+    } else if constexpr ((std::is_same<T, float>{})) {
+      const acc_type diff = x - y;
+      acc += diff * diff;
+    }
+  };
+
+  dim3 block_dim(utils::kThreadPerBlock);
+
+  if (greater) {
+    if (metric == raft::distance::DistanceType::L2Expanded ||
+        metric == raft::distance::DistanceType::L2Unexpanded) {
+      constexpr auto interleaved_scan_euclidean_greater =
+        interleaved_scan<capacity, veclen, T, acc_type, decltype(euclidean_lambda), true>;
+      if (gridDimX == 0) {
+        dim3 grid_dim =
+          launchConfigGenerator(batch_size, nprobe, smem_size, interleaved_scan_euclidean_greater);
+        gridDimX = grid_dim.x;
+        return;
+      }
+      dim3 grid_dim =
+        launchConfigGenerator(batch_size, nprobe, smem_size, interleaved_scan_euclidean_greater);
+      interleaved_scan_euclidean_greater<<<grid_dim, block_dim, smem_size, stream>>>(
+        queries,
+        coarse_index,
+        list_index,
+        (T*)list_data,
+        list_lengths,
+        list_prefix_interleave,
+        metric,
+        euclidean_lambda,
+        nprobe,
+        k,
+        dim,
+        neighbors,
+        distances,
+        dummy);
+    } else {
+      constexpr auto interleaved_scan_inner_prod_greater =
+        interleaved_scan<capacity, veclen, T, acc_type, decltype(inner_prod_lambda), true>;
+      if (gridDimX == 0) {
+        dim3 grid_dim =
+          launchConfigGenerator(batch_size, nprobe, smem_size, interleaved_scan_inner_prod_greater);
+        gridDimX = grid_dim.x;
+        return;
+      }
+      dim3 grid_dim =
+        launchConfigGenerator(batch_size, nprobe, smem_size, interleaved_scan_inner_prod_greater);
+      interleaved_scan_inner_prod_greater<<<grid_dim, block_dim, smem_size, stream>>>(
+        queries,
+        coarse_index,
+        list_index,
+        (T*)list_data,
+        list_lengths,
+        list_prefix_interleave,
+        metric,
+        inner_prod_lambda,
+        nprobe,
+        k,
+        dim,
+        neighbors,
+        distances,
+        dummy);
+    }
+  } else {
+    if (metric == raft::distance::DistanceType::L2Expanded ||
+        metric == raft::distance::DistanceType::L2Unexpanded) {
+      constexpr auto interleaved_scan_euclidean_ngreater =
+        interleaved_scan<capacity, veclen, T, acc_type, decltype(euclidean_lambda), false>;
+      if (gridDimX == 0) {
+        dim3 grid_dim =
+          launchConfigGenerator(batch_size, nprobe, smem_size, interleaved_scan_euclidean_ngreater);
+        gridDimX = grid_dim.x;
+        return;
+      }
+      dim3 grid_dim =
+        launchConfigGenerator(batch_size, nprobe, smem_size, interleaved_scan_euclidean_ngreater);
+      interleaved_scan_euclidean_ngreater<<<grid_dim, block_dim, smem_size, stream>>>(
+        queries,
+        coarse_index,
+        list_index,
+        (T*)list_data,
+        list_lengths,
+        list_prefix_interleave,
+        metric,
+        euclidean_lambda,
+        nprobe,
+        k,
+        dim,
+        neighbors,
+        distances,
+        dummy);
+    } else {
+      constexpr auto interleaved_scan_inner_prod_ngreater =
+        interleaved_scan<capacity, veclen, T, acc_type, decltype(inner_prod_lambda), false>;
+      if (gridDimX == 0) {
+        dim3 grid_dim = launchConfigGenerator(
+          batch_size, nprobe, smem_size, interleaved_scan_inner_prod_ngreater);
+        gridDimX = grid_dim.x;
+        return;
+      }
+      dim3 grid_dim =
+        launchConfigGenerator(batch_size, nprobe, smem_size, interleaved_scan_inner_prod_ngreater);
+      interleaved_scan_inner_prod_ngreater<<<grid_dim, block_dim, smem_size, stream>>>(
+        queries,
+        coarse_index,
+        list_index,
+        (T*)list_data,
+        list_lengths,
+        list_prefix_interleave,
+        metric,
+        inner_prod_lambda,
+        nprobe,
+        k,
+        dim,
+        neighbors,
+        distances,
+        dummy);
+    }
+  }
+}
+
+template <int capacity, typename T, typename acc_type>
+void select_interleaved_scan_kernel(
+  const T* queries,        // Input: Query Vector; [batch_size, dim]
+  uint32_t* coarse_index,  // Record the cluster(list) id; [batch_size,nprobe]
+  uint32_t* list_index,    // Record the id of vector for each cluster(list); [nrow]
+  void* list_data,         // Record the full value of vector for each cluster(list) interleaved;
+                           // [nrow, dim]
+  uint32_t* list_lengths,  // The number of vectors in each cluster(list); [nlist]
+  uint32_t* list_prefix_interleave,           // The start offset of each cluster(list) for
+                                              // list_index; [nlist]
+  const raft::distance::DistanceType metric,  // Function to process the different metric
+  const uint32_t nprobe,
+  const uint32_t k,
+  const uint32_t dim,
+  size_t* neighbors,  // [batch_size, nprobe]
+  float* distances,   // [batch_size, nprobe]
+  const bool greater,
+  const int smem_size,
+  const uint32_t batch_size,
+  cudaStream_t stream,
+  const int veclen,
+  uint32_t& gridDimX)
+{
+  if constexpr ((std::is_same<T, uint8_t>{}) || (std::is_same<T, int8_t>{})) {
+    switch (veclen) {
+      case 1:
+        launch_interleaved_scan_kernel<capacity, 1, T, acc_type>(queries,
+                                                                 coarse_index,
+                                                                 list_index,
+                                                                 list_data,
+                                                                 list_lengths,
+                                                                 list_prefix_interleave,
+                                                                 metric,
+                                                                 nprobe,
+                                                                 k,
+                                                                 dim,
+                                                                 neighbors,
+                                                                 distances,
+                                                                 greater,
+                                                                 smem_size,
+                                                                 batch_size,
+                                                                 stream,
+                                                                 gridDimX);
+        break;
+      case 2:
+        launch_interleaved_scan_kernel<capacity, 2, T, acc_type>(queries,
+                                                                 coarse_index,
+                                                                 list_index,
+                                                                 list_data,
+                                                                 list_lengths,
+                                                                 list_prefix_interleave,
+                                                                 metric,
+                                                                 nprobe,
+                                                                 k,
+                                                                 dim,
+                                                                 neighbors,
+                                                                 distances,
+                                                                 greater,
+                                                                 smem_size,
+                                                                 batch_size,
+                                                                 stream,
+                                                                 gridDimX);
+        break;
+      case 4:
+        launch_interleaved_scan_kernel<capacity, 4, T, acc_type>(queries,
+                                                                 coarse_index,
+                                                                 list_index,
+                                                                 list_data,
+                                                                 list_lengths,
+                                                                 list_prefix_interleave,
+                                                                 metric,
+                                                                 nprobe,
+                                                                 k,
+                                                                 dim,
+                                                                 neighbors,
+                                                                 distances,
+                                                                 greater,
+                                                                 smem_size,
+                                                                 batch_size,
+                                                                 stream,
+                                                                 gridDimX);
+        break;
+      case 8:
+        launch_interleaved_scan_kernel<capacity, 8, T, acc_type>(queries,
+                                                                 coarse_index,
+                                                                 list_index,
+                                                                 list_data,
+                                                                 list_lengths,
+                                                                 list_prefix_interleave,
+                                                                 metric,
+                                                                 nprobe,
+                                                                 k,
+                                                                 dim,
+                                                                 neighbors,
+                                                                 distances,
+                                                                 greater,
+                                                                 smem_size,
+                                                                 batch_size,
+                                                                 stream,
+                                                                 gridDimX);
+        break;
+      case 16:
+        launch_interleaved_scan_kernel<capacity, 16, T, acc_type>(queries,
+                                                                  coarse_index,
+                                                                  list_index,
+                                                                  list_data,
+                                                                  list_lengths,
+                                                                  list_prefix_interleave,
+                                                                  metric,
+                                                                  nprobe,
+                                                                  k,
+                                                                  dim,
+                                                                  neighbors,
+                                                                  distances,
+                                                                  greater,
+                                                                  smem_size,
+                                                                  batch_size,
+                                                                  stream,
+                                                                  gridDimX);
+        break;
+      default: assert("veclen should be 1, 2, 4, 8 or 16\n"); break;
+    }
+  } else if constexpr (std::is_same<T, float>{}) {
+    switch (veclen) {
+      case 1:
+        launch_interleaved_scan_kernel<capacity, 1, T, acc_type>(queries,
+                                                                 coarse_index,
+                                                                 list_index,
+                                                                 list_data,
+                                                                 list_lengths,
+                                                                 list_prefix_interleave,
+                                                                 metric,
+                                                                 nprobe,
+                                                                 k,
+                                                                 dim,
+                                                                 neighbors,
+                                                                 distances,
+                                                                 greater,
+                                                                 smem_size,
+                                                                 batch_size,
+                                                                 stream,
+                                                                 gridDimX);
+        break;
+      case 2:
+        launch_interleaved_scan_kernel<capacity, 2, T, acc_type>(queries,
+                                                                 coarse_index,
+                                                                 list_index,
+                                                                 list_data,
+                                                                 list_lengths,
+                                                                 list_prefix_interleave,
+                                                                 metric,
+                                                                 nprobe,
+                                                                 k,
+                                                                 dim,
+                                                                 neighbors,
+                                                                 distances,
+                                                                 greater,
+                                                                 smem_size,
+                                                                 batch_size,
+                                                                 stream,
+                                                                 gridDimX);
+        break;
+      case 4:
+        launch_interleaved_scan_kernel<capacity, 4, T, acc_type>(queries,
+                                                                 coarse_index,
+                                                                 list_index,
+                                                                 list_data,
+                                                                 list_lengths,
+                                                                 list_prefix_interleave,
+                                                                 metric,
+                                                                 nprobe,
+                                                                 k,
+                                                                 dim,
+                                                                 neighbors,
+                                                                 distances,
+                                                                 greater,
+                                                                 smem_size,
+                                                                 batch_size,
+                                                                 stream,
+                                                                 gridDimX);
+        break;
+      default: assert("veclen should be 1, 2 or 4\n"); break;
+    }
+  }
+}
+
+template <typename T, typename value_t>
+void ivfflat_interleaved_scan(const T* queries,                  //[batch_size, dim]
+                              uint32_t* coarse_index,            //[batch_size,nprobe]
+                              uint32_t* list_index,              // [nrow]
+                              void* list_data,                   //[nrow, dim]
+                              uint32_t* list_lengths,            // [nlist]
+                              uint32_t* list_prefix_interleave,  // [nlist]
+                              const raft::distance::DistanceType metric,
+                              const uint32_t nprobe,
+                              const uint32_t k,
+                              const uint32_t batch_size,
+                              const uint32_t dim,
+                              size_t* neighbors,  // [batch_size, nprobe, k]
+                              float* distances,   // [batch_size, nprobe, k]
+                              cudaStream_t stream,
+                              const bool greater,
+                              const int veclen,
+                              uint32_t& gridDimX)
+{
+  const int capacity = raft::spatial::knn::detail::topk::calc_capacity(k);
+
+#ifdef USE_FAISS
+  int smem_size = 0;
+#else
+  int smem_size = raft::spatial::knn::detail::topk::calc_smem_size_for_block_wide<value_t, size_t>(
+    utils::kNumWarps, k);
+#endif
+
+  switch (capacity) {
+    case 32:
+      select_interleaved_scan_kernel<32, T, value_t>(queries,
+                                                     coarse_index,
+                                                     list_index,
+                                                     list_data,
+                                                     list_lengths,
+                                                     list_prefix_interleave,
+                                                     metric,
+                                                     nprobe,
+                                                     k,
+                                                     dim,
+                                                     neighbors,
+                                                     distances,
+                                                     greater,
+                                                     smem_size,
+                                                     batch_size,
+                                                     stream,
+                                                     veclen,
+                                                     gridDimX);
+      break;
+    // case 64:
+    //   select_interleaved_scan_kernel<64, T, value_t>(queries, coarse_index,
+    //   list_index, list_data, list_lengths, list_prefix_interleave, metric,
+    //   nprobe, k, dim, neighbors, distances, greater, smem_size, batch_size,
+    //   stream, veclen, gridDimX);
+    //   break;
+    // case 128:
+    //   select_interleaved_scan_kernel<128, T, value_t>(queries, coarse_index,
+    //   list_index, list_data, list_lengths, list_prefix_interleave, metric,
+    //   nprobe, k, dim, neighbors, distances, greater, smem_size, batch_size,
+    //   stream, veclen, gridDimX);
+    //   break;
+    // case 256:
+    //   select_interleaved_scan_kernel<256, T, value_t>(queries, coarse_index,
+    //   list_index, list_data, list_lengths, list_prefix_interleave, metric,
+    //   nprobe, k, dim, neighbors, distances, greater, smem_size, batch_size,
+    //   stream, veclen, gridDimX);
+    //   break;
+    // case 512:
+    //   select_interleaved_scan_kernel<512, T, value_t>(queries, coarse_index,
+    //   list_index, list_data, list_lengths, list_prefix_interleave, metric,
+    //   nprobe, k, dim, neighbors, distances, greater, smem_size, batch_size,
+    //   stream, veclen, gridDimX);
+    //   break;
+    // case 1024:
+    //   select_interleaved_scan_kernel<1024, T, value_t>(queries, coarse_index,
+    //   list_index, list_data, list_lengths, list_prefix_interleave, metric,
+    //   nprobe, k, dim, neighbors, distances, greater, smem_size, batch_size,
+    //   stream, veclen, gridDimX);
+    //   break;
+    default: break;
+  }  // end switch
+}
+
+}  // namespace detail
+}  // namespace knn
+}  // namespace spatial
+}  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
new file mode 100644
index 0000000000..d5b95af99c
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -0,0 +1,414 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "../ann_common.h"
+#include "ann_utils.cuh"
+#include "knn_brute_force_faiss.cuh"
+
+#include "common_faiss.h"
+#include "processing.hpp"
+
+#include "processing.hpp"
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+
+//#include <label/classlabels.cuh>
+#include <raft/distance/distance.hpp>
+#include <raft/spatial/knn/faiss_mr.hpp>
+
+#include <faiss/gpu/GpuDistance.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/GpuIndexIVFFlat.h>
+#include <faiss/gpu/GpuIndexIVFPQ.h>
+#include <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/utils/Limits.cuh>
+#include <faiss/gpu/utils/Select.cuh>
+#include <faiss/gpu/utils/Tensor.cuh>
+#include <faiss/utils/Heap.h>
+
+#include <thrust/iterator/transform_iterator.h>
+
+#include <raft/distance/distance_type.hpp>
+
+#include <iostream>
+#include <set>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+namespace detail {
+// namespace kmeans {
+
+// predict label of dataset
+void _cuann_kmeans_predict_core(cublasHandle_t cublasHandle,
+                                const float* centers,  // [numCenters, dimCenters]
+                                uint32_t numCenters,
+                                uint32_t dimCenters,
+                                const float* dataset,  // [numDataset, dimCenters]
+                                uint32_t numDataset,
+                                uint32_t* labels,  // [numDataset]
+                                raft::distance::DistanceType metric,
+                                float* workspace)
+{
+  const uint32_t dimDataset = dimCenters;
+  float* sqsumCenters;  // [numCenters]
+  float* sqsumDataset;  // [numDataset]
+  float* distances;     // [numDataset, numCenters]
+
+  sqsumCenters = workspace;
+  sqsumDataset = sqsumCenters + numCenters;
+  distances    = sqsumDataset + numDataset;
+
+  float alpha;
+  float beta;
+  if (metric == raft::distance::DistanceType::InnerProduct) {
+    alpha = -1.0;
+    beta  = 0.0;
+  } else {
+    utils::_cuann_sqsum(numCenters, dimCenters, centers, sqsumCenters);
+    utils::_cuann_sqsum(numDataset, dimDataset, dataset, sqsumDataset);
+    utils::_cuann_outer_add(sqsumDataset, numDataset, sqsumCenters, numCenters, distances);
+    alpha = -2.0;
+    beta  = 1.0;
+  }
+  cublasGemmEx(cublasHandle,
+               CUBLAS_OP_T,
+               CUBLAS_OP_N,
+               numCenters,
+               numDataset,
+               dimCenters,
+               &alpha,
+               centers,
+               CUDA_R_32F,
+               dimCenters,
+               dataset,
+               CUDA_R_32F,
+               dimDataset,
+               &beta,
+               distances,
+               CUDA_R_32F,
+               numCenters,
+               CUBLAS_COMPUTE_32F,
+               CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+  utils::_cuann_argmin(numDataset, numCenters, distances, labels);
+}
+
+//
+uint32_t _cuann_kmeans_predict_chunkSize(uint32_t numCenters, uint32_t numDataset)
+{
+  uint32_t chunk = (1 << 20);
+  if (chunk > (1 << 28) / numCenters) {
+    chunk = (1 << 28) / numCenters;
+    chunk += 32;
+    chunk -= chunk % 64;
+  }
+  chunk = min(chunk, numDataset);
+  return chunk;
+}
+
+//
+size_t _cuann_kmeans_predict_bufferSize(uint32_t numCenters,
+                                        uint32_t dimCenters,
+                                        uint32_t numDataset)
+{
+  uint32_t chunk = _cuann_kmeans_predict_chunkSize(numCenters, numDataset);
+  size_t size    = 0;
+  // float *curDataset;  // [chunk, dimCenters]
+  size += utils::_cuann_aligned(sizeof(float) * chunk * dimCenters);
+  // void *bufDataset;  // [chunk, dimCenters]
+  size += utils::_cuann_aligned(sizeof(float) * chunk * dimCenters);
+  // float *workspace;
+  size += utils::_cuann_aligned(sizeof(float) * (numCenters + chunk + (numCenters * chunk)));
+  return size;
+}
+
+// update kmeans centers
+void _cuann_kmeans_update_centers(float* centers,  // [numCenters, dimCenters]
+                                  uint32_t numCenters,
+                                  uint32_t dimCenters,
+                                  const void* dataset,  // [numDataset, dimCenters]
+                                  cudaDataType_t dtype,
+                                  uint32_t numDataset,
+                                  uint32_t* labels,  // [numDataset]
+                                  raft::distance::DistanceType metric,
+                                  uint32_t* clusterSize,  // [numCenters]
+                                  float* accumulatedCenters = NULL)
+{
+  if (accumulatedCenters == NULL) {
+    // accumulate
+    utils::_cuann_memset(centers, 0, sizeof(float) * numCenters * dimCenters);
+    utils::_cuann_memset(clusterSize, 0, sizeof(uint32_t) * numCenters);
+    if (dtype == CUDA_R_32F) {
+      utils::_cuann_accumulate_with_label<float>(
+        numCenters, dimCenters, centers, clusterSize, numDataset, (const float*)dataset, labels);
+    } else if (dtype == CUDA_R_8U) {
+      constexpr float divisor = 256.0;
+      utils::_cuann_accumulate_with_label<uint8_t>(numCenters,
+                                                   dimCenters,
+                                                   centers,
+                                                   clusterSize,
+                                                   numDataset,
+                                                   (const uint8_t*)dataset,
+                                                   labels,
+                                                   divisor);
+    } else if (dtype == CUDA_R_8I) {
+      constexpr float divisor = 128.0;
+      utils::_cuann_accumulate_with_label<int8_t>(numCenters,
+                                                  dimCenters,
+                                                  centers,
+                                                  clusterSize,
+                                                  numDataset,
+                                                  (const int8_t*)dataset,
+                                                  labels,
+                                                  divisor);
+    }
+  } else {
+    cudaMemcpy(
+      centers, accumulatedCenters, sizeof(float) * numCenters * dimCenters, cudaMemcpyDefault);
+  }
+
+  if (metric == raft::distance::DistanceType::InnerProduct) {
+    // normalize
+    utils::_cuann_normalize(numCenters, dimCenters, centers, clusterSize);
+  } else {
+    // average
+    utils::_cuann_divide(numCenters, dimCenters, centers, clusterSize);
+  }
+}
+
+// predict label of dataset
+void _cuann_kmeans_predict(cublasHandle_t cublasHandle,
+                           float* centers,  // [numCenters, dimCenters]
+                           uint32_t numCenters,
+                           uint32_t dimCenters,
+                           const void* dataset,  // [numDataset, dimCenters]
+                           cudaDataType_t dtype,
+                           uint32_t numDataset,
+                           uint32_t* labels,  // [numDataset]
+                           raft::distance::DistanceType metric,
+                           bool isCenterSet      = true,
+                           void* _workspace      = NULL,
+                           float* tempCenters    = NULL,  // [numCenters, dimCenters]
+                           uint32_t* clusterSize = NULL,  // [numCenters,]
+                           bool updateCenter     = true)
+{
+  if (!isCenterSet) {
+    // If centers are not set, the labels will be determined randomly.
+    for (uint32_t i = 0; i < numDataset; i++) {
+      labels[i] = i % numCenters;
+    }
+    if (tempCenters != NULL && clusterSize != NULL) {
+      // update centers
+      _cuann_kmeans_update_centers(
+        centers, numCenters, dimCenters, dataset, dtype, numDataset, labels, metric, clusterSize);
+    }
+    return;
+  }
+
+  uint32_t chunk  = _cuann_kmeans_predict_chunkSize(numCenters, numDataset);
+  void* workspace = _workspace;
+  if (_workspace == NULL) {
+    size_t sizeWorkspace = _cuann_kmeans_predict_bufferSize(numCenters, dimCenters, numDataset);
+    RAFT_CUDA_TRY(cudaMallocManaged(&workspace, sizeWorkspace));
+  }
+  float* curDataset;  // [chunk, dimCenters]
+  void* bufDataset;   // [chunk, dimCenters]
+  float* workspace_core;
+  curDataset = (float*)workspace;
+  bufDataset =
+    (void*)((uint8_t*)curDataset + utils::_cuann_aligned(sizeof(float) * chunk * dimCenters));
+  workspace_core =
+    (float*)((uint8_t*)bufDataset + utils::_cuann_aligned(sizeof(float) * chunk * dimCenters));
+
+  if (tempCenters != NULL && clusterSize != NULL) {
+    utils::_cuann_memset(tempCenters, 0, sizeof(float) * numCenters * dimCenters);
+    utils::_cuann_memset(clusterSize, 0, sizeof(uint32_t) * numCenters);
+  }
+
+  cudaMemcpyKind kind;
+  cudaPointerAttributes attr;
+  cudaPointerGetAttributes(&attr, dataset);
+  if (attr.type == cudaMemoryTypeDevice || attr.type == cudaMemoryTypeManaged) {
+    kind = cudaMemcpyDeviceToDevice;
+  } else {
+    kind = cudaMemcpyHostToDevice;
+  }
+
+  for (uint64_t is = 0; is < numDataset; is += chunk) {
+    uint64_t ie       = min(is + chunk, (uint64_t)numDataset);
+    uint32_t nDataset = ie - is;
+
+    if (dtype == CUDA_R_32F) {
+      RAFT_CUDA_TRY(cudaMemcpy(bufDataset,
+                               (float*)dataset + (is * dimCenters),
+                               sizeof(float) * nDataset * dimCenters,
+                               cudaMemcpyDefault));
+    } else if (dtype == CUDA_R_8U) {
+      RAFT_CUDA_TRY(cudaMemcpyAsync(bufDataset,
+                                    (uint8_t*)dataset + (is * dimCenters),
+                                    sizeof(uint8_t) * nDataset * dimCenters,
+                                    kind,
+                                    NULL));
+    } else if (dtype == CUDA_R_8I) {
+      RAFT_CUDA_TRY(cudaMemcpyAsync(bufDataset,
+                                    (int8_t*)dataset + (is * dimCenters),
+                                    sizeof(int8_t) * nDataset * dimCenters,
+                                    kind,
+                                    NULL));
+    }
+
+    if (dtype == CUDA_R_32F) {
+      // No need to copy when dtype is CUDA_R_32F
+      curDataset = (float*)bufDataset;
+    } else if (dtype == CUDA_R_8U) {
+      float divisor = 256.0;
+      utils::_cuann_copy<uint8_t, float>(nDataset,
+                                         dimCenters,
+                                         (const uint8_t*)bufDataset,
+                                         dimCenters,
+                                         curDataset,
+                                         dimCenters,
+                                         divisor);
+    } else if (dtype == CUDA_R_8I) {
+      float divisor = 128.0;
+      utils::_cuann_copy<int8_t, float>(nDataset,
+                                        dimCenters,
+                                        (const int8_t*)bufDataset,
+                                        dimCenters,
+                                        curDataset,
+                                        dimCenters,
+                                        divisor);
+    }
+
+    // predict
+    _cuann_kmeans_predict_core(cublasHandle,
+                               centers,
+                               numCenters,
+                               dimCenters,
+                               curDataset,
+                               nDataset,
+                               labels + is,
+                               metric,
+                               workspace_core);
+
+    if ((tempCenters != NULL) && (clusterSize != NULL)) {
+      // accumulate
+      utils::_cuann_accumulate_with_label<float>(
+        numCenters, dimCenters, tempCenters, clusterSize, nDataset, curDataset, labels + is);
+    }
+  }
+
+  if ((tempCenters != NULL) && (clusterSize != NULL) && updateCenter) {
+    _cuann_kmeans_update_centers(centers,
+                                 numCenters,
+                                 dimCenters,
+                                 dataset,
+                                 dtype,
+                                 numDataset,
+                                 labels,
+                                 metric,
+                                 clusterSize,
+                                 tempCenters);
+  }
+
+  if (_workspace == NULL) { cudaFree(workspace); }
+}
+
+// adjust centers which have small number of entries
+bool _cuann_kmeans_adjust_centers(float* centers,  // [numCenters, dimCenters]
+                                  uint32_t numCenters,
+                                  uint32_t dimCenters,
+                                  const void* dataset,  // [numDataset, dimCenters]
+                                  cudaDataType_t dtype,
+                                  uint32_t numDataset,
+                                  const uint32_t* labels,  // [numDataset]
+                                  raft::distance::DistanceType metric,
+                                  const uint32_t* clusterSize,  // [numCenters]
+                                  float threshold)
+{
+  // cudaDeviceSynchronize();
+  bool adjusted                = false;
+  static uint32_t i            = 0;
+  static uint32_t iPrimes      = 0;
+  constexpr uint32_t numPrimes = 40;
+  uint32_t primes[numPrimes]   = {29,   71,   113,  173,  229,  281,  349,  409,  463,  541,
+                                601,  659,  733,  809,  863,  941,  1013, 1069, 1151, 1223,
+                                1291, 1373, 1451, 1511, 1583, 1657, 1733, 1811, 1889, 1987,
+                                2053, 2129, 2213, 2287, 2357, 2423, 2531, 2617, 2687, 2741};
+  uint32_t average             = numDataset / numCenters;
+  uint32_t ofst;
+  if (dtype != CUDA_R_32F && dtype != CUDA_R_8U && dtype != CUDA_R_8I) {
+    fprintf(stderr, "(%s, %d) Unsupported dtype (%d)\n", __func__, __LINE__, dtype);
+  }
+  do {
+    iPrimes = (iPrimes + 1) % numPrimes;
+    ofst    = primes[iPrimes];
+  } while (numDataset % ofst == 0);
+  uint32_t count = 0;
+
+  for (uint32_t l = 0; l < numCenters; l++) {
+    if (clusterSize[l] > (uint32_t)(average * threshold)) continue;
+    do {
+      i = (i + ofst) % numDataset;
+    } while (clusterSize[labels[i]] < average);
+    uint32_t li = labels[i];
+    float sqsum = 0.0;
+    for (uint32_t j = 0; j < dimCenters; j++) {
+      float val = centers[j + ((uint64_t)dimCenters * li)] * 7.0;
+      if (dtype == CUDA_R_32F) {
+        val += ((float*)dataset)[j + ((uint64_t)dimCenters * i)];
+      } else if (dtype == CUDA_R_8U) {
+        float divisor = 256.0;
+        val += ((uint8_t*)dataset)[j + ((uint64_t)dimCenters * i)] / divisor;
+      } else if (dtype == CUDA_R_8I) {
+        float divisor = 128.0;
+        val += ((int8_t*)dataset)[j + ((uint64_t)dimCenters * i)] / divisor;
+      }
+      val /= 8.0;
+      sqsum += val * val;
+      centers[j + ((uint64_t)dimCenters * l)] = val;
+    }
+    if (metric == raft::distance::DistanceType::InnerProduct) {
+      sqsum = sqrt(sqsum);
+      for (uint32_t j = 0; j < dimCenters; j++) {
+        centers[j + ((uint64_t)dimCenters * l)] /= sqsum;
+      }
+    }
+    adjusted = true;
+    count += 1;
+  }
+
+#ifdef CUANN_DEBUG
+  if (count > 0) {
+    fprintf(stderr,
+            "(%s) num adjusted: %u / %u, threshold: %d\n",
+            __func__,
+            count,
+            numCenters,
+            (int)(average * threshold));
+  }
+#endif
+  return adjusted;
+}
+
+//}  // namespace kmeans
+}  // namespace detail
+}  // namespace knn
+}  // namespace spatial
+}  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
index 78631b431f..c9d2313baa 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
@@ -26,8 +26,8 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 
-#include <label/classlabels.cuh>
 #include <raft/distance/distance.cuh>
+#include <raft/label/classlabels.cuh>
 #include <raft/spatial/knn/faiss_mr.hpp>
 
 #include <faiss/gpu/GpuDistance.h>
@@ -45,9 +45,13 @@
 
 #include <raft/distance/distance_type.hpp>
 
+#include "ann_ivf_flat.cuh"
+
 #include <iostream>
 #include <set>
 
+#define IVF_FAISS 0
+
 namespace raft {
 namespace spatial {
 namespace knn {
@@ -82,6 +86,62 @@ void approx_knn_ivfflat_build_index(
   index->index = faiss_index;
 }
 
+template <typename T = float, typename IntType = int>
+void approx_knn_cuivfl_ivfflat_build_index(knnIndex* index,
+                                           IVFParam* params,
+                                           raft::distance::DistanceType metric,
+                                           T* dataset,
+                                           IntType n,
+                                           IntType D,
+                                           cudaStream_t stream)
+{
+  int ratio = 2;  // TODO: take these parameters from API
+  int niter = 20;
+
+  index->handle_ = std::make_unique<cuivflHandle>(metric, D, params->nlist, niter, index->device);
+
+  const int dim       = D;
+  const size_t ntrain = n / ratio;
+  assert(ntrain > 0);
+
+  // T* trainset = (T*)rmm(ntrain * dim * sizeof(T));
+  // cudaMemcpyKind kind;
+  // cudaPointerAttributes attr;
+  // cudaPointerGetAttributes(&attr, dataset);
+  // if (attr.type == cudaMemoryTypeDevice || attr.type == cudaMemoryTypeManaged) {
+  //     kind = cudaMemcpyDeviceToDevice;
+  // } else {
+  //     kind = cudaMemcpyHostToDevice;
+  // }
+
+  // rmm::device_uvector<T> trainset(ntrain * dim, stream);
+  T* trainset = nullptr;
+  RAFT_CUDA_TRY(cudaMallocManaged(&trainset, ntrain * dim * sizeof(T)));
+  printf("  ntrain = %d and n = %d dim = %d nlist = %d nprobe = %d\n",
+         (int)ntrain,
+         (int)n,
+         (int)dim,
+         (int)params->nlist,
+         (int)params->nprobe);
+  fflush(0);
+
+  for (size_t i = 0; i < ntrain; ++i) {
+    RAFT_CUDA_TRY(cudaMemcpyAsync(
+      trainset + i * dim, dataset + ratio * i * dim, dim * sizeof(T), cudaMemcpyDefault, stream));
+  }
+
+  cudaDataType_t dtype;
+  if (typeid(T) == typeid(float)) {
+    dtype = CUDA_R_32F;
+  } else if (typeid(T) == typeid(uint8_t)) {
+    dtype = CUDA_R_8U;
+  } else if (typeid(T) == typeid(int8_t)) {
+    dtype = CUDA_R_8I;
+  }
+  index->handle_->cuivflBuildIndex(dataset, trainset, dtype, n, ntrain, stream);
+  RAFT_CUDA_TRY(cudaFree(trainset));
+}
+
 template <typename IntType = int>
 void approx_knn_ivfpq_build_index(
   knnIndex* index, IVFPQParam* params, raft::distance::DistanceType metric, IntType n, IntType D)
@@ -111,97 +171,183 @@ void approx_knn_ivfsq_build_index(
   index->index = faiss_index;
 }
 
-template <typename IntType = int>
+template <typename T = float, typename IntType = int>
 void approx_knn_build_index(raft::handle_t& handle,
                             raft::spatial::knn::knnIndex* index,
                             raft::spatial::knn::knnIndexParam* params,
                             raft::distance::DistanceType metric,
                             float metricArg,
-                            float* index_array,
+                            T* index_array,
                             IntType n,
                             IntType D)
 {
-  int device;
-  RAFT_CUDA_TRY(cudaGetDevice(&device));
-
-  raft::spatial::knn::RmmGpuResources* gpu_res = new raft::spatial::knn::RmmGpuResources();
-  gpu_res->noTempMemory();
-  gpu_res->setDefaultStream(device, handle.get_stream());
-  index->gpu_res   = gpu_res;
-  index->device    = device;
   index->index     = nullptr;
   index->metric    = metric;
   index->metricArg = metricArg;
 
   // perform preprocessing
   // k set to 0 (unused during preprocessing / revertion)
-  std::unique_ptr<MetricProcessor<float>> query_metric_processor =
-    create_processor<float>(metric, n, D, 0, false, handle.get_stream());
-
-  query_metric_processor->preprocess(index_array);
-
-  if (dynamic_cast<IVFFlatParam*>(params)) {
-    IVFFlatParam* IVFFlat_param = dynamic_cast<IVFFlatParam*>(params);
-    approx_knn_ivfflat_build_index(index, IVFFlat_param, metric, n, D);
-    std::vector<float> h_index_array(n * D);
-    raft::update_host(h_index_array.data(), index_array, h_index_array.size(), handle.get_stream());
-    query_metric_processor->revert(index_array);
-    index->index->train(n, h_index_array.data());
-    index->index->add(n, h_index_array.data());
-  } else {
-    if (dynamic_cast<IVFPQParam*>(params)) {
-      IVFPQParam* IVFPQ_param = dynamic_cast<IVFPQParam*>(params);
-      approx_knn_ivfpq_build_index(index, IVFPQ_param, metric, n, D);
-    } else if (dynamic_cast<IVFSQParam*>(params)) {
-      IVFSQParam* IVFSQ_param = dynamic_cast<IVFSQParam*>(params);
-      approx_knn_ivfsq_build_index(index, IVFSQ_param, metric, n, D);
-    } else {
-      ASSERT(index->index, "KNN index could not be initialized");
+  if constexpr (std::is_same<T, uint8_t>{} || std::is_same<T, int8_t>{}) {
+    if (dynamic_cast<IVFFlatParam*>(params)) {
+      IVFFlatParam* IVFFlat_param = dynamic_cast<IVFFlatParam*>(params);
+      T* h_index_array;
+      RAFT_CUDA_TRY(cudaMallocManaged(&h_index_array, n * D * sizeof(T)));
+      RAFT_CUDA_TRY(cudaMemcpyAsync(
+        h_index_array, index_array, n * D * sizeof(T), cudaMemcpyDefault, handle.get_stream()));
+
+      approx_knn_cuivfl_ivfflat_build_index(
+        index, IVFFlat_param, metric, h_index_array, n, D, handle.get_stream());
     }
+  } else if constexpr (std::is_same<T, float>{}) {
+    std::unique_ptr<MetricProcessor<float>> query_metric_processor =
+      create_processor<float>(metric, n, D, 0, false, handle.get_stream());
+
+    if (dynamic_cast<IVFFlatParam*>(params)) {
+      IVFFlatParam* IVFFlat_param = dynamic_cast<IVFFlatParam*>(params);
+      // cuivfl only supports L2/Inner product for now.
+      if (metric == raft::distance::DistanceType::L2SqrtExpanded ||
+          metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
+          metric == raft::distance::DistanceType::L2Unexpanded ||
+          metric == raft::distance::DistanceType::L2Expanded ||
+          metric == raft::distance::DistanceType::InnerProduct) {
+        T* h_index_array;
+        cudaMallocManaged(&h_index_array, n * D * sizeof(T));
+        // raft::update_host(h_index_array.data(), index_array, h_index_array.size(),
+        // handle.get_stream());
+        cudaMemcpy(h_index_array, index_array, n * D * sizeof(T), cudaMemcpyDefault);
+
+        approx_knn_cuivfl_ivfflat_build_index(
+          index, IVFFlat_param, metric, h_index_array, n, D, handle.get_stream());
+      } else {
+        int device;
+        RAFT_CUDA_TRY(cudaGetDevice(&device));
+        raft::spatial::knn::RmmGpuResources* gpu_res = new raft::spatial::knn::RmmGpuResources();
+        gpu_res->noTempMemory();
+        gpu_res->setDefaultStream(device, handle.get_stream());
+        index->gpu_res = gpu_res;
+        index->device  = device;
+        approx_knn_ivfflat_build_index(index, IVFFlat_param, metric, n, D);
+        std::vector<float> h_index_array(n * D);
+        raft::update_host(
+          h_index_array.data(), index_array, h_index_array.size(), handle.get_stream());
+        query_metric_processor->revert(index_array);
+        index->index->train(n, h_index_array.data());
+        index->index->add(n, h_index_array.data());
+      }
+    } else {
+      int device;
+      RAFT_CUDA_TRY(cudaGetDevice(&device));
+      raft::spatial::knn::RmmGpuResources* gpu_res = new raft::spatial::knn::RmmGpuResources();
+      gpu_res->noTempMemory();
+      gpu_res->setDefaultStream(device, handle.get_stream());
+      index->gpu_res = gpu_res;
+      index->device  = device;
+      query_metric_processor->preprocess(index_array);
+      if (dynamic_cast<IVFPQParam*>(params)) {
+        IVFPQParam* IVFPQ_param = dynamic_cast<IVFPQParam*>(params);
+        approx_knn_ivfpq_build_index(index, IVFPQ_param, metric, n, D);
+      } else if (dynamic_cast<IVFSQParam*>(params)) {
+        IVFSQParam* IVFSQ_param = dynamic_cast<IVFSQParam*>(params);
+        approx_knn_ivfsq_build_index(index, IVFSQ_param, metric, n, D);
+      } else {
+        ASSERT(index->index, "KNN index could not be initialized");
+      }
 
-    index->index->train(n, index_array);
-    index->index->add(n, index_array);
-    query_metric_processor->revert(index_array);
+      index->index->train(n, index_array);
+      index->index->add(n, index_array);
+      query_metric_processor->revert(index_array);
+    }
   }
 }
 
-template <typename IntType = int>
+template <typename T = float, typename IntType = int>
 void approx_knn_search(raft::handle_t& handle,
                        float* distances,
                        int64_t* indices,
                        raft::spatial::knn::knnIndex* index,
+                       raft::spatial::knn::knnIndexParam* params,
                        IntType k,
-                       float* query_array,
+                       T* query_array,
                        IntType n)
 {
   // perform preprocessing
+#if 0
   std::unique_ptr<MetricProcessor<float>> query_metric_processor =
-    create_processor<float>(index->metric, n, index->index->d, k, false, handle.get_stream());
-
+  create_processor<float>(index->metric, n, index->index->d, k, false, handle.get_stream());
   query_metric_processor->preprocess(query_array);
-  index->index->search(n, query_array, k, distances, indices);
-  query_metric_processor->revert(query_array);
-
-  // Perform necessary post-processing
-  if (index->metric == raft::distance::DistanceType::L2SqrtExpanded ||
-      index->metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
-      index->metric == raft::distance::DistanceType::LpUnexpanded) {
-    /**
-     * post-processing
-     */
-    float p = 0.5;  // standard l2
-    if (index->metric == raft::distance::DistanceType::LpUnexpanded) p = 1.0 / index->metricArg;
-    raft::linalg::unaryOp<float>(
-      distances,
-      distances,
-      n * k,
-      [p] __device__(float input) { return powf(input, p); },
-      handle.get_stream());
+    index->index->search(n, query_array, k, distances, indices);
+#else
+  if constexpr (std::is_same<T, uint8_t>{} || std::is_same<T, int8_t>{}) {
+    if (dynamic_cast<IVFFlatParam*>(params)) {
+      IVFFlatParam* IVFFlat_param = dynamic_cast<IVFFlatParam*>(params);
+      int nprobe                  = IVFFlat_param->nprobe;
+      int max_batch               = n;
+      int max_k                   = k;
+      // assert(nprobe <= nlist_); ?? is it supposed to compare agains the private member of
+      // cuivflHandle?
+
+      index->handle_->cuivflSetSearchParameters(nprobe, max_batch, max_k);
+
+      cudaDataType_t dtype;
+      if (typeid(T) == typeid(float)) {
+        dtype = CUDA_R_32F;
+      } else if (typeid(T) == typeid(uint8_t)) {
+        dtype = CUDA_R_8U;
+      } else if (typeid(T) == typeid(int8_t)) {
+        dtype = CUDA_R_8I;
+      }
+      index->handle_->cuivflSearch(
+        query_array, max_batch, max_k, (size_t*)indices, distances, handle.get_stream(), dtype);
+    }
+  } else if constexpr (std::is_same<T, float>{}) {
+    std::unique_ptr<MetricProcessor<float>> query_metric_processor = create_processor<float>(
+      index->metric, n, index->handle_->getDim(), k, false, handle.get_stream());
+    query_metric_processor->preprocess(query_array);
+    if (dynamic_cast<IVFFlatParam*>(params)) {
+      IVFFlatParam* IVFFlat_param = dynamic_cast<IVFFlatParam*>(params);
+      int nprobe                  = IVFFlat_param->nprobe;
+      int max_batch               = n;
+      int max_k                   = k;
+      // assert(nprobe <= nlist_); ?? is it supposed to compare agains the private member of
+      // cuivflHandle?
+
+      index->handle_->cuivflSetSearchParameters(nprobe, max_batch, max_k);
+
+      cudaDataType_t dtype;
+      if (typeid(T) == typeid(float)) {
+        dtype = CUDA_R_32F;
+      } else if (typeid(T) == typeid(uint8_t)) {
+        dtype = CUDA_R_8U;
+      } else if (typeid(T) == typeid(int8_t)) {
+        dtype = CUDA_R_8I;
+      }
+      index->handle_->cuivflSearch(
+        query_array, max_batch, max_k, (size_t*)indices, distances, handle.get_stream(), dtype);
+    }
+    query_metric_processor->revert(query_array);
+
+    // Perform necessary post-processing
+    if (index->metric == raft::distance::DistanceType::L2SqrtExpanded ||
+        index->metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
+        index->metric == raft::distance::DistanceType::LpUnexpanded) {
+      /**
+       * post-processing
+       */
+      float p = 0.5;  // standard l2
+      if (index->metric == raft::distance::DistanceType::LpUnexpanded) p = 1.0 / index->metricArg;
+      raft::linalg::unaryOp<float>(
+        distances,
+        distances,
+        n * k,
+        [p] __device__(float input) { return powf(input, p); },
+        handle.get_stream());
+    }
+    query_metric_processor->postprocess(distances);
   }
-  query_metric_processor->postprocess(distances);
+#endif
 }
 
 }  // namespace detail
 }  // namespace knn
 }  // namespace spatial
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
new file mode 100644
index 0000000000..aa75f8d002
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -0,0 +1,586 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "../ann_common.h"
+#include "knn_brute_force_faiss.cuh"
+
+#include "common_faiss.h"
+#include "processing.hpp"
+
+#include "processing.hpp"
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+
+#include <raft/distance/distance.hpp>
+#include <raft/spatial/knn/faiss_mr.hpp>
+
+#include <faiss/gpu/GpuDistance.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/GpuIndexIVFFlat.h>
+#include <faiss/gpu/GpuIndexIVFPQ.h>
+#include <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/utils/Limits.cuh>
+#include <faiss/gpu/utils/Select.cuh>
+#include <faiss/gpu/utils/Tensor.cuh>
+#include <faiss/utils/Heap.h>
+
+#include <thrust/iterator/transform_iterator.h>
+
+#include <raft/distance/distance_type.hpp>
+
+#include <iostream>
+#include <set>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+namespace detail {
+namespace utils {
+
+// bool check(cudaError_t e, int iLine, const char *szFile) {
+//         if (e != cudaSuccess) {
+//             std::cout << "CUDA runtime API error " << cudaGetErrorName(e) << " at line " << iLine
+//                     << " in file " << szFile << std::endl;
+//             exit(0);
+//             return false;
+//         }
+//         return true;
+// }
+
+// const char *cublasGetErrorString(cublasStatus_t status) {
+//     switch (status) {
+//     case CUBLAS_STATUS_SUCCESS:
+//         return "CUBLAS_STATUS_SUCCESS";
+//     case CUBLAS_STATUS_NOT_INITIALIZED:
+//         return "CUBLAS_STATUS_NOT_INITIALIZED";
+//     case CUBLAS_STATUS_ALLOC_FAILED:
+//         return "CUBLAS_STATUS_ALLOC_FAILED";
+//     case CUBLAS_STATUS_INVALID_VALUE:
+//         return "CUBLAS_STATUS_INVALID_VALUE";
+//     case CUBLAS_STATUS_ARCH_MISMATCH:
+//         return "CUBLAS_STATUS_ARCH_MISMATCH";
+//     case CUBLAS_STATUS_MAPPING_ERROR:
+//         return "CUBLAS_STATUS_MAPPING_ERROR";
+//     case CUBLAS_STATUS_EXECUTION_FAILED:
+//         return "CUBLAS_STATUS_EXECUTION_FAILED";
+//     case CUBLAS_STATUS_INTERNAL_ERROR:
+//         return "CUBLAS_STATUS_INTERNAL_ERROR";
+//     }
+//     return "unknown error";
+// }
+
+// bool check(cublasStatus_t e, int iLine, const char *szFile) {
+//     if (e != CUBLAS_STATUS_SUCCESS) {
+//         std::cout << "CUDA runtime API error " << cublasGetErrorString(e) << " at line "
+//                 << iLine << " in file " << szFile << std::endl;
+//         exit(0);
+//         return false;
+//     }
+//     return true;
+// }
+constexpr int kWarpSize       = 32;
+constexpr int kThreadPerBlock = 128;
+constexpr int kNumWarps       = kThreadPerBlock / kWarpSize;
+
+namespace numeric {
+
+// a new type should specialize get_lower_bound() & get_upper_bound()
+// rather than get_dummy()
+template <typename T>
+constexpr T get_lower_bound()
+{
+  if (std::numeric_limits<T>::has_infinity && std::numeric_limits<T>::is_signed) {
+    return -std::numeric_limits<T>::infinity();
+  } else {
+    return std::numeric_limits<T>::lowest();
+  }
+}
+
+template <typename T>
+constexpr T get_upper_bound()
+{
+  if (std::numeric_limits<T>::has_infinity) {
+    return std::numeric_limits<T>::infinity();
+  } else {
+    return std::numeric_limits<T>::max();
+  }
+}
+
+template <typename T>
+constexpr T get_dummy(bool greater)
+{
+  // TODO: for unsigned and greater=true, dummy will be 0
+  //       find better way to warn about this
+  assert(!(std::is_unsigned<T>::value && greater));
+  return greater ? get_lower_bound<T>() : get_upper_bound<T>();
+}
+
+template <bool greater, typename T>
+__device__ inline bool is_better_than(T val, T baseline)
+{
+  return (val > baseline && greater) || (val < baseline && !greater);
+}
+
+}  // namespace numeric
+
+/*******************************************************/
+/*                   Debug Function                    */
+/*******************************************************/
+
+template <typename T>
+void printDevPtr(const T* d_cache, int len, const char* name)
+{
+  T* res = (T*)malloc(sizeof(T) * len);
+  RAFT_CUDA_TRY(cudaMemcpy(res, d_cache, sizeof(T) * len, cudaMemcpyDeviceToHost));
+  printf("%s ", name);
+  for (int i = 0; i < len; i++) {
+    printf("%d(%f) ", i, (float)res[i]);
+    if (i % 10 == 9) { printf("\n"); }
+  }
+  printf("\n");
+  free(res);
+}
+
+inline size_t calc_aligned_size(const std::vector<size_t>& sizes)
+{
+  const size_t ALIGN_BYTES = 256;
+  const size_t ALIGN_MASK  = ~(ALIGN_BYTES - 1);
+  size_t total             = 0;
+  for (auto sz : sizes) {
+    total += (sz + ALIGN_BYTES - 1) & ALIGN_MASK;
+  }
+  return total + ALIGN_BYTES - 1;
+}
+
+inline std::vector<void*> calc_aligned_pointers(const void* p, const std::vector<size_t>& sizes)
+{
+  const size_t ALIGN_BYTES = 256;
+  const size_t ALIGN_MASK  = ~(ALIGN_BYTES - 1);
+
+  char* ptr = reinterpret_cast<char*>((reinterpret_cast<size_t>(p) + ALIGN_BYTES - 1) & ALIGN_MASK);
+
+  std::vector<void*> aligned_pointers;
+  aligned_pointers.reserve(sizes.size());
+  for (auto sz : sizes) {
+    aligned_pointers.push_back(ptr);
+    ptr += (sz + ALIGN_BYTES - 1) & ALIGN_MASK;
+  }
+
+  return aligned_pointers;
+}
+
+//
+size_t _cuann_aligned(size_t size, size_t unit = 128)
+{
+  if (size % unit) { size += unit - (size % unit); }
+  return size;
+}
+
+// memset
+void _cuann_memset(void* ptr, int value, size_t count)
+{
+  cudaPointerAttributes attr;
+  cudaPointerGetAttributes(&attr, ptr);
+  if (attr.type == cudaMemoryTypeDevice || attr.type == cudaMemoryTypeManaged) {
+    RAFT_CUDA_TRY(cudaMemset(ptr, value, count));
+    // if (ret != cudaSuccess) {
+    //     fprintf(stderr, "(%s) cudaMemset() failed\n", __func__);
+    //     exit(-1);
+    // }
+  } else {
+    memset(ptr, value, count);
+  }
+}
+
+// argmin along column
+__global__ void kern_argmin(uint32_t nRows,
+                            uint32_t nCols,
+                            const float* a,  // [nRows, nCols]
+                            uint32_t* out    // [nRows]
+)
+{
+  __shared__ uint32_t smCol[1024];
+  __shared__ float smVal[1024];
+  uint32_t iRow = blockIdx.x;
+  if (iRow >= nRows) return;
+  uint32_t iCol   = threadIdx.x;
+  uint32_t minCol = nCols;
+  float minVal    = FLT_MAX;
+  for (iCol = threadIdx.x; iCol < nCols; iCol += blockDim.x) {
+    if (minVal > a[iCol + (nCols * iRow)]) {
+      minVal = a[iCol + (nCols * iRow)];
+      minCol = iCol;
+    }
+  }
+  smVal[threadIdx.x] = minVal;
+  smCol[threadIdx.x] = minCol;
+  __syncthreads();
+  for (uint32_t offset = blockDim.x / 2; offset > 0; offset >>= 1) {
+    if (threadIdx.x < offset) {
+      if (smVal[threadIdx.x] < smVal[threadIdx.x + offset]) {
+      } else if (smVal[threadIdx.x] > smVal[threadIdx.x + offset]) {
+        smVal[threadIdx.x] = smVal[threadIdx.x + offset];
+        smCol[threadIdx.x] = smCol[threadIdx.x + offset];
+      } else if (smCol[threadIdx.x] > smCol[threadIdx.x + offset]) {
+        smCol[threadIdx.x] = smCol[threadIdx.x + offset];
+      }
+    }
+    __syncthreads();
+  }
+  if (threadIdx.x == 0) { out[iRow] = smCol[0]; }
+}
+
+// argmin along column
+void _cuann_argmin(uint32_t nRows,
+                   uint32_t nCols,
+                   const float* a,  // [nRows, nCols]
+                   uint32_t* out    // [nRows]
+)
+{
+  uint32_t nThreads = 1024;
+  while (nThreads > nCols) {
+    nThreads /= 2;
+  }
+  nThreads = max(nThreads, 128);
+  kern_argmin<<<nRows, nThreads>>>(nRows, nCols, a, out);
+}
+
+// square sum along column
+__global__ void kern_sqsum(uint32_t nRows,
+                           uint32_t nCols,
+                           const float* a,  // [nRows, nCols]
+                           float* out       // [nRows]
+)
+{
+  uint64_t iRow = threadIdx.y + (blockDim.y * blockIdx.x);
+  if (iRow >= nRows) return;
+
+  float sqsum = 0.0;
+  for (uint64_t iCol = threadIdx.x; iCol < nCols; iCol += blockDim.x) {
+    float val = a[iCol + (nCols * iRow)];
+    sqsum += val * val;
+  }
+  sqsum += __shfl_xor_sync(0xffffffff, sqsum, 1);
+  sqsum += __shfl_xor_sync(0xffffffff, sqsum, 2);
+  sqsum += __shfl_xor_sync(0xffffffff, sqsum, 4);
+  sqsum += __shfl_xor_sync(0xffffffff, sqsum, 8);
+  sqsum += __shfl_xor_sync(0xffffffff, sqsum, 16);
+  if (threadIdx.x == 0) { out[iRow] = sqsum; }
+}
+
+// square sum along column
+void _cuann_sqsum(uint32_t nRows,
+                  uint32_t nCols,
+                  const float* a,  // [numDataset, dimDataset]
+                  float* out       // [numDataset,]
+)
+{
+  dim3 threads(32, 4, 1);  // DO NOT CHANGE
+  dim3 blocks((nRows + threads.y - 1) / threads.y, 1, 1);
+  kern_sqsum<<<blocks, threads>>>(nRows, nCols, a, out);
+}
+
+// copy
+
+template <typename S, typename D>
+__global__ void kern_copy(uint32_t nRows,
+                          uint32_t nCols,
+                          const S* src,  // [nRows, ldSrc]
+                          uint32_t ldSrc,
+                          D* dst,  // [nRows, ldDst]
+                          uint32_t ldDst)
+{
+  uint32_t gid  = threadIdx.x + (blockDim.x * blockIdx.x);
+  uint32_t iCol = gid % nCols;
+  uint32_t iRow = gid / nCols;
+  if (iRow >= nRows) return;
+  dst[iCol + (ldDst * iRow)] = src[iCol + (ldSrc * iRow)];
+}
+
+// copy
+template <typename S, typename D>
+__global__ void kern_copy(uint32_t nRows,
+                          uint32_t nCols,
+                          const S* src,  // [nRows, ldSrc]
+                          uint32_t ldSrc,
+                          D* dst,  // [nRows, ldDst]
+                          uint32_t ldDst,
+                          D divisor)
+{
+  uint32_t gid  = threadIdx.x + (blockDim.x * blockIdx.x);
+  uint32_t iCol = gid % nCols;
+  uint32_t iRow = gid / nCols;
+  if (iRow >= nRows) return;
+  dst[iCol + (ldDst * iRow)] = src[iCol + (ldSrc * iRow)] / divisor;
+}
+
+template <typename S, typename D>
+void _cuann_copy(uint32_t nRows,
+                 uint32_t nCols,
+                 const S* src,  // [nRows, ldSrc]
+                 uint32_t ldSrc,
+                 D* dst,  // [nRows, ldDst]
+                 uint32_t ldDst,
+                 D divisor)
+{
+  uint32_t nThreads = 128;
+  uint32_t nBlocks  = ((nRows * nCols) + nThreads - 1) / nThreads;
+  kern_copy<S, D><<<nBlocks, nThreads>>>(nRows, nCols, src, ldSrc, dst, ldDst, divisor);
+}
+
+template <typename S, typename D>
+void _cuann_copy(uint32_t nRows,
+                 uint32_t nCols,
+                 const S* src,  // [nRows, ldSrc]
+                 uint32_t ldSrc,
+                 D* dst,  // [nRows, ldDst]
+                 uint32_t ldDst,
+                 cudaStream_t stream,
+                 D divisor)
+{
+  uint32_t nThreads = 128;
+  uint32_t nBlocks  = ((nRows * nCols) + nThreads - 1) / nThreads;
+  kern_copy<S, D><<<nBlocks, nThreads, 0, stream>>>(nRows, nCols, src, ldSrc, dst, ldDst, divisor);
+}
+
+// accumulate
+template <typename T>
+__global__ void kern_accumulate_with_label(uint32_t nRowsOutput,
+                                           uint32_t nCols,
+                                           float* output,    // [nRowsOutput, nCols,]
+                                           uint32_t* count,  // [nRowsOutput,]
+                                           uint32_t nRowsInput,
+                                           const T* input,         // [nRowsInput, nCols,]
+                                           const uint32_t* label,  // [nRowsInput,]
+                                           float divisor)
+{
+  uint64_t gid       = threadIdx.x + (blockDim.x * blockIdx.x);
+  uint64_t iCol      = gid % nCols;
+  uint64_t iRowInput = gid / nCols;
+  if (iRowInput >= nRowsInput) return;
+  uint64_t iRowOutput = label[iRowInput];
+  if (iCol == 0) { atomicAdd(&(count[iRowOutput]), 1); }
+  atomicAdd(&(output[iCol + (nCols * iRowOutput)]), input[gid] / divisor);
+}
+
+// accumulate
+template <typename T>
+void _cuann_accumulate_with_label(uint32_t nRowsOutput,
+                                  uint32_t nCols,
+                                  float* output,    // [nRowsOutput, nCols,]
+                                  uint32_t* count,  // [nRowsOutput,]
+                                  uint32_t nRowsInput,
+                                  const T* input,         // [nRowsInput, nCols,]
+                                  const uint32_t* label,  // [nRowsInput,]
+                                  float divisor = 1.0)
+{
+  bool useGPU = 1;
+  cudaPointerAttributes attr;
+  cudaPointerGetAttributes(&attr, output);
+  if (attr.type == cudaMemoryTypeUnregistered || attr.type == cudaMemoryTypeHost) { useGPU = 0; }
+  cudaPointerGetAttributes(&attr, count);
+  if (attr.type == cudaMemoryTypeUnregistered || attr.type == cudaMemoryTypeHost) { useGPU = 0; }
+  cudaPointerGetAttributes(&attr, input);
+  if (attr.type == cudaMemoryTypeUnregistered || attr.type == cudaMemoryTypeHost) { useGPU = 0; }
+  // _cuann_memset(output, 0, sizeof(float) * nRowsOutput * nCols);
+  // _cuann_memset(count, 0, sizeof(uint32_t) * nRowsOutput);
+
+  if (useGPU) {
+    // GPU
+    uint32_t nThreads = 128;
+    uint64_t nBlocks  = (((uint64_t)nRowsInput * nCols) + nThreads - 1) / nThreads;
+    kern_accumulate_with_label<T>
+      <<<nBlocks, nThreads>>>(nRowsOutput, nCols, output, count, nRowsInput, input, label, divisor);
+  } else {
+    // CPU
+    cudaDeviceSynchronize();
+    for (uint64_t i = 0; i < nRowsInput; i++) {
+      uint64_t l = label[i];
+      count[l] += 1;
+      for (uint64_t j = 0; j < nCols; j++) {
+        output[j + (nCols * l)] += input[j + (nCols * i)] / divisor;
+      }
+    }
+  }
+}
+
+// normalize
+__global__ void kern_normalize(uint32_t nRows,
+                               uint32_t nCols,
+                               float* a,                   // [nRows, nCols]
+                               const uint32_t* numSamples  // [nRows,]
+)
+{
+  uint64_t iRow = threadIdx.y + (blockDim.y * blockIdx.x);
+  if (iRow >= nRows) return;
+  if (numSamples != NULL and numSamples[iRow] < 1) return;
+
+  float sqsum = 0.0;
+  for (uint32_t iCol = threadIdx.x; iCol < nCols; iCol += blockDim.x) {
+    float val = a[iCol + (nCols * iRow)];
+    sqsum += val * val;
+  }
+  sqsum += __shfl_xor_sync(0xffffffff, sqsum, 1);
+  sqsum += __shfl_xor_sync(0xffffffff, sqsum, 2);
+  sqsum += __shfl_xor_sync(0xffffffff, sqsum, 4);
+  sqsum += __shfl_xor_sync(0xffffffff, sqsum, 8);
+  sqsum += __shfl_xor_sync(0xffffffff, sqsum, 16);
+  sqsum = sqrt(sqsum);
+  for (uint32_t iCol = threadIdx.x; iCol < nCols; iCol += blockDim.x) {
+    a[iCol + (nCols * iRow)] /= sqsum;
+  }
+}
+
+// normalize
+void _cuann_normalize(uint32_t nRows,
+                      uint32_t nCols,
+                      float* a,                   // [nRows, nCols]
+                      const uint32_t* numSamples  // [nRows,]
+)
+{
+  dim3 threads(32, 4, 1);  // DO NOT CHANGE
+  dim3 blocks((nRows + threads.y - 1) / threads.y, 1, 1);
+  kern_normalize<<<blocks, threads>>>(nRows, nCols, a, numSamples);
+}
+
+// divide
+__global__ void kern_divide(uint32_t nRows,
+                            uint32_t nCols,
+                            float* a,                   // [nRows, nCols]
+                            const uint32_t* numSamples  // [nRows,]
+)
+{
+  uint64_t gid  = threadIdx.x + (blockDim.x * blockIdx.x);
+  uint64_t iRow = gid / nCols;
+  if (iRow >= nRows) return;
+  if (numSamples[iRow] == 0) return;
+  a[gid] /= numSamples[iRow];
+}
+
+// divide
+void _cuann_divide(uint32_t nRows,
+                   uint32_t nCols,
+                   float* a,                   // [nRows, nCols]
+                   const uint32_t* numSamples  // [nRows,]
+)
+{
+  dim3 threads(128, 1, 1);
+  dim3 blocks(((uint64_t)nRows * nCols + threads.x - 1) / threads.x, 1, 1);
+  kern_divide<<<blocks, threads>>>(nRows, nCols, a, numSamples);
+}
+
+// outer add
+__global__ void kern_outer_add(const float* a,
+                               uint32_t numA,
+                               const float* b,
+                               uint32_t numB,
+                               float* c  // [numA, numB]
+)
+{
+  uint64_t gid = threadIdx.x + (blockDim.x * blockIdx.x);
+  uint64_t iA  = gid / numB;
+  uint64_t iB  = gid % numB;
+  if (iA >= numA) return;
+  float valA = (a == NULL) ? 0.0 : a[iA];
+  float valB = (b == NULL) ? 0.0 : b[iB];
+  c[gid]     = valA + valB;
+}
+
+// outer add
+void _cuann_outer_add(const float* a,
+                      uint32_t numA,
+                      const float* b,
+                      uint32_t numB,
+                      float* c  // [numA, numB]
+)
+{
+  dim3 threads(128, 1, 1);
+  dim3 blocks(((uint64_t)numA * numB + threads.x - 1) / threads.x, 1, 1);
+  kern_outer_add<<<blocks, threads>>>(a, numA, b, numB, c);
+}
+
+// copy with row list
+template <typename T>
+__global__ void kern_copy_with_list(uint32_t nRows,
+                                    uint32_t nCols,
+                                    const T* src,             // [..., ldSrc]
+                                    const uint32_t* rowList,  // [nRows,]
+                                    uint32_t ldSrc,
+                                    float* dst,  // [nRows, ldDst]
+                                    uint32_t ldDst)
+{
+  uint64_t gid  = threadIdx.x + (blockDim.x * blockIdx.x);
+  uint64_t iCol = gid % nCols;
+  uint64_t iRow = gid / nCols;
+  if (iRow >= nRows) return;
+  uint64_t iaRow             = rowList[iRow];
+  dst[iCol + (ldDst * iRow)] = src[iCol + (ldSrc * iaRow)];
+}
+
+// copy with row list
+template <typename T>
+__global__ void kern_copy_with_list(uint32_t nRows,
+                                    uint32_t nCols,
+                                    const T* src,             // [..., ldSrc]
+                                    const uint32_t* rowList,  // [nRows,]
+                                    uint32_t ldSrc,
+                                    float* dst,  // [nRows, ldDst]
+                                    uint32_t ldDst,
+                                    float divisor)
+{
+  uint64_t gid  = threadIdx.x + (blockDim.x * blockIdx.x);
+  uint64_t iCol = gid % nCols;
+  uint64_t iRow = gid / nCols;
+  if (iRow >= nRows) return;
+  uint64_t iaRow             = rowList[iRow];
+  dst[iCol + (ldDst * iRow)] = src[iCol + (ldSrc * iaRow)] / divisor;
+}
+
+// copy with row list
+template <typename T>
+void _cuann_copy_with_list(uint32_t nRows,
+                           uint32_t nCols,
+                           const T* src,             // [..., ldSrc]
+                           const uint32_t* rowList,  // [nRows,]
+                           uint32_t ldSrc,
+                           float* dst,  // [nRows, ldDst]
+                           uint32_t ldDst,
+                           float divisor = 1.0)
+{
+  cudaPointerAttributes attr;
+  cudaPointerGetAttributes(&attr, src);
+  if (attr.type == cudaMemoryTypeUnregistered || attr.type == cudaMemoryTypeHost) {
+    for (uint64_t iRow = 0; iRow < nRows; iRow++) {
+      uint64_t iaRow = rowList[iRow];
+      for (uint64_t iCol = 0; iCol < nCols; iCol++) {
+        dst[iCol + (ldDst * iRow)] = src[iCol + (ldSrc * iaRow)] / divisor;
+      }
+    }
+  } else {
+    uint32_t nThreads = 128;
+    uint32_t nBlocks  = ((nRows * nCols) + nThreads - 1) / nThreads;
+    kern_copy_with_list<T>
+      <<<nBlocks, nThreads>>>(nRows, nCols, src, rowList, ldSrc, dst, ldDst, divisor);
+  }
+}
+}  // namespace utils
+}  // namespace detail
+}  // namespace knn
+}  // namespace spatial
+}  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh b/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
index 21e6ea026c..009d629066 100644
--- a/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
@@ -16,14 +16,20 @@
 
 #pragma once
 
+#include <raft/cudart_utils.h>
+#include <raft/device_atomics.cuh>
+#include <raft/vectorized.cuh>
+
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_scan.cuh>
 #include <cub/block/block_store.cuh>
 #include <cub/block/radix_rank_sort_operations.cuh>
 
-#include <raft/cudart_utils.h>
-#include <raft/device_atomics.cuh>
-#include <raft/vectorized.cuh>
+#include <rmm/device_vector.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <optional>
 
 namespace raft::spatial::knn::detail::topk {
 
@@ -531,7 +537,8 @@ void radix_topk(const T* in,
                 T* out,
                 IdxT* out_idx,
                 bool select_min,
-                rmm::cuda_stream_view stream)
+                rmm::cuda_stream_view stream,
+                rmm::mr::device_memory_resource* mr = nullptr)
 {
   // TODO: is it possible to relax this restriction?
   static_assert(calc_num_passes<T, BitsPerPass>() > 1);
@@ -541,12 +548,22 @@ void radix_topk(const T* in,
   uint16_t max_chunk_size =
     get_optimal_batch_size<T, IdxT, BitsPerPass, BlockSize>(batch_size, blocks_per_row);
 
-  rmm::device_uvector<Counter<T, IdxT>> counters(max_chunk_size, stream);
-  rmm::device_uvector<IdxT> histograms(num_buckets * max_chunk_size, stream);
-  rmm::device_uvector<T> buf1(len * max_chunk_size, stream);
-  rmm::device_uvector<IdxT> idx_buf1(len * max_chunk_size, stream);
-  rmm::device_uvector<T> buf2(len * max_chunk_size, stream);
-  rmm::device_uvector<IdxT> idx_buf2(len * max_chunk_size, stream);
+  std::optional<rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>> pool_res;
+  if (mr == nullptr) {
+    pool_res.emplace(
+      rmm::mr::get_current_device_resource(),
+      max_chunk_size * (sizeof(Counter<T, IdxT>)            // counters
+                        + sizeof(IdxT) * (num_buckets + 2)  // histograms and IdxT bufs
+                        + sizeof(T) * 2                     // T bufs
+                        ));
+    mr = &(pool_res.value());
+  }
+  rmm::device_uvector<Counter<T, IdxT>> counters(max_chunk_size, stream, mr);
+  rmm::device_uvector<IdxT> histograms(num_buckets * max_chunk_size, stream, mr);
+  rmm::device_uvector<T> buf1(len * max_chunk_size, stream, mr);
+  rmm::device_uvector<IdxT> idx_buf1(len * max_chunk_size, stream, mr);
+  rmm::device_uvector<T> buf2(len * max_chunk_size, stream, mr);
+  rmm::device_uvector<IdxT> idx_buf2(len * max_chunk_size, stream, mr);
 
   for (size_t offset = 0; offset < batch_size; offset += max_chunk_size) {
     auto chunk_size = uint16_t(std::min<size_t>(max_chunk_size, batch_size - offset));
@@ -605,4 +622,171 @@ void radix_topk(const T* in,
   }
 }
 
+inline size_t calc_aligned_size(const std::vector<size_t>& sizes)
+{
+  const size_t ALIGN_BYTES = 256;
+  const size_t ALIGN_MASK  = ~(ALIGN_BYTES - 1);
+  size_t total             = 0;
+  for (auto sz : sizes) {
+    total += (sz + ALIGN_BYTES - 1) & ALIGN_MASK;
+  }
+  return total + ALIGN_BYTES - 1;
+}
+
+inline std::vector<void*> calc_aligned_pointers(const void* p, const std::vector<size_t>& sizes)
+{
+  const size_t ALIGN_BYTES = 256;
+  const size_t ALIGN_MASK  = ~(ALIGN_BYTES - 1);
+
+  char* ptr = reinterpret_cast<char*>((reinterpret_cast<size_t>(p) + ALIGN_BYTES - 1) & ALIGN_MASK);
+
+  std::vector<void*> aligned_pointers;
+  aligned_pointers.reserve(sizes.size());
+  for (auto sz : sizes) {
+    aligned_pointers.push_back(ptr);
+    ptr += (sz + ALIGN_BYTES - 1) & ALIGN_MASK;
+  }
+
+  return aligned_pointers;
+}
+
+template <typename T, typename idxT, int BITS_PER_PASS, int NUM_THREAD>
+void radix_topk(void* buf,
+                size_t& buf_size,
+                const T* in,
+                const idxT* in_idx,
+                idxT batch_size,
+                idxT len,
+                idxT k,
+                T* out,
+                idxT* out_idx,
+                bool greater,
+                cudaStream_t stream)
+{
+  // TODO: is it possible to relax this condition?
+  static_assert(calc_num_passes<T, BITS_PER_PASS>() > 1);
+  constexpr int num_buckets = calc_num_buckets<BITS_PER_PASS>();
+
+  Counter<T, idxT>* counters = nullptr;
+  idxT* histograms           = nullptr;
+  T* buf1                    = nullptr;
+  idxT* idx_buf1             = nullptr;
+  T* buf2                    = nullptr;
+  idxT* idx_buf2             = nullptr;
+  {
+    std::vector<size_t> sizes = {sizeof(*counters) * batch_size,
+                                 sizeof(*histograms) * num_buckets * batch_size,
+                                 sizeof(*buf1) * len * batch_size,
+                                 sizeof(*idx_buf1) * len * batch_size,
+                                 sizeof(*buf2) * len * batch_size,
+                                 sizeof(*idx_buf2) * len * batch_size};
+    size_t total_size         = calc_aligned_size(sizes);
+    if (!buf) {
+      buf_size = total_size;
+      return;
+    }
+
+    std::vector<void*> aligned_pointers = calc_aligned_pointers(buf, sizes);
+    counters                            = static_cast<decltype(counters)>(aligned_pointers[0]);
+    histograms                          = static_cast<decltype(histograms)>(aligned_pointers[1]);
+    buf1                                = static_cast<decltype(buf1)>(aligned_pointers[2]);
+    idx_buf1                            = static_cast<decltype(idx_buf1)>(aligned_pointers[3]);
+    buf2                                = static_cast<decltype(buf2)>(aligned_pointers[4]);
+    idx_buf2                            = static_cast<decltype(idx_buf2)>(aligned_pointers[5]);
+
+    RAFT_CUDA_TRY(cudaMemsetAsync(
+      buf,
+      0,
+      static_cast<char*>(aligned_pointers[2]) - static_cast<char*>(aligned_pointers[0]),
+      stream));
+  }
+
+  const T* in_buf        = nullptr;
+  const idxT* in_idx_buf = nullptr;
+  T* out_buf             = nullptr;
+  idxT* out_idx_buf      = nullptr;
+
+  dim3 blocks((len - 1) / (NUM_THREAD * ITEM_PER_THREAD) + 1, batch_size);
+
+  constexpr int num_passes = calc_num_passes<T, BITS_PER_PASS>();
+  for (int pass = 0; pass < num_passes; pass++) {
+    if (pass == 0) {
+      in_buf      = in;
+      in_idx_buf  = nullptr;
+      out_buf     = nullptr;
+      out_idx_buf = nullptr;
+    } else if (pass == 1) {
+      in_buf      = in;
+      in_idx_buf  = in_idx ? in_idx : nullptr;
+      out_buf     = buf1;
+      out_idx_buf = idx_buf1;
+    } else if (pass % 2 == 0) {
+      in_buf      = buf1;
+      in_idx_buf  = idx_buf1;
+      out_buf     = buf2;
+      out_idx_buf = idx_buf2;
+    } else {
+      in_buf      = buf2;
+      in_idx_buf  = idx_buf2;
+      out_buf     = buf1;
+      out_idx_buf = idx_buf1;
+    }
+
+    radix_kernel<T, idxT, BITS_PER_PASS, NUM_THREAD><<<blocks, NUM_THREAD, 0, stream>>>(in_buf,
+                                                                                        in_idx_buf,
+                                                                                        out_buf,
+                                                                                        out_idx_buf,
+                                                                                        out,
+                                                                                        out_idx,
+                                                                                        counters,
+                                                                                        histograms,
+                                                                                        len,
+                                                                                        k,
+                                                                                        greater,
+                                                                                        pass);
+  }
+}
+
+template <typename T, typename idxT>
+void radix_topk_11bits(void* buf,
+                       size_t& buf_size,
+                       const T* in,
+                       idxT batch_size,
+                       idxT len,
+                       idxT k,
+                       T* out,
+                       idxT* out_idx       = nullptr,
+                       bool greater        = true,
+                       cudaStream_t stream = 0)
+{
+  radix_topk<T, idxT, 11, 512>(buf,
+                               buf_size,
+                               in,
+                               static_cast<idxT*>(nullptr),
+                               batch_size,
+                               len,
+                               k,
+                               out,
+                               out_idx,
+                               greater,
+                               stream);
+}
+
+template <typename T, typename idxT>
+void radix_topk_11bits(void* buf,
+                       size_t& buf_size,
+                       const T* in,
+                       const idxT* in_idx,
+                       idxT batch_size,
+                       idxT len,
+                       idxT k,
+                       T* out,
+                       idxT* out_idx       = nullptr,
+                       bool greater        = true,
+                       cudaStream_t stream = 0)
+{
+  radix_topk<T, idxT, 11, 512>(
+    buf, buf_size, in, in_idx, batch_size, len, k, out, out_idx, greater, stream);
+}
+
 }  // namespace raft::spatial::knn::detail::topk
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 43c6257966..ca8611ac56 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -105,6 +105,7 @@ add_executable(test_raft
     test/sparse/row_op.cu
     test/sparse/sort.cu
     test/sparse/symmetrize.cu
+    test/spatial/ann_ivf_flat.cu
     test/spatial/knn.cu
     test/spatial/fused_l2_knn.cu
     test/spatial/haversine.cu
diff --git a/cpp/test/spatial/ann_base_kernel.cuh b/cpp/test/spatial/ann_base_kernel.cuh
new file mode 100644
index 0000000000..51ac3f0ba6
--- /dev/null
+++ b/cpp/test/spatial/ann_base_kernel.cuh
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/cuda_utils.cuh>
+#include <raft/distance/distance_type.hpp>
+#include <raft/spatial/knn/detail/selection_faiss.cuh>
+
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+template <typename DataType, typename AccT>
+__global__ void naiveDistanceKernel(float* dist,
+                                    int64_t* indices,
+                                    const DataType* x,
+                                    const DataType* y,
+                                    int m,
+                                    int n,
+                                    int k,
+                                    raft::distance::DistanceType type,
+                                    bool isRowMajor)
+{
+  int midx = threadIdx.x + blockIdx.x * blockDim.x;
+  int nidx = threadIdx.y + blockIdx.y * blockDim.y;
+  if (midx >= m || nidx >= n) return;
+  AccT acc = AccT(0);
+  for (int i = 0; i < k; ++i) {
+    int xidx = isRowMajor ? i + midx * k : i * m + midx;
+    int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
+    if (type == raft::distance::DistanceType::InnerProduct) {
+      acc += x[xidx] * y[yidx];
+    } else {
+      AccT diff = x[xidx] - y[yidx];
+      acc += diff * diff;
+    }
+  }
+  float dist_val = (float)acc;
+  if (type == raft::distance::DistanceType::L2SqrtExpanded ||
+      type == raft::distance::DistanceType::L2SqrtUnexpanded)
+    dist_val = raft::mySqrt(dist_val);
+  int outidx      = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  dist[outidx]    = dist_val;
+  indices[outidx] = outidx;  // This is required because of the select_k API.
+}
+
+// currently using this naive kernel as FAISS & fusedL2kNN doesn't support 8-bit
+template <typename DataType, typename AccT>
+void naiveBfKnn(float* dist_topk,
+                int64_t* indices_topk,
+                const DataType* x,
+                const DataType* y,
+                int m,
+                int n,
+                int k,
+                int numOfNN,
+                raft::distance::DistanceType type,
+                bool isRowMajor,
+                DataType metric_arg = 2.0f,
+                cudaStream_t stream = 0)
+{
+  static const dim3 TPB(16, 32, 1);
+  dim3 nblks(raft::ceildiv(m, (int)TPB.x), raft::ceildiv(n, (int)TPB.y), 1);
+
+  rmm::device_uvector<float> dist(m * n, stream);
+  rmm::device_uvector<int64_t> indices(m * n, stream);
+  naiveDistanceKernel<DataType, AccT>
+    <<<nblks, TPB, 0, stream>>>(dist.data(), indices.data(), x, y, m, n, k, type, isRowMajor);
+  detail::select_k(
+    dist.data(), indices.data(), m, n, dist_topk, indices_topk, true, numOfNN, stream);
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+}
+}  // namespace knn
+}  // namespace spatial
+}  // namespace raft
diff --git a/cpp/test/spatial/ann_ivf_flat.cu b/cpp/test/spatial/ann_ivf_flat.cu
new file mode 100644
index 0000000000..dc66dd43e6
--- /dev/null
+++ b/cpp/test/spatial/ann_ivf_flat.cu
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+
+#include "./ann_base_kernel.cuh"
+#include <raft/distance/distance_type.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/spatial/knn/ann.cuh>
+#include <raft/spatial/knn/detail/common_faiss.h>
+
+#include <raft/spatial/knn/knn.cuh>
+
+#include <rmm/device_buffer.hpp>
+
+#include <gtest/gtest.h>
+
+#include <cstddef>
+#include <iostream>
+#include <vector>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+struct AnnIvfFlatInputs {
+  int num_queries;
+  int num_db_vecs;
+  int dim;
+  int k;
+  int nprobe;
+  int nlist;
+  raft::distance::DistanceType metric_;
+};
+
+template <typename IdxT, typename DistT, typename compareDist>
+struct idx_dist_pair {
+  IdxT idx;
+  DistT dist;
+  compareDist eq_compare;
+  bool operator==(const idx_dist_pair<IdxT, DistT, compareDist>& a) const
+  {
+    if (idx == a.idx) return true;
+    if (eq_compare(dist, a.dist)) return true;
+    return false;
+  }
+  idx_dist_pair(IdxT x, DistT y, compareDist op) : idx(x), dist(y), eq_compare(op) {}
+};
+
+template <typename T, typename DistT>
+testing::AssertionResult devArrMatchKnnPair(const T* expected_idx,
+                                            const T* actual_idx,
+                                            const DistT* expected_dist,
+                                            const DistT* actual_dist,
+                                            size_t rows,
+                                            size_t cols,
+                                            const DistT eps,
+                                            cudaStream_t stream = 0)
+{
+  size_t size = rows * cols;
+  std::unique_ptr<T[]> exp_idx_h(new T[size]);
+  std::unique_ptr<T[]> act_idx_h(new T[size]);
+  std::unique_ptr<DistT[]> exp_dist_h(new DistT[size]);
+  std::unique_ptr<DistT[]> act_dist_h(new DistT[size]);
+  raft::update_host<T>(exp_idx_h.get(), expected_idx, size, stream);
+  raft::update_host<T>(act_idx_h.get(), actual_idx, size, stream);
+  raft::update_host<DistT>(exp_dist_h.get(), expected_dist, size, stream);
+  raft::update_host<DistT>(act_dist_h.get(), actual_dist, size, stream);
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  size_t match_count = 0;
+  for (size_t i = 0; i < rows; ++i) {
+    for (size_t k = 0; k < cols; ++k) {
+      size_t idx_k  = i * cols + k;  // row major assumption!
+      auto act_idx  = act_idx_h.get()[idx_k];
+      auto act_dist = act_dist_h.get()[idx_k];
+      for (size_t j = 0; j < cols; ++j) {
+        size_t idx    = i * cols + j;  // row major assumption!
+        auto exp_idx  = exp_idx_h.get()[idx];
+        auto exp_dist = exp_dist_h.get()[idx];
+        idx_dist_pair exp_kvp(exp_idx, exp_dist, raft::CompareApprox<DistT>(eps));
+        idx_dist_pair act_kvp(act_idx, act_dist, raft::CompareApprox<DistT>(eps));
+        if (!(exp_kvp == act_kvp)) {
+          // return testing::AssertionFailure()
+          //        << "actual=" << act_kvp.idx << "," << act_kvp.dist << "!="
+          //        << "expected" << exp_kvp.idx << "," << exp_kvp.dist << " @" << i << "," << j;
+          // std::cout<< "actual = " << act_kvp.idx << "," << act_kvp.dist << " != "  <<
+          //           " expected = " << exp_kvp.idx << "," << exp_kvp.dist << " @" << i
+          //           << "," << j << std::endl;
+        } else {
+          match_count++;
+          break;
+        }
+      }
+    }
+  }
+  std::cout << "Recall = " << match_count << "/" << rows * cols << std::endl;
+  return testing::AssertionSuccess();
+}
+
+template <typename T, typename DataT>
+class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
+ public:
+  AnnIVFFlatTest()
+    : stream_(handle_.get_stream()),
+      params_(::testing::TestWithParam<AnnIvfFlatInputs>::GetParam()),
+      database(params_.num_db_vecs * params_.dim, stream_),
+      search_queries(params_.num_queries * params_.dim, stream_),
+      raft_indices_(params_.num_queries * params_.k, stream_),
+      raft_distances_(params_.num_queries * params_.k, stream_),
+      faiss_indices_(params_.num_queries * params_.k, stream_),
+      faiss_distances_(params_.num_queries * params_.k, stream_)
+  {
+    RAFT_CUDA_TRY(cudaMemsetAsync(database.data(), 0, database.size() * sizeof(DataT), stream_));
+    RAFT_CUDA_TRY(
+      cudaMemsetAsync(search_queries.data(), 0, search_queries.size() * sizeof(DataT), stream_));
+    RAFT_CUDA_TRY(
+      cudaMemsetAsync(raft_indices_.data(), 0, raft_indices_.size() * sizeof(int64_t), stream_));
+    RAFT_CUDA_TRY(
+      cudaMemsetAsync(raft_distances_.data(), 0, raft_distances_.size() * sizeof(T), stream_));
+    RAFT_CUDA_TRY(
+      cudaMemsetAsync(faiss_indices_.data(), 0, faiss_indices_.size() * sizeof(int64_t), stream_));
+    RAFT_CUDA_TRY(
+      cudaMemsetAsync(faiss_distances_.data(), 0, faiss_distances_.size() * sizeof(T), stream_));
+  }
+
+ protected:
+  void testIVFFlat(bool is8bit)
+  {
+    if constexpr (std::is_same<DataT, uint8_t>{}) {
+      naiveBfKnn<uint8_t, uint32_t>(faiss_distances_.data(),
+                                    faiss_indices_.data(),
+                                    search_queries.data(),
+                                    database.data(),
+                                    num_queries,
+                                    num_db_vecs,
+                                    dim,
+                                    k_,
+                                    metric,
+                                    true,
+                                    2.0f,
+                                    stream_);
+    } else if constexpr (std::is_same<DataT, int8_t>{}) {
+      naiveBfKnn<int8_t, int32_t>(faiss_distances_.data(),
+                                  faiss_indices_.data(),
+                                  search_queries.data(),
+                                  database.data(),
+                                  num_queries,
+                                  num_db_vecs,
+                                  dim,
+                                  k_,
+                                  metric,
+                                  true,
+                                  2.0f,
+                                  stream_);
+    } else if constexpr (std::is_same<DataT, float>{}) {
+      naiveBfKnn<float, float>(faiss_distances_.data(),
+                               faiss_indices_.data(),
+                               search_queries.data(),
+                               database.data(),
+                               num_queries,
+                               num_db_vecs,
+                               dim,
+                               k_,
+                               metric,
+                               true,
+                               2.0f,
+                               stream_);
+    }
+
+    raft::spatial::knn::IVFFlatParam ivfParams;
+    ivfParams.nprobe = nprobe_;
+    ivfParams.nlist  = nlist_;
+    raft::spatial::knn::knnIndex index;
+    index.index   = nullptr;
+    index.gpu_res = nullptr;
+
+    approx_knn_build_index(handle_,
+                           &index,
+                           dynamic_cast<raft::spatial::knn::knnIndexParam*>(&ivfParams),
+                           metric,
+                           0,
+                           database.data(),
+                           num_db_vecs,
+                           dim);
+    approx_knn_search(handle_,
+                      raft_distances_.data(),
+                      raft_indices_.data(),
+                      &index,
+                      dynamic_cast<raft::spatial::knn::knnIndexParam*>(&ivfParams),
+                      k_,
+                      search_queries.data(),
+                      num_queries);
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream_));
+    // verify.
+    devArrMatchKnnPair(faiss_indices_.data(),
+                       raft_indices_.data(),
+                       faiss_distances_.data(),
+                       raft_distances_.data(),
+                       num_queries,
+                       k_,
+                       float(0.001),
+                       stream_);
+  }
+
+  void SetUp() override
+  {
+    num_queries = params_.num_queries;
+    num_db_vecs = params_.num_db_vecs;
+    dim         = params_.dim;
+    k_          = params_.k;
+    metric      = params_.metric_;
+    nprobe_     = params_.nprobe;
+    nlist_      = params_.nlist;
+
+    unsigned long long int seed = 1234ULL;
+    raft::random::Rng r(seed);
+    if constexpr (std::is_same<DataT, float>{}) {
+      r.uniform(database.data(), num_db_vecs * dim, DataT(0.1), DataT(2.0), stream_);
+      r.uniform(search_queries.data(), num_queries * dim, DataT(0.1), DataT(2.0), stream_);
+    } else {
+      r.uniformInt(database.data(), num_db_vecs * dim, DataT(1), DataT(20), stream_);
+      r.uniformInt(search_queries.data(), num_queries * dim, DataT(1), DataT(20), stream_);
+    }
+  }
+
+ private:
+  raft::handle_t handle_;
+  cudaStream_t stream_ = 0;
+  AnnIvfFlatInputs params_;
+  int num_queries;
+  int num_db_vecs;
+  int dim;
+  rmm::device_uvector<DataT> database;
+  rmm::device_uvector<DataT> search_queries;
+  rmm::device_uvector<int64_t> raft_indices_;
+  rmm::device_uvector<T> raft_distances_;
+  rmm::device_uvector<int64_t> faiss_indices_;
+  rmm::device_uvector<T> faiss_distances_;
+  int k_;
+  int nprobe_;
+  int nlist_;
+  raft::distance::DistanceType metric;
+};
+
+const std::vector<AnnIvfFlatInputs> inputs = {
+  {1000, 10000, 16, 10, 40, 1024, raft::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 10, 50, 1024, raft::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 10, 70, 1024, raft::distance::DistanceType::L2Expanded},
+  {100, 10000, 16, 10, 20, 512, raft::distance::DistanceType::L2Expanded},
+  {20, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::L2Expanded},
+  {1000, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::L2Expanded},
+  {10000, 131072, 8, 10, 20, 1024, raft::distance::DistanceType::L2Expanded},
+
+  {1000, 10000, 16, 10, 40, 1024, raft::distance::DistanceType::InnerProduct},
+  {1000, 10000, 16, 10, 50, 1024, raft::distance::DistanceType::InnerProduct},
+  {1000, 10000, 16, 10, 70, 1024, raft::distance::DistanceType::InnerProduct},
+  {100, 10000, 16, 10, 20, 512, raft::distance::DistanceType::InnerProduct},
+  {20, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::InnerProduct},
+  {1000, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::InnerProduct},
+  {10000, 131072, 8, 10, 50, 1024, raft::distance::DistanceType::InnerProduct}};
+
+typedef AnnIVFFlatTest<float, float> AnnIVFFlatTestF;
+TEST_P(AnnIVFFlatTestF, AnnIVFFlat) { this->testIVFFlat(false); }
+
+INSTANTIATE_TEST_CASE_P(AnnIVFFlatTest, AnnIVFFlatTestF, ::testing::ValuesIn(inputs));
+
+typedef AnnIVFFlatTest<float, uint8_t> AnnIVFFlatTestF_uint8;
+TEST_P(AnnIVFFlatTestF_uint8, AnnIVFFlat) { this->testIVFFlat(true); }
+
+INSTANTIATE_TEST_CASE_P(AnnIVFFlatTest, AnnIVFFlatTestF_uint8, ::testing::ValuesIn(inputs));
+
+typedef AnnIVFFlatTest<float, int8_t> AnnIVFFlatTestF_int8;
+TEST_P(AnnIVFFlatTestF_int8, AnnIVFFlat) { this->testIVFFlat(true); }
+
+INSTANTIATE_TEST_CASE_P(AnnIVFFlatTest, AnnIVFFlatTestF_int8, ::testing::ValuesIn(inputs));
+
+}  // namespace knn
+}  // namespace spatial
+}  // namespace raft

From 24e8c4d0b6e600da3d06c1e343ba3ddca51c514f Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 16 May 2022 09:44:06 +0200
Subject: [PATCH 002/118] update save/load index function to work with cuann
 benchmark suite, separate out cuivfl handle init in separate func

---
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  | 35 ++++++++++++++---
 .../knn/detail/ann_quantized_faiss.cuh        | 39 +++++++------------
 2 files changed, 45 insertions(+), 29 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index 1768bf1a1d..56ced13547 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -238,7 +238,7 @@ cuivflHandle::cuivflHandle(raft::distance::DistanceType metric_type,
 
   // cuBLAS
   cublasStatus_t cublasError;
-  cublasError = cublasCreate(&(cublas_handle_));
+  cublasError = cublasCreate(&cublas_handle_);
 
   if (cublasError != CUBLAS_STATUS_SUCCESS) {
     fprintf(stderr, "(%s) cublasCreate() failed\n", __func__);
@@ -413,7 +413,7 @@ cuivflStatus_t cuivflHandle::cuivflLoadIndex(const char* fileName)
                            cudaMemcpyHostToDevice));
 
 #ifdef DEBUG_L2
-  printDevPtr(centriod_norm_dev_ptr_, 20, "centriod_norm_dev_ptr_");
+  utils::printDevPtr(centriod_norm_dev_ptr_, 20, "centriod_norm_dev_ptr_");
 #endif
   if (read_counts != total_counts) {
     fprintf(stderr, "(%s) failed to load index to file (%s)\n", __func__, fileName);
@@ -438,10 +438,15 @@ cuivflStatus_t cuivflHandle::cuivflSaveIndex(const char* fileName)
     3 + 2 * nlist_ + nlist_ * dim_ + ninterleave_ + ninterleave_ * dim_ + nlist_;
 
   written_counts += fwrite(&nrow_, sizeof(uint32_t), 1, fp);
+
   written_counts += fwrite(&dtype_, sizeof(dtype_), 1, fp);
   written_counts += fwrite(&ninterleave_, sizeof(ninterleave_), 1, fp);
   // Step 3: Write the list
-
+  list_prefix_interleaved_host_ptr_ = (uint32_t*)malloc(sizeof(uint32_t) * nlist_);
+  cudaMemcpy(list_prefix_interleaved_host_ptr_,
+             list_prefix_interleaved_dev_ptr_,
+             sizeof(uint32_t) * nlist_,
+             cudaMemcpyDefault);
   written_counts += fwrite(list_prefix_interleaved_host_ptr_, sizeof(uint32_t), nlist_, fp);
   written_counts += fwrite(list_lengths_host_ptr_, sizeof(uint32_t), nlist_, fp);
 
@@ -455,7 +460,14 @@ cuivflStatus_t cuivflHandle::cuivflSaveIndex(const char* fileName)
 
   written_counts += fwrite(list_index_host_ptr_, sizeof(uint32_t), ninterleave_, fp);
 
+  centriod_host_ptr_ = (float*)malloc(sizeof(float) * nlist_ * dim_);
+  RAFT_CUDA_TRY(cudaMemcpy(
+    centriod_host_ptr_, centriod_dev_ptr_, sizeof(float) * nlist_ * dim_, cudaMemcpyDefault));
+
   written_counts += fwrite(centriod_host_ptr_, sizeof(float), nlist_ * dim_, fp);
+
+  RAFT_CUDA_TRY(cudaMemcpy(
+    centriod_norm_host_ptr_, centriod_norm_dev_ptr_, nlist_ * sizeof(float), cudaMemcpyDefault));
   written_counts += fwrite(centriod_norm_host_ptr_, sizeof(float), nlist_, fp);
 
   if (written_counts != total_counts) {
@@ -463,6 +475,8 @@ cuivflStatus_t cuivflHandle::cuivflSaveIndex(const char* fileName)
     return cuivflStatus_t::CUIVFL_STATUS_FILEIO_ERROR;
   }
   fclose(fp);
+  // free(list_prefix_interleaved_host_ptr_);
+  // free(centriod_host_ptr_);
 
   return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
 }  // end func cuivflHandle::cuivflSaveIndex
@@ -1040,8 +1054,8 @@ cuivflStatus_t cuivflHandle::cuivflSetSearchParameters(const uint32_t nprobe,
       metric_type_ == raft::distance::DistanceType::L2Unexpanded) {
     greater_ = false;
   } else {
-    greater_ = false;  // Need to set this to true for inner product if need FAISS like behavior for
-                       // inner product
+    // Need to set this to true for inner product if need FAISS like behavior for inner product
+    greater_ = false;
   }
 
   // Set buffer
@@ -1358,6 +1372,17 @@ cuivflStatus_t cuivflHandle::cuivflSearchImpl(const T* queries,  // [numQueries,
   return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
 }  // end func cuivflHandle::cuivflSearchImpl
 
+void cuivflInit(std::unique_ptr<detail::cuivflHandle>& handle,
+                raft::distance::DistanceType metric,
+                int D,
+                int nlist,
+                int niter,
+                int deviceId)
+{
+  handle = std::make_unique<cuivflHandle>(metric, D, nlist, niter, deviceId);
+  return;
+}
+
 }  // namespace detail
 }  // namespace knn
 }  // namespace spatial
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
index c9d2313baa..ae6a3de4c9 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
@@ -95,11 +95,8 @@ void approx_knn_cuivfl_ivfflat_build_index(knnIndex* index,
                                            IntType D,
                                            cudaStream_t stream)
 {
-  int ratio = 2;  // TODO: take these parameters from API
-  int niter = 20;
-
-  index->handle_ = std::make_unique<cuivflHandle>(metric, D, params->nlist, niter, index->device);
-
+  int ratio           = 2;  // TODO: take these parameters from API
+  int niter           = 20;
   const int dim       = D;
   const size_t ntrain = n / ratio;
   assert(ntrain > 0);
@@ -113,17 +110,9 @@ void approx_knn_cuivfl_ivfflat_build_index(knnIndex* index,
   // } else {
   //     kind = cudaMemcpyHostToDevice;
   // }
-
   // rmm::device_uvector<T> trainset(ntrain * dim, stream);
   T* trainset = nullptr;
   RAFT_CUDA_TRY(cudaMallocManaged(&trainset, ntrain * dim * sizeof(T)));
-  printf("  ntrain = %d and n = %d dim = %d nlist = %d nprobe = %d\n",
-         (int)ntrain,
-         (int)n,
-         (int)dim,
-         (int)params->nlist,
-         (int)params->nprobe);
-  fflush(0);
 
   for (size_t i = 0; i < ntrain; ++i) {
     RAFT_CUDA_TRY(cudaMemcpyAsync(
@@ -138,6 +127,9 @@ void approx_knn_cuivfl_ivfflat_build_index(knnIndex* index,
   } else if (typeid(T) == typeid(int8_t)) {
     dtype = CUDA_R_8I;
   }
+
+  cuivflInit(index->handle_, metric, D, params->nlist, niter, index->device);
+
   index->handle_->cuivflBuildIndex(dataset, trainset, dtype, n, ntrain, stream);
   RAFT_CUDA_TRY(cudaFree(trainset));
 }
@@ -184,6 +176,9 @@ void approx_knn_build_index(raft::handle_t& handle,
   index->index     = nullptr;
   index->metric    = metric;
   index->metricArg = metricArg;
+  int device;
+  RAFT_CUDA_TRY(cudaGetDevice(&device));
+  index->device = device;
 
   // perform preprocessing
   // k set to 0 (unused during preprocessing / revertion)
@@ -210,22 +205,22 @@ void approx_knn_build_index(raft::handle_t& handle,
           metric == raft::distance::DistanceType::L2Unexpanded ||
           metric == raft::distance::DistanceType::L2Expanded ||
           metric == raft::distance::DistanceType::InnerProduct) {
-        T* h_index_array;
+        float* h_index_array;
         cudaMallocManaged(&h_index_array, n * D * sizeof(T));
         // raft::update_host(h_index_array.data(), index_array, h_index_array.size(),
         // handle.get_stream());
-        cudaMemcpy(h_index_array, index_array, n * D * sizeof(T), cudaMemcpyDefault);
-
+        cudaMemcpyAsync((void*)h_index_array,
+                        (void*)index_array,
+                        n * D * sizeof(T),
+                        cudaMemcpyDefault,
+                        handle.get_stream());
         approx_knn_cuivfl_ivfflat_build_index(
           index, IVFFlat_param, metric, h_index_array, n, D, handle.get_stream());
       } else {
-        int device;
-        RAFT_CUDA_TRY(cudaGetDevice(&device));
         raft::spatial::knn::RmmGpuResources* gpu_res = new raft::spatial::knn::RmmGpuResources();
         gpu_res->noTempMemory();
         gpu_res->setDefaultStream(device, handle.get_stream());
         index->gpu_res = gpu_res;
-        index->device  = device;
         approx_knn_ivfflat_build_index(index, IVFFlat_param, metric, n, D);
         std::vector<float> h_index_array(n * D);
         raft::update_host(
@@ -241,7 +236,6 @@ void approx_knn_build_index(raft::handle_t& handle,
       gpu_res->noTempMemory();
       gpu_res->setDefaultStream(device, handle.get_stream());
       index->gpu_res = gpu_res;
-      index->device  = device;
       query_metric_processor->preprocess(index_array);
       if (dynamic_cast<IVFPQParam*>(params)) {
         IVFPQParam* IVFPQ_param = dynamic_cast<IVFPQParam*>(params);
@@ -283,8 +277,6 @@ void approx_knn_search(raft::handle_t& handle,
       int nprobe                  = IVFFlat_param->nprobe;
       int max_batch               = n;
       int max_k                   = k;
-      // assert(nprobe <= nlist_); ?? is it supposed to compare agains the private member of
-      // cuivflHandle?
 
       index->handle_->cuivflSetSearchParameters(nprobe, max_batch, max_k);
 
@@ -303,13 +295,12 @@ void approx_knn_search(raft::handle_t& handle,
     std::unique_ptr<MetricProcessor<float>> query_metric_processor = create_processor<float>(
       index->metric, n, index->handle_->getDim(), k, false, handle.get_stream());
     query_metric_processor->preprocess(query_array);
+
     if (dynamic_cast<IVFFlatParam*>(params)) {
       IVFFlatParam* IVFFlat_param = dynamic_cast<IVFFlatParam*>(params);
       int nprobe                  = IVFFlat_param->nprobe;
       int max_batch               = n;
       int max_k                   = k;
-      // assert(nprobe <= nlist_); ?? is it supposed to compare agains the private member of
-      // cuivflHandle?
 
       index->handle_->cuivflSetSearchParameters(nprobe, max_batch, max_k);
 

From cb8bcd29aef8e52d9a602c6fb6f92746ecec7b97 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 16 May 2022 15:29:45 +0200
Subject: [PATCH 003/118] Added benchmarks.

---
 cpp/bench/CMakeLists.txt       |   1 +
 cpp/bench/common/benchmark.hpp |  16 +-
 cpp/bench/spatial/knn.cu       | 357 +++++++++++++++++++++++++++++++++
 3 files changed, 369 insertions(+), 5 deletions(-)
 create mode 100644 cpp/bench/spatial/knn.cu

diff --git a/cpp/bench/CMakeLists.txt b/cpp/bench/CMakeLists.txt
index 7a0f1d5201..51e1c41499 100644
--- a/cpp/bench/CMakeLists.txt
+++ b/cpp/bench/CMakeLists.txt
@@ -30,6 +30,7 @@ add_executable(${RAFT_CPP_BENCH_TARGET}
   bench/random/permute.cu
   bench/random/rng.cu
   bench/spatial/fused_l2_nn.cu
+  bench/spatial/knn.cu
   bench/spatial/selection.cu
   bench/main.cpp
 )
diff --git a/cpp/bench/common/benchmark.hpp b/cpp/bench/common/benchmark.hpp
index de34cf4f57..17aedec10c 100644
--- a/cpp/bench/common/benchmark.hpp
+++ b/cpp/bench/common/benchmark.hpp
@@ -122,6 +122,13 @@ class fixture {
   virtual void run_benchmark(::benchmark::State& state) = 0;
   virtual void generate_metrics(::benchmark::State& state) {}
 
+ protected:
+  /** The helper that writes zeroes to some buffer in GPU memory to flush the L2 cache.  */
+  void flush_L2_cache()
+  {
+    RAFT_CUDA_TRY(cudaMemsetAsync(scratch_buf_.data(), 0, scratch_buf_.size(), stream));
+  }
+
   /**
    * The helper to be used inside `run_benchmark`, to loop over the state and record time using the
    * cuda_event_timer.
@@ -130,9 +137,7 @@ class fixture {
   void loop_on_state(::benchmark::State& state, Lambda benchmark_func, bool flush_L2 = true)
   {
     for (auto _ : state) {
-      if (flush_L2) {
-        RAFT_CUDA_TRY(cudaMemsetAsync(scratch_buf_.data(), 0, scratch_buf_.size(), stream));
-      }
+      if (flush_L2) { flush_L2_cache(); }
       cuda_event_timer timer(state, stream);
       benchmark_func();
     }
@@ -147,9 +152,9 @@ class Fixture : public ::benchmark::Fixture {
 
  public:
   explicit Fixture(const std::string name, const Params&... params)
-    : ::benchmark::Fixture(), params_(params...)
+    : ::benchmark::Fixture(), params_(params...), name_(name)
   {
-    SetName(name.c_str());
+    SetName(name_.c_str());
   }
   Fixture() = delete;
 
@@ -165,6 +170,7 @@ class Fixture : public ::benchmark::Fixture {
  private:
   std::unique_ptr<Class> fixture_;
   std::tuple<Params...> params_;
+  const std::string name_;
 
  protected:
   void BenchmarkCase(State& state) override
diff --git a/cpp/bench/spatial/knn.cu b/cpp/bench/spatial/knn.cu
new file mode 100644
index 0000000000..04b15a38a6
--- /dev/null
+++ b/cpp/bench/spatial/knn.cu
@@ -0,0 +1,357 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <optional>
+
+#include <common/benchmark.hpp>
+#include <raft/spatial/knn/ann.cuh>
+#include <raft/spatial/knn/knn.cuh>
+
+#if defined RAFT_NN_COMPILED
+#include <raft/spatial/knn/specializations.hpp>
+#endif
+
+#include <raft/random/rng.cuh>
+#include <raft/sparse/detail/utils.h>
+
+#include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <rmm/mr/host/new_delete_resource.hpp>
+#include <rmm/mr/host/pinned_memory_resource.hpp>
+
+namespace raft::bench::spatial {
+
+struct params {
+  /** Size of the dataset. */
+  size_t n_samples;
+  /** Number of dimensions in the dataset. */
+  size_t n_dims;
+  /** The batch size -- number of KNN searches. */
+  size_t n_probes;
+  /** Number of nearest neighbours to find for every probe. */
+  size_t k;
+};
+
+auto operator<<(std::ostream& os, const params& p) -> std::ostream&
+{
+  os << p.n_samples << "#" << p.n_dims << "#" << p.n_probes << "#" << p.k;
+  return os;
+}
+
+enum class TransferStrategy { NO_COPY, COPY_PLAIN, COPY_PINNED, MAP_PINNED, MANAGED };
+enum class Scope { BUILD, SEARCH, BUILD_SEARCH };
+
+auto operator<<(std::ostream& os, const TransferStrategy& ts) -> std::ostream&
+{
+  switch (ts) {
+    case TransferStrategy::NO_COPY: os << "NO_COPY"; break;
+    case TransferStrategy::COPY_PLAIN: os << "COPY_PLAIN"; break;
+    case TransferStrategy::COPY_PINNED: os << "COPY_PINNED"; break;
+    case TransferStrategy::MAP_PINNED: os << "MAP_PINNED"; break;
+    case TransferStrategy::MANAGED: os << "MANAGED"; break;
+    default: os << "UNKNOWN";
+  }
+  return os;
+}
+
+auto operator<<(std::ostream& os, const Scope& s) -> std::ostream&
+{
+  switch (s) {
+    case Scope::BUILD: os << "BUILD"; break;
+    case Scope::SEARCH: os << "SEARCH"; break;
+    case Scope::BUILD_SEARCH: os << "BUILD_SEARCH"; break;
+    default: os << "UNKNOWN";
+  }
+  return os;
+}
+
+struct device_resource {
+ public:
+  explicit device_resource(bool managed) : managed_(managed)
+  {
+    if (managed_) {
+      res_ = new rmm::mr::managed_memory_resource();
+    } else {
+      res_ = rmm::mr::get_current_device_resource();
+    }
+  }
+
+  ~device_resource()
+  {
+    if (managed_) { delete res_; }
+  }
+
+  [[nodiscard]] auto get() const -> rmm::mr::device_memory_resource* { return res_; }
+
+ private:
+  const bool managed_;
+  rmm::mr::device_memory_resource* res_;
+};
+
+template <typename T>
+struct host_uvector {
+  host_uvector(size_t n, bool pinned) : n_(n)
+  {
+    if (pinned) {
+      res_ = new rmm::mr::pinned_memory_resource();
+    } else {
+      res_ = new rmm::mr::new_delete_resource();
+    }
+    arr_ = static_cast<T*>(res_->allocate(n_ * sizeof(T)));
+  }
+
+  ~host_uvector() noexcept
+  {
+    res_->deallocate(arr_, n_ * sizeof(T));
+    delete res_;
+  }
+
+  auto data() -> T* { return arr_; }
+  [[nodiscard]] auto size() const -> size_t { return n_; }
+
+ private:
+  rmm::mr::host_memory_resource* res_;
+  size_t n_;
+  T* arr_;
+};
+
+template <typename ValT, typename IdxT>
+struct ivf_flat_knn {
+  raft::spatial::knn::knnIndex index;
+  raft::spatial::knn::IVFFlatParam ivf_params;
+  params ps;
+
+  ivf_flat_knn(const raft::handle_t& handle, const params& ps, const ValT* data) : ps(ps)
+  {
+    ivf_params.nlist  = 4096;
+    ivf_params.nprobe = 20;
+    raft::spatial::knn::approx_knn_build_index<ValT, IdxT>(
+      const_cast<raft::handle_t&>(handle),
+      &(index),
+      dynamic_cast<raft::spatial::knn::knnIndexParam*>(&ivf_params),
+      raft::distance::DistanceType::L2Unexpanded,
+      2.0f,
+      const_cast<ValT*>(data),
+      (IdxT)ps.n_samples,
+      (IdxT)ps.n_dims);
+  }
+
+  void search(const raft::handle_t& handle,
+              const ValT* search_items,
+              ValT* out_dists,
+              IdxT* out_idxs)
+  {
+    raft::spatial::knn::approx_knn_search<ValT, IdxT>(
+      const_cast<raft::handle_t&>(handle),
+      out_dists,
+      out_idxs,
+      &(index),
+      dynamic_cast<raft::spatial::knn::knnIndexParam*>(&ivf_params),
+      (IdxT)ps.k,
+      const_cast<ValT*>(search_items),
+      (IdxT)ps.n_probes);
+  }
+};
+
+template <typename ValT, typename IdxT>
+struct brute_force_knn {
+  ValT* index;
+  params ps;
+
+  brute_force_knn(const raft::handle_t& handle, const params& ps, const ValT* data)
+    : index(const_cast<ValT*>(data)), ps(ps)
+  {
+  }
+
+  void search(const raft::handle_t& handle,
+              const ValT* search_items,
+              ValT* out_dists,
+              IdxT* out_idxs)
+  {
+    std::vector<ValT*> input{index};
+    std::vector<size_t> sizes{ps.n_samples};
+    raft::spatial::knn::brute_force_knn<IdxT, ValT, size_t>(handle,
+                                                            input,
+                                                            sizes,
+                                                            ps.n_dims,
+                                                            const_cast<ValT*>(search_items),
+                                                            ps.n_probes,
+                                                            out_idxs,
+                                                            out_dists,
+                                                            ps.k);
+  }
+};
+
+template <typename ValT, typename IdxT, typename ImplT>
+struct knn : public fixture {
+  explicit knn(const params& p, const TransferStrategy& strategy, const Scope& scope)
+    : params_(p),
+      strategy_(strategy),
+      scope_(scope),
+      dev_mem_res_(strategy == TransferStrategy::MANAGED),
+      data_host_(0),
+      search_items_(p.n_probes * p.n_dims, stream),
+      out_dists_(p.n_probes * p.k, stream),
+      out_idxs_(p.n_probes * p.k, stream)
+  {
+    raft::random::RngState state{42};
+    raft::random::uniform(
+      state, search_items_.data(), search_items_.size(), ValT(-1.0), ValT(1.0), stream);
+    try {
+      size_t total_size = p.n_samples * p.n_dims;
+      data_host_.resize(total_size);
+      constexpr size_t kGenMinibatchSize = 1024 * 1024 * 1024;
+      rmm::device_uvector<ValT> d(std::min(kGenMinibatchSize, total_size), stream);
+      for (size_t offset = 0; offset < total_size; offset += kGenMinibatchSize) {
+        size_t actual_size = std::min(total_size - offset, kGenMinibatchSize);
+        raft::random::uniform(state, d.data(), actual_size, ValT(-1.0), ValT(1.0), stream);
+        copy(data_host_.data() + offset, d.data(), actual_size, stream);
+      }
+    } catch (std::bad_alloc& e) {
+      data_does_not_fit_ = true;
+    }
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    if (data_does_not_fit_) {
+      state.SkipWithError("The data size is too big to fit into the host memory.");
+    }
+    if (scope_ == Scope::SEARCH && strategy_ != TransferStrategy::NO_COPY) {
+      state.SkipWithError(
+        "When benchmarking without index building (Scope::SEARCH), the data must be already on the "
+        "device (TransferStrategy::NO_COPY)");
+    }
+
+    using_pool_memory_res default_resource;
+
+    try {
+      std::ostringstream label_stream;
+      label_stream << params_ << "#" << strategy_ << "#" << scope_;
+      state.SetLabel(label_stream.str());
+      raft::handle_t handle(stream);
+      std::optional<ImplT> index;
+
+      if (scope_ == Scope::SEARCH) {  // also implies TransferStrategy::NO_COPY
+        rmm::device_uvector<ValT> data(data_host_.size(), stream);
+        copy(data.data(), data_host_.data(), data_host_.size(), stream);
+        index.emplace(handle, params_, data.data());
+        stream.synchronize();
+      }
+
+      // benchmark loop
+      for (auto _ : state) {
+        // managed or plain device memory initialized anew every time
+        rmm::device_uvector<ValT> data(data_host_.size(), stream, dev_mem_res_.get());
+        ValT* data_ptr         = data.data();
+        size_t allocation_size = data_host_.size() * sizeof(ValT);
+
+        // Non-benchmarked part: using different methods to copy the data if necessary
+        switch (strategy_) {
+          case TransferStrategy::NO_COPY:  // copy data to GPU before starting the timer.
+            copy(data_ptr, data_host_.data(), data_host_.size(), stream);
+            break;
+          case TransferStrategy::COPY_PINNED:
+            RAFT_CUDA_TRY(
+              cudaHostRegister(data_host_.data(), allocation_size, cudaHostRegisterDefault));
+            break;
+          case TransferStrategy::MAP_PINNED:
+            RAFT_CUDA_TRY(
+              cudaHostRegister(data_host_.data(), allocation_size, cudaHostRegisterMapped));
+            RAFT_CUDA_TRY(cudaHostGetDevicePointer(&data_ptr, data_host_.data(), 0));
+            break;
+          case TransferStrategy::MANAGED:  // sic! using std::memcpy rather than cuda copy
+            CUDA_CHECK(cudaMemAdvise(
+              data_ptr, allocation_size, cudaMemAdviseSetPreferredLocation, handle.get_device()));
+            CUDA_CHECK(cudaMemAdvise(
+              data_ptr, allocation_size, cudaMemAdviseSetAccessedBy, handle.get_device()));
+            CUDA_CHECK(cudaMemAdvise(data_ptr, allocation_size, cudaMemAdviseSetReadMostly, 0));
+            std::memcpy(data_ptr, data_host_.data(), allocation_size);
+            break;
+          default: break;
+        }
+
+        flush_L2_cache();
+        {
+          // Timer synchronizes the stream, so all prior gpu work should be done before it sets off.
+          cuda_event_timer timer(state, stream);
+          switch (strategy_) {
+            case TransferStrategy::COPY_PLAIN:
+            case TransferStrategy::COPY_PINNED:
+              copy(data_ptr, data_host_.data(), data_host_.size(), stream);
+            default: break;
+          }
+
+          if (scope_ != Scope::SEARCH) { index.emplace(handle, params_, data_ptr); }
+          if (scope_ != Scope::BUILD) {
+            index->search(handle, search_items_.data(), out_dists_.data(), out_idxs_.data());
+          }
+        }
+
+        if (scope_ != Scope::SEARCH) { index.reset(); }
+
+        switch (strategy_) {
+          case TransferStrategy::COPY_PINNED:
+          case TransferStrategy::MAP_PINNED:
+            RAFT_CUDA_TRY(cudaHostUnregister(data_host_.data()));
+            break;
+          default: break;
+        }
+      }
+    } catch (raft::exception& e) {
+      state.SkipWithError(e.what());
+    } catch (std::bad_alloc& e) {
+      state.SkipWithError(e.what());
+    }
+  }
+
+ private:
+  const params params_;
+  const TransferStrategy strategy_;
+  const Scope scope_;
+  device_resource dev_mem_res_;
+  bool data_does_not_fit_ = false;
+
+  std::vector<ValT> data_host_;
+  rmm::device_uvector<ValT> search_items_;
+  rmm::device_uvector<ValT> out_dists_;
+  rmm::device_uvector<IdxT> out_idxs_;
+};
+
+const std::vector<params> kInputs{{2000000, 128, 1000, 32}, {10000000, 128, 1000, 32}};
+
+const std::vector<TransferStrategy> kAllStrategies{TransferStrategy::NO_COPY,
+                                                   TransferStrategy::COPY_PLAIN,
+                                                   TransferStrategy::COPY_PINNED,
+                                                   TransferStrategy::MAP_PINNED,
+                                                   TransferStrategy::MANAGED};
+const std::vector<TransferStrategy> kNoCopyOnly{TransferStrategy::NO_COPY};
+
+const std::vector<Scope> kScopeFull{Scope::BUILD_SEARCH};
+const std::vector<Scope> kAllScopes{Scope::BUILD, Scope::SEARCH, Scope::BUILD_SEARCH};
+
+#define KNN_REGISTER(ValT, IdxT, ImplT, inputs, strats, scope)                   \
+  namespace BENCHMARK_PRIVATE_NAME(knn)                                          \
+  {                                                                              \
+    using KNN = knn<ValT, IdxT, ImplT<ValT, IdxT>>;                              \
+    RAFT_BENCH_REGISTER(KNN, #ValT "/" #IdxT "/" #ImplT, inputs, strats, scope); \
+  }
+
+KNN_REGISTER(float, int64_t, brute_force_knn, kInputs, kAllStrategies, kScopeFull);
+KNN_REGISTER(float, int64_t, ivf_flat_knn, kInputs, kNoCopyOnly, kAllScopes);
+
+}  // namespace raft::bench::spatial

From 8c4a0a0242987e4464e9bae5fee487957f239116 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 17 May 2022 08:17:02 +0200
Subject: [PATCH 004/118] Add a missing parameter docs

---
 cpp/include/raft/spatial/knn/ann.cuh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/include/raft/spatial/knn/ann.cuh b/cpp/include/raft/spatial/knn/ann.cuh
index 4dfb1b6d89..5768d83601 100644
--- a/cpp/include/raft/spatial/knn/ann.cuh
+++ b/cpp/include/raft/spatial/knn/ann.cuh
@@ -59,6 +59,7 @@ inline void approx_knn_build_index(raft::handle_t& handle,
  *                       their query point
  * @param[out] indices indices of the nearest neighbors
  * @param[in] index index to perform a search with
+ * @param[in] params parameters used to build the index
  * @param[in] k the number of nearest neighbors to search for
  * @param[in] query_array the query to perform a search with
  * @param[in] n number of rows in the query array

From 070fd0551d0ae9a91ec1271b6a6ddf6ffae0ac76 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 17 May 2022 08:58:16 +0200
Subject: [PATCH 005/118] Adapt to the changes in the warpsort api

---
 .../raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh        | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
index 904a64c374..b4d6e6ecaf 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
@@ -802,11 +802,8 @@ __global__ void interleaved_scan(
       queue(identity, keyMax, smemK, smemV, k);
 
 #else
-  extern __shared__ char smem_ext[];
-  constexpr auto Dir =
-    GREATER ? false : true;  // topk::block_sort uses ascending hence switch is needed.
-  topk::block_sort<topk::warp_sort_immediate, CAPACITY, Dir, float, size_t> queue(
-    k, dummy, smem_ext);
+  extern __shared__ __align__(256) uint8_t smem_ext[];
+  topk::block_sort<topk::warp_sort_immediate, CAPACITY, !GREATER, float, size_t> queue(k, smem_ext);
 #endif
 
   const int laneId = threadIdx.x % kWarpSize;

From 83b66309666eb4657c6582513054d73f93ef8223 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 17 May 2022 09:23:06 +0200
Subject: [PATCH 006/118] cleanup: use WarpSize constant

---
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  | 18 ++---
 .../knn/detail/ann_ivf_flat_kernel.cuh        | 72 +++++++++----------
 .../raft/spatial/knn/detail/ann_utils.cuh     |  3 +-
 .../knn/detail/ball_cover/registers.cuh       | 12 ++--
 .../spatial/knn/detail/haversine_distance.cuh |  4 +-
 .../knn/detail/knn_brute_force_faiss.cuh      |  4 +-
 .../spatial/knn/detail/selection_faiss.cuh    |  6 +-
 7 files changed, 59 insertions(+), 60 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index 56ced13547..81e3fc5585 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -64,13 +64,13 @@ template <typename T>
 void _ivfflat_interleaved(
   T* list_data, T* dataset, uint32_t dim, size_t index, size_t prefix, uint32_t veclen)
 {
-  size_t group_id = index / kWarpSize;
-  size_t in_id    = (index % kWarpSize) * veclen;
-  list_data += (prefix + group_id * kWarpSize) * dim + in_id;
+  size_t group_id = index / WarpSize;
+  size_t in_id    = (index % WarpSize) * veclen;
+  list_data += (prefix + group_id * WarpSize) * dim + in_id;
 
   for (size_t i = 0; i < dim; i += veclen) {
     for (size_t j = 0; j < veclen; j++) {
-      list_data[i * kWarpSize + j] = dataset[i + j];
+      list_data[i * WarpSize + j] = dataset[i + j];
     }
   }
 }
@@ -81,13 +81,13 @@ template <typename T>
 __global__ void write_ivf_flat_interleaved_index(
   T* list_data, T* dataset, uint32_t dim, size_t index, size_t prefix, uint32_t veclen)
 {
-  size_t group_id = index / kWarpSize;
-  size_t in_id    = (index % kWarpSize) * veclen;
-  list_data += (prefix + group_id * kWarpSize) * dim + in_id;
+  size_t group_id = index / WarpSize;
+  size_t in_id    = (index % WarpSize) * veclen;
+  list_data += (prefix + group_id * WarpSize) * dim + in_id;
 
   for (size_t i = 0; i < dim; i += veclen) {
     for (size_t j = 0; j < veclen; j++) {
-      list_data[i * kWarpSize + j] = dataset[i + j];
+      list_data[i * WarpSize + j] = dataset[i + j];
     }
   }
 }
@@ -860,7 +860,7 @@ cuivflStatus_t cuivflHandle::cuivflBuildIndex(const void* dataset,
   ninterleave_ = 0;
   for (uint32_t i = 0; i < nlist_; i++) {
     list_prefix_interleaved_host_ptr_[i] = ninterleave_;
-    ninterleave_ += ((list_lengths_host_ptr_[i] - 1) / kWarpSize + 1) * kWarpSize;
+    ninterleave_ += ((list_lengths_host_ptr_[i] - 1) / WarpSize + 1) * WarpSize;
   }
 
   if (dtype == CUDA_R_32F) {
diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
index b4d6e6ecaf..01353b6dcb 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
@@ -197,7 +197,7 @@ struct loadAndComputeDist {
     T encV[kUnroll][veclen];
     T queryReg               = query[baseLoadIndex + laneId];
     constexpr int stride     = kUnroll * veclen;
-    constexpr int totalIter  = kWarpSize / stride;
+    constexpr int totalIter  = WarpSize / stride;
     constexpr int gmemStride = stride * wordsPerVectorBlockDim;
 #pragma unroll
     for (int i = 0; i < totalIter; ++i, data += gmemStride) {
@@ -208,7 +208,7 @@ struct loadAndComputeDist {
         const int d = (i * kUnroll + j) * veclen;
 #pragma unroll
         for (int k = 0; k < veclen; ++k) {
-          q[k] = SHFL_SYNC(queryReg, d + k, kWarpSize);
+          q[k] = SHFL_SYNC(queryReg, d + k, WarpSize);
           computeDist(dist, q[k], encV[j][k]);  //@TODO add other metrics
         }
       }
@@ -227,7 +227,7 @@ struct loadAndComputeDist {
       ldg(enc, data + loadDataIdx);
 #pragma unroll
       for (int k = 0; k < veclen; k++) {
-        q[k] = SHFL_SYNC(queryReg, d + k, kWarpSize);
+        q[k] = SHFL_SYNC(queryReg, d + k, WarpSize);
         computeDist(dist, q[k], enc[k]);
       }
     }  // end for d < dim - dimBlocks
@@ -287,7 +287,7 @@ struct loadAndComputeDist<kUnroll,
     constexpr int stride = kUnroll * uint8_veclen;
 
 #pragma unroll
-    for (int i = 0; i < kWarpSize / stride; ++i, data += stride * wordsPerVectorBlockDim) {
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * wordsPerVectorBlockDim) {
 #pragma unroll
       for (int j = 0; j < kUnroll; ++j) {
         ldg(encV[j],
@@ -296,7 +296,7 @@ struct loadAndComputeDist<kUnroll,
         const int d = (i * kUnroll + j) * veclen_int;
 #pragma unroll
         for (int k = 0; k < veclen_int; ++k) {
-          q[j][k] = SHFL_SYNC(queryReg, d + k, kWarpSize);
+          q[j][k] = SHFL_SYNC(queryReg, d + k, WarpSize);
           computeDist(dist, q[j][k], encV[j][k]);
         }
       }
@@ -319,7 +319,7 @@ struct loadAndComputeDist<kUnroll,
       ldg(enc, reinterpret_cast<uint32_t const*>(data) + laneId * veclen_int);
 #pragma unroll
       for (int k = 0; k < veclen_int; k++) {
-        q[k] = SHFL_SYNC(queryReg, (d / 4) + k, kWarpSize);
+        q[k] = SHFL_SYNC(queryReg, (d / 4) + k, WarpSize);
         computeDist(dist, q[k], enc[k]);
       }
     }  // end for d < dim - dimBlocks
@@ -368,12 +368,12 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 4, uin
     constexpr int stride = kUnroll * veclen;
 
 #pragma unroll
-    for (int i = 0; i < kWarpSize / stride; ++i, data += stride * wordsPerVectorBlockDim) {
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * wordsPerVectorBlockDim) {
 #pragma unroll
       for (int j = 0; j < kUnroll; ++j) {
         encV[j]     = reinterpret_cast<unsigned const*>(data)[laneId + j * wordsPerVectorBlockDim];
         const int d = (i * kUnroll + j);
-        q[j]        = SHFL_SYNC(queryReg, d, kWarpSize);
+        q[j]        = SHFL_SYNC(queryReg, d, WarpSize);
         computeDist(dist, q[j], encV[j]);
       }
     }
@@ -390,7 +390,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 4, uin
     uint32_t queryReg    = loadDim < dim ? reinterpret_cast<unsigned const*>(query)[loadDim] : 0;
     for (int d = 0; d < dim - dimBlocks; d += veclen, data += wordsPerVectorBlockDim * veclen) {
       uint32_t enc = reinterpret_cast<unsigned const*>(data)[laneId];
-      uint32_t q   = SHFL_SYNC(queryReg, d / veclen, kWarpSize);
+      uint32_t q   = SHFL_SYNC(queryReg, d / veclen, WarpSize);
       computeDist(dist, q, enc);
     }  // end for d < dim - dimBlocks
   }
@@ -438,13 +438,13 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 2, uin
     constexpr int stride = kUnroll * veclen;
 
 #pragma unroll
-    for (int i = 0; i < kWarpSize / stride; ++i, data += stride * wordsPerVectorBlockDim) {
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * wordsPerVectorBlockDim) {
 #pragma unroll
       for (int j = 0; j < kUnroll; ++j) {
         encV[j]     = 0;
         encV[j]     = reinterpret_cast<uint16_t const*>(data)[laneId + j * wordsPerVectorBlockDim];
         const int d = (i * kUnroll + j);
-        q[j]        = SHFL_SYNC(queryReg, d, kWarpSize);
+        q[j]        = SHFL_SYNC(queryReg, d, WarpSize);
         computeDist(dist, q[j], encV[j]);
       }
     }
@@ -462,7 +462,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 2, uin
     queryReg = loadDim < dim ? reinterpret_cast<uint16_t const*>(query + loadDim)[0] : 0;
     for (int d = 0; d < dim - dimBlocks; d += veclen, data += wordsPerVectorBlockDim * veclen) {
       uint32_t enc = reinterpret_cast<uint16_t const*>(data)[laneId];
-      uint32_t q   = SHFL_SYNC(queryReg, d / veclen, kWarpSize);
+      uint32_t q   = SHFL_SYNC(queryReg, d / veclen, WarpSize);
       computeDist(dist, q, enc);
     }  // end for d < dim - dimBlocks
   }
@@ -508,13 +508,13 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 1, uin
     constexpr int stride = kUnroll * veclen;
 
 #pragma unroll
-    for (int i = 0; i < kWarpSize / stride; ++i, data += stride * wordsPerVectorBlockDim) {
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * wordsPerVectorBlockDim) {
 #pragma unroll
       for (int j = 0; j < kUnroll; ++j) {
         encV[j]     = 0;
         encV[j]     = data[laneId + j * wordsPerVectorBlockDim];
         const int d = (i * kUnroll + j);
-        q[j]        = SHFL_SYNC(queryReg, d, kWarpSize);
+        q[j]        = SHFL_SYNC(queryReg, d, WarpSize);
         computeDist(dist, q[j], encV[j]);
       }
     }
@@ -533,7 +533,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 1, uin
     for (int d = 0; d < dim - dimBlocks; d += veclen, data += wordsPerVectorBlockDim * veclen) {
       uint32_t enc = 0;
       enc          = data[laneId];
-      uint32_t q   = SHFL_SYNC(queryReg, d, kWarpSize);
+      uint32_t q   = SHFL_SYNC(queryReg, d, WarpSize);
       computeDist(dist, q, enc);
     }  // end for d < dim - dimBlocks
   }
@@ -592,7 +592,7 @@ struct loadAndComputeDist<kUnroll,
     constexpr int stride = kUnroll * int8_veclen;
 
 #pragma unroll
-    for (int i = 0; i < kWarpSize / stride; ++i, data += stride * wordsPerVectorBlockDim) {
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * wordsPerVectorBlockDim) {
 #pragma unroll
       for (int j = 0; j < kUnroll; ++j) {
         ldg(encV[j],
@@ -601,7 +601,7 @@ struct loadAndComputeDist<kUnroll,
         const int d = (i * kUnroll + j) * veclen_int;
 #pragma unroll
         for (int k = 0; k < veclen_int; ++k) {
-          q[j][k] = SHFL_SYNC(queryReg, d + k, kWarpSize);
+          q[j][k] = SHFL_SYNC(queryReg, d + k, WarpSize);
           computeDist(dist, q[j][k], encV[j][k]);
         }
       }
@@ -621,7 +621,7 @@ struct loadAndComputeDist<kUnroll,
       ldg(enc, reinterpret_cast<int32_t const*>(data) + laneId * veclen_int);
 #pragma unroll
       for (int k = 0; k < veclen_int; k++) {
-        q[k] = SHFL_SYNC(queryReg, (d / 4) + k, kWarpSize);  // Here 4 is for 1 - int;
+        q[k] = SHFL_SYNC(queryReg, (d / 4) + k, WarpSize);  // Here 4 is for 1 - int;
         computeDist(dist, q[k], enc[k]);
       }
     }  // end for d < dim - dimBlocks
@@ -668,13 +668,13 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 2, int
     constexpr int stride = kUnroll * veclen;
 
 #pragma unroll
-    for (int i = 0; i < kWarpSize / stride; ++i, data += stride * wordsPerVectorBlockDim) {
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * wordsPerVectorBlockDim) {
 #pragma unroll
       for (int j = 0; j < kUnroll; ++j) {
         encV[j]     = 0;
         encV[j]     = reinterpret_cast<uint16_t const*>(data)[laneId + j * wordsPerVectorBlockDim];
         const int d = (i * kUnroll + j);
-        q[j]        = SHFL_SYNC(queryReg, d, kWarpSize);
+        q[j]        = SHFL_SYNC(queryReg, d, WarpSize);
         computeDist(dist, q[j], encV[j]);
       }
     }
@@ -689,7 +689,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 2, int
     queryReg = loadDim < dim ? reinterpret_cast<uint16_t const*>(query + loadDim)[0] : 0;
     for (int d = 0; d < dim - dimBlocks; d += veclen, data += wordsPerVectorBlockDim * veclen) {
       int32_t enc = reinterpret_cast<uint16_t const*>(data + laneId * veclen)[0];
-      int32_t q   = SHFL_SYNC(queryReg, d / veclen, kWarpSize);
+      int32_t q   = SHFL_SYNC(queryReg, d / veclen, WarpSize);
       computeDist(dist, q, enc);
     }  // end for d < dim - dimBlocks
   }
@@ -737,13 +737,13 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 1, int
     int32_t q[kUnroll];
 
 #pragma unroll
-    for (int i = 0; i < kWarpSize / stride; ++i, data += stride * wordsPerVectorBlockDim) {
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * wordsPerVectorBlockDim) {
 #pragma unroll
       for (int j = 0; j < kUnroll; ++j) {
         encV[j]     = 0;
         encV[j]     = data[laneId + j * wordsPerVectorBlockDim];
         const int d = (i * kUnroll + j);
-        q[j]        = SHFL_SYNC(queryReg, d, kWarpSize);
+        q[j]        = SHFL_SYNC(queryReg, d, WarpSize);
         computeDist(dist, q[j], encV[j]);
       }
     }
@@ -758,7 +758,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 1, int
     for (int d = 0; d < dim - dimBlocks; d += veclen, data += wordsPerVectorBlockDim * veclen) {
       int32_t enc = 0;
       enc         = data[laneId];
-      int32_t q   = SHFL_SYNC(queryReg, d, kWarpSize);
+      int32_t q   = SHFL_SYNC(queryReg, d, WarpSize);
       computeDist(dist, q, enc);
     }  // end for d < dim - dimBlocks
   }
@@ -806,17 +806,17 @@ __global__ void interleaved_scan(
   topk::block_sort<topk::warp_sort_immediate, CAPACITY, !GREATER, float, size_t> queue(k, smem_ext);
 #endif
 
-  const int laneId = threadIdx.x % kWarpSize;
-  const int warpId = threadIdx.x / kWarpSize;
+  const int laneId = threadIdx.x % WarpSize;
+  const int warpId = threadIdx.x / WarpSize;
   int queryId      = blockIdx.y;
 
   /// Set the address
   auto query                           = queries + queryId * dim;
-  constexpr int bytesPerVectorBlockDim = sizeof(T) * kWarpSize;
+  constexpr int bytesPerVectorBlockDim = sizeof(T) * WarpSize;
   constexpr int wordsPerVectorBlockDim = bytesPerVectorBlockDim / sizeof(T);
 
   // int wordsPerVectorBlock = wordsPerVectorBlockDim * dim;
-  const int dimBlocks = roundDown(dim, kWarpSize);
+  const int dimBlocks = roundDown(dim, WarpSize);
 
   // This should be multiple of warpSize = 32
   constexpr uint32_t queryShmemSize = 2048;
@@ -846,12 +846,12 @@ __global__ void interleaved_scan(
     const uint32_t numVecs = list_lengths[listId];
 
     // The number of interleaved group to be processed
-    const uint32_t numBlocks = divUp(numVecs, kWarpSize);
+    const uint32_t numBlocks = divUp(numVecs, WarpSize);
 
     for (uint32_t block = warpId; block < numBlocks; block += utils::kNumWarps) {
       value_t dist = 0;
       // This is the vector a given lane/thread handles
-      const uint32_t vec = block * kWarpSize + laneId;
+      const uint32_t vec = block * WarpSize + laneId;
       bool valid         = vec < numVecs;
       size_t idx         = (valid) ? (size_t)indexBase[vec] : (size_t)laneId;
       // This is where this warp begins reading data
@@ -860,10 +860,10 @@ __global__ void interleaved_scan(
 
       if (valid) {
         /// load query from shared mem
-        for (int dBase = 0; dBase < shLoadDim; dBase += kWarpSize) {  //
-          constexpr int kUnroll   = kWarpSize / veclen;
+        for (int dBase = 0; dBase < shLoadDim; dBase += WarpSize) {  //
+          constexpr int kUnroll   = WarpSize / veclen;
           constexpr int stride    = kUnroll * veclen;
-          constexpr int totalIter = kWarpSize / stride;
+          constexpr int totalIter = WarpSize / stride;
 
           loadAndComputeDist<kUnroll,
                              wordsPerVectorBlockDim,
@@ -875,12 +875,12 @@ __global__ void interleaved_scan(
 #pragma unroll
           for (int i = 0; i < totalIter; ++i, data += stride * wordsPerVectorBlockDim) {
             obj.runLoadShmemCompute(data, queryShared, laneId, dBase, i);
-          }  // end for i < kWarpSize / kUnroll
+          }  // end for i < WarpSize / kUnroll
         }    // end for dBase < dimBlocks
       }
 
       if (dim > queryShmemSize) {
-        constexpr int kUnroll = kWarpSize / veclen;
+        constexpr int kUnroll = WarpSize / veclen;
         ;
         loadAndComputeDist<kUnroll,
                            wordsPerVectorBlockDim,
@@ -889,7 +889,7 @@ __global__ void interleaved_scan(
                            T,
                            value_t>
           obj(dist, computeDist);
-        for (int dBase = shLoadDim; dBase < dimBlocks; dBase += kWarpSize) {  //
+        for (int dBase = shLoadDim; dBase < dimBlocks; dBase += WarpSize) {  //
           obj.runLoadShflAndCompute(data, query, dBase, laneId);
         }
         // Remainder chunk = dim - dimBlocks
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index aa75f8d002..c7aa381acf 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -94,9 +94,8 @@ namespace utils {
 //     }
 //     return true;
 // }
-constexpr int kWarpSize       = 32;
 constexpr int kThreadPerBlock = 128;
-constexpr int kNumWarps       = kThreadPerBlock / kWarpSize;
+constexpr int kNumWarps       = kThreadPerBlock / WarpSize;
 
 namespace numeric {
 
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
index 07608f1688..963c4730f4 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
@@ -171,7 +171,7 @@ __global__ void compute_final_dists_registers(const value_t* X_index,
                                               dist_func dfunc,
                                               value_int* dist_counter)
 {
-  static constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
+  static constexpr int kNumWarps = tpb / WarpSize;
 
   __shared__ value_t shared_memK[kNumWarps * warp_q];
   __shared__ faiss::gpu::KeyValuePair<value_t, value_idx> shared_memV[kNumWarps * warp_q];
@@ -196,7 +196,7 @@ __global__ void compute_final_dists_registers(const value_t* X_index,
          shared_memV,
          k);
 
-  const value_int n_k = faiss::gpu::utils::roundDown(k, faiss::gpu::kWarpSize);
+  const value_int n_k = faiss::gpu::utils::roundDown(k, WarpSize);
   value_int i         = threadIdx.x;
   for (; i < n_k; i += tpb) {
     value_idx ind = knn_inds[blockIdx.x * k + i];
@@ -223,7 +223,7 @@ __global__ void compute_final_dists_registers(const value_t* X_index,
       // Round R_size to the nearest warp threads so they can
       // all be computing in parallel.
 
-      const value_int limit = faiss::gpu::utils::roundDown(R_size, faiss::gpu::kWarpSize);
+      const value_int limit = faiss::gpu::utils::roundDown(R_size, WarpSize);
 
       i = threadIdx.x;
       for (; i < limit; i += tpb) {
@@ -333,7 +333,7 @@ __global__ void block_rbc_kernel_registers(const value_t* X_index,
                                            distance_func dfunc,
                                            float weight = 1.0)
 {
-  static constexpr value_int kNumWarps = tpb / faiss::gpu::kWarpSize;
+  static constexpr value_int kNumWarps = tpb / WarpSize;
 
   __shared__ value_t shared_memK[kNumWarps * warp_q];
   __shared__ faiss::gpu::KeyValuePair<value_t, value_idx> shared_memV[kNumWarps * warp_q];
@@ -389,7 +389,7 @@ __global__ void block_rbc_kernel_registers(const value_t* X_index,
 
     value_idx R_size = R_stop_offset - R_start_offset;
 
-    value_int limit = faiss::gpu::utils::roundDown(R_size, faiss::gpu::kWarpSize);
+    value_int limit = faiss::gpu::utils::roundDown(R_size, WarpSize);
     value_int i     = threadIdx.x;
     for (; i < limit; i += tpb) {
       // Index and distance of current candidate's nearest landmark
@@ -789,4 +789,4 @@ void rbc_low_dim_pass_two(const raft::handle_t& handle,
 };  // namespace detail
 };  // namespace knn
 };  // namespace spatial
-};  // namespace raft
\ No newline at end of file
+};  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
index c2d89aae7d..4eb41b7931 100644
--- a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
+++ b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
@@ -65,7 +65,7 @@ __global__ void haversine_knn_kernel(value_idx* out_inds,
                                      size_t n_index_rows,
                                      int k)
 {
-  constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
+  constexpr int kNumWarps = tpb / WarpSize;
 
   __shared__ value_t smemK[kNumWarps * warp_q];
   __shared__ value_idx smemV[kNumWarps * warp_q];
@@ -75,7 +75,7 @@ __global__ void haversine_knn_kernel(value_idx* out_inds,
       heap(faiss::gpu::Limits<value_t>::getMax(), -1, smemK, smemV, k);
 
   // Grid is exactly sized to rows available
-  int limit = faiss::gpu::utils::roundDown(n_index_rows, faiss::gpu::kWarpSize);
+  int limit = faiss::gpu::utils::roundDown(n_index_rows, WarpSize);
 
   const value_t* query_ptr = query + (blockIdx.x * 2);
   value_t x1               = query_ptr[0];
diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index 196124352a..9112657d44 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -63,7 +63,7 @@ __global__ void knn_merge_parts_kernel(value_t* inK,
                                        int k,
                                        value_idx* translations)
 {
-  constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
+  constexpr int kNumWarps = tpb / WarpSize;
 
   __shared__ value_t smemK[kNumWarps * warp_q];
   __shared__ value_idx smemV[kNumWarps * warp_q];
@@ -90,7 +90,7 @@ __global__ void knn_merge_parts_kernel(value_t* inK,
   value_t* inKStart   = inK + (row_idx + col);
   value_idx* inVStart = inV + (row_idx + col);
 
-  int limit             = faiss::gpu::utils::roundDown(total_k, faiss::gpu::kWarpSize);
+  int limit             = faiss::gpu::utils::roundDown(total_k, WarpSize);
   value_idx translation = 0;
 
   for (; i < limit; i += tpb) {
diff --git a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
index d157a57f52..91f7edd16f 100644
--- a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
@@ -48,7 +48,7 @@ __global__ void select_k_kernel(key_t* inK,
                                 payload_t initV,
                                 int k)
 {
-  constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
+  constexpr int kNumWarps = tpb / WarpSize;
 
   __shared__ key_t smemK[kNumWarps * warp_q];
   __shared__ payload_t smemV[kNumWarps * warp_q];
@@ -66,7 +66,7 @@ __global__ void select_k_kernel(key_t* inK,
   payload_t* inVStart = inV + idx + i;
 
   // Whole warps must participate in the selection
-  int limit = faiss::gpu::utils::roundDown(n_cols, faiss::gpu::kWarpSize);
+  int limit = faiss::gpu::utils::roundDown(n_cols, WarpSize);
 
   for (; i < limit; i += tpb) {
     inKStart = inK + idx + i;
@@ -172,4 +172,4 @@ inline void select_k(key_t* inK,
 };  // namespace detail
 };  // namespace knn
 };  // namespace spatial
-};  // namespace raft
\ No newline at end of file
+};  // namespace raft

From 3a2703cc18002522e93f6fdf46bd8716b68731f4 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 17 May 2022 09:59:46 +0200
Subject: [PATCH 007/118] cleanup: remove unnecessary helpers

---
 .../knn/detail/ann_ivf_flat_kernel.cuh        | 21 ++---
 .../raft/spatial/knn/detail/ann_utils.cuh     | 82 -------------------
 2 files changed, 8 insertions(+), 95 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
index 01353b6dcb..095fffbe43 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
@@ -782,8 +782,8 @@ __global__ void interleaved_scan(
   const uint32_t k,
   const uint32_t dim,
   size_t* neighbors,  // [batch_size, nprobe]
-  float* distances,   // [batch_size, nprobe]
-  const float dummy)
+  float* distances    // [batch_size, nprobe]
+)
 {
 #ifdef USE_FAISS
   // temporary use of FAISS blockSelect for development purpose of k <= 32
@@ -908,7 +908,8 @@ __global__ void interleaved_scan(
       }
 
       /// Inqueue warp_wise
-      float val = (valid) ? (float)dist : dummy;
+      constexpr float kDummy = GREATER ? lower_bound<float>() : upper_bound<float>();
+      float val              = (valid) ? (float)dist : kDummy;
       queue.add(val, idx);
     }  // end for block < numBlocks
   }
@@ -978,8 +979,6 @@ void launch_interleaved_scan_kernel(
   cudaStream_t stream,
   uint32_t& gridDimX)
 {
-  const float dummy = utils::numeric::get_dummy<float>(greater);  // should be value_t?
-
   // Accumulation inner product lambda
   auto inner_prod_lambda = [] __device__(acc_type & acc, acc_type & x, acc_type & y) {
     if constexpr ((std::is_same<T, int8_t>{}) || (std::is_same<T, uint8_t>{})) {
@@ -1046,8 +1045,7 @@ void launch_interleaved_scan_kernel(
         k,
         dim,
         neighbors,
-        distances,
-        dummy);
+        distances);
     } else {
       constexpr auto interleaved_scan_inner_prod_greater =
         interleaved_scan<capacity, veclen, T, acc_type, decltype(inner_prod_lambda), true>;
@@ -1072,8 +1070,7 @@ void launch_interleaved_scan_kernel(
         k,
         dim,
         neighbors,
-        distances,
-        dummy);
+        distances);
     }
   } else {
     if (metric == raft::distance::DistanceType::L2Expanded ||
@@ -1101,8 +1098,7 @@ void launch_interleaved_scan_kernel(
         k,
         dim,
         neighbors,
-        distances,
-        dummy);
+        distances);
     } else {
       constexpr auto interleaved_scan_inner_prod_ngreater =
         interleaved_scan<capacity, veclen, T, acc_type, decltype(inner_prod_lambda), false>;
@@ -1127,8 +1123,7 @@ void launch_interleaved_scan_kernel(
         k,
         dim,
         neighbors,
-        distances,
-        dummy);
+        distances);
     }
   }
 }
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index c7aa381acf..ccfe73ffd0 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -53,91 +53,9 @@ namespace knn {
 namespace detail {
 namespace utils {
 
-// bool check(cudaError_t e, int iLine, const char *szFile) {
-//         if (e != cudaSuccess) {
-//             std::cout << "CUDA runtime API error " << cudaGetErrorName(e) << " at line " << iLine
-//                     << " in file " << szFile << std::endl;
-//             exit(0);
-//             return false;
-//         }
-//         return true;
-// }
-
-// const char *cublasGetErrorString(cublasStatus_t status) {
-//     switch (status) {
-//     case CUBLAS_STATUS_SUCCESS:
-//         return "CUBLAS_STATUS_SUCCESS";
-//     case CUBLAS_STATUS_NOT_INITIALIZED:
-//         return "CUBLAS_STATUS_NOT_INITIALIZED";
-//     case CUBLAS_STATUS_ALLOC_FAILED:
-//         return "CUBLAS_STATUS_ALLOC_FAILED";
-//     case CUBLAS_STATUS_INVALID_VALUE:
-//         return "CUBLAS_STATUS_INVALID_VALUE";
-//     case CUBLAS_STATUS_ARCH_MISMATCH:
-//         return "CUBLAS_STATUS_ARCH_MISMATCH";
-//     case CUBLAS_STATUS_MAPPING_ERROR:
-//         return "CUBLAS_STATUS_MAPPING_ERROR";
-//     case CUBLAS_STATUS_EXECUTION_FAILED:
-//         return "CUBLAS_STATUS_EXECUTION_FAILED";
-//     case CUBLAS_STATUS_INTERNAL_ERROR:
-//         return "CUBLAS_STATUS_INTERNAL_ERROR";
-//     }
-//     return "unknown error";
-// }
-
-// bool check(cublasStatus_t e, int iLine, const char *szFile) {
-//     if (e != CUBLAS_STATUS_SUCCESS) {
-//         std::cout << "CUDA runtime API error " << cublasGetErrorString(e) << " at line "
-//                 << iLine << " in file " << szFile << std::endl;
-//         exit(0);
-//         return false;
-//     }
-//     return true;
-// }
 constexpr int kThreadPerBlock = 128;
 constexpr int kNumWarps       = kThreadPerBlock / WarpSize;
 
-namespace numeric {
-
-// a new type should specialize get_lower_bound() & get_upper_bound()
-// rather than get_dummy()
-template <typename T>
-constexpr T get_lower_bound()
-{
-  if (std::numeric_limits<T>::has_infinity && std::numeric_limits<T>::is_signed) {
-    return -std::numeric_limits<T>::infinity();
-  } else {
-    return std::numeric_limits<T>::lowest();
-  }
-}
-
-template <typename T>
-constexpr T get_upper_bound()
-{
-  if (std::numeric_limits<T>::has_infinity) {
-    return std::numeric_limits<T>::infinity();
-  } else {
-    return std::numeric_limits<T>::max();
-  }
-}
-
-template <typename T>
-constexpr T get_dummy(bool greater)
-{
-  // TODO: for unsigned and greater=true, dummy will be 0
-  //       find better way to warn about this
-  assert(!(std::is_unsigned<T>::value && greater));
-  return greater ? get_lower_bound<T>() : get_upper_bound<T>();
-}
-
-template <bool greater, typename T>
-__device__ inline bool is_better_than(T val, T baseline)
-{
-  return (val > baseline && greater) || (val < baseline && !greater);
-}
-
-}  // namespace numeric
-
 /*******************************************************/
 /*                   Debug Function                    */
 /*******************************************************/

From 31bbaec28ef2ad6fa191139c7d96dc9791dd1cd6 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 17 May 2022 10:26:04 +0200
Subject: [PATCH 008/118] Use a more efficient warp_sort_filtered

---
 cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
index 095fffbe43..8cea7f4e5e 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
@@ -803,7 +803,7 @@ __global__ void interleaved_scan(
 
 #else
   extern __shared__ __align__(256) uint8_t smem_ext[];
-  topk::block_sort<topk::warp_sort_immediate, CAPACITY, !GREATER, float, size_t> queue(k, smem_ext);
+  topk::block_sort<topk::warp_sort_filtered, CAPACITY, !GREATER, float, size_t> queue(k, smem_ext);
 #endif
 
   const int laneId = threadIdx.x % WarpSize;
@@ -907,7 +907,7 @@ __global__ void interleaved_scan(
         }
       }
 
-      /// Inqueue warp_wise
+      // Enqueue one element per thread
       constexpr float kDummy = GREATER ? lower_bound<float>() : upper_bound<float>();
       float val              = (valid) ? (float)dist : kDummy;
       queue.add(val, idx);

From 4b40181fe202d5447176b75afb8156960ecbb663 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 17 May 2022 10:33:38 +0200
Subject: [PATCH 009/118] Recover files that have only non-relevant changes to
 reduce the size of the PR

---
 .../raft/spatial/knn/detail/ball_cover/registers.cuh | 12 ++++++------
 .../raft/spatial/knn/detail/haversine_distance.cuh   |  4 ++--
 .../spatial/knn/detail/knn_brute_force_faiss.cuh     |  4 ++--
 .../raft/spatial/knn/detail/selection_faiss.cuh      |  6 +++---
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
index 963c4730f4..07608f1688 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
@@ -171,7 +171,7 @@ __global__ void compute_final_dists_registers(const value_t* X_index,
                                               dist_func dfunc,
                                               value_int* dist_counter)
 {
-  static constexpr int kNumWarps = tpb / WarpSize;
+  static constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
 
   __shared__ value_t shared_memK[kNumWarps * warp_q];
   __shared__ faiss::gpu::KeyValuePair<value_t, value_idx> shared_memV[kNumWarps * warp_q];
@@ -196,7 +196,7 @@ __global__ void compute_final_dists_registers(const value_t* X_index,
          shared_memV,
          k);
 
-  const value_int n_k = faiss::gpu::utils::roundDown(k, WarpSize);
+  const value_int n_k = faiss::gpu::utils::roundDown(k, faiss::gpu::kWarpSize);
   value_int i         = threadIdx.x;
   for (; i < n_k; i += tpb) {
     value_idx ind = knn_inds[blockIdx.x * k + i];
@@ -223,7 +223,7 @@ __global__ void compute_final_dists_registers(const value_t* X_index,
       // Round R_size to the nearest warp threads so they can
       // all be computing in parallel.
 
-      const value_int limit = faiss::gpu::utils::roundDown(R_size, WarpSize);
+      const value_int limit = faiss::gpu::utils::roundDown(R_size, faiss::gpu::kWarpSize);
 
       i = threadIdx.x;
       for (; i < limit; i += tpb) {
@@ -333,7 +333,7 @@ __global__ void block_rbc_kernel_registers(const value_t* X_index,
                                            distance_func dfunc,
                                            float weight = 1.0)
 {
-  static constexpr value_int kNumWarps = tpb / WarpSize;
+  static constexpr value_int kNumWarps = tpb / faiss::gpu::kWarpSize;
 
   __shared__ value_t shared_memK[kNumWarps * warp_q];
   __shared__ faiss::gpu::KeyValuePair<value_t, value_idx> shared_memV[kNumWarps * warp_q];
@@ -389,7 +389,7 @@ __global__ void block_rbc_kernel_registers(const value_t* X_index,
 
     value_idx R_size = R_stop_offset - R_start_offset;
 
-    value_int limit = faiss::gpu::utils::roundDown(R_size, WarpSize);
+    value_int limit = faiss::gpu::utils::roundDown(R_size, faiss::gpu::kWarpSize);
     value_int i     = threadIdx.x;
     for (; i < limit; i += tpb) {
       // Index and distance of current candidate's nearest landmark
@@ -789,4 +789,4 @@ void rbc_low_dim_pass_two(const raft::handle_t& handle,
 };  // namespace detail
 };  // namespace knn
 };  // namespace spatial
-};  // namespace raft
+};  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
index 4eb41b7931..c2d89aae7d 100644
--- a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
+++ b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
@@ -65,7 +65,7 @@ __global__ void haversine_knn_kernel(value_idx* out_inds,
                                      size_t n_index_rows,
                                      int k)
 {
-  constexpr int kNumWarps = tpb / WarpSize;
+  constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
 
   __shared__ value_t smemK[kNumWarps * warp_q];
   __shared__ value_idx smemV[kNumWarps * warp_q];
@@ -75,7 +75,7 @@ __global__ void haversine_knn_kernel(value_idx* out_inds,
       heap(faiss::gpu::Limits<value_t>::getMax(), -1, smemK, smemV, k);
 
   // Grid is exactly sized to rows available
-  int limit = faiss::gpu::utils::roundDown(n_index_rows, WarpSize);
+  int limit = faiss::gpu::utils::roundDown(n_index_rows, faiss::gpu::kWarpSize);
 
   const value_t* query_ptr = query + (blockIdx.x * 2);
   value_t x1               = query_ptr[0];
diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index 9112657d44..196124352a 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -63,7 +63,7 @@ __global__ void knn_merge_parts_kernel(value_t* inK,
                                        int k,
                                        value_idx* translations)
 {
-  constexpr int kNumWarps = tpb / WarpSize;
+  constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
 
   __shared__ value_t smemK[kNumWarps * warp_q];
   __shared__ value_idx smemV[kNumWarps * warp_q];
@@ -90,7 +90,7 @@ __global__ void knn_merge_parts_kernel(value_t* inK,
   value_t* inKStart   = inK + (row_idx + col);
   value_idx* inVStart = inV + (row_idx + col);
 
-  int limit             = faiss::gpu::utils::roundDown(total_k, WarpSize);
+  int limit             = faiss::gpu::utils::roundDown(total_k, faiss::gpu::kWarpSize);
   value_idx translation = 0;
 
   for (; i < limit; i += tpb) {
diff --git a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
index 91f7edd16f..d157a57f52 100644
--- a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
@@ -48,7 +48,7 @@ __global__ void select_k_kernel(key_t* inK,
                                 payload_t initV,
                                 int k)
 {
-  constexpr int kNumWarps = tpb / WarpSize;
+  constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
 
   __shared__ key_t smemK[kNumWarps * warp_q];
   __shared__ payload_t smemV[kNumWarps * warp_q];
@@ -66,7 +66,7 @@ __global__ void select_k_kernel(key_t* inK,
   payload_t* inVStart = inV + idx + i;
 
   // Whole warps must participate in the selection
-  int limit = faiss::gpu::utils::roundDown(n_cols, WarpSize);
+  int limit = faiss::gpu::utils::roundDown(n_cols, faiss::gpu::kWarpSize);
 
   for (; i < limit; i += tpb) {
     inKStart = inK + idx + i;
@@ -172,4 +172,4 @@ inline void select_k(key_t* inK,
 };  // namespace detail
 };  // namespace knn
 };  // namespace spatial
-};  // namespace raft
+};  // namespace raft
\ No newline at end of file

From 7e3041c9c8dc587b8d3b8d0c8535a9b56af0a66e Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 17 May 2022 15:36:47 +0200
Subject: [PATCH 010/118] wip: replacing explicit allocations with rmm buffers

---
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  | 187 ++++++++----------
 .../knn/detail/ann_kmeans_balanced.cuh        |  21 +-
 2 files changed, 92 insertions(+), 116 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index 81e3fc5585..388321d4b1 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -37,6 +37,11 @@
 #include <raft/distance/distance.hpp>
 #include <raft/spatial/knn/faiss_mr.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/managed_memory_resource.hpp>
+
 #include <faiss/gpu/GpuDistance.h>
 #include <faiss/gpu/GpuIndexFlat.h>
 #include <faiss/gpu/GpuIndexIVFFlat.h>
@@ -122,7 +127,7 @@ class cuivflHandle {
                                   cudaDataType_t dtype,
                                   uint32_t nrow,
                                   uint32_t nTrainset,
-                                  cudaStream_t stream);
+                                  rmm::cuda_stream_view stream);
   cuivflStatus_t cuivflSaveIndex(const char* fileName);
 
   cuivflStatus_t cuivflLoadIndex(const char* fileName);
@@ -136,7 +141,7 @@ class cuivflHandle {
                               uint32_t k,
                               size_t* neighbors,
                               float* distances,
-                              cudaStream_t stream,
+                              rmm::cuda_stream_view stream,
                               cudaDataType_t dtype);
 
   cuivflStatus_t queryIVFFlatGridSize(const uint32_t nprobe,
@@ -159,14 +164,15 @@ class cuivflHandle {
   size_t ninterleave_;    // The number of elements in 32 interleaved group for input dataset
   size_t buf_topk_size_;  // The size of buffer used for topk select.
   size_t floatQuerySize;  // The size of float converted queries from int8_t/uint8_t
-  cudaStream_t stream_;   // The stream for build and search
-  uint32_t veclen;        // The vectorization length of dataset in index.
-  uint32_t gridDimX_;     // The number of blocks launched across nprobe.
+  rmm::cuda_stream_view stream_;  // The stream for build and search
+  uint32_t veclen;                // The vectorization length of dataset in index.
+  uint32_t gridDimX_;             // The number of blocks launched across nprobe.
 
  private:
   // device pointer
   //  The device memory pointer; inverted list for data; size [ninterleave_, dim_]
   void* list_data_dev_ptr_;
+  
   // The device memory pointer; inverted list for index; size [ninterleave_]
   uint32_t* list_index_dev_ptr_;
   // The device memory pointer; Used for list_data_manage_ptr_; size [nlist_]
@@ -202,14 +208,14 @@ class cuivflHandle {
                                             cudaDataType_t dtype,
                                             uint32_t nrow,
                                             uint32_t ntrain,
-                                            cudaStream_t stream);
+                                            rmm::cuda_stream_view stream);
   template <typename T, typename value_t>
   cuivflStatus_t cuivflSearchImpl(const T* queries,
                                   uint32_t batch_size,
                                   uint32_t k,
                                   size_t* neighbors,
                                   value_t* distances,
-                                  cudaStream_t stream);
+                                  rmm::cuda_stream_view stream);
 };
 
 // cuivflCreate
@@ -228,7 +234,7 @@ cuivflHandle::cuivflHandle(raft::distance::DistanceType metric_type,
   floatQuerySize = 0;
   veclen         = 1;
   gridDimX_      = 0;
-  stream_        = 0;
+  stream_        = rmm::cuda_stream_default;
 
   if ((dim % 4) == 0) {
     veclen = 4;
@@ -490,42 +496,36 @@ cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_manage_p
                                                         cudaDataType_t dtype,
                                                         uint32_t nrow,
                                                         uint32_t ntrain,
-                                                        cudaStream_t stream)
+                                                        rmm::cuda_stream_view stream)
 {
   uint32_t numTrainset   = ntrain;
   uint32_t numClusters   = nlist_;
   uint32_t dimDataset    = dim_;
   uint32_t numIterations = niter_;
 
-  uint32_t* trainsetLabels = nullptr;
-  RAFT_CUDA_TRY(cudaMallocManaged(&trainsetLabels, sizeof(uint32_t) * numTrainset));
+  rmm::device_uvector<uint32_t> trainsetLabels(numTrainset, stream);
 
   float* clusterCenters = centriod_manage_ptr;
 
   uint32_t numMesoClusters = pow((double)(numClusters), (double)1.0 / 2.0) + 0.5;
   fprintf(stderr, "# numMesoClusters: %u\n", numMesoClusters);
 
-  float* mesoClusterCenters;  // [numMesoClusters, dimDataset]
-  RAFT_CUDA_TRY(
-    cudaMallocManaged(&mesoClusterCenters, sizeof(float) * numMesoClusters * dimDataset));
-
-  uint32_t* mesoClusterLabels;  // [numTrainset,]
-  RAFT_CUDA_TRY(cudaMallocManaged(&mesoClusterLabels, sizeof(uint32_t) * numTrainset));
+  rmm::mr::managed_memory_resource managed_memory;
+  rmm::device_uvector<float> mesoClusterCenters(
+    numMesoClusters * dimDataset, stream, &managed_memory);
+  rmm::device_uvector<uint32_t> mesoClusterLabels(numTrainset, stream, &managed_memory);
+  rmm::device_uvector<uint32_t> mesoClusterSize_buf(numMesoClusters, stream, &managed_memory);
+  rmm::device_uvector<float> mesoClusterCentersTemp(
+    numMesoClusters * dimDataset, stream, &managed_memory);
 
-  uint32_t* mesoClusterSize;  // [numMesoClusters,]
-  RAFT_CUDA_TRY(cudaMallocManaged(&mesoClusterSize, sizeof(uint32_t) * numMesoClusters));
-
-  float* mesoClusterCentersTemp;  // [numMesoClusters, dimDataset]
-  RAFT_CUDA_TRY(
-    cudaMallocManaged(&mesoClusterCentersTemp, sizeof(float) * numMesoClusters * dimDataset));
+  auto mesoClusterSize = mesoClusterSize_buf.data();
 
   size_t sizePredictWorkspace =
     _cuann_kmeans_predict_bufferSize(numMesoClusters,  // number of centers
                                      dimDataset,
                                      numTrainset  // number of vectors
     );
-  void* predictWorkspace = NULL;
-  RAFT_CUDA_TRY(cudaMallocManaged(&predictWorkspace, sizePredictWorkspace));
+  rmm::device_buffer predictWorkspace(sizePredictWorkspace, stream);
   // Training meso-clusters
   for (uint32_t iter = 0; iter < 2 * numIterations; iter += 2) {
     fprintf(stderr,
@@ -534,27 +534,27 @@ cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_manage_p
             (float)iter / 2,
             numIterations);
     _cuann_kmeans_predict(cublas_handle_,
-                          mesoClusterCenters,
+                          mesoClusterCenters.data(),
                           numMesoClusters,
                           dimDataset,
                           trainset,
                           dtype,
                           numTrainset,
-                          mesoClusterLabels,
+                          mesoClusterLabels.data(),
                           metric_type_,
                           (iter != 0),
-                          predictWorkspace,
-                          mesoClusterCentersTemp,
+                          predictWorkspace.data(),
+                          mesoClusterCentersTemp.data(),
                           mesoClusterSize);
 
     if (iter < 2 * (numIterations - 2)) {
-      if (_cuann_kmeans_adjust_centers(mesoClusterCenters,
+      if (_cuann_kmeans_adjust_centers(mesoClusterCenters.data(),
                                        numMesoClusters,
                                        dimDataset,
                                        trainset,
                                        dtype,
                                        numTrainset,
-                                       mesoClusterLabels,
+                                       mesoClusterLabels.data(),
                                        metric_type_,
                                        mesoClusterSize,
                                        (float)1.0 / 4)) {
@@ -566,10 +566,9 @@ cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_manage_p
   fprintf(stderr, "\n");
   RAFT_CUDA_TRY(cudaDeviceSynchronize());
 
-  uint32_t* numFineClusters;  // [numMesoClusters,]
-  numFineClusters            = (uint32_t*)malloc(sizeof(uint32_t) * numMesoClusters);
-  uint32_t* csumFineClusters = (uint32_t*)malloc(sizeof(uint32_t) * (numMesoClusters + 1));
-  csumFineClusters[0]        = 0;
+  std::vector<uint32_t> numFineClusters(numMesoClusters);
+  std::vector<uint32_t> csumFineClusters(numMesoClusters + 1);
+  csumFineClusters[0] = 0;
 
   uint32_t numClustersRemain  = numClusters;
   uint32_t numTrainsetRemain  = numTrainset;
@@ -598,10 +597,11 @@ cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_manage_p
   assert(csumFineClusters[numMesoClusters] == numClusters);
 
   // uint32_t *idsTrainset = (uint32_t *)malloc(sizeof(uint32_t) * mesoClusterSizeMax);
-  uint32_t* idsTrainset;
-  RAFT_CUDA_TRY(cudaMallocManaged(&idsTrainset, sizeof(uint32_t) * mesoClusterSizeMax));
-  float* subTrainset;
-  RAFT_CUDA_TRY(cudaMallocManaged(&subTrainset, sizeof(float) * mesoClusterSizeMax * dimDataset));
+  rmm::device_uvector<uint32_t> idsTrainset_buf(mesoClusterSizeMax, stream, &managed_memory);
+  rmm::device_uvector<float> subTrainset_buf(
+    mesoClusterSizeMax * dimDataset, stream, &managed_memory);
+  auto idsTrainset = idsTrainset_buf.data();
+  auto subTrainset = subTrainset_buf.data();
 
   sizePredictWorkspace = 0;
   for (uint32_t i = 0; i < numMesoClusters; i++) {
@@ -614,30 +614,23 @@ cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_manage_p
   }
 
   // label (cluster ID) of each vector
-  uint32_t* labelsMP = NULL;
-  RAFT_CUDA_TRY(cudaMallocManaged(&labelsMP, mesoClusterSizeMax * sizeof(uint32_t)));
-
-  cudaFree(predictWorkspace);
-  RAFT_CUDA_TRY(cudaMallocManaged(&predictWorkspace, sizePredictWorkspace));
+  rmm::device_uvector<uint32_t> labelsMP(mesoClusterSizeMax, stream, &managed_memory);
 
-  float* clusterCentersEach = NULL;
-  RAFT_CUDA_TRY(
-    cudaMallocManaged(&clusterCentersEach, numFineClustersMax * dimDataset * sizeof(float)));
-
-  float* clusterCentersMP = NULL;
-  RAFT_CUDA_TRY(
-    cudaMallocManaged(&clusterCentersMP, numFineClustersMax * dimDataset * sizeof(float)));
+  predictWorkspace.resize(sizePredictWorkspace, stream);
 
+  rmm::device_uvector<float> clusterCentersEach(
+    numFineClustersMax * dimDataset, stream, &managed_memory);
+  rmm::device_uvector<float> clusterCentersMP(
+    numFineClustersMax * dimDataset, stream, &managed_memory);
   // number of vectors in each cluster
-  uint32_t* clusterSizeMP = NULL;
-  RAFT_CUDA_TRY(cudaMallocManaged(&clusterSizeMP, numFineClustersMax * sizeof(uint32_t)));
+  rmm::device_uvector<uint32_t> clusterSizeMP(numFineClustersMax, stream, &managed_memory);
 
   // Training clusters in each meso-clusters
   uint32_t numClustersDone = 0;
   for (uint32_t i = 0; i < numMesoClusters; i++) {
     uint32_t k = 0;
     for (uint32_t j = 0; j < numTrainset; j++) {
-      if (mesoClusterLabels[j] != i) continue;
+      if (mesoClusterLabels.data()[j] != i) continue;
       idsTrainset[k++] = j;
     }
     assert(k == mesoClusterSize[i]);
@@ -685,29 +678,29 @@ cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_manage_p
               numIterations);
 
       _cuann_kmeans_predict(cublas_handle_,
-                            clusterCentersEach,
+                            clusterCentersEach.data(),
                             numFineClusters[i],
                             dimDataset,
                             subTrainset,
                             CUDA_R_32F,
                             mesoClusterSize[i],
-                            labelsMP,
+                            labelsMP.data(),
                             metric_type_,
                             (iter != 0),
-                            predictWorkspace,
-                            clusterCentersMP,
-                            clusterSizeMP);
+                            predictWorkspace.data(),
+                            clusterCentersMP.data(),
+                            clusterSizeMP.data());
 
       if (iter < 2 * (numIterations - 2)) {
-        if (_cuann_kmeans_adjust_centers(clusterCentersEach,
+        if (_cuann_kmeans_adjust_centers(clusterCentersEach.data(),
                                          numFineClusters[i],
                                          dimDataset,
                                          subTrainset,
                                          CUDA_R_32F,
                                          mesoClusterSize[i],
-                                         labelsMP,
+                                         labelsMP.data(),
                                          metric_type_,
-                                         clusterSizeMP,
+                                         clusterSizeMP.data(),
                                          (float)1.0 / 4)) {
           iter -= 1;
         }
@@ -715,7 +708,7 @@ cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_manage_p
       RAFT_CUDA_TRY(cudaDeviceSynchronize());
     }
     RAFT_CUDA_TRY(cudaMemcpy(clusterCenters + (dimDataset * csumFineClusters[i]),
-                             clusterCentersEach,
+                             clusterCentersEach.data(),
                              sizeof(float) * numFineClusters[i] * dimDataset,
                              cudaMemcpyDefault));
     numClustersDone += numFineClusters[i];
@@ -723,28 +716,12 @@ cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_manage_p
   fprintf(stderr, "\n");
   assert(numClustersDone == numClusters);
 
-  cudaFree(subTrainset);
-  cudaFree(idsTrainset);
-  free(numFineClusters);
-  free(csumFineClusters);
-  cudaFree(mesoClusterSize);
-  cudaFree(mesoClusterLabels);
-  cudaFree(mesoClusterCenters);
-  cudaFree(predictWorkspace);
-  cudaFree(clusterSizeMP);
-  RAFT_CUDA_TRY(cudaFree(clusterCentersEach));
-  RAFT_CUDA_TRY(cudaFree(clusterCentersMP));
-  RAFT_CUDA_TRY(cudaFree(labelsMP));
-
-  // [numClusters, dimDataset]
-  RAFT_CUDA_TRY(cudaMallocManaged(&clusterCentersMP, numClusters * dimDataset * sizeof(float)));
-
-  // [numClusters]
-  RAFT_CUDA_TRY(cudaMallocManaged(&clusterSizeMP, numClusters * sizeof(uint32_t)));
+  clusterCentersMP.resize(numClusters * dimDataset, stream);
+  clusterSizeMP.resize(numClusters, stream);
 
   // [...]
   sizePredictWorkspace = _cuann_kmeans_predict_bufferSize(numClusters, dimDataset, numTrainset);
-  RAFT_CUDA_TRY(cudaMallocManaged(&predictWorkspace, sizePredictWorkspace));
+  predictWorkspace.resize(sizePredictWorkspace, stream);
 
   // Fitting whole clusters using whole trainset.
   for (int iter = 0; iter < 2; iter++) {
@@ -755,22 +732,19 @@ cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_manage_p
                           trainset,
                           dtype,
                           numTrainset,
-                          trainsetLabels,
+                          trainsetLabels.data(),
                           metric_type_,
                           true,
-                          predictWorkspace,
-                          clusterCentersMP,
-                          clusterSizeMP,
+                          predictWorkspace.data(),
+                          clusterCentersMP.data(),
+                          clusterSizeMP.data(),
                           true);
   }  // end for (int iter = 0; iter < 2; iter++)
 
   fprintf(stderr, "(%s) Final fitting\n", __func__);
 
-  RAFT_CUDA_TRY(cudaFree(trainsetLabels));
-  RAFT_CUDA_TRY(cudaFree(predictWorkspace));
-
   sizePredictWorkspace = _cuann_kmeans_predict_bufferSize(numClusters, dimDataset, nrow_);
-  RAFT_CUDA_TRY(cudaMallocManaged(&predictWorkspace, sizePredictWorkspace));
+  predictWorkspace.resize(sizePredictWorkspace, stream);
 
   _cuann_kmeans_predict(cublas_handle_,
                         (float*)clusterCenters,
@@ -782,9 +756,9 @@ cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_manage_p
                         datasetLabels,
                         metric_type_,
                         true,
-                        predictWorkspace,
-                        clusterCentersMP,
-                        clusterSizeMP,
+                        predictWorkspace.data(),
+                        clusterCentersMP.data(),
+                        clusterSizeMP.data(),
                         true);
 
   _cuann_kmeans_predict(cublas_handle_,
@@ -797,13 +771,10 @@ cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_manage_p
                         datasetLabels,
                         metric_type_,
                         true,
-                        predictWorkspace,
-                        clusterCentersMP,
-                        clusterSizeMP,
+                        predictWorkspace.data(),
+                        clusterCentersMP.data(),
+                        clusterSizeMP.data(),
                         false);
-  RAFT_CUDA_TRY(cudaFree(clusterCentersMP));
-  RAFT_CUDA_TRY(cudaFree(clusterSizeMP));
-  RAFT_CUDA_TRY(cudaFree(predictWorkspace));
 
   return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
 }  // end func cuivflBuildOptimizedKmeans
@@ -813,14 +784,15 @@ cuivflStatus_t cuivflHandle::cuivflBuildIndex(const void* dataset,
                                               cudaDataType_t dtype,
                                               uint32_t nrow,
                                               uint32_t ntrain,
-                                              cudaStream_t stream)
+                                              rmm::cuda_stream_view stream)
 {
   nrow_   = nrow;
   dtype_  = dtype;
   stream_ = stream;
 
-  float* centriod_manage_ptr = nullptr;
-  RAFT_CUDA_TRY(cudaMallocManaged(&centriod_manage_ptr, sizeof(float) * nlist_ * dim_));
+  rmm::mr::managed_memory_resource managed_memory;
+  rmm::device_uvector<float> centriod_manage_buf(nlist_ * dim_, stream, &managed_memory);
+  auto centriod_manage_ptr = centriod_manage_buf.data();
 
   if (this == NULL || nrow_ == 0) { return CUIVFL_STATUS_NOT_INITIALIZED; }
   if (dtype != CUDA_R_32F && dtype != CUDA_R_8U && dtype != CUDA_R_8I) {
@@ -828,8 +800,8 @@ cuivflStatus_t cuivflHandle::cuivflBuildIndex(const void* dataset,
   }
 
   // Alloc manage memory for centriods, trainset and workspace
-  uint32_t* datasetLabels;  // [numDataset]
-  RAFT_CUDA_TRY(cudaMallocManaged(&datasetLabels, sizeof(uint32_t) * nrow_));
+  rmm::device_uvector<uint32_t> datasetLabels_buf(nrow_, stream, &managed_memory);  // [numDataset]
+  auto datasetLabels = datasetLabels_buf.data();
 
   // Step 3: Predict labels of the whole dataset
   cuivflBuildOptimizedKmeans(
@@ -917,9 +889,6 @@ cuivflStatus_t cuivflHandle::cuivflBuildIndex(const void* dataset,
                                 cudaMemcpyDefault,
                                 stream));
 
-  RAFT_CUDA_TRY(cudaFree(datasetLabels));
-  RAFT_CUDA_TRY(cudaFree(centriod_manage_ptr));
-
   // Store index on GPU memory: temp WAR until we've entire index building buffers on device
   RAFT_CUDA_TRY(cudaMalloc(&list_prefix_interleaved_dev_ptr_, sizeof(uint32_t) * nlist_));
   RAFT_CUDA_TRY(cudaMalloc(&list_lengths_dev_ptr_, sizeof(uint32_t) * nlist_));
@@ -1134,7 +1103,7 @@ cuivflStatus_t cuivflHandle::cuivflSearch(const void* queries,  // [numQueries,
                                           uint32_t k,
                                           size_t* neighbors,  // [numQueries, topK]
                                           float* distances,
-                                          cudaStream_t stream,
+                                          rmm::cuda_stream_view stream,
                                           cudaDataType_t dtype)
 {
   switch (dtype) {
@@ -1166,7 +1135,7 @@ cuivflStatus_t cuivflHandle::cuivflSearchImpl(const T* queries,  // [numQueries,
                                               uint32_t k,
                                               size_t* neighbors,  // [numQueries, topK]
                                               value_t* distances,
-                                              cudaStream_t stream)
+                                              rmm::cuda_stream_view stream)
 {
   uint32_t nprobe = std::min(nprobe_, (uint32_t)nlist_);
   stream_         = stream;
diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index d5b95af99c..08aa77bb1d 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -31,6 +31,9 @@
 #include <raft/distance/distance.hpp>
 #include <raft/spatial/knn/faiss_mr.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+
 #include <faiss/gpu/GpuDistance.h>
 #include <faiss/gpu/GpuIndexFlat.h>
 #include <faiss/gpu/GpuIndexIVFFlat.h>
@@ -208,11 +211,14 @@ void _cuann_kmeans_predict(cublasHandle_t cublasHandle,
                            uint32_t* clusterSize = NULL,  // [numCenters,]
                            bool updateCenter     = true)
 {
+  rmm::cuda_stream_view stream = rmm::cuda_stream_default;
   if (!isCenterSet) {
     // If centers are not set, the labels will be determined randomly.
-    for (uint32_t i = 0; i < numDataset; i++) {
-      labels[i] = i % numCenters;
-    }
+    linalg::writeOnlyUnaryOp(
+      labels,
+      numDataset,
+      [numCenters] __device__(uint32_t * out, uint32_t i) { *out = i % numCenters; },
+      stream);
     if (tempCenters != NULL && clusterSize != NULL) {
       // update centers
       _cuann_kmeans_update_centers(
@@ -223,9 +229,12 @@ void _cuann_kmeans_predict(cublasHandle_t cublasHandle,
 
   uint32_t chunk  = _cuann_kmeans_predict_chunkSize(numCenters, numDataset);
   void* workspace = _workspace;
+  rmm::device_buffer sub_workspace(0, stream);
+
   if (_workspace == NULL) {
-    size_t sizeWorkspace = _cuann_kmeans_predict_bufferSize(numCenters, dimCenters, numDataset);
-    RAFT_CUDA_TRY(cudaMallocManaged(&workspace, sizeWorkspace));
+    sub_workspace.resize(_cuann_kmeans_predict_bufferSize(numCenters, dimCenters, numDataset),
+                         stream);
+    workspace = sub_workspace.data();
   }
   float* curDataset;  // [chunk, dimCenters]
   void* bufDataset;   // [chunk, dimCenters]
@@ -326,8 +335,6 @@ void _cuann_kmeans_predict(cublasHandle_t cublasHandle,
                                  clusterSize,
                                  tempCenters);
   }
-
-  if (_workspace == NULL) { cudaFree(workspace); }
 }
 
 // adjust centers which have small number of entries

From dd558b450150dc7f142890c6fabdb52ee7c13dcd Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Wed, 18 May 2022 11:52:56 +0200
Subject: [PATCH 011/118] Update
 cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh

Co-authored-by: Tamas Bela Feher <tfeher@nvidia.com>
---
 cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
index ae6a3de4c9..5e8ab55a5b 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
@@ -192,6 +192,8 @@ void approx_knn_build_index(raft::handle_t& handle,
 
       approx_knn_cuivfl_ivfflat_build_index(
         index, IVFFlat_param, metric, h_index_array, n, D, handle.get_stream());
+    } else {
+      RAFT_FAIL("IVF Flat algorithm required to fit int8 data");
     }
   } else if constexpr (std::is_same<T, float>{}) {
     std::unique_ptr<MetricProcessor<float>> query_metric_processor =

From 94b3cbe3e511122b546a1f85cd3e42fff684fb83 Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Wed, 18 May 2022 12:09:44 +0200
Subject: [PATCH 012/118] Update
 cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh

Co-authored-by: Tamas Bela Feher <tfeher@nvidia.com>
---
 cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
index 5e8ab55a5b..0d5e124d6d 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
@@ -115,8 +115,7 @@ void approx_knn_cuivfl_ivfflat_build_index(knnIndex* index,
   RAFT_CUDA_TRY(cudaMallocManaged(&trainset, ntrain * dim * sizeof(T)));
 
   for (size_t i = 0; i < ntrain; ++i) {
-    RAFT_CUDA_TRY(cudaMemcpyAsync(
-      trainset + i * dim, dataset + ratio * i * dim, dim * sizeof(T), cudaMemcpyDefault, stream));
+    copy(trainset + i * dim, dataset + ratio * i * dim, dim, stream);
   }
 
   cudaDataType_t dtype;

From 2be45a9a76d37dbe3a988d91f6878f2229621647 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 18 May 2022 12:46:41 +0200
Subject: [PATCH 013/118] wip: replace cudaMemcpy with raft::copy

---
 .../raft/spatial/knn/detail/ann_quantized_faiss.cuh    | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
index 0d5e124d6d..1743a358cd 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
@@ -186,9 +186,7 @@ void approx_knn_build_index(raft::handle_t& handle,
       IVFFlatParam* IVFFlat_param = dynamic_cast<IVFFlatParam*>(params);
       T* h_index_array;
       RAFT_CUDA_TRY(cudaMallocManaged(&h_index_array, n * D * sizeof(T)));
-      RAFT_CUDA_TRY(cudaMemcpyAsync(
-        h_index_array, index_array, n * D * sizeof(T), cudaMemcpyDefault, handle.get_stream()));
-
+      copy(h_index_array, index_array, n * D, handle.get_stream());
       approx_knn_cuivfl_ivfflat_build_index(
         index, IVFFlat_param, metric, h_index_array, n, D, handle.get_stream());
     } else {
@@ -210,11 +208,7 @@ void approx_knn_build_index(raft::handle_t& handle,
         cudaMallocManaged(&h_index_array, n * D * sizeof(T));
         // raft::update_host(h_index_array.data(), index_array, h_index_array.size(),
         // handle.get_stream());
-        cudaMemcpyAsync((void*)h_index_array,
-                        (void*)index_array,
-                        n * D * sizeof(T),
-                        cudaMemcpyDefault,
-                        handle.get_stream());
+        copy(h_index_array, index_array, n * D, handle.get_stream());
         approx_knn_cuivfl_ivfflat_build_index(
           index, IVFFlat_param, metric, h_index_array, n, D, handle.get_stream());
       } else {

From 30c32a9c4c3f3cfaefb09c5e592a258e596e7740 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 18 May 2022 14:13:30 +0200
Subject: [PATCH 014/118] Simplified some cudaMemcpy invocations

---
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  | 46 +++---------
 .../knn/detail/ann_kmeans_balanced.cuh        | 33 ++-------
 .../raft/spatial/knn/detail/ann_utils.cuh     | 70 ++++++++++++++++++-
 3 files changed, 81 insertions(+), 68 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index 388321d4b1..82ea552822 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -172,7 +172,7 @@ class cuivflHandle {
   // device pointer
   //  The device memory pointer; inverted list for data; size [ninterleave_, dim_]
   void* list_data_dev_ptr_;
-  
+
   // The device memory pointer; inverted list for index; size [ninterleave_]
   uint32_t* list_index_dev_ptr_;
   // The device memory pointer; Used for list_data_manage_ptr_; size [nlist_]
@@ -883,11 +883,7 @@ cuivflStatus_t cuivflHandle::cuivflBuildIndex(const void* dataset,
   }
 
   RAFT_CUDA_TRY(cudaMalloc(&centriod_dev_ptr_, sizeof(float) * nlist_ * dim_));
-  RAFT_CUDA_TRY(cudaMemcpyAsync(centriod_dev_ptr_,
-                                centriod_manage_ptr,
-                                sizeof(float) * nlist_ * dim_,
-                                cudaMemcpyDefault,
-                                stream));
+  copy(centriod_dev_ptr_, centriod_manage_ptr, nlist_ * dim_, stream);
 
   // Store index on GPU memory: temp WAR until we've entire index building buffers on device
   RAFT_CUDA_TRY(cudaMalloc(&list_prefix_interleaved_dev_ptr_, sizeof(uint32_t) * nlist_));
@@ -903,41 +899,15 @@ cuivflStatus_t cuivflHandle::cuivflBuildIndex(const void* dataset,
   }
 
   // Step 3: Read the list
-  RAFT_CUDA_TRY(cudaMemcpyAsync(list_prefix_interleaved_dev_ptr_,
-                                list_prefix_interleaved_host_ptr_,
-                                sizeof(uint32_t) * nlist_,
-                                cudaMemcpyHostToDevice,
-                                stream));
-  RAFT_CUDA_TRY(cudaMemcpyAsync(list_lengths_dev_ptr_,
-                                list_lengths_host_ptr_,
-                                sizeof(uint32_t) * nlist_,
-                                cudaMemcpyHostToDevice,
-                                stream));
+  copy(list_prefix_interleaved_dev_ptr_, list_prefix_interleaved_host_ptr_, nlist_, stream);
+  copy(list_lengths_dev_ptr_, list_lengths_host_ptr_, nlist_, stream);
 
-  if (dtype_ == CUDA_R_32F) {
-    RAFT_CUDA_TRY(cudaMemcpyAsync(list_data_dev_ptr_,
-                                  list_data_host_ptr_,
-                                  sizeof(float) * ninterleave_ * dim_,
-                                  cudaMemcpyHostToDevice,
-                                  stream));
-  } else if (dtype_ == CUDA_R_8U) {
-    RAFT_CUDA_TRY(cudaMemcpyAsync(list_data_dev_ptr_,
-                                  list_data_host_ptr_,
-                                  sizeof(uint8_t) * ninterleave_ * dim_,
-                                  cudaMemcpyHostToDevice,
-                                  stream));
-  } else if (dtype_ == CUDA_R_8I) {
-    RAFT_CUDA_TRY(cudaMemcpyAsync(list_data_dev_ptr_,
-                                  list_data_host_ptr_,
-                                  sizeof(int8_t) * ninterleave_ * dim_,
-                                  cudaMemcpyHostToDevice,
-                                  stream));
-  }
-  RAFT_CUDA_TRY(cudaMemcpyAsync(list_index_dev_ptr_,
-                                list_index_host_ptr_,
-                                sizeof(uint32_t) * ninterleave_,
+  RAFT_CUDA_TRY(cudaMemcpyAsync(list_data_dev_ptr_,
+                                list_data_host_ptr_,
+                                utils::cuda_datatype_size(dtype_) * ninterleave_ * dim_,
                                 cudaMemcpyHostToDevice,
                                 stream));
+  copy(list_index_dev_ptr_, list_index_host_ptr_, ninterleave_, stream);
 
   return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
 }  // end func cuivflBuildIndex
diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index 08aa77bb1d..637086d391 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -250,37 +250,16 @@ void _cuann_kmeans_predict(cublasHandle_t cublasHandle,
     utils::_cuann_memset(clusterSize, 0, sizeof(uint32_t) * numCenters);
   }
 
-  cudaMemcpyKind kind;
-  cudaPointerAttributes attr;
-  cudaPointerGetAttributes(&attr, dataset);
-  if (attr.type == cudaMemoryTypeDevice || attr.type == cudaMemoryTypeManaged) {
-    kind = cudaMemcpyDeviceToDevice;
-  } else {
-    kind = cudaMemcpyHostToDevice;
-  }
-
+  auto elem_size = utils::cuda_datatype_size(dtype);
   for (uint64_t is = 0; is < numDataset; is += chunk) {
     uint64_t ie       = min(is + chunk, (uint64_t)numDataset);
     uint32_t nDataset = ie - is;
 
-    if (dtype == CUDA_R_32F) {
-      RAFT_CUDA_TRY(cudaMemcpy(bufDataset,
-                               (float*)dataset + (is * dimCenters),
-                               sizeof(float) * nDataset * dimCenters,
-                               cudaMemcpyDefault));
-    } else if (dtype == CUDA_R_8U) {
-      RAFT_CUDA_TRY(cudaMemcpyAsync(bufDataset,
-                                    (uint8_t*)dataset + (is * dimCenters),
-                                    sizeof(uint8_t) * nDataset * dimCenters,
-                                    kind,
-                                    NULL));
-    } else if (dtype == CUDA_R_8I) {
-      RAFT_CUDA_TRY(cudaMemcpyAsync(bufDataset,
-                                    (int8_t*)dataset + (is * dimCenters),
-                                    sizeof(int8_t) * nDataset * dimCenters,
-                                    kind,
-                                    NULL));
-    }
+    RAFT_CUDA_TRY(
+      cudaMemcpy(bufDataset,
+                 reinterpret_cast<const uint8_t*>(dataset) + is * dimCenters * elem_size,
+                 elem_size * nDataset * dimCenters,
+                 cudaMemcpyDefault));
 
     if (dtype == CUDA_R_32F) {
       // No need to copy when dtype is CUDA_R_32F
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index ccfe73ffd0..2b8542b87f 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -74,6 +74,41 @@ void printDevPtr(const T* d_cache, int len, const char* name)
   free(res);
 }
 
+inline auto cuda_datatype_size(cudaDataType_t t) -> size_t
+{
+  switch (t) {
+    case CUDA_R_8I:
+    case CUDA_C_8I:
+    case CUDA_R_8U:
+    case CUDA_C_8U: return 1;
+
+    case CUDA_R_16F:
+    case CUDA_C_16F:
+    case CUDA_R_16BF:
+    case CUDA_C_16BF:
+    case CUDA_R_16I:
+    case CUDA_C_16I:
+    case CUDA_R_16U:
+    case CUDA_C_16U: return 2;
+
+    case CUDA_R_32F:
+    case CUDA_C_32F:
+    case CUDA_R_32I:
+    case CUDA_C_32I:
+    case CUDA_R_32U:
+    case CUDA_C_32U: return 4;
+
+    case CUDA_R_64F:
+    case CUDA_C_64F:
+    case CUDA_R_64I:
+    case CUDA_C_64I:
+    case CUDA_R_64U:
+    case CUDA_C_64U: return 8;
+
+    default: RAFT_FAIL("cuda_datatype_size: unsupported dtype (%d)", t);
+  }
+}
+
 inline size_t calc_aligned_size(const std::vector<size_t>& sizes)
 {
   const size_t ALIGN_BYTES = 256;
@@ -296,7 +331,20 @@ __global__ void kern_accumulate_with_label(uint32_t nRowsOutput,
   atomicAdd(&(output[iCol + (nCols * iRowOutput)]), input[gid] / divisor);
 }
 
-// accumulate
+/**
+ * @brief Accumulate
+ *
+ * @tparam T
+ *
+ * @param nRowsOutput
+ * @param nCols
+ * @param output device/host pointer
+ * @param count device/host pointer
+ * @param nRowsInput
+ * @param input device/host pointer
+ * @param label device/host pointer
+ * @param divisor
+ */
 template <typename T>
 void _cuann_accumulate_with_label(uint32_t nRowsOutput,
                                   uint32_t nCols,
@@ -315,6 +363,8 @@ void _cuann_accumulate_with_label(uint32_t nRowsOutput,
   if (attr.type == cudaMemoryTypeUnregistered || attr.type == cudaMemoryTypeHost) { useGPU = 0; }
   cudaPointerGetAttributes(&attr, input);
   if (attr.type == cudaMemoryTypeUnregistered || attr.type == cudaMemoryTypeHost) { useGPU = 0; }
+  cudaPointerGetAttributes(&attr, label);
+  if (attr.type == cudaMemoryTypeUnregistered || attr.type == cudaMemoryTypeHost) { useGPU = 0; }
   // _cuann_memset(output, 0, sizeof(float) * nRowsOutput * nCols);
   // _cuann_memset(count, 0, sizeof(uint32_t) * nRowsOutput);
 
@@ -364,7 +414,14 @@ __global__ void kern_normalize(uint32_t nRows,
   }
 }
 
-// normalize
+/**
+ * @brief Normalize
+ *
+ * @param nRows
+ * @param nCols
+ * @param a device pointer
+ * @param numSamples device pointer
+ */
 void _cuann_normalize(uint32_t nRows,
                       uint32_t nCols,
                       float* a,                   // [nRows, nCols]
@@ -390,7 +447,14 @@ __global__ void kern_divide(uint32_t nRows,
   a[gid] /= numSamples[iRow];
 }
 
-// divide
+/**
+ * @brief Divide
+ *
+ * @param nRows
+ * @param nCols
+ * @param a device pointer
+ * @param numSamples device pointer
+ */
 void _cuann_divide(uint32_t nRows,
                    uint32_t nCols,
                    float* a,                   // [nRows, nCols]

From 150a4380baf057fd8ee2f78d8a9af20251aeb40c Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 19 May 2022 11:00:12 +0200
Subject: [PATCH 015/118] Refactoring with helper functions

---
 cpp/include/raft/cuda_utils.cuh               | 61 +++++++++++++++++++
 .../knn/detail/ann_ivf_flat_kernel.cuh        |  6 +-
 .../knn/detail/ann_kmeans_balanced.cuh        |  3 +-
 .../knn/detail/ann_quantized_faiss.cuh        | 18 +-----
 .../raft/spatial/knn/detail/ann_utils.cuh     | 21 +++++++
 5 files changed, 87 insertions(+), 22 deletions(-)

diff --git a/cpp/include/raft/cuda_utils.cuh b/cpp/include/raft/cuda_utils.cuh
index 362dba66c5..19800cb2d9 100644
--- a/cpp/include/raft/cuda_utils.cuh
+++ b/cpp/include/raft/cuda_utils.cuh
@@ -649,6 +649,67 @@ DI T shfl_xor(T val, int laneMask, int width = WarpSize, uint32_t mask = 0xfffff
 #endif
 }
 
+/**
+ * @brief Four-way byte dot product-accumulate.
+ * @tparam T Four-byte integer: int or unsigned int
+ * @tparam S Either same as T or a 4-byte vector of the same signedness.
+ *
+ * @param a
+ * @param b
+ * @param c
+ * @return dot(a, b) + c
+ */
+template <typename T, typename S = T>
+DI auto dp4a(S a, S b, T c) -> T;
+
+template <>
+DI auto dp4a(char4 a, char4 b, int c) -> int
+{
+#if __CUDA_ARCH__ >= 610
+  return __dp4a(a, b, c);
+#else
+  c += static_cast<int>(a.x) * static_cast<int>(b.x);
+  c += static_cast<int>(a.y) * static_cast<int>(b.y);
+  c += static_cast<int>(a.z) * static_cast<int>(b.z);
+  c += static_cast<int>(a.w) * static_cast<int>(b.w);
+  return c;
+#endif
+}
+
+template <>
+DI auto dp4a(uchar4 a, uchar4 b, unsigned int c) -> unsigned int
+{
+#if __CUDA_ARCH__ >= 610
+  return __dp4a(a, b, c);
+#else
+  c += static_cast<unsigned int>(a.x) * static_cast<unsigned int>(b.x);
+  c += static_cast<unsigned int>(a.y) * static_cast<unsigned int>(b.y);
+  c += static_cast<unsigned int>(a.z) * static_cast<unsigned int>(b.z);
+  c += static_cast<unsigned int>(a.w) * static_cast<unsigned int>(b.w);
+  return c;
+#endif
+}
+
+template <>
+DI auto dp4a(int a, int b, int c) -> int
+{
+#if __CUDA_ARCH__ >= 610
+  return __dp4a(a, b, c);
+#else
+  return dp4a(*reinterpret_cast<char4*>(&a), *reinterpret_cast<char4*>(&b), c);
+#endif
+}
+
+template <>
+DI auto dp4a(unsigned int a, unsigned int b, unsigned int c) -> unsigned int
+{
+#if __CUDA_ARCH__ >= 610
+  return __dp4a(a, b, c);
+#else
+  return dp4a(*reinterpret_cast<uchar4*>(&a), *reinterpret_cast<uchar4*>(&b), c);
+#endif
+}
+
 /**
  * @brief Warp-level sum reduction
  * @param val input value
diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
index 8cea7f4e5e..f02471094f 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
@@ -985,7 +985,7 @@ void launch_interleaved_scan_kernel(
       if constexpr (veclen == 1) {
         acc += x * y;
       } else {
-        acc = __dp4a(x, y, acc);
+        acc = dp4a(x, y, acc);
       }
     } else if constexpr (std::is_same<T, float>{}) {
       acc += x * y;
@@ -1001,7 +1001,7 @@ void launch_interleaved_scan_kernel(
       } else {
         const acc_type diff = __vabsdiffu4(x, y);
         // TODO: add CUDA_ARCH based guard as IDP is only available from SM 6.1 onwards
-        acc = __dp4a(diff, diff, acc);
+        acc = dp4a(diff, diff, acc);
       }
     } else if constexpr (std::is_same<T, int8_t>{}) {
       if constexpr (veclen == 1) {
@@ -1009,7 +1009,7 @@ void launch_interleaved_scan_kernel(
         acc += diff * diff;
       } else {
         asm("vabsdiff4.u32.s32.s32 %0,%1,%2,%3;" : "=r"(x) : "r"(x), "r"(y), "r"(0));
-        acc = __dp4a(x, x, acc);
+        acc = dp4a(x, x, acc);
       }
     } else if constexpr ((std::is_same<T, float>{})) {
       const acc_type diff = x - y;
diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index 637086d391..cc2edb0f7d 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -182,8 +182,7 @@ void _cuann_kmeans_update_centers(float* centers,  // [numCenters, dimCenters]
                                                   divisor);
     }
   } else {
-    cudaMemcpy(
-      centers, accumulatedCenters, sizeof(float) * numCenters * dimCenters, cudaMemcpyDefault);
+    copy(centers, accumulatedCenters, numCenters * dimCenters, rmm::cuda_stream_default);
   }
 
   if (metric == raft::distance::DistanceType::InnerProduct) {
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
index 1743a358cd..2cf8bb0c8d 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
@@ -101,15 +101,6 @@ void approx_knn_cuivfl_ivfflat_build_index(knnIndex* index,
   const size_t ntrain = n / ratio;
   assert(ntrain > 0);
 
-  // T* trainset = (T*)rmm(ntrain * dim * sizeof(T));
-  // cudaMemcpyKind kind;
-  // cudaPointerAttributes attr;
-  // cudaPointerGetAttributes(&attr, dataset);
-  // if (attr.type == cudaMemoryTypeDevice || attr.type == cudaMemoryTypeManaged) {
-  //     kind = cudaMemcpyDeviceToDevice;
-  // } else {
-  //     kind = cudaMemcpyHostToDevice;
-  // }
   // rmm::device_uvector<T> trainset(ntrain * dim, stream);
   T* trainset = nullptr;
   RAFT_CUDA_TRY(cudaMallocManaged(&trainset, ntrain * dim * sizeof(T)));
@@ -118,14 +109,7 @@ void approx_knn_cuivfl_ivfflat_build_index(knnIndex* index,
     copy(trainset + i * dim, dataset + ratio * i * dim, dim, stream);
   }
 
-  cudaDataType_t dtype;
-  if (typeid(T) == typeid(float)) {
-    dtype = CUDA_R_32F;
-  } else if (typeid(T) == typeid(uint8_t)) {
-    dtype = CUDA_R_8U;
-  } else if (typeid(T) == typeid(int8_t)) {
-    dtype = CUDA_R_8I;
-  }
+  cudaDataType_t dtype = utils::cuda_datatype<T>();
 
   cuivflInit(index->handle_, metric, D, params->nlist, niter, index->device);
 
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index 2b8542b87f..515cdb19a8 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -109,6 +109,27 @@ inline auto cuda_datatype_size(cudaDataType_t t) -> size_t
   }
 }
 
+template <typename T>
+inline constexpr auto cuda_datatype() -> cudaDataType_t;
+
+template <>
+inline constexpr auto cuda_datatype<float>() -> cudaDataType_t
+{
+  return CUDA_R_32F;
+}
+
+template <>
+inline constexpr auto cuda_datatype<uint8_t>() -> cudaDataType_t
+{
+  return CUDA_R_8U;
+}
+
+template <>
+inline constexpr auto cuda_datatype<int8_t>() -> cudaDataType_t
+{
+  return CUDA_R_8I;
+}
+
 inline size_t calc_aligned_size(const std::vector<size_t>& sizes)
 {
   const size_t ALIGN_BYTES = 256;

From ddfb8ccb8f96c2f284fa8d6d9d5644d823196988 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 19 May 2022 12:04:16 +0200
Subject: [PATCH 016/118] Make the scratch buf 3x L2 cache size

---
 cpp/bench/common/benchmark.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/bench/common/benchmark.hpp b/cpp/bench/common/benchmark.hpp
index 17aedec10c..67b64a212b 100644
--- a/cpp/bench/common/benchmark.hpp
+++ b/cpp/bench/common/benchmark.hpp
@@ -115,7 +115,7 @@ class fixture {
     int device_id     = 0;
     RAFT_CUDA_TRY(cudaGetDevice(&device_id));
     RAFT_CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_size, cudaDevAttrL2CacheSize, device_id));
-    scratch_buf_ = rmm::device_buffer(l2_cache_size, stream);
+    scratch_buf_ = rmm::device_buffer(l2_cache_size * 3, stream);
   }
 
   // every benchmark should be overriding this

From b788e2e8f9558c5f57d2d45c0f12d8abab9c2b7a Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 19 May 2022 13:13:31 +0200
Subject: [PATCH 017/118] Remove serialization code for now

---
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  | 164 ------------------
 1 file changed, 164 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index 82ea552822..7ecb6c62a2 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -128,9 +128,6 @@ class cuivflHandle {
                                   uint32_t nrow,
                                   uint32_t nTrainset,
                                   rmm::cuda_stream_view stream);
-  cuivflStatus_t cuivflSaveIndex(const char* fileName);
-
-  cuivflStatus_t cuivflLoadIndex(const char* fileName);
 
   cuivflStatus_t cuivflSetSearchParameters(const uint32_t nprobe,
                                            const uint32_t max_batch,
@@ -326,167 +323,6 @@ cuivflHandle::~cuivflHandle()
   cublasDestroy(cublas_handle_);
 }  // end func cuivflHandle::cuivflHand
 
-// cuivflLoadIndex
-cuivflStatus_t cuivflHandle::cuivflLoadIndex(const char* fileName)
-{
-  // Step 1: Open the file
-  FILE* fp = fopen(fileName, "r");
-
-  if (fp == NULL) {
-    fprintf(stderr, "(%s) failed to open file (%s)\n", __func__, fileName);
-    return cuivflStatus_t::CUIVFL_STATUS_FILEIO_ERROR;
-  }
-  // Step 2: Write the meta data
-  size_t read_counts = 0;
-  read_counts += fread(&nrow_, sizeof(uint32_t), 1, fp);
-  read_counts += fread(&dtype_, sizeof(dtype_), 1, fp);
-  read_counts += fread(&ninterleave_, sizeof(ninterleave_), 1, fp);
-
-  size_t total_counts =
-    3 + 2 * nlist_ + nlist_ * dim_ + ninterleave_ + ninterleave_ * dim_ + nlist_;
-
-  list_prefix_interleaved_host_ptr_ = (uint32_t*)malloc(sizeof(uint32_t) * nlist_);
-  list_lengths_host_ptr_            = (uint32_t*)malloc(sizeof(uint32_t) * nlist_);
-  list_index_host_ptr_              = (uint32_t*)malloc(sizeof(uint32_t) * ninterleave_);
-  RAFT_CUDA_TRY(cudaMalloc(&list_prefix_interleaved_dev_ptr_, sizeof(uint32_t) * nlist_));
-  RAFT_CUDA_TRY(cudaMalloc(&list_lengths_dev_ptr_, sizeof(uint32_t) * nlist_));
-  RAFT_CUDA_TRY(cudaMalloc(&list_index_dev_ptr_, sizeof(uint32_t) * ninterleave_));
-
-  if (dtype_ == CUDA_R_32F) {
-    list_data_host_ptr_ = malloc(sizeof(float) * ninterleave_ * dim_);
-    RAFT_CUDA_TRY(cudaMalloc(&list_data_dev_ptr_, sizeof(float) * ninterleave_ * dim_));
-  } else if (dtype_ == CUDA_R_8U) {
-    list_data_host_ptr_ = malloc(sizeof(uint8_t) * ninterleave_ * dim_);
-    RAFT_CUDA_TRY(cudaMalloc(&list_data_dev_ptr_, sizeof(uint8_t) * ninterleave_ * dim_));
-  } else if (dtype_ == CUDA_R_8I) {
-    list_data_host_ptr_ = malloc(sizeof(int8_t) * ninterleave_ * dim_);
-    RAFT_CUDA_TRY(cudaMalloc(&list_data_dev_ptr_, sizeof(int8_t) * ninterleave_ * dim_));
-  }
-
-  centriod_host_ptr_ = (float*)malloc(sizeof(float) * nlist_ * dim_);
-  RAFT_CUDA_TRY(cudaMalloc(&centriod_dev_ptr_, sizeof(float) * nlist_ * dim_));
-
-  centriod_norm_host_ptr_ = (float*)malloc(sizeof(float) * nlist_);
-  RAFT_CUDA_TRY(cudaMalloc(&centriod_norm_dev_ptr_, sizeof(float) * nlist_));
-
-  // Step 3: Read the list
-  read_counts += fread(list_prefix_interleaved_host_ptr_, sizeof(uint32_t), nlist_, fp);
-  RAFT_CUDA_TRY(cudaMemcpy(list_prefix_interleaved_dev_ptr_,
-                           list_prefix_interleaved_host_ptr_,
-                           sizeof(uint32_t) * nlist_,
-                           cudaMemcpyHostToDevice));
-
-  read_counts += fread(list_lengths_host_ptr_, sizeof(uint32_t), nlist_, fp);
-  RAFT_CUDA_TRY(cudaMemcpy(list_lengths_dev_ptr_,
-                           list_lengths_host_ptr_,
-                           sizeof(uint32_t) * nlist_,
-                           cudaMemcpyHostToDevice));
-
-  if (dtype_ == CUDA_R_32F) {
-    read_counts += fread(list_data_host_ptr_, sizeof(float), ninterleave_ * dim_, fp);
-    RAFT_CUDA_TRY(cudaMemcpy(list_data_dev_ptr_,
-                             list_data_host_ptr_,
-                             sizeof(float) * ninterleave_ * dim_,
-                             cudaMemcpyHostToDevice));
-  } else if (dtype_ == CUDA_R_8U) {
-    read_counts += fread(list_data_host_ptr_, sizeof(uint8_t), ninterleave_ * dim_, fp);
-    RAFT_CUDA_TRY(cudaMemcpy(list_data_dev_ptr_,
-                             list_data_host_ptr_,
-                             sizeof(uint8_t) * ninterleave_ * dim_,
-                             cudaMemcpyHostToDevice));
-  } else if (dtype_ == CUDA_R_8I) {
-    read_counts += fread(list_data_host_ptr_, sizeof(int8_t), ninterleave_ * dim_, fp);
-    RAFT_CUDA_TRY(cudaMemcpy(list_data_dev_ptr_,
-                             list_data_host_ptr_,
-                             sizeof(int8_t) * ninterleave_ * dim_,
-                             cudaMemcpyHostToDevice));
-  }
-
-  read_counts += fread(list_index_host_ptr_, sizeof(uint32_t), ninterleave_, fp);
-  RAFT_CUDA_TRY(cudaMemcpy(list_index_dev_ptr_,
-                           list_index_host_ptr_,
-                           sizeof(uint32_t) * ninterleave_,
-                           cudaMemcpyHostToDevice));
-  read_counts += fread(centriod_host_ptr_, sizeof(float), nlist_ * dim_, fp);
-  RAFT_CUDA_TRY(cudaMemcpy(
-    centriod_dev_ptr_, centriod_host_ptr_, sizeof(float) * nlist_ * dim_, cudaMemcpyHostToDevice));
-
-  // centriod_norm_host_ptr_
-  read_counts += fread(centriod_norm_host_ptr_, sizeof(float), nlist_, fp);
-  RAFT_CUDA_TRY(cudaMemcpy(centriod_norm_dev_ptr_,
-                           centriod_norm_host_ptr_,
-                           sizeof(float) * nlist_,
-                           cudaMemcpyHostToDevice));
-
-#ifdef DEBUG_L2
-  utils::printDevPtr(centriod_norm_dev_ptr_, 20, "centriod_norm_dev_ptr_");
-#endif
-  if (read_counts != total_counts) {
-    fprintf(stderr, "(%s) failed to load index to file (%s)\n", __func__, fileName);
-    return cuivflStatus_t::CUIVFL_STATUS_FILEIO_ERROR;
-  }
-  fclose(fp);
-  return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
-}  // end func cuivflHandle::cuivflLoadIndex
-
-// cuivflSaveIndex
-cuivflStatus_t cuivflHandle::cuivflSaveIndex(const char* fileName)
-{
-  // Step 1: Open the file
-  FILE* fp = fopen(fileName, "w");
-  if (fp == NULL) {
-    fprintf(stderr, "(%s) failed to open file (%s)\n", __func__, fileName);
-    return cuivflStatus_t::CUIVFL_STATUS_FILEIO_ERROR;
-  }
-  // Step 2: Write the meta data
-  size_t written_counts = 0;
-  size_t total_counts =
-    3 + 2 * nlist_ + nlist_ * dim_ + ninterleave_ + ninterleave_ * dim_ + nlist_;
-
-  written_counts += fwrite(&nrow_, sizeof(uint32_t), 1, fp);
-
-  written_counts += fwrite(&dtype_, sizeof(dtype_), 1, fp);
-  written_counts += fwrite(&ninterleave_, sizeof(ninterleave_), 1, fp);
-  // Step 3: Write the list
-  list_prefix_interleaved_host_ptr_ = (uint32_t*)malloc(sizeof(uint32_t) * nlist_);
-  cudaMemcpy(list_prefix_interleaved_host_ptr_,
-             list_prefix_interleaved_dev_ptr_,
-             sizeof(uint32_t) * nlist_,
-             cudaMemcpyDefault);
-  written_counts += fwrite(list_prefix_interleaved_host_ptr_, sizeof(uint32_t), nlist_, fp);
-  written_counts += fwrite(list_lengths_host_ptr_, sizeof(uint32_t), nlist_, fp);
-
-  if (dtype_ == CUDA_R_32F) {
-    written_counts += fwrite(list_data_host_ptr_, sizeof(float), ninterleave_ * dim_, fp);
-  } else if (dtype_ == CUDA_R_8U) {
-    written_counts += fwrite(list_data_host_ptr_, sizeof(uint8_t), ninterleave_ * dim_, fp);
-  } else if (dtype_ == CUDA_R_8I) {
-    written_counts += fwrite(list_data_host_ptr_, sizeof(int8_t), ninterleave_ * dim_, fp);
-  }
-
-  written_counts += fwrite(list_index_host_ptr_, sizeof(uint32_t), ninterleave_, fp);
-
-  centriod_host_ptr_ = (float*)malloc(sizeof(float) * nlist_ * dim_);
-  RAFT_CUDA_TRY(cudaMemcpy(
-    centriod_host_ptr_, centriod_dev_ptr_, sizeof(float) * nlist_ * dim_, cudaMemcpyDefault));
-
-  written_counts += fwrite(centriod_host_ptr_, sizeof(float), nlist_ * dim_, fp);
-
-  RAFT_CUDA_TRY(cudaMemcpy(
-    centriod_norm_host_ptr_, centriod_norm_dev_ptr_, nlist_ * sizeof(float), cudaMemcpyDefault));
-  written_counts += fwrite(centriod_norm_host_ptr_, sizeof(float), nlist_, fp);
-
-  if (written_counts != total_counts) {
-    fprintf(stderr, "(%s) failed to save index to file (%s)\n", __func__, fileName);
-    return cuivflStatus_t::CUIVFL_STATUS_FILEIO_ERROR;
-  }
-  fclose(fp);
-  // free(list_prefix_interleaved_host_ptr_);
-  // free(centriod_host_ptr_);
-
-  return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
-}  // end func cuivflHandle::cuivflSaveIndex
-
 // cuivflBuildIndex
 
 cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_manage_ptr,

From 3e1c14d8bc5f2502953b921e638507f66e6cbc00 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 19 May 2022 13:48:49 +0200
Subject: [PATCH 018/118] remove obsolete comment

---
 cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
index f02471094f..cedda06b24 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
@@ -1000,8 +1000,7 @@ void launch_interleaved_scan_kernel(
         acc += diff * diff;
       } else {
         const acc_type diff = __vabsdiffu4(x, y);
-        // TODO: add CUDA_ARCH based guard as IDP is only available from SM 6.1 onwards
-        acc = dp4a(diff, diff, acc);
+        acc                 = dp4a(diff, diff, acc);
       }
     } else if constexpr (std::is_same<T, int8_t>{}) {
       if constexpr (veclen == 1) {

From a001999d52113272dc9f438b33aa9a912bf1aa40 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 19 May 2022 13:59:51 +0200
Subject: [PATCH 019/118] Add a missing sync

---
 cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index cc2edb0f7d..632823ee62 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -26,6 +26,7 @@
 #include "processing.hpp"
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
+#include <raft/interruptible.hpp>
 
 //#include <label/classlabels.cuh>
 #include <raft/distance/distance.hpp>
@@ -183,6 +184,7 @@ void _cuann_kmeans_update_centers(float* centers,  // [numCenters, dimCenters]
     }
   } else {
     copy(centers, accumulatedCenters, numCenters * dimCenters, rmm::cuda_stream_default);
+    interruptible::synchronize(rmm::cuda_stream_default);
   }
 
   if (metric == raft::distance::DistanceType::InnerProduct) {

From 2d082713911cf7a91288cc9bf99979c785fd0e01 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 19 May 2022 15:10:16 +0200
Subject: [PATCH 020/118] Rename ann_quantized_faiss

---
 cpp/include/raft/spatial/knn/ann.cuh                           | 3 +--
 .../knn/detail/{ann_quantized_faiss.cuh => ann_quantized.cuh}  | 0
 2 files changed, 1 insertion(+), 2 deletions(-)
 rename cpp/include/raft/spatial/knn/detail/{ann_quantized_faiss.cuh => ann_quantized.cuh} (100%)

diff --git a/cpp/include/raft/spatial/knn/ann.cuh b/cpp/include/raft/spatial/knn/ann.cuh
index 5768d83601..8948be35fc 100644
--- a/cpp/include/raft/spatial/knn/ann.cuh
+++ b/cpp/include/raft/spatial/knn/ann.cuh
@@ -17,9 +17,8 @@
 #pragma once
 
 #include "ann_common.h"
-#include "detail/ann_quantized_faiss.cuh"
+#include "detail/ann_quantized.cuh"
 
-#include <faiss/gpu/GpuIndex.h>
 #include <raft/spatial/knn/faiss_mr.hpp>
 
 namespace raft::spatial::knn {
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
similarity index 100%
rename from cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
rename to cpp/include/raft/spatial/knn/detail/ann_quantized.cuh

From 0f88aaa32b09cc13a99f46d7e787bfc3f4a20220 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 19 May 2022 15:57:10 +0200
Subject: [PATCH 021/118] wip from manual allocations to rmm: updated some
 parts with pointer requirements CPU/GPU

---
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  |  5 +++-
 .../knn/detail/ann_kmeans_balanced.cuh        | 27 +++++++++++++++--
 .../raft/spatial/knn/detail/ann_quantized.cuh | 26 ++++++++++------
 .../raft/spatial/knn/detail/ann_utils.cuh     | 30 +++++++++++--------
 4 files changed, 62 insertions(+), 26 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index 7ecb6c62a2..f26ef36608 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -39,7 +39,6 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/managed_memory_resource.hpp>
 
 #include <faiss/gpu/GpuDistance.h>
@@ -325,6 +324,10 @@ cuivflHandle::~cuivflHandle()
 
 // cuivflBuildIndex
 
+/**
+ * NB: `dataset` is accessed only by GPU code, `trainset` accessed by CPU and GPU.
+ *
+ */
 cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_manage_ptr,
                                                         const void* dataset,
                                                         void* trainset,
diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index 632823ee62..4ec9b1f7af 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -142,7 +142,18 @@ size_t _cuann_kmeans_predict_bufferSize(uint32_t numCenters,
   return size;
 }
 
-// update kmeans centers
+/**
+ * @brief update kmeans centers
+ *
+ * NB: `centers` and `clusterSize` must be accessible on GPU due to _cuann_divide/_cuann_normalize.
+ *      The rest can be both, under assumption that all pointer are accessible from the same place.
+ *
+ * i.e. two variants are possible:
+ *
+ *   1. All pointers are on the device.
+ *   2. All pointers are on the host, but `centers` and `clusterSize` are accessible from GPU.
+ *
+ */
 void _cuann_kmeans_update_centers(float* centers,  // [numCenters, dimCenters]
                                   uint32_t numCenters,
                                   uint32_t dimCenters,
@@ -196,7 +207,13 @@ void _cuann_kmeans_update_centers(float* centers,  // [numCenters, dimCenters]
   }
 }
 
-// predict label of dataset
+
+/**
+ * @brief predict label of dataset
+ *
+ * NB: seems that all pointers here are accessed by devicie code only
+ *
+ */
 void _cuann_kmeans_predict(cublasHandle_t cublasHandle,
                            float* centers,  // [numCenters, dimCenters]
                            uint32_t numCenters,
@@ -317,7 +334,11 @@ void _cuann_kmeans_predict(cublasHandle_t cublasHandle,
   }
 }
 
-// adjust centers which have small number of entries
+/**
+ * @brief adjust centers which have small number of entries
+ *
+ * NB: all pointers are used on the CPU side.
+ */
 bool _cuann_kmeans_adjust_centers(float* centers,  // [numCenters, dimCenters]
                                   uint32_t numCenters,
                                   uint32_t dimCenters,
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
index 2cf8bb0c8d..b6a4fe1ca4 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
@@ -30,6 +30,10 @@
 #include <raft/label/classlabels.cuh>
 #include <raft/spatial/knn/faiss_mr.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/managed_memory_resource.hpp>
+
 #include <faiss/gpu/GpuDistance.h>
 #include <faiss/gpu/GpuIndexFlat.h>
 #include <faiss/gpu/GpuIndexIVFFlat.h>
@@ -93,7 +97,7 @@ void approx_knn_cuivfl_ivfflat_build_index(knnIndex* index,
                                            T* dataset,
                                            IntType n,
                                            IntType D,
-                                           cudaStream_t stream)
+                                           rmm::cuda_stream_view stream)
 {
   int ratio           = 2;  // TODO: take these parameters from API
   int niter           = 20;
@@ -101,20 +105,24 @@ void approx_knn_cuivfl_ivfflat_build_index(knnIndex* index,
   const size_t ntrain = n / ratio;
   assert(ntrain > 0);
 
-  // rmm::device_uvector<T> trainset(ntrain * dim, stream);
-  T* trainset = nullptr;
-  RAFT_CUDA_TRY(cudaMallocManaged(&trainset, ntrain * dim * sizeof(T)));
+  rmm::mr::managed_memory_resource managed_memory;
+  rmm::device_uvector<T> trainset(ntrain * dim, stream, &managed_memory);
 
-  for (size_t i = 0; i < ntrain; ++i) {
-    copy(trainset + i * dim, dataset + ratio * i * dim, dim, stream);
-  }
+  RAFT_CUDA_TRY(cudaMemcpy2DAsync(trainset.data(),
+                                  sizeof(T) * dim,
+                                  dataset,
+                                  sizeof(T) * dim * ratio,
+                                  sizeof(T) * dim,
+                                  ntrain,
+                                  cudaMemcpyDefault,
+                                  stream));
 
   cudaDataType_t dtype = utils::cuda_datatype<T>();
 
   cuivflInit(index->handle_, metric, D, params->nlist, niter, index->device);
 
-  index->handle_->cuivflBuildIndex(dataset, trainset, dtype, n, ntrain, stream);
-  RAFT_CUDA_TRY(cudaFree(trainset));
+  // NB: `trainset` is accessed by both CPU and GPU code here.
+  index->handle_->cuivflBuildIndex(dataset, trainset.data(), dtype, n, ntrain, stream);
 }
 
 template <typename IntType = int>
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index 515cdb19a8..527c47c008 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -165,17 +165,19 @@ size_t _cuann_aligned(size_t size, size_t unit = 128)
   return size;
 }
 
-// memset
+/**
+ * @brief Sets the first num bytes of the block of memory pointed by ptr to the specified value.
+ *
+ * @param[out] ptr host or device pointer
+ * @param[in] value
+ * @param[in] count
+ */
 void _cuann_memset(void* ptr, int value, size_t count)
 {
   cudaPointerAttributes attr;
   cudaPointerGetAttributes(&attr, ptr);
   if (attr.type == cudaMemoryTypeDevice || attr.type == cudaMemoryTypeManaged) {
     RAFT_CUDA_TRY(cudaMemset(ptr, value, count));
-    // if (ret != cudaSuccess) {
-    //     fprintf(stderr, "(%s) cudaMemset() failed\n", __func__);
-    //     exit(-1);
-    // }
   } else {
     memset(ptr, value, count);
   }
@@ -355,6 +357,8 @@ __global__ void kern_accumulate_with_label(uint32_t nRowsOutput,
 /**
  * @brief Accumulate
  *
+ * Pointer residency: altogether available either on GPU or on CPU
+ *
  * @tparam T
  *
  * @param nRowsOutput
@@ -438,10 +442,10 @@ __global__ void kern_normalize(uint32_t nRows,
 /**
  * @brief Normalize
  *
- * @param nRows
- * @param nCols
- * @param a device pointer
- * @param numSamples device pointer
+ * @param[in] nRows
+ * @param[in] nCols
+ * @param[inout] a device pointer
+ * @param[in] numSamples device pointer
  */
 void _cuann_normalize(uint32_t nRows,
                       uint32_t nCols,
@@ -471,10 +475,10 @@ __global__ void kern_divide(uint32_t nRows,
 /**
  * @brief Divide
  *
- * @param nRows
- * @param nCols
- * @param a device pointer
- * @param numSamples device pointer
+ * @param[in] nRows
+ * @param[in] nCols
+ * @param[inout] a device pointer
+ * @param[in] numSamples device pointer
  */
 void _cuann_divide(uint32_t nRows,
                    uint32_t nCols,

From 363dfc93a30eb16cde7f7d9de89e69ff5f511521 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 19 May 2022 16:11:28 +0200
Subject: [PATCH 022/118] wip from manual allocations to rmm

---
 .../raft/spatial/knn/detail/ann_quantized.cuh | 29 +++++++++----------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
index b6a4fe1ca4..071d758284 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
@@ -164,6 +164,7 @@ void approx_knn_build_index(raft::handle_t& handle,
                             IntType n,
                             IntType D)
 {
+  auto stream      = handle.get_stream();
   index->index     = nullptr;
   index->metric    = metric;
   index->metricArg = metricArg;
@@ -176,17 +177,18 @@ void approx_knn_build_index(raft::handle_t& handle,
   if constexpr (std::is_same<T, uint8_t>{} || std::is_same<T, int8_t>{}) {
     if (dynamic_cast<IVFFlatParam*>(params)) {
       IVFFlatParam* IVFFlat_param = dynamic_cast<IVFFlatParam*>(params);
-      T* h_index_array;
-      RAFT_CUDA_TRY(cudaMallocManaged(&h_index_array, n * D * sizeof(T)));
-      copy(h_index_array, index_array, n * D, handle.get_stream());
+
+      rmm::mr::managed_memory_resource managed_memory;
+      rmm::device_uvector<T> managed_index_array(n * D, stream, &managed_memory);
+      copy(managed_index_array.data(), index_array, n * D, stream);
       approx_knn_cuivfl_ivfflat_build_index(
-        index, IVFFlat_param, metric, h_index_array, n, D, handle.get_stream());
+        index, IVFFlat_param, metric, managed_index_array.data(), n, D, stream);
     } else {
       RAFT_FAIL("IVF Flat algorithm required to fit int8 data");
     }
   } else if constexpr (std::is_same<T, float>{}) {
     std::unique_ptr<MetricProcessor<float>> query_metric_processor =
-      create_processor<float>(metric, n, D, 0, false, handle.get_stream());
+      create_processor<float>(metric, n, D, 0, false, stream);
 
     if (dynamic_cast<IVFFlatParam*>(params)) {
       IVFFlatParam* IVFFlat_param = dynamic_cast<IVFFlatParam*>(params);
@@ -196,22 +198,19 @@ void approx_knn_build_index(raft::handle_t& handle,
           metric == raft::distance::DistanceType::L2Unexpanded ||
           metric == raft::distance::DistanceType::L2Expanded ||
           metric == raft::distance::DistanceType::InnerProduct) {
-        float* h_index_array;
-        cudaMallocManaged(&h_index_array, n * D * sizeof(T));
-        // raft::update_host(h_index_array.data(), index_array, h_index_array.size(),
-        // handle.get_stream());
-        copy(h_index_array, index_array, n * D, handle.get_stream());
+        rmm::mr::managed_memory_resource managed_memory;
+        rmm::device_uvector<T> managed_index_array(n * D, stream, &managed_memory);
+        copy(managed_index_array.data(), index_array, n * D, stream);
         approx_knn_cuivfl_ivfflat_build_index(
-          index, IVFFlat_param, metric, h_index_array, n, D, handle.get_stream());
+          index, IVFFlat_param, metric, managed_index_array.data(), n, D, stream);
       } else {
         raft::spatial::knn::RmmGpuResources* gpu_res = new raft::spatial::knn::RmmGpuResources();
         gpu_res->noTempMemory();
-        gpu_res->setDefaultStream(device, handle.get_stream());
+        gpu_res->setDefaultStream(device, stream);
         index->gpu_res = gpu_res;
         approx_knn_ivfflat_build_index(index, IVFFlat_param, metric, n, D);
         std::vector<float> h_index_array(n * D);
-        raft::update_host(
-          h_index_array.data(), index_array, h_index_array.size(), handle.get_stream());
+        raft::update_host(h_index_array.data(), index_array, h_index_array.size(), stream);
         query_metric_processor->revert(index_array);
         index->index->train(n, h_index_array.data());
         index->index->add(n, h_index_array.data());
@@ -221,7 +220,7 @@ void approx_knn_build_index(raft::handle_t& handle,
       RAFT_CUDA_TRY(cudaGetDevice(&device));
       raft::spatial::knn::RmmGpuResources* gpu_res = new raft::spatial::knn::RmmGpuResources();
       gpu_res->noTempMemory();
-      gpu_res->setDefaultStream(device, handle.get_stream());
+      gpu_res->setDefaultStream(device, stream);
       index->gpu_res = gpu_res;
       query_metric_processor->preprocess(index_array);
       if (dynamic_cast<IVFPQParam*>(params)) {

From e5399f824c95c33579111158d10de1326c3a61bd Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 19 May 2022 16:19:25 +0200
Subject: [PATCH 023/118] fix style

---
 cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index 4ec9b1f7af..d39a034116 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -207,7 +207,6 @@ void _cuann_kmeans_update_centers(float* centers,  // [numCenters, dimCenters]
   }
 }
 
-
 /**
  * @brief predict label of dataset
  *

From 306f5bfaf66342be1c8b1feb69ed70e9f383bfaf Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 20 May 2022 08:18:39 +0200
Subject: [PATCH 024/118] Set minimum memory pool size in radix_topk to 256
 bytes

---
 cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh b/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
index d7320139b1..2f06eb6558 100644
--- a/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
@@ -576,10 +576,11 @@ void radix_topk(const T* in,
   if (mr == nullptr) {
     pool_res.emplace(
       rmm::mr::get_current_device_resource(),
-      max_chunk_size * (sizeof(Counter<T, IdxT>)            // counters
-                        + sizeof(IdxT) * (num_buckets + 2)  // histograms and IdxT bufs
-                        + sizeof(T) * 2                     // T bufs
-                        ));
+      Pow2<256>::roundUp(max_chunk_size *
+                         (sizeof(Counter<T, IdxT>)            // counters
+                          + sizeof(IdxT) * (num_buckets + 2)  // histograms and IdxT bufs
+                          + sizeof(T) * 2                     // T bufs
+                          )));
     mr = &(pool_res.value());
   }
   rmm::device_uvector<Counter<T, IdxT>> counters(max_chunk_size, stream, mr);

From fd7d2ba953a35d57cea456e9a03f98467ad3c9e3 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 20 May 2022 08:56:23 +0200
Subject: [PATCH 025/118] wip malloc-to-rmm: removed most of the manual
 allocations

---
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  | 185 +++++++-----------
 1 file changed, 67 insertions(+), 118 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index f26ef36608..44696a2577 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -146,6 +146,8 @@ class cuivflHandle {
   uint32_t getDim();
 
  private:
+  rmm::cuda_stream_view stream_;  // The stream for build and search
+
   uint32_t device_;
   cublasHandle_t cublas_handle_;
   cudaDataType_t dtype_;
@@ -160,9 +162,8 @@ class cuivflHandle {
   size_t ninterleave_;    // The number of elements in 32 interleaved group for input dataset
   size_t buf_topk_size_;  // The size of buffer used for topk select.
   size_t floatQuerySize;  // The size of float converted queries from int8_t/uint8_t
-  rmm::cuda_stream_view stream_;  // The stream for build and search
-  uint32_t veclen;                // The vectorization length of dataset in index.
-  uint32_t gridDimX_;             // The number of blocks launched across nprobe.
+  uint32_t veclen;        // The vectorization length of dataset in index.
+  uint32_t gridDimX_;     // The number of blocks launched across nprobe.
 
  private:
   // device pointer
@@ -170,29 +171,26 @@ class cuivflHandle {
   void* list_data_dev_ptr_;
 
   // The device memory pointer; inverted list for index; size [ninterleave_]
-  uint32_t* list_index_dev_ptr_;
+  rmm::device_uvector<uint32_t> list_index_dev_;
   // The device memory pointer; Used for list_data_manage_ptr_; size [nlist_]
-  uint32_t* list_prefix_interleaved_dev_ptr_;
+  rmm::device_uvector<uint32_t> list_prefix_interleaved_dev_;
   // The device memory pointer; the number of each cluster(list); size [nlist_]
-  uint32_t* list_lengths_dev_ptr_;
+  rmm::device_uvector<uint32_t> list_lengths_dev_;
   // The device memory pointer; centriod; size [nlist_, dim_]
-  float* centriod_dev_ptr_;
+  rmm::device_uvector<float> centriod_dev_;
   // The device memory pointer; centriod norm ; size [nlist_, dim_]
-  float* centriod_norm_dev_ptr_;
+  rmm::device_uvector<float> centriod_norm_dev_;
 
   // host pointer
   //  The host memory pointer; inverted list for data; size [ninterleave_, dim_]
   void* list_data_host_ptr_;
   // The host memory pointer; inverted list for index; size [ninterleave_]
-  uint32_t* list_index_host_ptr_;
+  std::vector<uint32_t> list_index_host_;
   // The host memory pointer; Used for list_data_manage_ptr_; size [nlist_]
-  uint32_t* list_prefix_interleaved_host_ptr_;
+  std::vector<uint32_t> list_prefix_interleaved_host_;
   // The host memory pointer; the number of each cluster(list); size [nlist_]
-  uint32_t* list_lengths_host_ptr_;
-  // The host memory pointer; centriod; size [nlist_, dim_]
-  float* centriod_host_ptr_;
-  // The host memory pointer; centriod norm ; size [nlist_, dim_]
-  float* centriod_norm_host_ptr_;
+  std::vector<uint32_t> list_lengths_host_;
+
   // The device memory; used for topk select.
   void* buf_dev_ptr_;
 
@@ -220,17 +218,24 @@ cuivflHandle::cuivflHandle(raft::distance::DistanceType metric_type,
                            uint32_t nlist,
                            uint32_t niter,
                            uint32_t device)
+  : stream_(rmm::cuda_stream_default),
+    device_(device),
+    dim_(dim),
+    nlist_(nlist),
+    niter_(niter),
+    metric_type_(metric_type),
+    list_index_dev_(0, stream_),
+    list_prefix_interleaved_dev_(0, stream_),
+    list_lengths_dev_(0, stream_),
+    centriod_dev_(0, stream_),
+    centriod_norm_dev_(0, stream_),
+    list_index_host_(0),
+    list_prefix_interleaved_host_(0),
+    list_lengths_host_(0)
 {
-  // Device
-  device_        = device;
-  dim_           = dim;
-  nlist_         = nlist;
-  niter_         = niter;
-  metric_type_   = metric_type;
   floatQuerySize = 0;
   veclen         = 1;
   gridDimX_      = 0;
-  stream_        = rmm::cuda_stream_default;
 
   if ((dim % 4) == 0) {
     veclen = 4;
@@ -247,19 +252,8 @@ cuivflHandle::cuivflHandle(raft::distance::DistanceType metric_type,
     throw cuivflStatus_t::CUIVFL_STATUS_CUBLAS_ERROR;
   }
 
-  list_data_dev_ptr_               = nullptr;
-  list_index_dev_ptr_              = nullptr;
-  list_prefix_interleaved_dev_ptr_ = nullptr;
-  list_lengths_dev_ptr_            = nullptr;
-  centriod_dev_ptr_                = nullptr;
-  centriod_norm_dev_ptr_           = nullptr;
-
-  list_data_host_ptr_               = nullptr;
-  list_index_host_ptr_              = nullptr;
-  list_prefix_interleaved_host_ptr_ = nullptr;
-  list_lengths_host_ptr_            = nullptr;
-  centriod_host_ptr_                = nullptr;
-  centriod_norm_host_ptr_           = nullptr;
+  list_data_dev_ptr_  = nullptr;
+  list_data_host_ptr_ = nullptr;
 
   buf_dev_ptr_           = nullptr;
   hierarchialClustering_ = true;
@@ -274,51 +268,10 @@ cuivflHandle::~cuivflHandle()
     cudaFree(list_data_dev_ptr_);
     list_data_dev_ptr_ = nullptr;
   }
-  if (list_index_dev_ptr_ != nullptr) {
-    cudaFree(list_index_dev_ptr_);
-    list_index_dev_ptr_ = nullptr;
-  }
-  if (list_prefix_interleaved_dev_ptr_ != nullptr) {
-    cudaFree(list_prefix_interleaved_dev_ptr_);
-    list_prefix_interleaved_dev_ptr_ = nullptr;
-  }
-  if (list_lengths_dev_ptr_ != nullptr) {
-    cudaFree(list_lengths_dev_ptr_);
-    list_lengths_dev_ptr_ = nullptr;
-  }
-  if (centriod_dev_ptr_ != nullptr) {
-    cudaFree(centriod_dev_ptr_);
-    centriod_dev_ptr_ = nullptr;
-  }
-  if (centriod_norm_dev_ptr_ != nullptr) {
-    cudaFree(centriod_norm_dev_ptr_);
-    centriod_norm_dev_ptr_ = nullptr;
-  }
-
   if (list_data_host_ptr_ != nullptr) {
     free(list_data_host_ptr_);
     list_data_host_ptr_ = nullptr;
   }
-  if (list_index_host_ptr_ != nullptr) {
-    free(list_index_host_ptr_);
-    list_index_host_ptr_ = nullptr;
-  }
-  if (list_prefix_interleaved_host_ptr_ != nullptr) {
-    free(list_prefix_interleaved_host_ptr_);
-    list_prefix_interleaved_host_ptr_ = nullptr;
-  }
-  if (list_lengths_host_ptr_ != nullptr) {
-    free(list_lengths_host_ptr_);
-    list_lengths_host_ptr_ = nullptr;
-  }
-  if (centriod_host_ptr_ != nullptr) {
-    free(centriod_host_ptr_);
-    centriod_host_ptr_ = nullptr;
-  }
-  if (centriod_norm_host_ptr_ != nullptr) {
-    free(centriod_norm_host_ptr_);
-    centriod_norm_host_ptr_ = nullptr;
-  }
   cublasDestroy(cublas_handle_);
 }  // end func cuivflHandle::cuivflHand
 
@@ -630,7 +583,7 @@ cuivflStatus_t cuivflHandle::cuivflBuildIndex(const void* dataset,
   stream_ = stream;
 
   rmm::mr::managed_memory_resource managed_memory;
-  rmm::device_uvector<float> centriod_manage_buf(nlist_ * dim_, stream, &managed_memory);
+  rmm::device_uvector<float> centriod_manage_buf(nlist_ * dim_, stream_, &managed_memory);
   auto centriod_manage_ptr = centriod_manage_buf.data();
 
   if (this == NULL || nrow_ == 0) { return CUIVFL_STATUS_NOT_INITIALIZED; }
@@ -639,39 +592,37 @@ cuivflStatus_t cuivflHandle::cuivflBuildIndex(const void* dataset,
   }
 
   // Alloc manage memory for centriods, trainset and workspace
-  rmm::device_uvector<uint32_t> datasetLabels_buf(nrow_, stream, &managed_memory);  // [numDataset]
+  rmm::device_uvector<uint32_t> datasetLabels_buf(nrow_, stream_, &managed_memory);  // [numDataset]
   auto datasetLabels = datasetLabels_buf.data();
 
   // Step 3: Predict labels of the whole dataset
   cuivflBuildOptimizedKmeans(
-    centriod_manage_ptr, dataset, trainset, datasetLabels, dtype, nrow, ntrain, stream);
+    centriod_manage_ptr, dataset, trainset, datasetLabels, dtype, nrow, ntrain, stream_);
 
   // Step 3.2: Calculate the L2 related result
-  centriod_norm_host_ptr_ = (float*)malloc(sizeof(float) * nlist_);
-  RAFT_CUDA_TRY(cudaMalloc(&centriod_norm_dev_ptr_, sizeof(float) * nlist_));
+  centriod_norm_dev_.resize(nlist_, stream_);
 
   if (metric_type_ == raft::distance::DistanceType::L2Expanded) {
-    utils::_cuann_sqsum(nlist_, dim_, centriod_manage_ptr, centriod_norm_dev_ptr_);
+    utils::_cuann_sqsum(nlist_, dim_, centriod_manage_ptr, centriod_norm_dev_.data());
 #ifdef DEBUG_L2
-    printDevPtr(centriod_norm_dev_ptr_, 20, "centriod_norm_dev_ptr_");
+    printDevPtr(centriod_norm_dev_.data(), 20, "centriod_norm_dev_");
 #endif
   }
 
   // Step 4: Record the number of elements in each clusters
   RAFT_CUDA_TRY(cudaDeviceSynchronize());
-  list_lengths_host_ptr_            = (uint32_t*)malloc(sizeof(uint32_t) * nlist_);
-  list_prefix_interleaved_host_ptr_ = (uint32_t*)malloc(sizeof(uint32_t) * nlist_);
-  memset(list_lengths_host_ptr_, 0, sizeof(uint32_t) * nlist_);
 
+  list_prefix_interleaved_host_.resize(nlist_);
+  list_lengths_host_.assign(nlist_, 0);
   for (uint32_t i = 0; i < nrow_; i++) {
     uint32_t id_cluster = datasetLabels[i];
-    list_lengths_host_ptr_[id_cluster] += 1;
+    list_lengths_host_[id_cluster] += 1;
   }
 
   ninterleave_ = 0;
   for (uint32_t i = 0; i < nlist_; i++) {
-    list_prefix_interleaved_host_ptr_[i] = ninterleave_;
-    ninterleave_ += ((list_lengths_host_ptr_[i] - 1) / WarpSize + 1) * WarpSize;
+    list_prefix_interleaved_host_[i] = ninterleave_;
+    ninterleave_ += ((list_lengths_host_[i] - 1) / WarpSize + 1) * WarpSize;
   }
 
   if (dtype == CUDA_R_32F) {
@@ -684,9 +635,8 @@ cuivflStatus_t cuivflHandle::cuivflBuildIndex(const void* dataset,
     list_data_host_ptr_ = malloc(sizeof(int8_t) * ninterleave_ * dim_);
     memset(list_data_host_ptr_, 0, sizeof(int8_t) * ninterleave_ * dim_);
   }
-  list_index_host_ptr_ = (uint32_t*)malloc(sizeof(uint32_t) * ninterleave_);
-  memset(list_index_host_ptr_, 0, sizeof(uint32_t) * ninterleave_);
-  memset(list_lengths_host_ptr_, 0, sizeof(uint32_t) * nlist_);
+  list_index_host_.assign(ninterleave_, 0);
+  list_lengths_host_.assign(nlist_, 0);
 
   if ((dtype == CUDA_R_8I) || (dtype == CUDA_R_8U)) {
     if ((dim_ % 16) == 0) {
@@ -698,8 +648,8 @@ cuivflStatus_t cuivflHandle::cuivflBuildIndex(const void* dataset,
 
   for (size_t i = 0; i < nrow_; i++) {
     uint32_t id_cluster     = datasetLabels[i];
-    uint32_t current_add    = list_lengths_host_ptr_[id_cluster];
-    uint32_t interleave_add = list_prefix_interleaved_host_ptr_[id_cluster];
+    uint32_t current_add    = list_lengths_host_[id_cluster];
+    uint32_t interleave_add = list_prefix_interleaved_host_[id_cluster];
 
     if (dtype == CUDA_R_32F) {
       float* list_data = (float*)list_data_host_ptr_;
@@ -717,17 +667,15 @@ cuivflStatus_t cuivflHandle::cuivflBuildIndex(const void* dataset,
       _ivfflat_interleaved(
         list_data, ori_data + i * dim_, dim_, current_add, interleave_add, veclen);
     }
-    list_index_host_ptr_[interleave_add + current_add] = i;
-    list_lengths_host_ptr_[id_cluster] += 1;
+    list_index_host_[interleave_add + current_add] = i;
+    list_lengths_host_[id_cluster] += 1;
   }
 
-  RAFT_CUDA_TRY(cudaMalloc(&centriod_dev_ptr_, sizeof(float) * nlist_ * dim_));
-  copy(centriod_dev_ptr_, centriod_manage_ptr, nlist_ * dim_, stream);
-
   // Store index on GPU memory: temp WAR until we've entire index building buffers on device
-  RAFT_CUDA_TRY(cudaMalloc(&list_prefix_interleaved_dev_ptr_, sizeof(uint32_t) * nlist_));
-  RAFT_CUDA_TRY(cudaMalloc(&list_lengths_dev_ptr_, sizeof(uint32_t) * nlist_));
-  RAFT_CUDA_TRY(cudaMalloc(&list_index_dev_ptr_, sizeof(uint32_t) * ninterleave_));
+  list_index_dev_.resize(ninterleave_, stream_);
+  list_prefix_interleaved_dev_.resize(nlist_, stream_);
+  list_lengths_dev_.resize(nlist_, stream_);
+  centriod_dev_.resize(nlist_ * dim_, stream_);
 
   if (dtype_ == CUDA_R_32F) {
     RAFT_CUDA_TRY(cudaMalloc(&list_data_dev_ptr_, sizeof(float) * ninterleave_ * dim_));
@@ -738,15 +686,16 @@ cuivflStatus_t cuivflHandle::cuivflBuildIndex(const void* dataset,
   }
 
   // Step 3: Read the list
-  copy(list_prefix_interleaved_dev_ptr_, list_prefix_interleaved_host_ptr_, nlist_, stream);
-  copy(list_lengths_dev_ptr_, list_lengths_host_ptr_, nlist_, stream);
+  copy(list_prefix_interleaved_dev_.data(), list_prefix_interleaved_host_.data(), nlist_, stream_);
+  copy(list_lengths_dev_.data(), list_lengths_host_.data(), nlist_, stream_);
+  copy(centriod_dev_.data(), centriod_manage_ptr, nlist_ * dim_, stream_);
 
   RAFT_CUDA_TRY(cudaMemcpyAsync(list_data_dev_ptr_,
                                 list_data_host_ptr_,
                                 utils::cuda_datatype_size(dtype_) * ninterleave_ * dim_,
                                 cudaMemcpyHostToDevice,
-                                stream));
-  copy(list_index_dev_ptr_, list_index_host_ptr_, ninterleave_, stream);
+                                stream_));
+  copy(list_index_dev_.data(), list_index_host_.data(), ninterleave_, stream_);
 
   return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
 }  // end func cuivflBuildIndex
@@ -1001,9 +950,9 @@ cuivflStatus_t cuivflHandle::cuivflSearchImpl(const T* queries,  // [numQueries,
     beta  = 1.0f;
     utils::_cuann_sqsum(batch_size, dim_, convertedQueries, query_norm_dev_ptr);
     utils::_cuann_outer_add(
-      query_norm_dev_ptr, batch_size, centriod_norm_dev_ptr_, nlist_, distance_buffer_dev_ptr);
+      query_norm_dev_ptr, batch_size, centriod_norm_dev_.data(), nlist_, distance_buffer_dev_ptr);
 #ifdef DEBUG_L2
-    utils::printDevPtr(centriod_norm_dev_ptr_, 20, "centriod_norm_dev_ptr_");
+    utils::printDevPtr(centriod_norm_dev_.data(), 20, "centriod_norm_dev_");
     utils::printDevPtr(distance_buffer_dev_ptr, 20, "distance_buffer_dev_ptr");
 #endif
   } else {
@@ -1018,7 +967,7 @@ cuivflStatus_t cuivflHandle::cuivflSearchImpl(const T* queries,  // [numQueries,
                batch_size,
                dim_,
                &alpha,
-               centriod_dev_ptr_,
+               centriod_dev_.data(),
                CUDA_R_32F,
                dim_,
                convertedQueries,
@@ -1059,10 +1008,10 @@ cuivflStatus_t cuivflHandle::cuivflSearchImpl(const T* queries,  // [numQueries,
   if constexpr (std::is_same<T, float>{}) {
     ivfflat_interleaved_scan<float, float>(queries,
                                            coarse_indices_dev_ptr,
-                                           list_index_dev_ptr_,
+                                           list_index_dev_.data(),
                                            list_data_dev_ptr_,
-                                           list_lengths_dev_ptr_,
-                                           list_prefix_interleaved_dev_ptr_,
+                                           list_lengths_dev_.data(),
+                                           list_prefix_interleaved_dev_.data(),
                                            metric_type_,
                                            nprobe,
                                            k,
@@ -1078,10 +1027,10 @@ cuivflStatus_t cuivflHandle::cuivflSearchImpl(const T* queries,  // [numQueries,
     // we use int32_t for accumulation, and final store in fp32
     ivfflat_interleaved_scan<uint8_t, uint32_t>(queries,
                                                 coarse_indices_dev_ptr,
-                                                list_index_dev_ptr_,
+                                                list_index_dev_.data(),
                                                 list_data_dev_ptr_,
-                                                list_lengths_dev_ptr_,
-                                                list_prefix_interleaved_dev_ptr_,
+                                                list_lengths_dev_.data(),
+                                                list_prefix_interleaved_dev_.data(),
                                                 metric_type_,
                                                 nprobe,
                                                 k,
@@ -1096,10 +1045,10 @@ cuivflStatus_t cuivflHandle::cuivflSearchImpl(const T* queries,  // [numQueries,
   } else if constexpr (std::is_same<T, int8_t>{}) {
     ivfflat_interleaved_scan<int8_t, int32_t>(queries,
                                               coarse_indices_dev_ptr,
-                                              list_index_dev_ptr_,
+                                              list_index_dev_.data(),
                                               list_data_dev_ptr_,
-                                              list_lengths_dev_ptr_,
-                                              list_prefix_interleaved_dev_ptr_,
+                                              list_lengths_dev_.data(),
+                                              list_prefix_interleaved_dev_.data(),
                                               metric_type_,
                                               nprobe,
                                               k,

From 403667aac9d7b20d1ed14e36b4eb5971dc3c0eb2 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 20 May 2022 09:20:32 +0200
Subject: [PATCH 026/118] misc cleanup

---
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  | 71 ++++---------------
 .../raft/spatial/knn/detail/ann_quantized.cuh |  2 +-
 2 files changed, 15 insertions(+), 58 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index 44696a2577..411c564c26 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -16,53 +16,24 @@
 
 #pragma once
 
+#include "ann_ivf_flat_kernel.cuh"
 #include "ann_kmeans_balanced.cuh"
 #include "ann_utils.cuh"
 #include "knn_brute_force_faiss.cuh"
-#include <cublas_v2.h>
-#include <library_types.h>
-#include <raft/spatial/knn/ann_common.h>
-//#include "ann_ivf_flat.cuh"
-#include "ann_ivf_flat_kernel.cuh"
-#include "topk/radix_topk.cuh"
-
-#include "common_faiss.h"
 #include "processing.hpp"
+#include "topk/radix_topk.cuh"
 
-#include "processing.hpp"
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-
-//#include <label/classlabels.cuh>
 #include <raft/distance/distance.hpp>
-#include <raft/spatial/knn/faiss_mr.hpp>
+#include <raft/distance/distance_type.hpp>
+#include <raft/spatial/knn/ann_common.h>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/managed_memory_resource.hpp>
 
-#include <faiss/gpu/GpuDistance.h>
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/GpuIndexIVFFlat.h>
-#include <faiss/gpu/GpuIndexIVFPQ.h>
-#include <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/utils/Limits.cuh>
-#include <faiss/gpu/utils/Select.cuh>
-#include <faiss/gpu/utils/Tensor.cuh>
-#include <faiss/utils/Heap.h>
-
-#include <thrust/iterator/transform_iterator.h>
-
-#include <raft/distance/distance_type.hpp>
-
-#include <iostream>
-#include <set>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-namespace detail {
+namespace raft::spatial::knn::detail {
 
 template <typename T>
 void _ivfflat_interleaved(
@@ -195,7 +166,7 @@ class cuivflHandle {
   void* buf_dev_ptr_;
 
  private:
-  cuivflStatus_t cuivflBuildOptimizedKmeans(float* centriod_manage_ptr,
+  cuivflStatus_t cuivflBuildOptimizedKmeans(float* centriod_managed_ptr,
                                             const void* dataset,
                                             void* trainset,
                                             uint32_t* clusterSize,
@@ -281,7 +252,7 @@ cuivflHandle::~cuivflHandle()
  * NB: `dataset` is accessed only by GPU code, `trainset` accessed by CPU and GPU.
  *
  */
-cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_manage_ptr,
+cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_managed_ptr,
                                                         const void* dataset,
                                                         void* trainset,
                                                         uint32_t* datasetLabels,
@@ -297,7 +268,7 @@ cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_manage_p
 
   rmm::device_uvector<uint32_t> trainsetLabels(numTrainset, stream);
 
-  float* clusterCenters = centriod_manage_ptr;
+  float* clusterCenters = centriod_managed_ptr;
 
   uint32_t numMesoClusters = pow((double)(numClusters), (double)1.0 / 2.0) + 0.5;
   fprintf(stderr, "# numMesoClusters: %u\n", numMesoClusters);
@@ -583,8 +554,8 @@ cuivflStatus_t cuivflHandle::cuivflBuildIndex(const void* dataset,
   stream_ = stream;
 
   rmm::mr::managed_memory_resource managed_memory;
-  rmm::device_uvector<float> centriod_manage_buf(nlist_ * dim_, stream_, &managed_memory);
-  auto centriod_manage_ptr = centriod_manage_buf.data();
+  rmm::device_uvector<float> centriod_managed_buf(nlist_ * dim_, stream_, &managed_memory);
+  auto centriod_managed_ptr = centriod_managed_buf.data();
 
   if (this == NULL || nrow_ == 0) { return CUIVFL_STATUS_NOT_INITIALIZED; }
   if (dtype != CUDA_R_32F && dtype != CUDA_R_8U && dtype != CUDA_R_8I) {
@@ -597,13 +568,13 @@ cuivflStatus_t cuivflHandle::cuivflBuildIndex(const void* dataset,
 
   // Step 3: Predict labels of the whole dataset
   cuivflBuildOptimizedKmeans(
-    centriod_manage_ptr, dataset, trainset, datasetLabels, dtype, nrow, ntrain, stream_);
+    centriod_managed_ptr, dataset, trainset, datasetLabels, dtype, nrow, ntrain, stream_);
 
   // Step 3.2: Calculate the L2 related result
   centriod_norm_dev_.resize(nlist_, stream_);
 
   if (metric_type_ == raft::distance::DistanceType::L2Expanded) {
-    utils::_cuann_sqsum(nlist_, dim_, centriod_manage_ptr, centriod_norm_dev_.data());
+    utils::_cuann_sqsum(nlist_, dim_, centriod_managed_ptr, centriod_norm_dev_.data());
 #ifdef DEBUG_L2
     printDevPtr(centriod_norm_dev_.data(), 20, "centriod_norm_dev_");
 #endif
@@ -688,7 +659,7 @@ cuivflStatus_t cuivflHandle::cuivflBuildIndex(const void* dataset,
   // Step 3: Read the list
   copy(list_prefix_interleaved_dev_.data(), list_prefix_interleaved_host_.data(), nlist_, stream_);
   copy(list_lengths_dev_.data(), list_lengths_host_.data(), nlist_, stream_);
-  copy(centriod_dev_.data(), centriod_manage_ptr, nlist_ * dim_, stream_);
+  copy(centriod_dev_.data(), centriod_managed_ptr, nlist_ * dim_, stream_);
 
   RAFT_CUDA_TRY(cudaMemcpyAsync(list_data_dev_ptr_,
                                 list_data_host_ptr_,
@@ -1099,18 +1070,4 @@ cuivflStatus_t cuivflHandle::cuivflSearchImpl(const T* queries,  // [numQueries,
   return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
 }  // end func cuivflHandle::cuivflSearchImpl
 
-void cuivflInit(std::unique_ptr<detail::cuivflHandle>& handle,
-                raft::distance::DistanceType metric,
-                int D,
-                int nlist,
-                int niter,
-                int deviceId)
-{
-  handle = std::make_unique<cuivflHandle>(metric, D, nlist, niter, deviceId);
-  return;
-}
-
-}  // namespace detail
-}  // namespace knn
-}  // namespace spatial
-}  // namespace raft
+}  // namespace raft::spatial::knn::detail
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
index 071d758284..710724a69d 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
@@ -119,7 +119,7 @@ void approx_knn_cuivfl_ivfflat_build_index(knnIndex* index,
 
   cudaDataType_t dtype = utils::cuda_datatype<T>();
 
-  cuivflInit(index->handle_, metric, D, params->nlist, niter, index->device);
+  index->handle_ = std::make_unique<cuivflHandle>(metric, D, params->nlist, niter, index->device);
 
   // NB: `trainset` is accessed by both CPU and GPU code here.
   index->handle_->cuivflBuildIndex(dataset, trainset.data(), dtype, n, ntrain, stream);

From 4c6d563fc487ab8bf841fab74efcc4a79aeaf031 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 20 May 2022 10:12:40 +0200
Subject: [PATCH 027/118] Refactoing; used raft::handle in place of cublas
 handle everywhere

---
 cpp/include/raft/linalg/detail/gemm.hpp       |   2 +-
 cpp/include/raft/linalg/gemm.cuh              |   2 +-
 cpp/include/raft/linalg/gemm.hpp              |   4 +-
 cpp/include/raft/spatial/knn/ann.cuh          |   4 +-
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  | 152 +++++++-----------
 .../knn/detail/ann_kmeans_balanced.cuh        |  62 ++-----
 .../raft/spatial/knn/detail/ann_quantized.cuh |  24 +--
 .../raft/spatial/knn/detail/ann_utils.cuh     |  24 ---
 8 files changed, 91 insertions(+), 183 deletions(-)

diff --git a/cpp/include/raft/linalg/detail/gemm.hpp b/cpp/include/raft/linalg/detail/gemm.hpp
index 29308304d8..50a8be6018 100644
--- a/cpp/include/raft/linalg/detail/gemm.hpp
+++ b/cpp/include/raft/linalg/detail/gemm.hpp
@@ -61,7 +61,7 @@ void gemm(const raft::handle_t& handle,
           const math_t* B,
           const int ldb,
           const math_t* beta,
-          const math_t* C,
+          math_t* C,
           const int ldc,
           cudaStream_t stream)
 {
diff --git a/cpp/include/raft/linalg/gemm.cuh b/cpp/include/raft/linalg/gemm.cuh
index 9670834ff0..16a5bc48ea 100644
--- a/cpp/include/raft/linalg/gemm.cuh
+++ b/cpp/include/raft/linalg/gemm.cuh
@@ -58,7 +58,7 @@ void gemm(const raft::handle_t& handle,
           const math_t* B,
           const int ldb,
           const math_t* beta,
-          const math_t* C,
+          math_t* C,
           const int ldc,
           cudaStream_t stream)
 {
diff --git a/cpp/include/raft/linalg/gemm.hpp b/cpp/include/raft/linalg/gemm.hpp
index 56621e4f8b..37c6b2d552 100644
--- a/cpp/include/raft/linalg/gemm.hpp
+++ b/cpp/include/raft/linalg/gemm.hpp
@@ -63,7 +63,7 @@ void gemm(const raft::handle_t& handle,
           const math_t* B,
           const int ldb,
           const math_t* beta,
-          const math_t* C,
+          math_t* C,
           const int ldc,
           cudaStream_t stream)
 {
@@ -181,4 +181,4 @@ void gemm(const raft::handle_t& handle,
 }  // end namespace linalg
 }  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/spatial/knn/ann.cuh b/cpp/include/raft/spatial/knn/ann.cuh
index 8948be35fc..a5e05bc82b 100644
--- a/cpp/include/raft/spatial/knn/ann.cuh
+++ b/cpp/include/raft/spatial/knn/ann.cuh
@@ -37,7 +37,7 @@ namespace raft::spatial::knn {
  * @param[in] D the dimensionality of the index array
  */
 template <typename T = float, typename value_idx = int>
-inline void approx_knn_build_index(raft::handle_t& handle,
+inline void approx_knn_build_index(const raft::handle_t& handle,
                                    raft::spatial::knn::knnIndex* index,
                                    knnIndexParam* params,
                                    raft::distance::DistanceType metric,
@@ -64,7 +64,7 @@ inline void approx_knn_build_index(raft::handle_t& handle,
  * @param[in] n number of rows in the query array
  */
 template <typename T = float, typename value_idx = int>
-inline void approx_knn_search(raft::handle_t& handle,
+inline void approx_knn_search(const raft::handle_t& handle,
                               float* distances,
                               int64_t* indices,
                               raft::spatial::knn::knnIndex* index,
diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index 411c564c26..cc4be0f12a 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -19,14 +19,13 @@
 #include "ann_ivf_flat_kernel.cuh"
 #include "ann_kmeans_balanced.cuh"
 #include "ann_utils.cuh"
-#include "knn_brute_force_faiss.cuh"
-#include "processing.hpp"
 #include "topk/radix_topk.cuh"
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/distance/distance.hpp>
 #include <raft/distance/distance_type.hpp>
+#include <raft/linalg/gemm.cuh>
 #include <raft/spatial/knn/ann_common.h>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -86,18 +85,15 @@ enum cuivflStatus_t : unsigned int {
 
 class cuivflHandle {
  public:
-  cuivflHandle(raft::distance::DistanceType metric_type,
+  cuivflHandle(const handle_t& handle,
+               raft::distance::DistanceType metric_type,
                uint32_t dim,
                uint32_t nlist,
                uint32_t niter,
                uint32_t device);
   ~cuivflHandle();
-  cuivflStatus_t cuivflBuildIndex(const void* dataset,
-                                  void* trainset,
-                                  cudaDataType_t dtype,
-                                  uint32_t nrow,
-                                  uint32_t nTrainset,
-                                  rmm::cuda_stream_view stream);
+  cuivflStatus_t cuivflBuildIndex(
+    const void* dataset, void* trainset, cudaDataType_t dtype, uint32_t nrow, uint32_t nTrainset);
 
   cuivflStatus_t cuivflSetSearchParameters(const uint32_t nprobe,
                                            const uint32_t max_batch,
@@ -108,7 +104,6 @@ class cuivflHandle {
                               uint32_t k,
                               size_t* neighbors,
                               float* distances,
-                              rmm::cuda_stream_view stream,
                               cudaDataType_t dtype);
 
   cuivflStatus_t queryIVFFlatGridSize(const uint32_t nprobe,
@@ -117,10 +112,10 @@ class cuivflHandle {
   uint32_t getDim();
 
  private:
-  rmm::cuda_stream_view stream_;  // The stream for build and search
+  const handle_t& handle_;
+  const rmm::cuda_stream_view stream_;
 
   uint32_t device_;
-  cublasHandle_t cublas_handle_;
   cudaDataType_t dtype_;
   raft::distance::DistanceType metric_type_;
   bool greater_;
@@ -136,7 +131,6 @@ class cuivflHandle {
   uint32_t veclen;        // The vectorization length of dataset in index.
   uint32_t gridDimX_;     // The number of blocks launched across nprobe.
 
- private:
   // device pointer
   //  The device memory pointer; inverted list for data; size [ninterleave_, dim_]
   void* list_data_dev_ptr_;
@@ -165,31 +159,28 @@ class cuivflHandle {
   // The device memory; used for topk select.
   void* buf_dev_ptr_;
 
- private:
   cuivflStatus_t cuivflBuildOptimizedKmeans(float* centriod_managed_ptr,
                                             const void* dataset,
                                             void* trainset,
                                             uint32_t* clusterSize,
                                             cudaDataType_t dtype,
                                             uint32_t nrow,
-                                            uint32_t ntrain,
-                                            rmm::cuda_stream_view stream);
+                                            uint32_t ntrain);
+
   template <typename T, typename value_t>
-  cuivflStatus_t cuivflSearchImpl(const T* queries,
-                                  uint32_t batch_size,
-                                  uint32_t k,
-                                  size_t* neighbors,
-                                  value_t* distances,
-                                  rmm::cuda_stream_view stream);
+  cuivflStatus_t cuivflSearchImpl(
+    const T* queries, uint32_t batch_size, uint32_t k, size_t* neighbors, value_t* distances);
 };
 
 // cuivflCreate
-cuivflHandle::cuivflHandle(raft::distance::DistanceType metric_type,
+cuivflHandle::cuivflHandle(const handle_t& handle,
+                           raft::distance::DistanceType metric_type,
                            uint32_t dim,
                            uint32_t nlist,
                            uint32_t niter,
                            uint32_t device)
-  : stream_(rmm::cuda_stream_default),
+  : handle_(handle),
+    stream_(handle_.get_stream()),
     device_(device),
     dim_(dim),
     nlist_(nlist),
@@ -214,15 +205,6 @@ cuivflHandle::cuivflHandle(raft::distance::DistanceType metric_type,
     veclen = 2;
   }
 
-  // cuBLAS
-  cublasStatus_t cublasError;
-  cublasError = cublasCreate(&cublas_handle_);
-
-  if (cublasError != CUBLAS_STATUS_SUCCESS) {
-    fprintf(stderr, "(%s) cublasCreate() failed\n", __func__);
-    throw cuivflStatus_t::CUIVFL_STATUS_CUBLAS_ERROR;
-  }
-
   list_data_dev_ptr_  = nullptr;
   list_data_host_ptr_ = nullptr;
 
@@ -243,7 +225,6 @@ cuivflHandle::~cuivflHandle()
     free(list_data_host_ptr_);
     list_data_host_ptr_ = nullptr;
   }
-  cublasDestroy(cublas_handle_);
 }  // end func cuivflHandle::cuivflHand
 
 // cuivflBuildIndex
@@ -258,15 +239,14 @@ cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_managed_
                                                         uint32_t* datasetLabels,
                                                         cudaDataType_t dtype,
                                                         uint32_t nrow,
-                                                        uint32_t ntrain,
-                                                        rmm::cuda_stream_view stream)
+                                                        uint32_t ntrain)
 {
   uint32_t numTrainset   = ntrain;
   uint32_t numClusters   = nlist_;
   uint32_t dimDataset    = dim_;
   uint32_t numIterations = niter_;
 
-  rmm::device_uvector<uint32_t> trainsetLabels(numTrainset, stream);
+  rmm::device_uvector<uint32_t> trainsetLabels(numTrainset, stream_);
 
   float* clusterCenters = centriod_managed_ptr;
 
@@ -275,11 +255,11 @@ cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_managed_
 
   rmm::mr::managed_memory_resource managed_memory;
   rmm::device_uvector<float> mesoClusterCenters(
-    numMesoClusters * dimDataset, stream, &managed_memory);
-  rmm::device_uvector<uint32_t> mesoClusterLabels(numTrainset, stream, &managed_memory);
-  rmm::device_uvector<uint32_t> mesoClusterSize_buf(numMesoClusters, stream, &managed_memory);
+    numMesoClusters * dimDataset, stream_, &managed_memory);
+  rmm::device_uvector<uint32_t> mesoClusterLabels(numTrainset, stream_, &managed_memory);
+  rmm::device_uvector<uint32_t> mesoClusterSize_buf(numMesoClusters, stream_, &managed_memory);
   rmm::device_uvector<float> mesoClusterCentersTemp(
-    numMesoClusters * dimDataset, stream, &managed_memory);
+    numMesoClusters * dimDataset, stream_, &managed_memory);
 
   auto mesoClusterSize = mesoClusterSize_buf.data();
 
@@ -288,7 +268,7 @@ cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_managed_
                                      dimDataset,
                                      numTrainset  // number of vectors
     );
-  rmm::device_buffer predictWorkspace(sizePredictWorkspace, stream);
+  rmm::device_buffer predictWorkspace(sizePredictWorkspace, stream_);
   // Training meso-clusters
   for (uint32_t iter = 0; iter < 2 * numIterations; iter += 2) {
     fprintf(stderr,
@@ -296,7 +276,7 @@ cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_managed_
             __func__,
             (float)iter / 2,
             numIterations);
-    _cuann_kmeans_predict(cublas_handle_,
+    _cuann_kmeans_predict(handle_,
                           mesoClusterCenters.data(),
                           numMesoClusters,
                           dimDataset,
@@ -360,9 +340,9 @@ cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_managed_
   assert(csumFineClusters[numMesoClusters] == numClusters);
 
   // uint32_t *idsTrainset = (uint32_t *)malloc(sizeof(uint32_t) * mesoClusterSizeMax);
-  rmm::device_uvector<uint32_t> idsTrainset_buf(mesoClusterSizeMax, stream, &managed_memory);
+  rmm::device_uvector<uint32_t> idsTrainset_buf(mesoClusterSizeMax, stream_, &managed_memory);
   rmm::device_uvector<float> subTrainset_buf(
-    mesoClusterSizeMax * dimDataset, stream, &managed_memory);
+    mesoClusterSizeMax * dimDataset, stream_, &managed_memory);
   auto idsTrainset = idsTrainset_buf.data();
   auto subTrainset = subTrainset_buf.data();
 
@@ -377,16 +357,16 @@ cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_managed_
   }
 
   // label (cluster ID) of each vector
-  rmm::device_uvector<uint32_t> labelsMP(mesoClusterSizeMax, stream, &managed_memory);
+  rmm::device_uvector<uint32_t> labelsMP(mesoClusterSizeMax, stream_, &managed_memory);
 
-  predictWorkspace.resize(sizePredictWorkspace, stream);
+  predictWorkspace.resize(sizePredictWorkspace, stream_);
 
   rmm::device_uvector<float> clusterCentersEach(
-    numFineClustersMax * dimDataset, stream, &managed_memory);
+    numFineClustersMax * dimDataset, stream_, &managed_memory);
   rmm::device_uvector<float> clusterCentersMP(
-    numFineClustersMax * dimDataset, stream, &managed_memory);
+    numFineClustersMax * dimDataset, stream_, &managed_memory);
   // number of vectors in each cluster
-  rmm::device_uvector<uint32_t> clusterSizeMP(numFineClustersMax, stream, &managed_memory);
+  rmm::device_uvector<uint32_t> clusterSizeMP(numFineClustersMax, stream_, &managed_memory);
 
   // Training clusters in each meso-clusters
   uint32_t numClustersDone = 0;
@@ -440,7 +420,7 @@ cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_managed_
               (float)iter / 2,
               numIterations);
 
-      _cuann_kmeans_predict(cublas_handle_,
+      _cuann_kmeans_predict(handle_,
                             clusterCentersEach.data(),
                             numFineClusters[i],
                             dimDataset,
@@ -479,16 +459,16 @@ cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_managed_
   fprintf(stderr, "\n");
   assert(numClustersDone == numClusters);
 
-  clusterCentersMP.resize(numClusters * dimDataset, stream);
-  clusterSizeMP.resize(numClusters, stream);
+  clusterCentersMP.resize(numClusters * dimDataset, stream_);
+  clusterSizeMP.resize(numClusters, stream_);
 
   // [...]
   sizePredictWorkspace = _cuann_kmeans_predict_bufferSize(numClusters, dimDataset, numTrainset);
-  predictWorkspace.resize(sizePredictWorkspace, stream);
+  predictWorkspace.resize(sizePredictWorkspace, stream_);
 
   // Fitting whole clusters using whole trainset.
   for (int iter = 0; iter < 2; iter++) {
-    _cuann_kmeans_predict(cublas_handle_,
+    _cuann_kmeans_predict(handle_,
                           clusterCenters,
                           numClusters,
                           dimDataset,
@@ -507,9 +487,9 @@ cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_managed_
   fprintf(stderr, "(%s) Final fitting\n", __func__);
 
   sizePredictWorkspace = _cuann_kmeans_predict_bufferSize(numClusters, dimDataset, nrow_);
-  predictWorkspace.resize(sizePredictWorkspace, stream);
+  predictWorkspace.resize(sizePredictWorkspace, stream_);
 
-  _cuann_kmeans_predict(cublas_handle_,
+  _cuann_kmeans_predict(handle_,
                         (float*)clusterCenters,
                         nlist_,
                         dim_,
@@ -524,7 +504,7 @@ cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_managed_
                         clusterSizeMP.data(),
                         true);
 
-  _cuann_kmeans_predict(cublas_handle_,
+  _cuann_kmeans_predict(handle_,
                         (float*)clusterCenters,
                         nlist_,
                         dim_,
@@ -542,16 +522,11 @@ cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_managed_
   return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
 }  // end func cuivflBuildOptimizedKmeans
 
-cuivflStatus_t cuivflHandle::cuivflBuildIndex(const void* dataset,
-                                              void* trainset,
-                                              cudaDataType_t dtype,
-                                              uint32_t nrow,
-                                              uint32_t ntrain,
-                                              rmm::cuda_stream_view stream)
+cuivflStatus_t cuivflHandle::cuivflBuildIndex(
+  const void* dataset, void* trainset, cudaDataType_t dtype, uint32_t nrow, uint32_t ntrain)
 {
-  nrow_   = nrow;
-  dtype_  = dtype;
-  stream_ = stream;
+  nrow_  = nrow;
+  dtype_ = dtype;
 
   rmm::mr::managed_memory_resource managed_memory;
   rmm::device_uvector<float> centriod_managed_buf(nlist_ * dim_, stream_, &managed_memory);
@@ -568,7 +543,7 @@ cuivflStatus_t cuivflHandle::cuivflBuildIndex(const void* dataset,
 
   // Step 3: Predict labels of the whole dataset
   cuivflBuildOptimizedKmeans(
-    centriod_managed_ptr, dataset, trainset, datasetLabels, dtype, nrow, ntrain, stream_);
+    centriod_managed_ptr, dataset, trainset, datasetLabels, dtype, nrow, ntrain);
 
   // Step 3.2: Calculate the L2 related result
   centriod_norm_dev_.resize(nlist_, stream_);
@@ -832,7 +807,6 @@ cuivflStatus_t cuivflHandle::cuivflSearch(const void* queries,  // [numQueries,
                                           uint32_t k,
                                           size_t* neighbors,  // [numQueries, topK]
                                           float* distances,
-                                          rmm::cuda_stream_view stream,
                                           cudaDataType_t dtype)
 {
   switch (dtype) {
@@ -841,16 +815,15 @@ cuivflStatus_t cuivflHandle::cuivflSearch(const void* queries,  // [numQueries,
                                      batch_size,
                                      k,
                                      neighbors,
-                                     reinterpret_cast<float*>(distances),
-                                     stream);
+                                     reinterpret_cast<float*>(distances));
       break;
     case CUDA_R_8U:
       cuivflSearchImpl<uint8_t, float>(
-        reinterpret_cast<const uint8_t*>(queries), batch_size, k, neighbors, distances, stream);
+        reinterpret_cast<const uint8_t*>(queries), batch_size, k, neighbors, distances);
       break;
     case CUDA_R_8I:
       cuivflSearchImpl<int8_t, float>(
-        reinterpret_cast<const int8_t*>(queries), batch_size, k, neighbors, distances, stream);
+        reinterpret_cast<const int8_t*>(queries), batch_size, k, neighbors, distances);
       break;
     default: printf("unsupported data type\n"); break;
   }
@@ -863,13 +836,10 @@ cuivflStatus_t cuivflHandle::cuivflSearchImpl(const T* queries,  // [numQueries,
                                               uint32_t batch_size,
                                               uint32_t k,
                                               size_t* neighbors,  // [numQueries, topK]
-                                              value_t* distances,
-                                              rmm::cuda_stream_view stream)
+                                              value_t* distances)
 {
   uint32_t nprobe = std::min(nprobe_, (uint32_t)nlist_);
-  stream_         = stream;
 
-  cublasSetStream(cublas_handle_, stream_);
   gridDimX_ = 0;
   queryIVFFlatGridSize(nprobe, batch_size, k);
   // Prepare the buffer for topk calculation
@@ -904,11 +874,11 @@ cuivflStatus_t cuivflHandle::cuivflSearchImpl(const T* queries,  // [numQueries,
   if constexpr (std::is_same<T, uint8_t>{}) {
     constexpr float divisor = 256.0;
     utils::_cuann_copy<uint8_t, float>(
-      batch_size, dim_, (uint8_t*)queries, dim_, convertedQueries, dim_, stream, divisor);
+      batch_size, dim_, (uint8_t*)queries, dim_, convertedQueries, dim_, stream_, divisor);
   } else if constexpr (std::is_same<T, int8_t>{}) {
     constexpr float divisor = 128.0;
     utils::_cuann_copy<int8_t, float>(
-      batch_size, dim_, (int8_t*)queries, dim_, convertedQueries, dim_, stream, divisor);
+      batch_size, dim_, (int8_t*)queries, dim_, convertedQueries, dim_, stream_, divisor);
   } else {
     convertedQueries = (float*)(queries);
   }
@@ -931,25 +901,21 @@ cuivflStatus_t cuivflHandle::cuivflSearchImpl(const T* queries,  // [numQueries,
     beta  = 0.0f;
   }
 
-  cublasGemmEx(cublas_handle_,
-               CUBLAS_OP_T,
-               CUBLAS_OP_N,
+  linalg::gemm(handle_,
+               true,
+               false,
                nlist_,
                batch_size,
                dim_,
                &alpha,
                centriod_dev_.data(),
-               CUDA_R_32F,
                dim_,
                convertedQueries,
-               CUDA_R_32F,
                dim_,
                &beta,
                distance_buffer_dev_ptr,
-               CUDA_R_32F,
                nlist_,
-               CUDA_R_32F,
-               CUBLAS_GEMM_DEFAULT);
+               stream_);
 
 #ifdef DEBUG_L2
   utils::printDevPtr(distance_buffer_dev_ptr, 20, "distance_buffer_dev_ptr");
@@ -963,7 +929,7 @@ cuivflStatus_t cuivflHandle::cuivflSearchImpl(const T* queries,  // [numQueries,
                                              coarse_distances_dev_ptr,
                                              coarse_indices_dev_ptr,
                                              greater_,
-                                             stream);
+                                             stream_);
 #ifdef DEBUG_L2
   utils::printDevPtr(coarse_indices_dev_ptr, 1 * nprobe, "coarse_indices_dev_ptr");
   utils::printDevPtr(coarse_distances_dev_ptr, 1 * nprobe, "coarse_distances_dev_ptr");
@@ -990,7 +956,7 @@ cuivflStatus_t cuivflHandle::cuivflSearchImpl(const T* queries,  // [numQueries,
                                            dim_,
                                            indices_dev_ptr,
                                            distances_dev_ptr,
-                                           stream,
+                                           stream_,
                                            greater_,
                                            veclen,
                                            gridDimX_);
@@ -1009,7 +975,7 @@ cuivflStatus_t cuivflHandle::cuivflSearchImpl(const T* queries,  // [numQueries,
                                                 dim_,
                                                 indices_dev_ptr,
                                                 distances_dev_ptr,
-                                                stream,
+                                                stream_,
                                                 greater_,
                                                 veclen,
                                                 gridDimX_);
@@ -1027,7 +993,7 @@ cuivflStatus_t cuivflHandle::cuivflSearchImpl(const T* queries,  // [numQueries,
                                               dim_,
                                               indices_dev_ptr,
                                               distances_dev_ptr,
-                                              stream,
+                                              stream_,
                                               greater_,
                                               veclen,
                                               gridDimX_);
@@ -1051,7 +1017,7 @@ cuivflStatus_t cuivflHandle::cuivflSearchImpl(const T* queries,  // [numQueries,
                                              distances,
                                              neighbors,
                                              greater_,
-                                             stream);
+                                             stream_);
 #else
     topk::warp_sort_topk<value_t, size_t>(buf_topk_dev_ptr,
                                           buf_topk_size_,
@@ -1063,7 +1029,7 @@ cuivflStatus_t cuivflHandle::cuivflSearchImpl(const T* queries,  // [numQueries,
                                           distances,
                                           neighbors,
                                           greater_,
-                                          stream);
+                                          stream_);
 #endif
   }  // end if nprobe=1
 
diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index d39a034116..f33219dde9 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -18,49 +18,21 @@
 
 #include "../ann_common.h"
 #include "ann_utils.cuh"
-#include "knn_brute_force_faiss.cuh"
 
-#include "common_faiss.h"
-#include "processing.hpp"
-
-#include "processing.hpp"
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/interruptible.hpp>
-
-//#include <label/classlabels.cuh>
 #include <raft/distance/distance.hpp>
-#include <raft/spatial/knn/faiss_mr.hpp>
+#include <raft/distance/distance_type.hpp>
+#include <raft/interruptible.hpp>
+#include <raft/linalg/gemm.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 
-#include <faiss/gpu/GpuDistance.h>
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/GpuIndexIVFFlat.h>
-#include <faiss/gpu/GpuIndexIVFPQ.h>
-#include <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/utils/Limits.cuh>
-#include <faiss/gpu/utils/Select.cuh>
-#include <faiss/gpu/utils/Tensor.cuh>
-#include <faiss/utils/Heap.h>
-
-#include <thrust/iterator/transform_iterator.h>
-
-#include <raft/distance/distance_type.hpp>
-
-#include <iostream>
-#include <set>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-namespace detail {
-// namespace kmeans {
+namespace raft::spatial::knn::detail {
 
 // predict label of dataset
-void _cuann_kmeans_predict_core(cublasHandle_t cublasHandle,
+void _cuann_kmeans_predict_core(const handle_t& handle,
                                 const float* centers,  // [numCenters, dimCenters]
                                 uint32_t numCenters,
                                 uint32_t dimCenters,
@@ -91,25 +63,21 @@ void _cuann_kmeans_predict_core(cublasHandle_t cublasHandle,
     alpha = -2.0;
     beta  = 1.0;
   }
-  cublasGemmEx(cublasHandle,
-               CUBLAS_OP_T,
-               CUBLAS_OP_N,
+  linalg::gemm(handle,
+               true,
+               false,
                numCenters,
                numDataset,
                dimCenters,
                &alpha,
                centers,
-               CUDA_R_32F,
                dimCenters,
                dataset,
-               CUDA_R_32F,
                dimDataset,
                &beta,
                distances,
-               CUDA_R_32F,
                numCenters,
-               CUBLAS_COMPUTE_32F,
-               CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+               handle.get_stream());
   utils::_cuann_argmin(numDataset, numCenters, distances, labels);
 }
 
@@ -213,7 +181,7 @@ void _cuann_kmeans_update_centers(float* centers,  // [numCenters, dimCenters]
  * NB: seems that all pointers here are accessed by devicie code only
  *
  */
-void _cuann_kmeans_predict(cublasHandle_t cublasHandle,
+void _cuann_kmeans_predict(const handle_t& handle,
                            float* centers,  // [numCenters, dimCenters]
                            uint32_t numCenters,
                            uint32_t dimCenters,
@@ -228,7 +196,7 @@ void _cuann_kmeans_predict(cublasHandle_t cublasHandle,
                            uint32_t* clusterSize = NULL,  // [numCenters,]
                            bool updateCenter     = true)
 {
-  rmm::cuda_stream_view stream = rmm::cuda_stream_default;
+  rmm::cuda_stream_view stream = handle.get_stream();
   if (!isCenterSet) {
     // If centers are not set, the labels will be determined randomly.
     linalg::writeOnlyUnaryOp(
@@ -302,7 +270,7 @@ void _cuann_kmeans_predict(cublasHandle_t cublasHandle,
     }
 
     // predict
-    _cuann_kmeans_predict_core(cublasHandle,
+    _cuann_kmeans_predict_core(handle,
                                centers,
                                numCenters,
                                dimCenters,
@@ -414,8 +382,4 @@ bool _cuann_kmeans_adjust_centers(float* centers,  // [numCenters, dimCenters]
   return adjusted;
 }
 
-//}  // namespace kmeans
-}  // namespace detail
-}  // namespace knn
-}  // namespace spatial
-}  // namespace raft
+}  // namespace raft::spatial::knn::detail
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
index 710724a69d..e379014bd6 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
@@ -91,14 +91,15 @@ void approx_knn_ivfflat_build_index(
 }
 
 template <typename T = float, typename IntType = int>
-void approx_knn_cuivfl_ivfflat_build_index(knnIndex* index,
+void approx_knn_cuivfl_ivfflat_build_index(const raft::handle_t& handle,
+                                           knnIndex* index,
                                            IVFParam* params,
                                            raft::distance::DistanceType metric,
                                            T* dataset,
                                            IntType n,
-                                           IntType D,
-                                           rmm::cuda_stream_view stream)
+                                           IntType D)
 {
+  auto stream         = handle.get_stream();
   int ratio           = 2;  // TODO: take these parameters from API
   int niter           = 20;
   const int dim       = D;
@@ -119,10 +120,11 @@ void approx_knn_cuivfl_ivfflat_build_index(knnIndex* index,
 
   cudaDataType_t dtype = utils::cuda_datatype<T>();
 
-  index->handle_ = std::make_unique<cuivflHandle>(metric, D, params->nlist, niter, index->device);
+  index->handle_ =
+    std::make_unique<cuivflHandle>(handle, metric, D, params->nlist, niter, index->device);
 
   // NB: `trainset` is accessed by both CPU and GPU code here.
-  index->handle_->cuivflBuildIndex(dataset, trainset.data(), dtype, n, ntrain, stream);
+  index->handle_->cuivflBuildIndex(dataset, trainset.data(), dtype, n, ntrain);
 }
 
 template <typename IntType = int>
@@ -155,7 +157,7 @@ void approx_knn_ivfsq_build_index(
 }
 
 template <typename T = float, typename IntType = int>
-void approx_knn_build_index(raft::handle_t& handle,
+void approx_knn_build_index(const handle_t& handle,
                             raft::spatial::knn::knnIndex* index,
                             raft::spatial::knn::knnIndexParam* params,
                             raft::distance::DistanceType metric,
@@ -182,7 +184,7 @@ void approx_knn_build_index(raft::handle_t& handle,
       rmm::device_uvector<T> managed_index_array(n * D, stream, &managed_memory);
       copy(managed_index_array.data(), index_array, n * D, stream);
       approx_knn_cuivfl_ivfflat_build_index(
-        index, IVFFlat_param, metric, managed_index_array.data(), n, D, stream);
+        handle, index, IVFFlat_param, metric, managed_index_array.data(), n, D);
     } else {
       RAFT_FAIL("IVF Flat algorithm required to fit int8 data");
     }
@@ -202,7 +204,7 @@ void approx_knn_build_index(raft::handle_t& handle,
         rmm::device_uvector<T> managed_index_array(n * D, stream, &managed_memory);
         copy(managed_index_array.data(), index_array, n * D, stream);
         approx_knn_cuivfl_ivfflat_build_index(
-          index, IVFFlat_param, metric, managed_index_array.data(), n, D, stream);
+          handle, index, IVFFlat_param, metric, managed_index_array.data(), n, D);
       } else {
         raft::spatial::knn::RmmGpuResources* gpu_res = new raft::spatial::knn::RmmGpuResources();
         gpu_res->noTempMemory();
@@ -241,7 +243,7 @@ void approx_knn_build_index(raft::handle_t& handle,
 }
 
 template <typename T = float, typename IntType = int>
-void approx_knn_search(raft::handle_t& handle,
+void approx_knn_search(const handle_t& handle,
                        float* distances,
                        int64_t* indices,
                        raft::spatial::knn::knnIndex* index,
@@ -275,7 +277,7 @@ void approx_knn_search(raft::handle_t& handle,
         dtype = CUDA_R_8I;
       }
       index->handle_->cuivflSearch(
-        query_array, max_batch, max_k, (size_t*)indices, distances, handle.get_stream(), dtype);
+        query_array, max_batch, max_k, (size_t*)indices, distances, dtype);
     }
   } else if constexpr (std::is_same<T, float>{}) {
     std::unique_ptr<MetricProcessor<float>> query_metric_processor = create_processor<float>(
@@ -299,7 +301,7 @@ void approx_knn_search(raft::handle_t& handle,
         dtype = CUDA_R_8I;
       }
       index->handle_->cuivflSearch(
-        query_array, max_batch, max_k, (size_t*)indices, distances, handle.get_stream(), dtype);
+        query_array, max_batch, max_k, (size_t*)indices, distances, dtype);
     }
     query_metric_processor->revert(query_array);
 
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index 527c47c008..cf11fd3e6b 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -17,36 +17,12 @@
 #pragma once
 
 #include "../ann_common.h"
-#include "knn_brute_force_faiss.cuh"
 
-#include "common_faiss.h"
-#include "processing.hpp"
-
-#include "processing.hpp"
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-
 #include <raft/distance/distance.hpp>
-#include <raft/spatial/knn/faiss_mr.hpp>
-
-#include <faiss/gpu/GpuDistance.h>
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/GpuIndexIVFFlat.h>
-#include <faiss/gpu/GpuIndexIVFPQ.h>
-#include <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/utils/Limits.cuh>
-#include <faiss/gpu/utils/Select.cuh>
-#include <faiss/gpu/utils/Tensor.cuh>
-#include <faiss/utils/Heap.h>
-
-#include <thrust/iterator/transform_iterator.h>
-
 #include <raft/distance/distance_type.hpp>
 
-#include <iostream>
-#include <set>
-
 namespace raft {
 namespace spatial {
 namespace knn {

From 3ae52ea93ccf76275a142365f04a7bd7232b7a96 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 20 May 2022 14:21:19 +0200
Subject: [PATCH 028/118] Fix the value type at runtime (use templates instead
 of runtime dtype)

---
 cpp/include/raft/spatial/knn/ann_common.h     |  65 ++-
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  | 542 ++++++------------
 .../knn/detail/ann_kmeans_balanced.cuh        | 112 ++--
 .../raft/spatial/knn/detail/ann_quantized.cuh |  40 +-
 .../raft/spatial/knn/detail/ann_utils.cuh     |  56 --
 5 files changed, 286 insertions(+), 529 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h
index cfbde4bf21..bb857eb64b 100644
--- a/cpp/include/raft/spatial/knn/ann_common.h
+++ b/cpp/include/raft/spatial/knn/ann_common.h
@@ -26,11 +26,74 @@ namespace raft {
 namespace spatial {
 namespace knn {
 
+struct cuivfl_handle_t {
+  template <typename T>
+  auto get() -> std::unique_ptr<detail::cuivflHandle<T>>&;
+
+  cuivfl_handle_t() {}
+
+  ~cuivfl_handle_t()
+  {
+    if (dtype_.has_value()) {
+      switch (*dtype_) {
+        case CUDA_R_32F: impl.float_.~unique_ptr(); break;
+        case CUDA_R_8U: impl.uint8_t_.~unique_ptr(); break;
+        case CUDA_R_8I: impl.int8_t_.~unique_ptr(); break;
+        default: break;
+      }
+    }
+  }
+
+ private:
+  union handle {
+    void* dummy;
+    std::unique_ptr<detail::cuivflHandle<float>> float_;
+    std::unique_ptr<detail::cuivflHandle<uint8_t>> uint8_t_;
+    std::unique_ptr<detail::cuivflHandle<int8_t>> int8_t_;
+    handle() : dummy(nullptr){};
+    ~handle(){};
+  } impl;
+  std::optional<cudaDataType_t> dtype_;
+};
+
+template <>
+auto cuivfl_handle_t::get<float>() -> std::unique_ptr<detail::cuivflHandle<float>>&
+{
+  if (dtype_.has_value()) {
+    RAFT_EXPECTS(*dtype_ == CUDA_R_32F, "wrong element type");
+  } else {
+    *dtype_ = CUDA_R_32F;
+  }
+  return impl.float_;
+}
+
+template <>
+auto cuivfl_handle_t::get<uint8_t>() -> std::unique_ptr<detail::cuivflHandle<uint8_t>>&
+{
+  if (dtype_.has_value()) {
+    RAFT_EXPECTS(*dtype_ == CUDA_R_8U, "wrong element type");
+  } else {
+    *dtype_ = CUDA_R_8U;
+  }
+  return impl.uint8_t_;
+}
+
+template <>
+auto cuivfl_handle_t::get<int8_t>() -> std::unique_ptr<detail::cuivflHandle<int8_t>>&
+{
+  if (dtype_.has_value()) {
+    RAFT_EXPECTS(*dtype_ == CUDA_R_8I, "wrong element type");
+  } else {
+    *dtype_ = CUDA_R_8I;
+  }
+  return impl.int8_t_;
+}
+
 struct knnIndex {
   faiss::gpu::GpuIndex* index;
   raft::distance::DistanceType metric;
   float metricArg;
-  std::unique_ptr<detail::cuivflHandle> handle_;
+  cuivfl_handle_t handle_;
 
   raft::spatial::knn::RmmGpuResources* gpu_res;
   int device;
diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index cc4be0f12a..263fc366c0 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -36,7 +36,7 @@ namespace raft::spatial::knn::detail {
 
 template <typename T>
 void _ivfflat_interleaved(
-  T* list_data, T* dataset, uint32_t dim, size_t index, size_t prefix, uint32_t veclen)
+  T* list_data, const T* dataset, uint32_t dim, size_t index, size_t prefix, uint32_t veclen)
 {
   size_t group_id = index / WarpSize;
   size_t in_id    = (index % WarpSize) * veclen;
@@ -53,7 +53,7 @@ void _ivfflat_interleaved(
 //
 template <typename T>
 __global__ void write_ivf_flat_interleaved_index(
-  T* list_data, T* dataset, uint32_t dim, size_t index, size_t prefix, uint32_t veclen)
+  T* list_data, const T* dataset, uint32_t dim, size_t index, size_t prefix, uint32_t veclen)
 {
   size_t group_id = index / WarpSize;
   size_t in_id    = (index % WarpSize) * veclen;
@@ -83,6 +83,27 @@ enum cuivflStatus_t : unsigned int {
   CUIVFL_STATUS_NOT_BUILD         = 12
 };
 
+template <typename T>
+struct ivfflat_config {
+};
+
+template <>
+struct ivfflat_config<float> {
+  using value_t                   = float;
+  static constexpr float kDivisor = 1.0;
+};
+template <>
+struct ivfflat_config<uint8_t> {
+  using value_t                   = uint32_t;
+  static constexpr float kDivisor = 256.0;
+};
+template <>
+struct ivfflat_config<int8_t> {
+  using value_t                   = int32_t;
+  static constexpr float kDivisor = 128.0;
+};
+
+template <typename T>
 class cuivflHandle {
  public:
   cuivflHandle(const handle_t& handle,
@@ -91,50 +112,41 @@ class cuivflHandle {
                uint32_t nlist,
                uint32_t niter,
                uint32_t device);
-  ~cuivflHandle();
-  cuivflStatus_t cuivflBuildIndex(
-    const void* dataset, void* trainset, cudaDataType_t dtype, uint32_t nrow, uint32_t nTrainset);
+
+  cuivflStatus_t cuivflBuildIndex(const T* dataset, T* trainset, uint32_t nrow, uint32_t nTrainset);
 
   cuivflStatus_t cuivflSetSearchParameters(const uint32_t nprobe,
                                            const uint32_t max_batch,
                                            const uint32_t max_k);
 
-  cuivflStatus_t cuivflSearch(const void* queries,
-                              uint32_t batch_size,
-                              uint32_t k,
-                              size_t* neighbors,
-                              float* distances,
-                              cudaDataType_t dtype);
+  cuivflStatus_t cuivflSearch(
+    const T* queries, uint32_t batch_size, uint32_t k, size_t* neighbors, float* distances);
 
   cuivflStatus_t queryIVFFlatGridSize(const uint32_t nprobe,
                                       const uint32_t batch_size,
                                       const uint32_t k);
-  uint32_t getDim();
+  uint32_t getDim() { return dim_; }
 
  private:
   const handle_t& handle_;
   const rmm::cuda_stream_view stream_;
 
-  uint32_t device_;
-  cudaDataType_t dtype_;
   raft::distance::DistanceType metric_type_;
   bool greater_;
-  bool hierarchialClustering_;
-  uint32_t nlist_;        // The number of inverted lists= the number of centriods
-  uint32_t niter_;        // The number of uint32_terations for kmeans to build the indexs
-  uint32_t dim_;          // The dimension of vectors for input dataset
-  uint32_t nprobe_;       // The number of clusters for searching
-  uint32_t nrow_;         // The number of elements for input dataset
-  size_t ninterleave_;    // The number of elements in 32 interleaved group for input dataset
-  size_t buf_topk_size_;  // The size of buffer used for topk select.
-  size_t floatQuerySize;  // The size of float converted queries from int8_t/uint8_t
-  uint32_t veclen;        // The vectorization length of dataset in index.
-  uint32_t gridDimX_;     // The number of blocks launched across nprobe.
+  uint32_t nlist_;           // The number of inverted lists= the number of centriods
+  uint32_t niter_;           // The number of uint32_terations for kmeans to build the indexs
+  uint32_t dim_;             // The dimension of vectors for input dataset
+  uint32_t nprobe_;          // The number of clusters for searching
+  uint32_t nrow_;            // The number of elements for input dataset
+  size_t ninterleave_;       // The number of elements in 32 interleaved group for input dataset
+  size_t buf_topk_size_;     // The size of buffer used for topk select.
+  size_t float_query_size_;  // The size of float converted queries from int8_t/uint8_t
+  uint32_t veclen_;          // The vectorization length of dataset in index.
+  uint32_t grid_dim_x_;      // The number of blocks launched across nprobe.
 
   // device pointer
   //  The device memory pointer; inverted list for data; size [ninterleave_, dim_]
-  void* list_data_dev_ptr_;
-
+  rmm::device_uvector<T> list_data_dev_;
   // The device memory pointer; inverted list for index; size [ninterleave_]
   rmm::device_uvector<uint32_t> list_index_dev_;
   // The device memory pointer; Used for list_data_manage_ptr_; size [nlist_]
@@ -145,10 +157,12 @@ class cuivflHandle {
   rmm::device_uvector<float> centriod_dev_;
   // The device memory pointer; centriod norm ; size [nlist_, dim_]
   rmm::device_uvector<float> centriod_norm_dev_;
+  // The device memory; used for topk select.
+  rmm::device_buffer select_workspace_dev_;
 
   // host pointer
   //  The host memory pointer; inverted list for data; size [ninterleave_, dim_]
-  void* list_data_host_ptr_;
+  std::vector<T> list_data_host_;
   // The host memory pointer; inverted list for index; size [ninterleave_]
   std::vector<uint32_t> list_index_host_;
   // The host memory pointer; Used for list_data_manage_ptr_; size [nlist_]
@@ -156,36 +170,34 @@ class cuivflHandle {
   // The host memory pointer; the number of each cluster(list); size [nlist_]
   std::vector<uint32_t> list_lengths_host_;
 
-  // The device memory; used for topk select.
-  void* buf_dev_ptr_;
-
   cuivflStatus_t cuivflBuildOptimizedKmeans(float* centriod_managed_ptr,
-                                            const void* dataset,
-                                            void* trainset,
+                                            const T* dataset,
+                                            T* trainset,
                                             uint32_t* clusterSize,
-                                            cudaDataType_t dtype,
                                             uint32_t nrow,
                                             uint32_t ntrain);
 
-  template <typename T, typename value_t>
+  template <typename value_t>
   cuivflStatus_t cuivflSearchImpl(
     const T* queries, uint32_t batch_size, uint32_t k, size_t* neighbors, value_t* distances);
 };
 
-// cuivflCreate
-cuivflHandle::cuivflHandle(const handle_t& handle,
-                           raft::distance::DistanceType metric_type,
-                           uint32_t dim,
-                           uint32_t nlist,
-                           uint32_t niter,
-                           uint32_t device)
+template <typename T>
+cuivflHandle<T>::cuivflHandle(const handle_t& handle,
+                              raft::distance::DistanceType metric_type,
+                              uint32_t dim,
+                              uint32_t nlist,
+                              uint32_t niter,
+                              uint32_t device)
   : handle_(handle),
     stream_(handle_.get_stream()),
-    device_(device),
     dim_(dim),
     nlist_(nlist),
     niter_(niter),
     metric_type_(metric_type),
+    float_query_size_(0),
+    grid_dim_x_(0),
+    list_data_dev_(0, stream_),
     list_index_dev_(0, stream_),
     list_prefix_interleaved_dev_(0, stream_),
     list_lengths_dev_(0, stream_),
@@ -193,53 +205,26 @@ cuivflHandle::cuivflHandle(const handle_t& handle,
     centriod_norm_dev_(0, stream_),
     list_index_host_(0),
     list_prefix_interleaved_host_(0),
-    list_lengths_host_(0)
+    list_lengths_host_(0),
+    list_data_host_(0)
 {
-  floatQuerySize = 0;
-  veclen         = 1;
-  gridDimX_      = 0;
-
-  if ((dim % 4) == 0) {
-    veclen = 4;
-  } else if ((dim % 2) == 0) {
-    veclen = 2;
+  veclen_ = 16 / sizeof(T);
+  while (dim % veclen_ != 0) {
+    veclen_ = veclen_ >> 1;
   }
-
-  list_data_dev_ptr_  = nullptr;
-  list_data_host_ptr_ = nullptr;
-
-  buf_dev_ptr_           = nullptr;
-  hierarchialClustering_ = true;
 }
 
-uint32_t cuivflHandle::getDim() { return dim_; }
-
-// cuivflDestroy
-cuivflHandle::~cuivflHandle()
-{
-  if (list_data_dev_ptr_ != nullptr) {
-    cudaFree(list_data_dev_ptr_);
-    list_data_dev_ptr_ = nullptr;
-  }
-  if (list_data_host_ptr_ != nullptr) {
-    free(list_data_host_ptr_);
-    list_data_host_ptr_ = nullptr;
-  }
-}  // end func cuivflHandle::cuivflHand
-
-// cuivflBuildIndex
-
 /**
  * NB: `dataset` is accessed only by GPU code, `trainset` accessed by CPU and GPU.
  *
  */
-cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_managed_ptr,
-                                                        const void* dataset,
-                                                        void* trainset,
-                                                        uint32_t* datasetLabels,
-                                                        cudaDataType_t dtype,
-                                                        uint32_t nrow,
-                                                        uint32_t ntrain)
+template <typename T>
+cuivflStatus_t cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_managed_ptr,
+                                                           const T* dataset,
+                                                           T* trainset,
+                                                           uint32_t* datasetLabels,
+                                                           uint32_t nrow,
+                                                           uint32_t ntrain)
 {
   uint32_t numTrainset   = ntrain;
   uint32_t numClusters   = nlist_;
@@ -281,7 +266,6 @@ cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_managed_
                           numMesoClusters,
                           dimDataset,
                           trainset,
-                          dtype,
                           numTrainset,
                           mesoClusterLabels.data(),
                           metric_type_,
@@ -295,7 +279,6 @@ cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_managed_
                                        numMesoClusters,
                                        dimDataset,
                                        trainset,
-                                       dtype,
                                        numTrainset,
                                        mesoClusterLabels.data(),
                                        metric_type_,
@@ -339,7 +322,6 @@ cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_managed_
   assert(numFineClustersSum == numClusters);
   assert(csumFineClusters[numMesoClusters] == numClusters);
 
-  // uint32_t *idsTrainset = (uint32_t *)malloc(sizeof(uint32_t) * mesoClusterSizeMax);
   rmm::device_uvector<uint32_t> idsTrainset_buf(mesoClusterSizeMax, stream_, &managed_memory);
   rmm::device_uvector<float> subTrainset_buf(
     mesoClusterSizeMax * dimDataset, stream_, &managed_memory);
@@ -378,38 +360,16 @@ cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_managed_
     }
     assert(k == mesoClusterSize[i]);
 
-    if (dtype == CUDA_R_32F) {
-      float divisor = 1.0;
-      utils::_cuann_copy_with_list<float>(mesoClusterSize[i],
-                                          dimDataset,
-                                          (const float*)trainset,
-                                          (const uint32_t*)idsTrainset,
-                                          dimDataset,
-                                          subTrainset,
-                                          dimDataset,
-                                          divisor);
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());
-    } else if (dtype == CUDA_R_8U) {
-      float divisor = 256.0;
-      utils::_cuann_copy_with_list<uint8_t>(mesoClusterSize[i],
-                                            dimDataset,
-                                            (const uint8_t*)trainset,
-                                            (const uint32_t*)idsTrainset,
-                                            dimDataset,
-                                            subTrainset,
-                                            dimDataset,
-                                            divisor);
-    } else if (dtype == CUDA_R_8I) {
-      float divisor = 128.0;
-      utils::_cuann_copy_with_list<int8_t>(mesoClusterSize[i],
-                                           dimDataset,
-                                           (const int8_t*)trainset,
-                                           (const uint32_t*)idsTrainset,
-                                           dimDataset,
-                                           subTrainset,
-                                           dimDataset,
-                                           divisor);
-    }
+    utils::_cuann_copy_with_list<T>(mesoClusterSize[i],
+                                    dimDataset,
+                                    trainset,
+                                    idsTrainset,
+                                    dimDataset,
+                                    subTrainset,
+                                    dimDataset,
+                                    ivfflat_config<T>::kDivisor);
+    RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
     for (uint32_t iter = 0; iter < 2 * numIterations; iter += 2) {
       fprintf(stderr,
               "(%s) Training kmeans of clusters in meso-cluster %u (numClusters: %u): "
@@ -425,7 +385,6 @@ cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_managed_
                             numFineClusters[i],
                             dimDataset,
                             subTrainset,
-                            CUDA_R_32F,
                             mesoClusterSize[i],
                             labelsMP.data(),
                             metric_type_,
@@ -439,7 +398,6 @@ cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_managed_
                                          numFineClusters[i],
                                          dimDataset,
                                          subTrainset,
-                                         CUDA_R_32F,
                                          mesoClusterSize[i],
                                          labelsMP.data(),
                                          metric_type_,
@@ -473,7 +431,6 @@ cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_managed_
                           numClusters,
                           dimDataset,
                           trainset,
-                          dtype,
                           numTrainset,
                           trainsetLabels.data(),
                           metric_type_,
@@ -490,11 +447,10 @@ cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_managed_
   predictWorkspace.resize(sizePredictWorkspace, stream_);
 
   _cuann_kmeans_predict(handle_,
-                        (float*)clusterCenters,
+                        clusterCenters,
                         nlist_,
                         dim_,
                         dataset,
-                        dtype,
                         nrow_,
                         datasetLabels,
                         metric_type_,
@@ -505,11 +461,10 @@ cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_managed_
                         true);
 
   _cuann_kmeans_predict(handle_,
-                        (float*)clusterCenters,
+                        clusterCenters,
                         nlist_,
                         dim_,
                         dataset,
-                        dtype,
                         nrow_,
                         datasetLabels,
                         metric_type_,
@@ -522,18 +477,21 @@ cuivflStatus_t cuivflHandle::cuivflBuildOptimizedKmeans(float* centriod_managed_
   return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
 }  // end func cuivflBuildOptimizedKmeans
 
-cuivflStatus_t cuivflHandle::cuivflBuildIndex(
-  const void* dataset, void* trainset, cudaDataType_t dtype, uint32_t nrow, uint32_t ntrain)
+template <typename T>
+cuivflStatus_t cuivflHandle<T>::cuivflBuildIndex(const T* dataset,
+                                                 T* trainset,
+                                                 uint32_t nrow,
+                                                 uint32_t ntrain)
 {
-  nrow_  = nrow;
-  dtype_ = dtype;
+  nrow_ = nrow;
 
   rmm::mr::managed_memory_resource managed_memory;
   rmm::device_uvector<float> centriod_managed_buf(nlist_ * dim_, stream_, &managed_memory);
   auto centriod_managed_ptr = centriod_managed_buf.data();
 
   if (this == NULL || nrow_ == 0) { return CUIVFL_STATUS_NOT_INITIALIZED; }
-  if (dtype != CUDA_R_32F && dtype != CUDA_R_8U && dtype != CUDA_R_8I) {
+  if constexpr (!std::is_same_v<T, float> && !std::is_same_v<T, uint8_t> &&
+                !std::is_same_v<T, int8_t>) {
     return CUIVFL_STATUS_UNSUPPORTED_DTYPE;
   }
 
@@ -542,8 +500,7 @@ cuivflStatus_t cuivflHandle::cuivflBuildIndex(
   auto datasetLabels = datasetLabels_buf.data();
 
   // Step 3: Predict labels of the whole dataset
-  cuivflBuildOptimizedKmeans(
-    centriod_managed_ptr, dataset, trainset, datasetLabels, dtype, nrow, ntrain);
+  cuivflBuildOptimizedKmeans(centriod_managed_ptr, dataset, trainset, datasetLabels, nrow, ntrain);
 
   // Step 3.2: Calculate the L2 related result
   centriod_norm_dev_.resize(nlist_, stream_);
@@ -571,154 +528,68 @@ cuivflStatus_t cuivflHandle::cuivflBuildIndex(
     ninterleave_ += ((list_lengths_host_[i] - 1) / WarpSize + 1) * WarpSize;
   }
 
-  if (dtype == CUDA_R_32F) {
-    list_data_host_ptr_ = malloc(sizeof(float) * ninterleave_ * dim_);
-    memset(list_data_host_ptr_, 0, sizeof(float) * ninterleave_ * dim_);
-  } else if (dtype == CUDA_R_8U) {
-    list_data_host_ptr_ = malloc(sizeof(uint8_t) * ninterleave_ * dim_);
-    memset(list_data_host_ptr_, 0, sizeof(uint8_t) * ninterleave_ * dim_);
-  } else if (dtype == CUDA_R_8I) {
-    list_data_host_ptr_ = malloc(sizeof(int8_t) * ninterleave_ * dim_);
-    memset(list_data_host_ptr_, 0, sizeof(int8_t) * ninterleave_ * dim_);
-  }
+  list_data_host_.assign(ninterleave_ * dim_, 0);
   list_index_host_.assign(ninterleave_, 0);
   list_lengths_host_.assign(nlist_, 0);
 
-  if ((dtype == CUDA_R_8I) || (dtype == CUDA_R_8U)) {
-    if ((dim_ % 16) == 0) {
-      veclen = 16;
-    } else if ((dim_ % 8) == 0) {
-      veclen = 8;
-    }
-  }
-
   for (size_t i = 0; i < nrow_; i++) {
     uint32_t id_cluster     = datasetLabels[i];
     uint32_t current_add    = list_lengths_host_[id_cluster];
     uint32_t interleave_add = list_prefix_interleaved_host_[id_cluster];
-
-    if (dtype == CUDA_R_32F) {
-      float* list_data = (float*)list_data_host_ptr_;
-      float* ori_data  = (float*)dataset;
-      _ivfflat_interleaved(
-        list_data, ori_data + i * dim_, dim_, current_add, interleave_add, veclen);
-    } else if (dtype == CUDA_R_8U) {
-      uint8_t* list_data = (uint8_t*)list_data_host_ptr_;
-      uint8_t* ori_data  = (uint8_t*)dataset;
-      _ivfflat_interleaved(
-        list_data, ori_data + i * dim_, dim_, current_add, interleave_add, veclen);
-    } else if (dtype == CUDA_R_8I) {
-      int8_t* list_data = (int8_t*)list_data_host_ptr_;
-      int8_t* ori_data  = (int8_t*)dataset;
-      _ivfflat_interleaved(
-        list_data, ori_data + i * dim_, dim_, current_add, interleave_add, veclen);
-    }
+    _ivfflat_interleaved(
+      list_data_host_.data(), dataset + i * dim_, dim_, current_add, interleave_add, veclen_);
     list_index_host_[interleave_add + current_add] = i;
     list_lengths_host_[id_cluster] += 1;
   }
 
   // Store index on GPU memory: temp WAR until we've entire index building buffers on device
+  list_data_dev_.resize(ninterleave_ * dim_, stream_);
   list_index_dev_.resize(ninterleave_, stream_);
   list_prefix_interleaved_dev_.resize(nlist_, stream_);
   list_lengths_dev_.resize(nlist_, stream_);
   centriod_dev_.resize(nlist_ * dim_, stream_);
 
-  if (dtype_ == CUDA_R_32F) {
-    RAFT_CUDA_TRY(cudaMalloc(&list_data_dev_ptr_, sizeof(float) * ninterleave_ * dim_));
-  } else if (dtype_ == CUDA_R_8U) {
-    RAFT_CUDA_TRY(cudaMalloc(&list_data_dev_ptr_, sizeof(uint8_t) * ninterleave_ * dim_));
-  } else if (dtype_ == CUDA_R_8I) {
-    RAFT_CUDA_TRY(cudaMalloc(&list_data_dev_ptr_, sizeof(int8_t) * ninterleave_ * dim_));
-  }
-
   // Step 3: Read the list
   copy(list_prefix_interleaved_dev_.data(), list_prefix_interleaved_host_.data(), nlist_, stream_);
   copy(list_lengths_dev_.data(), list_lengths_host_.data(), nlist_, stream_);
   copy(centriod_dev_.data(), centriod_managed_ptr, nlist_ * dim_, stream_);
 
-  RAFT_CUDA_TRY(cudaMemcpyAsync(list_data_dev_ptr_,
-                                list_data_host_ptr_,
-                                utils::cuda_datatype_size(dtype_) * ninterleave_ * dim_,
-                                cudaMemcpyHostToDevice,
-                                stream_));
+  copy(list_data_dev_.data(), list_data_host_.data(), ninterleave_ * dim_, stream_);
   copy(list_index_dev_.data(), list_index_host_.data(), ninterleave_, stream_);
 
   return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
 }  // end func cuivflBuildIndex
 
-cuivflStatus_t cuivflHandle::queryIVFFlatGridSize(const uint32_t nprobe,
-                                                  const uint32_t batch_size,
-                                                  const uint32_t k)
+template <typename T>
+cuivflStatus_t cuivflHandle<T>::queryIVFFlatGridSize(const uint32_t nprobe,
+                                                     const uint32_t batch_size,
+                                                     const uint32_t k)
 {
   // query the gridDimX size to store probes topK output
-  switch (dtype_) {
-    case CUDA_R_32F:
-      ivfflat_interleaved_scan<float, float>(nullptr,
-                                             nullptr,
-                                             nullptr,
-                                             nullptr,
-                                             nullptr,
-                                             nullptr,
-                                             metric_type_,
-                                             nprobe,
-                                             k,
-                                             batch_size,
-                                             dim_,
-                                             nullptr,
-                                             nullptr,
-                                             0,
-                                             greater_,
-                                             veclen,
-                                             gridDimX_);
-      break;
-    case CUDA_R_8U:
-      // we use int32_t for accumulation, and final store in fp32
-      ivfflat_interleaved_scan<uint8_t, uint32_t>(nullptr,
-                                                  nullptr,
-                                                  nullptr,
-                                                  nullptr,
-                                                  nullptr,
-                                                  nullptr,
-                                                  metric_type_,
-                                                  nprobe,
-                                                  k,
-                                                  batch_size,
-                                                  dim_,
-                                                  nullptr,
-                                                  nullptr,
-                                                  0,
-                                                  greater_,
-                                                  veclen,
-                                                  gridDimX_);
-      break;
-    case CUDA_R_8I:
-      ivfflat_interleaved_scan<int8_t, int32_t>(nullptr,
-                                                nullptr,
-                                                nullptr,
-                                                nullptr,
-                                                nullptr,
-                                                nullptr,
-                                                metric_type_,
-                                                nprobe,
-                                                k,
-                                                batch_size,
-                                                dim_,
-                                                nullptr,
-                                                nullptr,
-                                                0,
-                                                greater_,
-                                                veclen,
-                                                gridDimX_);
-      break;
-    default: break;
-  }
+  ivfflat_interleaved_scan<T, typename ivfflat_config<T>::value_t>(nullptr,
+                                                                   nullptr,
+                                                                   nullptr,
+                                                                   nullptr,
+                                                                   nullptr,
+                                                                   nullptr,
+                                                                   metric_type_,
+                                                                   nprobe,
+                                                                   k,
+                                                                   batch_size,
+                                                                   dim_,
+                                                                   nullptr,
+                                                                   nullptr,
+                                                                   0,
+                                                                   greater_,
+                                                                   veclen_,
+                                                                   grid_dim_x_);
   return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
 }
 
-// cuivflSetSearchParameters
-cuivflStatus_t cuivflHandle::cuivflSetSearchParameters(const uint32_t nprobe,
-                                                       const uint32_t max_batch,
-                                                       const uint32_t max_k)
+template <typename T>
+cuivflStatus_t cuivflHandle<T>::cuivflSetSearchParameters(const uint32_t nprobe,
+                                                          const uint32_t max_batch,
+                                                          const uint32_t max_k)
 {
   nprobe_ = nprobe;
   if (nprobe_ <= 0) { return CUIVFL_STATUS_INVALID_VALUE; }
@@ -732,15 +603,10 @@ cuivflStatus_t cuivflHandle::cuivflSetSearchParameters(const uint32_t nprobe,
   }
 
   // Set buffer
-  if ((dtype_ == CUDA_R_8U) || (dtype_ == CUDA_R_8I)) {
-    floatQuerySize = sizeof(float) * max_batch * dim_;
-    if ((dim_ % 16) == 0) {
-      veclen = 16;
-    } else if ((dim_ % 8) == 0) {
-      veclen = 8;
-    }
+  if constexpr (std::is_integral_v<T>) {
+    float_query_size_ = sizeof(float) * max_batch * dim_;
   } else {
-    floatQuerySize = 0;
+    float_query_size_ = 0;
   }
 
   size_t buf_coarse_size = 0;
@@ -792,68 +658,47 @@ cuivflStatus_t cuivflHandle::cuivflSetSearchParameters(const uint32_t nprobe,
                                max_batch * nprobe * max_k * sizeof(float),
                                max_batch * nprobe * max_k * sizeof(size_t),
                                buf_topk_size_,
-                               floatQuerySize};
-
-  size_t total_size = utils::calc_aligned_size(sizes);
+                               float_query_size_};
 
-  if (buf_dev_ptr_ != nullptr) { RAFT_CUDA_TRY(cudaFree(buf_dev_ptr_)); }
-  RAFT_CUDA_TRY(cudaMalloc(&buf_dev_ptr_, total_size));
+  select_workspace_dev_.resize(utils::calc_aligned_size(sizes), stream_);
   return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
 }
 
-// cuivflSearch
-cuivflStatus_t cuivflHandle::cuivflSearch(const void* queries,  // [numQueries, dimDataset]
-                                          uint32_t batch_size,
-                                          uint32_t k,
-                                          size_t* neighbors,  // [numQueries, topK]
-                                          float* distances,
-                                          cudaDataType_t dtype)
+template <typename T>
+cuivflStatus_t cuivflHandle<T>::cuivflSearch(const T* queries,  // [numQueries, dimDataset]
+                                             uint32_t batch_size,
+                                             uint32_t k,
+                                             size_t* neighbors,  // [numQueries, topK]
+                                             float* distances)
 {
-  switch (dtype) {
-    case CUDA_R_32F:
-      cuivflSearchImpl<float, float>(reinterpret_cast<const float*>(queries),
-                                     batch_size,
-                                     k,
-                                     neighbors,
-                                     reinterpret_cast<float*>(distances));
-      break;
-    case CUDA_R_8U:
-      cuivflSearchImpl<uint8_t, float>(
-        reinterpret_cast<const uint8_t*>(queries), batch_size, k, neighbors, distances);
-      break;
-    case CUDA_R_8I:
-      cuivflSearchImpl<int8_t, float>(
-        reinterpret_cast<const int8_t*>(queries), batch_size, k, neighbors, distances);
-      break;
-    default: printf("unsupported data type\n"); break;
-  }
-
+  cuivflSearchImpl<float>(queries, batch_size, k, neighbors, distances);
   return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
 }  // end func cuivflSearch
 
-template <typename T, typename value_t>
-cuivflStatus_t cuivflHandle::cuivflSearchImpl(const T* queries,  // [numQueries, dimDataset]
-                                              uint32_t batch_size,
-                                              uint32_t k,
-                                              size_t* neighbors,  // [numQueries, topK]
-                                              value_t* distances)
+template <typename T>
+template <typename value_t>
+cuivflStatus_t cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueries, dimDataset]
+                                                 uint32_t batch_size,
+                                                 uint32_t k,
+                                                 size_t* neighbors,  // [numQueries, topK]
+                                                 value_t* distances)
 {
   uint32_t nprobe = std::min(nprobe_, (uint32_t)nlist_);
 
-  gridDimX_ = 0;
+  grid_dim_x_ = 0;
   queryIVFFlatGridSize(nprobe, batch_size, k);
   // Prepare the buffer for topk calculation
-  uint32_t query_norm_size            = batch_size * sizeof(float);
-  std::vector<size_t> sizes           = {query_norm_size,
+  uint32_t query_norm_size  = batch_size * sizeof(float);
+  std::vector<size_t> sizes = {query_norm_size,
                                batch_size * nlist_ * sizeof(float),
                                batch_size * nprobe * sizeof(float),
                                batch_size * nprobe * sizeof(uint32_t),
                                batch_size * nprobe * k * sizeof(float),
                                batch_size * nprobe * k * sizeof(size_t),
                                buf_topk_size_,
-                               floatQuerySize};
-  size_t total_size                   = utils::calc_aligned_size(sizes);
-  std::vector<void*> aligned_pointers = utils::calc_aligned_pointers(buf_dev_ptr_, sizes);
+                               float_query_size_};
+  std::vector<void*> aligned_pointers =
+    utils::calc_aligned_pointers(select_workspace_dev_.data(), sizes);
 
   // The norm of query [batch_size];
   float* query_norm_dev_ptr = static_cast<float*>(aligned_pointers[0]);
@@ -871,16 +716,17 @@ cuivflStatus_t cuivflHandle::cuivflSearchImpl(const T* queries,  // [numQueries,
   void* buf_topk_dev_ptr          = static_cast<void*>(aligned_pointers[6]);
   float* convertedQueries         = static_cast<float*>(aligned_pointers[7]);
 
-  if constexpr (std::is_same<T, uint8_t>{}) {
-    constexpr float divisor = 256.0;
-    utils::_cuann_copy<uint8_t, float>(
-      batch_size, dim_, (uint8_t*)queries, dim_, convertedQueries, dim_, stream_, divisor);
-  } else if constexpr (std::is_same<T, int8_t>{}) {
-    constexpr float divisor = 128.0;
-    utils::_cuann_copy<int8_t, float>(
-      batch_size, dim_, (int8_t*)queries, dim_, convertedQueries, dim_, stream_, divisor);
+  if constexpr (std::is_same_v<T, float>) {
+    convertedQueries = const_cast<float*>(queries);
   } else {
-    convertedQueries = (float*)(queries);
+    utils::_cuann_copy<T, float>(batch_size,
+                                 dim_,
+                                 queries,
+                                 dim_,
+                                 convertedQueries,
+                                 dim_,
+                                 stream_,
+                                 ivfflat_config<T>::kDivisor);
   }
 
   float alpha = 1.0f;
@@ -937,74 +783,36 @@ cuivflStatus_t cuivflHandle::cuivflSearchImpl(const T* queries,  // [numQueries,
 
   value_t* distances_dev_ptr = refined_distances_dev_ptr;
   size_t* indices_dev_ptr    = refined_indices_dev_ptr;
-  if (nprobe == 1 || gridDimX_ == 1) {
+  if (nprobe == 1 || grid_dim_x_ == 1) {
     distances_dev_ptr = distances;
     indices_dev_ptr   = neighbors;
   }
 
-  if constexpr (std::is_same<T, float>{}) {
-    ivfflat_interleaved_scan<float, float>(queries,
-                                           coarse_indices_dev_ptr,
-                                           list_index_dev_.data(),
-                                           list_data_dev_ptr_,
-                                           list_lengths_dev_.data(),
-                                           list_prefix_interleaved_dev_.data(),
-                                           metric_type_,
-                                           nprobe,
-                                           k,
-                                           batch_size,
-                                           dim_,
-                                           indices_dev_ptr,
-                                           distances_dev_ptr,
-                                           stream_,
-                                           greater_,
-                                           veclen,
-                                           gridDimX_);
-  } else if constexpr (std::is_same<T, uint8_t>{}) {
-    // we use int32_t for accumulation, and final store in fp32
-    ivfflat_interleaved_scan<uint8_t, uint32_t>(queries,
-                                                coarse_indices_dev_ptr,
-                                                list_index_dev_.data(),
-                                                list_data_dev_ptr_,
-                                                list_lengths_dev_.data(),
-                                                list_prefix_interleaved_dev_.data(),
-                                                metric_type_,
-                                                nprobe,
-                                                k,
-                                                batch_size,
-                                                dim_,
-                                                indices_dev_ptr,
-                                                distances_dev_ptr,
-                                                stream_,
-                                                greater_,
-                                                veclen,
-                                                gridDimX_);
-  } else if constexpr (std::is_same<T, int8_t>{}) {
-    ivfflat_interleaved_scan<int8_t, int32_t>(queries,
-                                              coarse_indices_dev_ptr,
-                                              list_index_dev_.data(),
-                                              list_data_dev_ptr_,
-                                              list_lengths_dev_.data(),
-                                              list_prefix_interleaved_dev_.data(),
-                                              metric_type_,
-                                              nprobe,
-                                              k,
-                                              batch_size,
-                                              dim_,
-                                              indices_dev_ptr,
-                                              distances_dev_ptr,
-                                              stream_,
-                                              greater_,
-                                              veclen,
-                                              gridDimX_);
-  }
+  ivfflat_interleaved_scan<T, typename ivfflat_config<T>::value_t>(
+    queries,
+    coarse_indices_dev_ptr,
+    list_index_dev_.data(),
+    list_data_dev_.data(),
+    list_lengths_dev_.data(),
+    list_prefix_interleaved_dev_.data(),
+    metric_type_,
+    nprobe,
+    k,
+    batch_size,
+    dim_,
+    indices_dev_ptr,
+    distances_dev_ptr,
+    stream_,
+    greater_,
+    veclen_,
+    grid_dim_x_);
 
 #ifdef DEBUG_L2
   utils::printDevPtr(distances_dev_ptr, 2 * k, "distances_dev_ptr");
   utils::printDevPtr(indices_dev_ptr, 2 * k, "indices_dev_ptr");
 #endif
 
-  if (gridDimX_ > 1) {
+  if (grid_dim_x_ > 1) {
 //#ifdef RADIX
 #if 1
     topk::radix_topk_11bits<value_t, size_t>(buf_topk_dev_ptr,
@@ -1012,7 +820,7 @@ cuivflStatus_t cuivflHandle::cuivflSearchImpl(const T* queries,  // [numQueries,
                                              refined_distances_dev_ptr,
                                              refined_indices_dev_ptr,
                                              (size_t)batch_size,
-                                             (size_t)k * gridDimX_,
+                                             (size_t)k * grid_dim_x_,
                                              (size_t)k,
                                              distances,
                                              neighbors,
@@ -1024,7 +832,7 @@ cuivflStatus_t cuivflHandle::cuivflSearchImpl(const T* queries,  // [numQueries,
                                           refined_distances_dev_ptr,
                                           refined_indices_dev_ptr,
                                           (size_t)batch_size,
-                                          (size_t)(k * gridDimX_),
+                                          (size_t)(k * grid_dim_x_),
                                           (size_t)k,
                                           distances,
                                           neighbors,
diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index f33219dde9..5cab5830a8 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -122,11 +122,11 @@ size_t _cuann_kmeans_predict_bufferSize(uint32_t numCenters,
  *   2. All pointers are on the host, but `centers` and `clusterSize` are accessible from GPU.
  *
  */
+template <typename T>
 void _cuann_kmeans_update_centers(float* centers,  // [numCenters, dimCenters]
                                   uint32_t numCenters,
                                   uint32_t dimCenters,
-                                  const void* dataset,  // [numDataset, dimCenters]
-                                  cudaDataType_t dtype,
+                                  const T* dataset,  // [numDataset, dimCenters]
                                   uint32_t numDataset,
                                   uint32_t* labels,  // [numDataset]
                                   raft::distance::DistanceType metric,
@@ -137,30 +137,12 @@ void _cuann_kmeans_update_centers(float* centers,  // [numCenters, dimCenters]
     // accumulate
     utils::_cuann_memset(centers, 0, sizeof(float) * numCenters * dimCenters);
     utils::_cuann_memset(clusterSize, 0, sizeof(uint32_t) * numCenters);
-    if (dtype == CUDA_R_32F) {
-      utils::_cuann_accumulate_with_label<float>(
-        numCenters, dimCenters, centers, clusterSize, numDataset, (const float*)dataset, labels);
-    } else if (dtype == CUDA_R_8U) {
-      constexpr float divisor = 256.0;
-      utils::_cuann_accumulate_with_label<uint8_t>(numCenters,
-                                                   dimCenters,
-                                                   centers,
-                                                   clusterSize,
-                                                   numDataset,
-                                                   (const uint8_t*)dataset,
-                                                   labels,
-                                                   divisor);
-    } else if (dtype == CUDA_R_8I) {
-      constexpr float divisor = 128.0;
-      utils::_cuann_accumulate_with_label<int8_t>(numCenters,
-                                                  dimCenters,
-                                                  centers,
-                                                  clusterSize,
-                                                  numDataset,
-                                                  (const int8_t*)dataset,
-                                                  labels,
-                                                  divisor);
-    }
+    float divisor;
+    if constexpr (std::is_same_v<T, float>) { divisor = 1.0; }
+    if constexpr (std::is_same_v<T, uint8_t>) { divisor = 256.0; }
+    if constexpr (std::is_same_v<T, int8_t>) { divisor = 128.0; }
+    utils::_cuann_accumulate_with_label<T>(
+      numCenters, dimCenters, centers, clusterSize, numDataset, dataset, labels, divisor);
   } else {
     copy(centers, accumulatedCenters, numCenters * dimCenters, rmm::cuda_stream_default);
     interruptible::synchronize(rmm::cuda_stream_default);
@@ -181,12 +163,12 @@ void _cuann_kmeans_update_centers(float* centers,  // [numCenters, dimCenters]
  * NB: seems that all pointers here are accessed by devicie code only
  *
  */
+template <typename T>
 void _cuann_kmeans_predict(const handle_t& handle,
                            float* centers,  // [numCenters, dimCenters]
                            uint32_t numCenters,
                            uint32_t dimCenters,
-                           const void* dataset,  // [numDataset, dimCenters]
-                           cudaDataType_t dtype,
+                           const T* dataset,  // [numDataset, dimCenters]
                            uint32_t numDataset,
                            uint32_t* labels,  // [numDataset]
                            raft::distance::DistanceType metric,
@@ -207,7 +189,7 @@ void _cuann_kmeans_predict(const handle_t& handle,
     if (tempCenters != NULL && clusterSize != NULL) {
       // update centers
       _cuann_kmeans_update_centers(
-        centers, numCenters, dimCenters, dataset, dtype, numDataset, labels, metric, clusterSize);
+        centers, numCenters, dimCenters, dataset, numDataset, labels, metric, clusterSize);
     }
     return;
   }
@@ -222,11 +204,11 @@ void _cuann_kmeans_predict(const handle_t& handle,
     workspace = sub_workspace.data();
   }
   float* curDataset;  // [chunk, dimCenters]
-  void* bufDataset;   // [chunk, dimCenters]
+  T* bufDataset;      // [chunk, dimCenters]
   float* workspace_core;
   curDataset = (float*)workspace;
   bufDataset =
-    (void*)((uint8_t*)curDataset + utils::_cuann_aligned(sizeof(float) * chunk * dimCenters));
+    (T*)((uint8_t*)curDataset + utils::_cuann_aligned(sizeof(float) * chunk * dimCenters));
   workspace_core =
     (float*)((uint8_t*)bufDataset + utils::_cuann_aligned(sizeof(float) * chunk * dimCenters));
 
@@ -235,38 +217,22 @@ void _cuann_kmeans_predict(const handle_t& handle,
     utils::_cuann_memset(clusterSize, 0, sizeof(uint32_t) * numCenters);
   }
 
-  auto elem_size = utils::cuda_datatype_size(dtype);
   for (uint64_t is = 0; is < numDataset; is += chunk) {
     uint64_t ie       = min(is + chunk, (uint64_t)numDataset);
     uint32_t nDataset = ie - is;
 
-    RAFT_CUDA_TRY(
-      cudaMemcpy(bufDataset,
-                 reinterpret_cast<const uint8_t*>(dataset) + is * dimCenters * elem_size,
-                 elem_size * nDataset * dimCenters,
-                 cudaMemcpyDefault));
+    copy(bufDataset, dataset + is * dimCenters, nDataset * dimCenters, stream);
+    handle.sync_stream(stream);
 
-    if (dtype == CUDA_R_32F) {
-      // No need to copy when dtype is CUDA_R_32F
-      curDataset = (float*)bufDataset;
-    } else if (dtype == CUDA_R_8U) {
-      float divisor = 256.0;
-      utils::_cuann_copy<uint8_t, float>(nDataset,
-                                         dimCenters,
-                                         (const uint8_t*)bufDataset,
-                                         dimCenters,
-                                         curDataset,
-                                         dimCenters,
-                                         divisor);
-    } else if (dtype == CUDA_R_8I) {
-      float divisor = 128.0;
-      utils::_cuann_copy<int8_t, float>(nDataset,
-                                        dimCenters,
-                                        (const int8_t*)bufDataset,
-                                        dimCenters,
-                                        curDataset,
-                                        dimCenters,
-                                        divisor);
+    if constexpr (std::is_same_v<T, float>) {
+      // No need to copy floats
+      curDataset = bufDataset;
+    } else {
+      float divisor;
+      if constexpr (std::is_same_v<T, uint8_t>) { divisor = 256.0; }
+      if constexpr (std::is_same_v<T, int8_t>) { divisor = 128.0; }
+      utils::_cuann_copy<T, float>(
+        nDataset, dimCenters, bufDataset, dimCenters, curDataset, dimCenters, divisor);
     }
 
     // predict
@@ -292,7 +258,6 @@ void _cuann_kmeans_predict(const handle_t& handle,
                                  numCenters,
                                  dimCenters,
                                  dataset,
-                                 dtype,
                                  numDataset,
                                  labels,
                                  metric,
@@ -306,11 +271,11 @@ void _cuann_kmeans_predict(const handle_t& handle,
  *
  * NB: all pointers are used on the CPU side.
  */
+template <typename T>
 bool _cuann_kmeans_adjust_centers(float* centers,  // [numCenters, dimCenters]
                                   uint32_t numCenters,
                                   uint32_t dimCenters,
-                                  const void* dataset,  // [numDataset, dimCenters]
-                                  cudaDataType_t dtype,
+                                  const T* dataset,  // [numDataset, dimCenters]
                                   uint32_t numDataset,
                                   const uint32_t* labels,  // [numDataset]
                                   raft::distance::DistanceType metric,
@@ -328,9 +293,11 @@ bool _cuann_kmeans_adjust_centers(float* centers,  // [numCenters, dimCenters]
                                 2053, 2129, 2213, 2287, 2357, 2423, 2531, 2617, 2687, 2741};
   uint32_t average             = numDataset / numCenters;
   uint32_t ofst;
-  if (dtype != CUDA_R_32F && dtype != CUDA_R_8U && dtype != CUDA_R_8I) {
-    fprintf(stderr, "(%s, %d) Unsupported dtype (%d)\n", __func__, __LINE__, dtype);
-  }
+  float divisor;
+  if constexpr (std::is_same_v<T, float>) { divisor = 1.0; }
+  if constexpr (std::is_same_v<T, uint8_t>) { divisor = 256.0; }
+  if constexpr (std::is_same_v<T, int8_t>) { divisor = 128.0; }
+
   do {
     iPrimes = (iPrimes + 1) % numPrimes;
     ofst    = primes[iPrimes];
@@ -345,17 +312,12 @@ bool _cuann_kmeans_adjust_centers(float* centers,  // [numCenters, dimCenters]
     uint32_t li = labels[i];
     float sqsum = 0.0;
     for (uint32_t j = 0; j < dimCenters; j++) {
-      float val = centers[j + ((uint64_t)dimCenters * li)] * 7.0;
-      if (dtype == CUDA_R_32F) {
-        val += ((float*)dataset)[j + ((uint64_t)dimCenters * i)];
-      } else if (dtype == CUDA_R_8U) {
-        float divisor = 256.0;
-        val += ((uint8_t*)dataset)[j + ((uint64_t)dimCenters * i)] / divisor;
-      } else if (dtype == CUDA_R_8I) {
-        float divisor = 128.0;
-        val += ((int8_t*)dataset)[j + ((uint64_t)dimCenters * i)] / divisor;
-      }
-      val /= 8.0;
+      constexpr float kWc = 7.0;
+      constexpr float kWd = 1.0;
+      float val           = 0;
+      val += kWc * centers[j + ((uint64_t)dimCenters * li)];
+      val += kWd * dataset[j + ((uint64_t)dimCenters * i)] / divisor;
+      val /= kWc + kWd;
       sqsum += val * val;
       centers[j + ((uint64_t)dimCenters * l)] = val;
     }
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
index e379014bd6..2b347968e7 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
@@ -118,13 +118,11 @@ void approx_knn_cuivfl_ivfflat_build_index(const raft::handle_t& handle,
                                   cudaMemcpyDefault,
                                   stream));
 
-  cudaDataType_t dtype = utils::cuda_datatype<T>();
-
-  index->handle_ =
-    std::make_unique<cuivflHandle>(handle, metric, D, params->nlist, niter, index->device);
+  index->handle_.get<T>() = std::make_unique<detail::cuivflHandle<T>>(
+    handle, metric, D, params->nlist, niter, index->device);
 
   // NB: `trainset` is accessed by both CPU and GPU code here.
-  index->handle_->cuivflBuildIndex(dataset, trainset.data(), dtype, n, ntrain);
+  index->handle_.get<T>()->cuivflBuildIndex(dataset, trainset.data(), n, ntrain);
 }
 
 template <typename IntType = int>
@@ -266,22 +264,13 @@ void approx_knn_search(const handle_t& handle,
       int max_batch               = n;
       int max_k                   = k;
 
-      index->handle_->cuivflSetSearchParameters(nprobe, max_batch, max_k);
-
-      cudaDataType_t dtype;
-      if (typeid(T) == typeid(float)) {
-        dtype = CUDA_R_32F;
-      } else if (typeid(T) == typeid(uint8_t)) {
-        dtype = CUDA_R_8U;
-      } else if (typeid(T) == typeid(int8_t)) {
-        dtype = CUDA_R_8I;
-      }
-      index->handle_->cuivflSearch(
-        query_array, max_batch, max_k, (size_t*)indices, distances, dtype);
+      index->handle_.get<T>()->cuivflSetSearchParameters(nprobe, max_batch, max_k);
+      index->handle_.get<T>()->cuivflSearch(
+        query_array, max_batch, max_k, (size_t*)indices, distances);
     }
   } else if constexpr (std::is_same<T, float>{}) {
     std::unique_ptr<MetricProcessor<float>> query_metric_processor = create_processor<float>(
-      index->metric, n, index->handle_->getDim(), k, false, handle.get_stream());
+      index->metric, n, index->handle_.get<T>()->getDim(), k, false, handle.get_stream());
     query_metric_processor->preprocess(query_array);
 
     if (dynamic_cast<IVFFlatParam*>(params)) {
@@ -290,18 +279,9 @@ void approx_knn_search(const handle_t& handle,
       int max_batch               = n;
       int max_k                   = k;
 
-      index->handle_->cuivflSetSearchParameters(nprobe, max_batch, max_k);
-
-      cudaDataType_t dtype;
-      if (typeid(T) == typeid(float)) {
-        dtype = CUDA_R_32F;
-      } else if (typeid(T) == typeid(uint8_t)) {
-        dtype = CUDA_R_8U;
-      } else if (typeid(T) == typeid(int8_t)) {
-        dtype = CUDA_R_8I;
-      }
-      index->handle_->cuivflSearch(
-        query_array, max_batch, max_k, (size_t*)indices, distances, dtype);
+      index->handle_.get<T>()->cuivflSetSearchParameters(nprobe, max_batch, max_k);
+      index->handle_.get<T>()->cuivflSearch(
+        query_array, max_batch, max_k, (size_t*)indices, distances);
     }
     query_metric_processor->revert(query_array);
 
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index cf11fd3e6b..1d66ee32f4 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -50,62 +50,6 @@ void printDevPtr(const T* d_cache, int len, const char* name)
   free(res);
 }
 
-inline auto cuda_datatype_size(cudaDataType_t t) -> size_t
-{
-  switch (t) {
-    case CUDA_R_8I:
-    case CUDA_C_8I:
-    case CUDA_R_8U:
-    case CUDA_C_8U: return 1;
-
-    case CUDA_R_16F:
-    case CUDA_C_16F:
-    case CUDA_R_16BF:
-    case CUDA_C_16BF:
-    case CUDA_R_16I:
-    case CUDA_C_16I:
-    case CUDA_R_16U:
-    case CUDA_C_16U: return 2;
-
-    case CUDA_R_32F:
-    case CUDA_C_32F:
-    case CUDA_R_32I:
-    case CUDA_C_32I:
-    case CUDA_R_32U:
-    case CUDA_C_32U: return 4;
-
-    case CUDA_R_64F:
-    case CUDA_C_64F:
-    case CUDA_R_64I:
-    case CUDA_C_64I:
-    case CUDA_R_64U:
-    case CUDA_C_64U: return 8;
-
-    default: RAFT_FAIL("cuda_datatype_size: unsupported dtype (%d)", t);
-  }
-}
-
-template <typename T>
-inline constexpr auto cuda_datatype() -> cudaDataType_t;
-
-template <>
-inline constexpr auto cuda_datatype<float>() -> cudaDataType_t
-{
-  return CUDA_R_32F;
-}
-
-template <>
-inline constexpr auto cuda_datatype<uint8_t>() -> cudaDataType_t
-{
-  return CUDA_R_8U;
-}
-
-template <>
-inline constexpr auto cuda_datatype<int8_t>() -> cudaDataType_t
-{
-  return CUDA_R_8I;
-}
-
 inline size_t calc_aligned_size(const std::vector<size_t>& sizes)
 {
   const size_t ALIGN_BYTES = 256;

From 6fecd7fcbe920b508557c12e59c052845b68a1b1 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 20 May 2022 14:37:57 +0200
Subject: [PATCH 029/118] ceildiv

---
 .../raft/spatial/knn/detail/ann_utils.cuh        | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index 1d66ee32f4..2a8263b3d4 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -187,7 +187,7 @@ void _cuann_sqsum(uint32_t nRows,
 )
 {
   dim3 threads(32, 4, 1);  // DO NOT CHANGE
-  dim3 blocks((nRows + threads.y - 1) / threads.y, 1, 1);
+  dim3 blocks(ceildiv(nRows, threads.y), 1, 1);
   kern_sqsum<<<blocks, threads>>>(nRows, nCols, a, out);
 }
 
@@ -235,7 +235,7 @@ void _cuann_copy(uint32_t nRows,
                  D divisor)
 {
   uint32_t nThreads = 128;
-  uint32_t nBlocks  = ((nRows * nCols) + nThreads - 1) / nThreads;
+  uint32_t nBlocks  = ceildiv(nRows * nCols, nThreads);
   kern_copy<S, D><<<nBlocks, nThreads>>>(nRows, nCols, src, ldSrc, dst, ldDst, divisor);
 }
 
@@ -250,7 +250,7 @@ void _cuann_copy(uint32_t nRows,
                  D divisor)
 {
   uint32_t nThreads = 128;
-  uint32_t nBlocks  = ((nRows * nCols) + nThreads - 1) / nThreads;
+  uint32_t nBlocks  = ceildiv(nRows * nCols, nThreads);
   kern_copy<S, D><<<nBlocks, nThreads, 0, stream>>>(nRows, nCols, src, ldSrc, dst, ldDst, divisor);
 }
 
@@ -316,7 +316,7 @@ void _cuann_accumulate_with_label(uint32_t nRowsOutput,
   if (useGPU) {
     // GPU
     uint32_t nThreads = 128;
-    uint64_t nBlocks  = (((uint64_t)nRowsInput * nCols) + nThreads - 1) / nThreads;
+    uint64_t nBlocks  = ceildiv((uint64_t)nRowsInput * nCols, (uint64_t)nThreads);
     kern_accumulate_with_label<T>
       <<<nBlocks, nThreads>>>(nRowsOutput, nCols, output, count, nRowsInput, input, label, divisor);
   } else {
@@ -374,7 +374,7 @@ void _cuann_normalize(uint32_t nRows,
 )
 {
   dim3 threads(32, 4, 1);  // DO NOT CHANGE
-  dim3 blocks((nRows + threads.y - 1) / threads.y, 1, 1);
+  dim3 blocks(ceildiv(nRows, threads.y), 1, 1);
   kern_normalize<<<blocks, threads>>>(nRows, nCols, a, numSamples);
 }
 
@@ -407,7 +407,7 @@ void _cuann_divide(uint32_t nRows,
 )
 {
   dim3 threads(128, 1, 1);
-  dim3 blocks(((uint64_t)nRows * nCols + threads.x - 1) / threads.x, 1, 1);
+  dim3 blocks(ceildiv<uint64_t>((uint64_t)nRows * (uint64_t)nCols, threads.x), 1, 1);
   kern_divide<<<blocks, threads>>>(nRows, nCols, a, numSamples);
 }
 
@@ -437,7 +437,7 @@ void _cuann_outer_add(const float* a,
 )
 {
   dim3 threads(128, 1, 1);
-  dim3 blocks(((uint64_t)numA * numB + threads.x - 1) / threads.x, 1, 1);
+  dim3 blocks(ceildiv<uint64_t>((uint64_t)numA * (uint64_t)numB, threads.x), 1, 1);
   kern_outer_add<<<blocks, threads>>>(a, numA, b, numB, c);
 }
 
@@ -500,7 +500,7 @@ void _cuann_copy_with_list(uint32_t nRows,
     }
   } else {
     uint32_t nThreads = 128;
-    uint32_t nBlocks  = ((nRows * nCols) + nThreads - 1) / nThreads;
+    uint32_t nBlocks  = ceildiv(nRows * nCols, nThreads);
     kern_copy_with_list<T>
       <<<nBlocks, nThreads>>>(nRows, nCols, src, rowList, ldSrc, dst, ldDst, divisor);
   }

From 174854f22f51a2f123d9d5cc94b2dfd5a4fbe024 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 20 May 2022 17:23:46 +0200
Subject: [PATCH 030/118] Use rmm's memory pool in place of explicitly
 allocated buffers

---
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  | 222 +++++++-----------
 .../raft/spatial/knn/detail/ann_utils.cuh     |  28 ---
 .../spatial/knn/detail/topk/radix_topk.cuh    | 167 -------------
 3 files changed, 82 insertions(+), 335 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index 263fc366c0..4c35cb268f 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -31,6 +31,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
 
 namespace raft::spatial::knn::detail {
 
@@ -133,16 +135,14 @@ class cuivflHandle {
 
   raft::distance::DistanceType metric_type_;
   bool greater_;
-  uint32_t nlist_;           // The number of inverted lists= the number of centriods
-  uint32_t niter_;           // The number of uint32_terations for kmeans to build the indexs
-  uint32_t dim_;             // The dimension of vectors for input dataset
-  uint32_t nprobe_;          // The number of clusters for searching
-  uint32_t nrow_;            // The number of elements for input dataset
-  size_t ninterleave_;       // The number of elements in 32 interleaved group for input dataset
-  size_t buf_topk_size_;     // The size of buffer used for topk select.
-  size_t float_query_size_;  // The size of float converted queries from int8_t/uint8_t
-  uint32_t veclen_;          // The vectorization length of dataset in index.
-  uint32_t grid_dim_x_;      // The number of blocks launched across nprobe.
+  uint32_t nlist_;       // The number of inverted lists= the number of centriods
+  uint32_t niter_;       // The number of uint32_terations for kmeans to build the indexs
+  uint32_t dim_;         // The dimension of vectors for input dataset
+  uint32_t nprobe_;      // The number of clusters for searching
+  uint32_t nrow_;        // The number of elements for input dataset
+  size_t ninterleave_;   // The number of elements in 32 interleaved group for input dataset
+  uint32_t veclen_;      // The vectorization length of dataset in index.
+  uint32_t grid_dim_x_;  // The number of blocks launched across nprobe.
 
   // device pointer
   //  The device memory pointer; inverted list for data; size [ninterleave_, dim_]
@@ -157,8 +157,9 @@ class cuivflHandle {
   rmm::device_uvector<float> centriod_dev_;
   // The device memory pointer; centriod norm ; size [nlist_, dim_]
   rmm::device_uvector<float> centriod_norm_dev_;
-  // The device memory; used for topk select.
-  rmm::device_buffer select_workspace_dev_;
+  // Memory pool for use during search; after the first search is done the pool is not likely to
+  // resize, saving the costs of allocations.
+  std::optional<rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>> search_mem_res;
 
   // host pointer
   //  The host memory pointer; inverted list for data; size [ninterleave_, dim_]
@@ -195,7 +196,6 @@ cuivflHandle<T>::cuivflHandle(const handle_t& handle,
     nlist_(nlist),
     niter_(niter),
     metric_type_(metric_type),
-    float_query_size_(0),
     grid_dim_x_(0),
     list_data_dev_(0, stream_),
     list_index_dev_(0, stream_),
@@ -508,7 +508,7 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildIndex(const T* dataset,
   if (metric_type_ == raft::distance::DistanceType::L2Expanded) {
     utils::_cuann_sqsum(nlist_, dim_, centriod_managed_ptr, centriod_norm_dev_.data());
 #ifdef DEBUG_L2
-    printDevPtr(centriod_norm_dev_.data(), 20, "centriod_norm_dev_");
+    utils::printDevPtr(centriod_norm_dev_.data(), 20, "centriod_norm_dev_");
 #endif
   }
 
@@ -602,65 +602,12 @@ cuivflStatus_t cuivflHandle<T>::cuivflSetSearchParameters(const uint32_t nprobe,
     greater_ = false;
   }
 
-  // Set buffer
-  if constexpr (std::is_integral_v<T>) {
-    float_query_size_ = sizeof(float) * max_batch * dim_;
-  } else {
-    float_query_size_ = 0;
+  // Set memory buffer to be reused across searches
+  auto cur_memory_resource = rmm::mr::get_current_device_resource();
+  if (!search_mem_res.has_value() || search_mem_res->get_upstream() != cur_memory_resource) {
+    search_mem_res.emplace(cur_memory_resource,
+                           Pow2<256>::roundUp(max_batch * nprobe * max_k * 16));
   }
-
-  size_t buf_coarse_size = 0;
-  topk::radix_topk_11bits<float, uint32_t>(nullptr,
-                                           buf_coarse_size,
-                                           nullptr,
-                                           (uint32_t)max_batch,
-                                           (uint32_t)nlist_,
-                                           (uint32_t)nprobe,
-                                           nullptr,
-                                           nullptr,
-                                           greater_,
-                                           0);
-
-  size_t buf_refine_size = 0;
-//#ifdef RADIX
-#if 1
-  topk::radix_topk_11bits<float, size_t>(nullptr,
-                                         buf_refine_size,
-                                         nullptr,
-                                         nullptr,
-                                         (size_t)max_batch,
-                                         (size_t)max_k * nprobe,
-                                         (size_t)max_k,
-                                         nullptr,
-                                         nullptr,
-                                         greater_,
-                                         0);
-#else
-  nv::warp_sort_topk<float, size_t>(nullptr,
-                                    buf_refine_size,
-                                    nullptr,
-                                    nullptr,
-                                    (size_t)max_batch,
-                                    (size_t)(max_k * nprobe),
-                                    (size_t)max_k,
-                                    nullptr,
-                                    nullptr,
-                                    greater_,
-                                    0);
-#endif
-
-  buf_topk_size_            = buf_coarse_size > buf_refine_size ? buf_coarse_size : buf_refine_size;
-  uint32_t query_norm_size  = max_batch * sizeof(float);
-  std::vector<size_t> sizes = {query_norm_size,
-                               max_batch * nlist_ * sizeof(float),
-                               max_batch * nprobe * sizeof(float),
-                               max_batch * nprobe * sizeof(uint32_t),
-                               max_batch * nprobe * max_k * sizeof(float),
-                               max_batch * nprobe * max_k * sizeof(size_t),
-                               buf_topk_size_,
-                               float_query_size_};
-
-  select_workspace_dev_.resize(utils::calc_aligned_size(sizes), stream_);
   return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
 }
 
@@ -683,47 +630,40 @@ cuivflStatus_t cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueri
                                                  size_t* neighbors,  // [numQueries, topK]
                                                  value_t* distances)
 {
-  uint32_t nprobe = std::min(nprobe_, (uint32_t)nlist_);
-
-  grid_dim_x_ = 0;
+  uint32_t nprobe = std::min(nprobe_, nlist_);
+  grid_dim_x_     = 0;
   queryIVFFlatGridSize(nprobe, batch_size, k);
-  // Prepare the buffer for topk calculation
-  uint32_t query_norm_size  = batch_size * sizeof(float);
-  std::vector<size_t> sizes = {query_norm_size,
-                               batch_size * nlist_ * sizeof(float),
-                               batch_size * nprobe * sizeof(float),
-                               batch_size * nprobe * sizeof(uint32_t),
-                               batch_size * nprobe * k * sizeof(float),
-                               batch_size * nprobe * k * sizeof(size_t),
-                               buf_topk_size_,
-                               float_query_size_};
-  std::vector<void*> aligned_pointers =
-    utils::calc_aligned_pointers(select_workspace_dev_.data(), sizes);
-
-  // The norm of query [batch_size];
-  float* query_norm_dev_ptr = static_cast<float*>(aligned_pointers[0]);
-  // The distance value of cluster(list) and queries;[batch, nlist_]
-  float* distance_buffer_dev_ptr = static_cast<float*>(aligned_pointers[1]);
-  // The topk distance value of cluster(list) and queries;[batch, nprobe]
-  float* coarse_distances_dev_ptr = static_cast<float*>(aligned_pointers[2]);
-  // TODO:use float datatype here for now.
-  // The topk  index of cluster(list) and queries;[batch, nprobe]
-  uint32_t* coarse_indices_dev_ptr = static_cast<uint32_t*>(aligned_pointers[3]);
-  // The topk distance value of candicate vectors from each cluster(list);[batch,k]
-  value_t* refined_distances_dev_ptr = static_cast<value_t*>(aligned_pointers[4]);
-  // The topk index of candicate vectors from each cluster(list);[batch, k]
-  size_t* refined_indices_dev_ptr = static_cast<size_t*>(aligned_pointers[5]);
-  void* buf_topk_dev_ptr          = static_cast<void*>(aligned_pointers[6]);
-  float* convertedQueries         = static_cast<float*>(aligned_pointers[7]);
+  auto search_mr = &(search_mem_res.value());
+  // The norm of query
+  rmm::device_uvector<float> query_norm_dev(batch_size, stream_, search_mr);
+  // The distance value of cluster(list) and queries
+  rmm::device_uvector<float> distance_buffer_dev(batch_size * nlist_, stream_, search_mr);
+  // The topk distance value of cluster(list) and queries
+  rmm::device_uvector<float> coarse_distances_dev(batch_size * nprobe, stream_, search_mr);
+  // The topk  index of cluster(list) and queries
+  rmm::device_uvector<uint32_t> coarse_indices_dev(batch_size * nprobe, stream_, search_mr);
+  // The topk distance value of candicate vectors from each cluster(list)
+  rmm::device_uvector<value_t> refined_distances_dev(batch_size * nprobe * k, stream_, search_mr);
+  // The topk index of candicate vectors from each cluster(list)
+  rmm::device_uvector<size_t> refined_indices_dev(batch_size * nprobe * k, stream_, search_mr);
+
+  size_t float_query_size;
+  if constexpr (std::is_integral_v<T>) {
+    float_query_size = batch_size * dim_;
+  } else {
+    float_query_size = 0;
+  }
+  rmm::device_uvector<float> converted_queries_dev(float_query_size, stream_, search_mr);
+  float* converted_queries_ptr = converted_queries_dev.data();
 
   if constexpr (std::is_same_v<T, float>) {
-    convertedQueries = const_cast<float*>(queries);
+    converted_queries_ptr = const_cast<float*>(queries);
   } else {
     utils::_cuann_copy<T, float>(batch_size,
                                  dim_,
                                  queries,
                                  dim_,
-                                 convertedQueries,
+                                 converted_queries_ptr,
                                  dim_,
                                  stream_,
                                  ivfflat_config<T>::kDivisor);
@@ -735,12 +675,15 @@ cuivflStatus_t cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueri
   if (metric_type_ == raft::distance::DistanceType::L2Expanded) {
     alpha = -2.0f;
     beta  = 1.0f;
-    utils::_cuann_sqsum(batch_size, dim_, convertedQueries, query_norm_dev_ptr);
-    utils::_cuann_outer_add(
-      query_norm_dev_ptr, batch_size, centriod_norm_dev_.data(), nlist_, distance_buffer_dev_ptr);
+    utils::_cuann_sqsum(batch_size, dim_, converted_queries_ptr, query_norm_dev.data());
+    utils::_cuann_outer_add(query_norm_dev.data(),
+                            batch_size,
+                            centriod_norm_dev_.data(),
+                            nlist_,
+                            distance_buffer_dev.data());
 #ifdef DEBUG_L2
     utils::printDevPtr(centriod_norm_dev_.data(), 20, "centriod_norm_dev_");
-    utils::printDevPtr(distance_buffer_dev_ptr, 20, "distance_buffer_dev_ptr");
+    utils::printDevPtr(distance_buffer_dev.data(), 20, "distance_buffer_dev_ptr");
 #endif
   } else {
     alpha = 1.0f;
@@ -756,33 +699,33 @@ cuivflStatus_t cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueri
                &alpha,
                centriod_dev_.data(),
                dim_,
-               convertedQueries,
+               converted_queries_ptr,
                dim_,
                &beta,
-               distance_buffer_dev_ptr,
+               distance_buffer_dev.data(),
                nlist_,
                stream_);
 
 #ifdef DEBUG_L2
-  utils::printDevPtr(distance_buffer_dev_ptr, 20, "distance_buffer_dev_ptr");
+  utils::printDevPtr(distance_buffer_dev.data(), 20, "distance_buffer_dev_ptr");
 #endif
-  topk::radix_topk_11bits<value_t, uint32_t>(buf_topk_dev_ptr,
-                                             buf_topk_size_,
-                                             distance_buffer_dev_ptr,
-                                             (uint32_t)batch_size,
-                                             (uint32_t)nlist_,
-                                             (uint32_t)nprobe,
-                                             coarse_distances_dev_ptr,
-                                             coarse_indices_dev_ptr,
-                                             greater_,
-                                             stream_);
+  topk::radix_topk<value_t, uint32_t, 11, 512>(distance_buffer_dev.data(),
+                                               nullptr,
+                                               batch_size,
+                                               nlist_,
+                                               nprobe,
+                                               coarse_distances_dev.data(),
+                                               coarse_indices_dev.data(),
+                                               !greater_,
+                                               stream_,
+                                               &(search_mem_res.value()));
 #ifdef DEBUG_L2
-  utils::printDevPtr(coarse_indices_dev_ptr, 1 * nprobe, "coarse_indices_dev_ptr");
-  utils::printDevPtr(coarse_distances_dev_ptr, 1 * nprobe, "coarse_distances_dev_ptr");
+  utils::printDevPtr(coarse_indices_dev.data(), 1 * nprobe, "coarse_indices_dev_ptr");
+  utils::printDevPtr(coarse_distances_dev.data(), 1 * nprobe, "coarse_distances_dev_ptr");
 #endif
 
-  value_t* distances_dev_ptr = refined_distances_dev_ptr;
-  size_t* indices_dev_ptr    = refined_indices_dev_ptr;
+  value_t* distances_dev_ptr = refined_distances_dev.data();
+  size_t* indices_dev_ptr    = refined_indices_dev.data();
   if (nprobe == 1 || grid_dim_x_ == 1) {
     distances_dev_ptr = distances;
     indices_dev_ptr   = neighbors;
@@ -790,7 +733,7 @@ cuivflStatus_t cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueri
 
   ivfflat_interleaved_scan<T, typename ivfflat_config<T>::value_t>(
     queries,
-    coarse_indices_dev_ptr,
+    coarse_indices_dev.data(),
     list_index_dev_.data(),
     list_data_dev_.data(),
     list_lengths_dev_.data(),
@@ -815,22 +758,21 @@ cuivflStatus_t cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueri
   if (grid_dim_x_ > 1) {
 //#ifdef RADIX
 #if 1
-    topk::radix_topk_11bits<value_t, size_t>(buf_topk_dev_ptr,
-                                             buf_topk_size_,
-                                             refined_distances_dev_ptr,
-                                             refined_indices_dev_ptr,
-                                             (size_t)batch_size,
-                                             (size_t)k * grid_dim_x_,
-                                             (size_t)k,
-                                             distances,
-                                             neighbors,
-                                             greater_,
-                                             stream_);
+    topk::radix_topk<value_t, size_t, 11, 512>(refined_distances_dev.data(),
+                                               refined_indices_dev.data(),
+                                               batch_size,
+                                               k * grid_dim_x_,
+                                               k,
+                                               distances,
+                                               neighbors,
+                                               !greater_,
+                                               stream_,
+                                               &(search_mem_res.value()));
 #else
     topk::warp_sort_topk<value_t, size_t>(buf_topk_dev_ptr,
                                           buf_topk_size_,
-                                          refined_distances_dev_ptr,
-                                          refined_indices_dev_ptr,
+                                          refined_distances_dev.data(),
+                                          refined_indices_dev.data(),
                                           (size_t)batch_size,
                                           (size_t)(k * grid_dim_x_),
                                           (size_t)k,
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index 2a8263b3d4..a73e08ff31 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -50,34 +50,6 @@ void printDevPtr(const T* d_cache, int len, const char* name)
   free(res);
 }
 
-inline size_t calc_aligned_size(const std::vector<size_t>& sizes)
-{
-  const size_t ALIGN_BYTES = 256;
-  const size_t ALIGN_MASK  = ~(ALIGN_BYTES - 1);
-  size_t total             = 0;
-  for (auto sz : sizes) {
-    total += (sz + ALIGN_BYTES - 1) & ALIGN_MASK;
-  }
-  return total + ALIGN_BYTES - 1;
-}
-
-inline std::vector<void*> calc_aligned_pointers(const void* p, const std::vector<size_t>& sizes)
-{
-  const size_t ALIGN_BYTES = 256;
-  const size_t ALIGN_MASK  = ~(ALIGN_BYTES - 1);
-
-  char* ptr = reinterpret_cast<char*>((reinterpret_cast<size_t>(p) + ALIGN_BYTES - 1) & ALIGN_MASK);
-
-  std::vector<void*> aligned_pointers;
-  aligned_pointers.reserve(sizes.size());
-  for (auto sz : sizes) {
-    aligned_pointers.push_back(ptr);
-    ptr += (sz + ALIGN_BYTES - 1) & ALIGN_MASK;
-  }
-
-  return aligned_pointers;
-}
-
 //
 size_t _cuann_aligned(size_t size, size_t unit = 128)
 {
diff --git a/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh b/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
index 2f06eb6558..c0b86c9970 100644
--- a/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
@@ -645,171 +645,4 @@ void radix_topk(const T* in,
   }
 }
 
-inline size_t calc_aligned_size(const std::vector<size_t>& sizes)
-{
-  const size_t ALIGN_BYTES = 256;
-  const size_t ALIGN_MASK  = ~(ALIGN_BYTES - 1);
-  size_t total             = 0;
-  for (auto sz : sizes) {
-    total += (sz + ALIGN_BYTES - 1) & ALIGN_MASK;
-  }
-  return total + ALIGN_BYTES - 1;
-}
-
-inline std::vector<void*> calc_aligned_pointers(const void* p, const std::vector<size_t>& sizes)
-{
-  const size_t ALIGN_BYTES = 256;
-  const size_t ALIGN_MASK  = ~(ALIGN_BYTES - 1);
-
-  char* ptr = reinterpret_cast<char*>((reinterpret_cast<size_t>(p) + ALIGN_BYTES - 1) & ALIGN_MASK);
-
-  std::vector<void*> aligned_pointers;
-  aligned_pointers.reserve(sizes.size());
-  for (auto sz : sizes) {
-    aligned_pointers.push_back(ptr);
-    ptr += (sz + ALIGN_BYTES - 1) & ALIGN_MASK;
-  }
-
-  return aligned_pointers;
-}
-
-template <typename T, typename idxT, int BITS_PER_PASS, int NUM_THREAD>
-void radix_topk(void* buf,
-                size_t& buf_size,
-                const T* in,
-                const idxT* in_idx,
-                idxT batch_size,
-                idxT len,
-                idxT k,
-                T* out,
-                idxT* out_idx,
-                bool greater,
-                cudaStream_t stream)
-{
-  // TODO: is it possible to relax this condition?
-  static_assert(calc_num_passes<T, BITS_PER_PASS>() > 1);
-  constexpr int num_buckets = calc_num_buckets<BITS_PER_PASS>();
-
-  Counter<T, idxT>* counters = nullptr;
-  idxT* histograms           = nullptr;
-  T* buf1                    = nullptr;
-  idxT* idx_buf1             = nullptr;
-  T* buf2                    = nullptr;
-  idxT* idx_buf2             = nullptr;
-  {
-    std::vector<size_t> sizes = {sizeof(*counters) * batch_size,
-                                 sizeof(*histograms) * num_buckets * batch_size,
-                                 sizeof(*buf1) * len * batch_size,
-                                 sizeof(*idx_buf1) * len * batch_size,
-                                 sizeof(*buf2) * len * batch_size,
-                                 sizeof(*idx_buf2) * len * batch_size};
-    size_t total_size         = calc_aligned_size(sizes);
-    if (!buf) {
-      buf_size = total_size;
-      return;
-    }
-
-    std::vector<void*> aligned_pointers = calc_aligned_pointers(buf, sizes);
-    counters                            = static_cast<decltype(counters)>(aligned_pointers[0]);
-    histograms                          = static_cast<decltype(histograms)>(aligned_pointers[1]);
-    buf1                                = static_cast<decltype(buf1)>(aligned_pointers[2]);
-    idx_buf1                            = static_cast<decltype(idx_buf1)>(aligned_pointers[3]);
-    buf2                                = static_cast<decltype(buf2)>(aligned_pointers[4]);
-    idx_buf2                            = static_cast<decltype(idx_buf2)>(aligned_pointers[5]);
-
-    RAFT_CUDA_TRY(cudaMemsetAsync(
-      buf,
-      0,
-      static_cast<char*>(aligned_pointers[2]) - static_cast<char*>(aligned_pointers[0]),
-      stream));
-  }
-
-  const T* in_buf        = nullptr;
-  const idxT* in_idx_buf = nullptr;
-  T* out_buf             = nullptr;
-  idxT* out_idx_buf      = nullptr;
-
-  dim3 blocks((len - 1) / (NUM_THREAD * ITEM_PER_THREAD) + 1, batch_size);
-
-  constexpr int num_passes = calc_num_passes<T, BITS_PER_PASS>();
-  for (int pass = 0; pass < num_passes; pass++) {
-    if (pass == 0) {
-      in_buf      = in;
-      in_idx_buf  = nullptr;
-      out_buf     = nullptr;
-      out_idx_buf = nullptr;
-    } else if (pass == 1) {
-      in_buf      = in;
-      in_idx_buf  = in_idx ? in_idx : nullptr;
-      out_buf     = buf1;
-      out_idx_buf = idx_buf1;
-    } else if (pass % 2 == 0) {
-      in_buf      = buf1;
-      in_idx_buf  = idx_buf1;
-      out_buf     = buf2;
-      out_idx_buf = idx_buf2;
-    } else {
-      in_buf      = buf2;
-      in_idx_buf  = idx_buf2;
-      out_buf     = buf1;
-      out_idx_buf = idx_buf1;
-    }
-
-    radix_kernel<T, idxT, BITS_PER_PASS, NUM_THREAD><<<blocks, NUM_THREAD, 0, stream>>>(in_buf,
-                                                                                        in_idx_buf,
-                                                                                        out_buf,
-                                                                                        out_idx_buf,
-                                                                                        out,
-                                                                                        out_idx,
-                                                                                        counters,
-                                                                                        histograms,
-                                                                                        len,
-                                                                                        k,
-                                                                                        greater,
-                                                                                        pass);
-  }
-}
-
-template <typename T, typename idxT>
-void radix_topk_11bits(void* buf,
-                       size_t& buf_size,
-                       const T* in,
-                       idxT batch_size,
-                       idxT len,
-                       idxT k,
-                       T* out,
-                       idxT* out_idx       = nullptr,
-                       bool greater        = true,
-                       cudaStream_t stream = 0)
-{
-  radix_topk<T, idxT, 11, 512>(buf,
-                               buf_size,
-                               in,
-                               static_cast<idxT*>(nullptr),
-                               batch_size,
-                               len,
-                               k,
-                               out,
-                               out_idx,
-                               greater,
-                               stream);
-}
-
-template <typename T, typename idxT>
-void radix_topk_11bits(void* buf,
-                       size_t& buf_size,
-                       const T* in,
-                       const idxT* in_idx,
-                       idxT batch_size,
-                       idxT len,
-                       idxT k,
-                       T* out,
-                       idxT* out_idx       = nullptr,
-                       bool greater        = true,
-                       cudaStream_t stream = 0)
-{
-  radix_topk<T, idxT, 11, 512>(
-    buf, buf_size, in, in_idx, batch_size, len, k, out, out_idx, greater, stream);
-}
-
 }  // namespace raft::spatial::knn::detail::topk

From ca1aaada0b61a8030cea89ef0f9acb2c0977531c Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 24 May 2022 08:51:06 +0200
Subject: [PATCH 031/118] Use raft logging

---
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  | 27 +++++++------------
 1 file changed, 9 insertions(+), 18 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index 4c35cb268f..ef40708858 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -21,6 +21,7 @@
 #include "ann_utils.cuh"
 #include "topk/radix_topk.cuh"
 
+#include <raft/common/logger.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/distance/distance.hpp>
@@ -507,9 +508,7 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildIndex(const T* dataset,
 
   if (metric_type_ == raft::distance::DistanceType::L2Expanded) {
     utils::_cuann_sqsum(nlist_, dim_, centriod_managed_ptr, centriod_norm_dev_.data());
-#ifdef DEBUG_L2
-    utils::printDevPtr(centriod_norm_dev_.data(), 20, "centriod_norm_dev_");
-#endif
+    RAFT_LOG_TRACE_VEC(centriod_norm_dev_.data(), 20);
   }
 
   // Step 4: Record the number of elements in each clusters
@@ -681,10 +680,8 @@ cuivflStatus_t cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueri
                             centriod_norm_dev_.data(),
                             nlist_,
                             distance_buffer_dev.data());
-#ifdef DEBUG_L2
-    utils::printDevPtr(centriod_norm_dev_.data(), 20, "centriod_norm_dev_");
-    utils::printDevPtr(distance_buffer_dev.data(), 20, "distance_buffer_dev_ptr");
-#endif
+    RAFT_LOG_TRACE_VEC(centriod_norm_dev_.data(), 20);
+    RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), 20);
   } else {
     alpha = 1.0f;
     beta  = 0.0f;
@@ -706,9 +703,7 @@ cuivflStatus_t cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueri
                nlist_,
                stream_);
 
-#ifdef DEBUG_L2
-  utils::printDevPtr(distance_buffer_dev.data(), 20, "distance_buffer_dev_ptr");
-#endif
+  RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), 20);
   topk::radix_topk<value_t, uint32_t, 11, 512>(distance_buffer_dev.data(),
                                                nullptr,
                                                batch_size,
@@ -719,10 +714,8 @@ cuivflStatus_t cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueri
                                                !greater_,
                                                stream_,
                                                &(search_mem_res.value()));
-#ifdef DEBUG_L2
-  utils::printDevPtr(coarse_indices_dev.data(), 1 * nprobe, "coarse_indices_dev_ptr");
-  utils::printDevPtr(coarse_distances_dev.data(), 1 * nprobe, "coarse_distances_dev_ptr");
-#endif
+  RAFT_LOG_TRACE_VEC(coarse_indices_dev.data(), 1 * nprobe);
+  RAFT_LOG_TRACE_VEC(coarse_distances_dev.data(), 1 * nprobe);
 
   value_t* distances_dev_ptr = refined_distances_dev.data();
   size_t* indices_dev_ptr    = refined_indices_dev.data();
@@ -750,10 +743,8 @@ cuivflStatus_t cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueri
     veclen_,
     grid_dim_x_);
 
-#ifdef DEBUG_L2
-  utils::printDevPtr(distances_dev_ptr, 2 * k, "distances_dev_ptr");
-  utils::printDevPtr(indices_dev_ptr, 2 * k, "indices_dev_ptr");
-#endif
+  RAFT_LOG_TRACE_VEC(distances_dev_ptr, 2 * k);
+  RAFT_LOG_TRACE_VEC(indices_dev_ptr, 2 * k);
 
   if (grid_dim_x_ > 1) {
 //#ifdef RADIX

From 70d84ecc785eca9f362c3d7d3bab07555cf3d908 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 24 May 2022 09:55:30 +0200
Subject: [PATCH 032/118] Updated logging and nvtx markers

---
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  | 42 ++++++++++---------
 .../knn/detail/ann_kmeans_balanced.cuh        | 11 -----
 .../raft/spatial/knn/detail/ann_utils.cuh     | 18 --------
 3 files changed, 23 insertions(+), 48 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index ef40708858..a251e6108b 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -22,6 +22,7 @@
 #include "topk/radix_topk.cuh"
 
 #include <raft/common/logger.hpp>
+#include <raft/common/nvtx.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/distance/distance.hpp>
@@ -35,6 +36,13 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
+// TODO: remove this when #673 is merged
+#ifdef RAFT_LOG_TRACE_VEC
+#pragma message("PR #673 seems to be merged, please remove this CPP block.")
+#else
+#define RAFT_LOG_TRACE_VEC(ptr, len) void(0)
+#endif
+
 namespace raft::spatial::knn::detail {
 
 template <typename T>
@@ -227,6 +235,8 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_manag
                                                            uint32_t nrow,
                                                            uint32_t ntrain)
 {
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "cuivflBuildOptimizedKmeans(%u, %u)", nrow, ntrain);
   uint32_t numTrainset   = ntrain;
   uint32_t numClusters   = nlist_;
   uint32_t dimDataset    = dim_;
@@ -237,7 +247,7 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_manag
   float* clusterCenters = centriod_managed_ptr;
 
   uint32_t numMesoClusters = pow((double)(numClusters), (double)1.0 / 2.0) + 0.5;
-  fprintf(stderr, "# numMesoClusters: %u\n", numMesoClusters);
+  RAFT_LOG_DEBUG("(%s) # numMesoClusters: %u", __func__, numMesoClusters);
 
   rmm::mr::managed_memory_resource managed_memory;
   rmm::device_uvector<float> mesoClusterCenters(
@@ -257,11 +267,7 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_manag
   rmm::device_buffer predictWorkspace(sizePredictWorkspace, stream_);
   // Training meso-clusters
   for (uint32_t iter = 0; iter < 2 * numIterations; iter += 2) {
-    fprintf(stderr,
-            "(%s) Training kmeans of meso-clusters: %.1f / %u    \r",
-            __func__,
-            (float)iter / 2,
-            numIterations);
+    RAFT_LOG_TRACE("Training kmeans of meso-clusters: %.1f / %u", (float)iter / 2, numIterations);
     _cuann_kmeans_predict(handle_,
                           mesoClusterCenters.data(),
                           numMesoClusters,
@@ -290,7 +296,6 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_manag
     }    // end if iter < 2 * (numIterations - 2)
   }      // end for (int iter = 0; iter < 2 * numIterations; iter += 2)
 
-  fprintf(stderr, "\n");
   RAFT_CUDA_TRY(cudaDeviceSynchronize());
 
   std::vector<uint32_t> numFineClusters(numMesoClusters);
@@ -317,8 +322,9 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_manag
     numFineClustersMax      = max(numFineClustersMax, numFineClusters[i]);
     csumFineClusters[i + 1] = csumFineClusters[i] + numFineClusters[i];
   }  // end for (uint32_t i = 0; i < numMesoClusters; i++)
-  // fprintf(stderr, "# mesoClusterSizeSum: %u\n", mesoClusterSizeSum);
-  // fprintf(stderr, "# numFineClustersSum: %u\n", numFineClustersSum);
+
+  RAFT_LOG_DEBUG("(%s) # mesoClusterSizeSum: %u", __func__, mesoClusterSizeSum);
+  RAFT_LOG_DEBUG("(%s) # numFineClustersSum: %u", __func__, numFineClustersSum);
   assert(mesoClusterSizeSum == numTrainset);
   assert(numFineClustersSum == numClusters);
   assert(csumFineClusters[numMesoClusters] == numClusters);
@@ -372,14 +378,11 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_manag
     RAFT_CUDA_TRY(cudaDeviceSynchronize());
 
     for (uint32_t iter = 0; iter < 2 * numIterations; iter += 2) {
-      fprintf(stderr,
-              "(%s) Training kmeans of clusters in meso-cluster %u (numClusters: %u): "
-              "%.1f / %u    \r",
-              __func__,
-              i,
-              numFineClusters[i],
-              (float)iter / 2,
-              numIterations);
+      RAFT_LOG_TRACE("Training kmeans of clusters in meso-cluster %u (numClusters: %u): %.1f / %u",
+                     i,
+                     numFineClusters[i],
+                     (float)iter / 2,
+                     numIterations);
 
       _cuann_kmeans_predict(handle_,
                             clusterCentersEach.data(),
@@ -415,7 +418,6 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_manag
                              cudaMemcpyDefault));
     numClustersDone += numFineClusters[i];
   }  // end for (uint32_t i = 0; i < numMesoClusters; i++)
-  fprintf(stderr, "\n");
   assert(numClustersDone == numClusters);
 
   clusterCentersMP.resize(numClusters * dimDataset, stream_);
@@ -442,7 +444,7 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_manag
                           true);
   }  // end for (int iter = 0; iter < 2; iter++)
 
-  fprintf(stderr, "(%s) Final fitting\n", __func__);
+  RAFT_LOG_DEBUG("(%s) Final fitting.", __func__);
 
   sizePredictWorkspace = _cuann_kmeans_predict_bufferSize(numClusters, dimDataset, nrow_);
   predictWorkspace.resize(sizePredictWorkspace, stream_);
@@ -617,6 +619,8 @@ cuivflStatus_t cuivflHandle<T>::cuivflSearch(const T* queries,  // [numQueries,
                                              size_t* neighbors,  // [numQueries, topK]
                                              float* distances)
 {
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "cuivflSearch(%u, %u, %zu)", batch_size, k, neighbors);
   cuivflSearchImpl<float>(queries, batch_size, k, neighbors, distances);
   return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
 }  // end func cuivflSearch
diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index 5cab5830a8..76fe7270fc 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -330,17 +330,6 @@ bool _cuann_kmeans_adjust_centers(float* centers,  // [numCenters, dimCenters]
     adjusted = true;
     count += 1;
   }
-
-#ifdef CUANN_DEBUG
-  if (count > 0) {
-    fprintf(stderr,
-            "(%s) num adjusted: %u / %u, threshold: %d\n",
-            __func__,
-            count,
-            numCenters,
-            (int)(average * threshold));
-  }
-#endif
   return adjusted;
 }
 
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index a73e08ff31..18bd821b35 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -32,25 +32,7 @@ namespace utils {
 constexpr int kThreadPerBlock = 128;
 constexpr int kNumWarps       = kThreadPerBlock / WarpSize;
 
-/*******************************************************/
-/*                   Debug Function                    */
-/*******************************************************/
 
-template <typename T>
-void printDevPtr(const T* d_cache, int len, const char* name)
-{
-  T* res = (T*)malloc(sizeof(T) * len);
-  RAFT_CUDA_TRY(cudaMemcpy(res, d_cache, sizeof(T) * len, cudaMemcpyDeviceToHost));
-  printf("%s ", name);
-  for (int i = 0; i < len; i++) {
-    printf("%d(%f) ", i, (float)res[i]);
-    if (i % 10 == 9) { printf("\n"); }
-  }
-  printf("\n");
-  free(res);
-}
-
-//
 size_t _cuann_aligned(size_t size, size_t unit = 128)
 {
   if (size % unit) { size += unit - (size % unit); }

From f9c12f876963b7463aec480086d4bb3680f2869c Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 24 May 2022 09:58:03 +0200
Subject: [PATCH 033/118] clang-format

---
 cpp/include/raft/spatial/knn/detail/ann_utils.cuh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index 18bd821b35..740798ee08 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -32,7 +32,6 @@ namespace utils {
 constexpr int kThreadPerBlock = 128;
 constexpr int kNumWarps       = kThreadPerBlock / WarpSize;
 
-
 size_t _cuann_aligned(size_t size, size_t unit = 128)
 {
   if (size % unit) { size += unit - (size % unit); }

From 957ac94607868ad21937ddd5ce22ee6cd9927991 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 24 May 2022 16:22:23 +0200
Subject: [PATCH 034/118] Use the recommended logger header

---
 cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index a251e6108b..48d8011168 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -21,8 +21,8 @@
 #include "ann_utils.cuh"
 #include "topk/radix_topk.cuh"
 
-#include <raft/common/logger.hpp>
 #include <raft/common/nvtx.hpp>
+#include <raft/core/logger.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/distance/distance.hpp>

From ccfbccc3a18d7e3876bc2b7cd4cda2d787df1ff4 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 25 May 2022 09:51:13 +0200
Subject: [PATCH 035/118] Use warpsort for smaller k

---
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  | 87 ++++++++++---------
 1 file changed, 45 insertions(+), 42 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index 48d8011168..d2428c173b 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -20,6 +20,7 @@
 #include "ann_kmeans_balanced.cuh"
 #include "ann_utils.cuh"
 #include "topk/radix_topk.cuh"
+#include "topk/warpsort_topk.cuh"
 
 #include <raft/common/nvtx.hpp>
 #include <raft/core/logger.hpp>
@@ -36,13 +37,6 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
-// TODO: remove this when #673 is merged
-#ifdef RAFT_LOG_TRACE_VEC
-#pragma message("PR #673 seems to be merged, please remove this CPP block.")
-#else
-#define RAFT_LOG_TRACE_VEC(ptr, len) void(0)
-#endif
-
 namespace raft::spatial::knn::detail {
 
 template <typename T>
@@ -708,16 +702,28 @@ cuivflStatus_t cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueri
                stream_);
 
   RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), 20);
-  topk::radix_topk<value_t, uint32_t, 11, 512>(distance_buffer_dev.data(),
-                                               nullptr,
-                                               batch_size,
-                                               nlist_,
-                                               nprobe,
-                                               coarse_distances_dev.data(),
-                                               coarse_indices_dev.data(),
-                                               !greater_,
-                                               stream_,
-                                               &(search_mem_res.value()));
+  if (nprobe <= raft::spatial::knn::detail::topk::kMaxCapacity) {
+    topk::warp_sort_topk<value_t, uint32_t>(distance_buffer_dev.data(),
+                                            nullptr,
+                                            batch_size,
+                                            nlist_,
+                                            nprobe,
+                                            coarse_distances_dev.data(),
+                                            coarse_indices_dev.data(),
+                                            !greater_,
+                                            stream_);
+  } else {
+    topk::radix_topk<value_t, uint32_t, 11, 512>(distance_buffer_dev.data(),
+                                                 nullptr,
+                                                 batch_size,
+                                                 nlist_,
+                                                 nprobe,
+                                                 coarse_distances_dev.data(),
+                                                 coarse_indices_dev.data(),
+                                                 !greater_,
+                                                 stream_,
+                                                 &(search_mem_res.value()));
+  }
   RAFT_LOG_TRACE_VEC(coarse_indices_dev.data(), 1 * nprobe);
   RAFT_LOG_TRACE_VEC(coarse_distances_dev.data(), 1 * nprobe);
 
@@ -751,31 +757,28 @@ cuivflStatus_t cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueri
   RAFT_LOG_TRACE_VEC(indices_dev_ptr, 2 * k);
 
   if (grid_dim_x_ > 1) {
-//#ifdef RADIX
-#if 1
-    topk::radix_topk<value_t, size_t, 11, 512>(refined_distances_dev.data(),
-                                               refined_indices_dev.data(),
-                                               batch_size,
-                                               k * grid_dim_x_,
-                                               k,
-                                               distances,
-                                               neighbors,
-                                               !greater_,
-                                               stream_,
-                                               &(search_mem_res.value()));
-#else
-    topk::warp_sort_topk<value_t, size_t>(buf_topk_dev_ptr,
-                                          buf_topk_size_,
-                                          refined_distances_dev.data(),
-                                          refined_indices_dev.data(),
-                                          (size_t)batch_size,
-                                          (size_t)(k * grid_dim_x_),
-                                          (size_t)k,
-                                          distances,
-                                          neighbors,
-                                          greater_,
-                                          stream_);
-#endif
+    if (k <= raft::spatial::knn::detail::topk::kMaxCapacity) {
+      topk::warp_sort_topk<value_t, size_t>(refined_distances_dev.data(),
+                                            refined_indices_dev.data(),
+                                            batch_size,
+                                            k * grid_dim_x_,
+                                            k,
+                                            distances,
+                                            neighbors,
+                                            !greater_,
+                                            stream_);
+    } else {
+      topk::radix_topk<value_t, size_t, 11, 512>(refined_distances_dev.data(),
+                                                 refined_indices_dev.data(),
+                                                 batch_size,
+                                                 k * grid_dim_x_,
+                                                 k,
+                                                 distances,
+                                                 neighbors,
+                                                 !greater_,
+                                                 stream_,
+                                                 &(search_mem_res.value()));
+    }
   }  // end if nprobe=1
 
   return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;

From 7819397a2fce216234d5a2933f1a988b3f8aaa31 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 25 May 2022 10:11:30 +0200
Subject: [PATCH 036/118] Using raft helpers

---
 .../knn/detail/ann_ivf_flat_kernel.cuh        | 62 +++++++------------
 1 file changed, 22 insertions(+), 40 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
index cedda06b24..08806b78cc 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
@@ -28,6 +28,7 @@
 #include "processing.hpp"
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
+#include <raft/pow2_utils.cuh>
 
 //#include <label/classlabels.cuh>
 #include <raft/distance/distance.hpp>
@@ -56,9 +57,6 @@ namespace spatial {
 namespace knn {
 namespace detail {
 
-#define SHFL_SYNC(VAL, SRC_LANE, WIDTH) __shfl_sync(0xffffffff, VAL, SRC_LANE, WIDTH)
-// build
-
 // #define DEBUG
 /// Init centriods
 template <typename T>
@@ -73,23 +71,6 @@ void ivfflat_centriod_init(T* dataset, T* centriod, int nlist, int dim, int n)
 }  // end func ivfflat_centriod_init
 
 // search
-template <typename U, typename V>
-constexpr __host__ __device__ auto divUp(U a, V b)
-{
-  return (a + b - 1) / b;
-}
-
-template <typename U, typename V>
-constexpr __host__ __device__ auto divDown(U a, V b)
-{
-  return (a / b);
-}
-
-template <typename U, typename V>
-constexpr __host__ __device__ auto roundDown(U a, V b)
-{
-  return divDown(a, b) * b;
-}
 
 template <typename T, int veclen>
 __device__ __forceinline__ void queryLoadToShmem(const T* const& query,
@@ -208,7 +189,7 @@ struct loadAndComputeDist {
         const int d = (i * kUnroll + j) * veclen;
 #pragma unroll
         for (int k = 0; k < veclen; ++k) {
-          q[k] = SHFL_SYNC(queryReg, d + k, WarpSize);
+          q[k] = shfl(queryReg, d + k, WarpSize);
           computeDist(dist, q[k], encV[j][k]);  //@TODO add other metrics
         }
       }
@@ -227,7 +208,7 @@ struct loadAndComputeDist {
       ldg(enc, data + loadDataIdx);
 #pragma unroll
       for (int k = 0; k < veclen; k++) {
-        q[k] = SHFL_SYNC(queryReg, d + k, WarpSize);
+        q[k] = shfl(queryReg, d + k, WarpSize);
         computeDist(dist, q[k], enc[k]);
       }
     }  // end for d < dim - dimBlocks
@@ -296,7 +277,7 @@ struct loadAndComputeDist<kUnroll,
         const int d = (i * kUnroll + j) * veclen_int;
 #pragma unroll
         for (int k = 0; k < veclen_int; ++k) {
-          q[j][k] = SHFL_SYNC(queryReg, d + k, WarpSize);
+          q[j][k] = shfl(queryReg, d + k, WarpSize);
           computeDist(dist, q[j][k], encV[j][k]);
         }
       }
@@ -319,7 +300,7 @@ struct loadAndComputeDist<kUnroll,
       ldg(enc, reinterpret_cast<uint32_t const*>(data) + laneId * veclen_int);
 #pragma unroll
       for (int k = 0; k < veclen_int; k++) {
-        q[k] = SHFL_SYNC(queryReg, (d / 4) + k, WarpSize);
+        q[k] = shfl(queryReg, (d / 4) + k, WarpSize);
         computeDist(dist, q[k], enc[k]);
       }
     }  // end for d < dim - dimBlocks
@@ -373,7 +354,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 4, uin
       for (int j = 0; j < kUnroll; ++j) {
         encV[j]     = reinterpret_cast<unsigned const*>(data)[laneId + j * wordsPerVectorBlockDim];
         const int d = (i * kUnroll + j);
-        q[j]        = SHFL_SYNC(queryReg, d, WarpSize);
+        q[j]        = shfl(queryReg, d, WarpSize);
         computeDist(dist, q[j], encV[j]);
       }
     }
@@ -390,7 +371,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 4, uin
     uint32_t queryReg    = loadDim < dim ? reinterpret_cast<unsigned const*>(query)[loadDim] : 0;
     for (int d = 0; d < dim - dimBlocks; d += veclen, data += wordsPerVectorBlockDim * veclen) {
       uint32_t enc = reinterpret_cast<unsigned const*>(data)[laneId];
-      uint32_t q   = SHFL_SYNC(queryReg, d / veclen, WarpSize);
+      uint32_t q   = shfl(queryReg, d / veclen, WarpSize);
       computeDist(dist, q, enc);
     }  // end for d < dim - dimBlocks
   }
@@ -444,7 +425,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 2, uin
         encV[j]     = 0;
         encV[j]     = reinterpret_cast<uint16_t const*>(data)[laneId + j * wordsPerVectorBlockDim];
         const int d = (i * kUnroll + j);
-        q[j]        = SHFL_SYNC(queryReg, d, WarpSize);
+        q[j]        = shfl(queryReg, d, WarpSize);
         computeDist(dist, q[j], encV[j]);
       }
     }
@@ -462,7 +443,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 2, uin
     queryReg = loadDim < dim ? reinterpret_cast<uint16_t const*>(query + loadDim)[0] : 0;
     for (int d = 0; d < dim - dimBlocks; d += veclen, data += wordsPerVectorBlockDim * veclen) {
       uint32_t enc = reinterpret_cast<uint16_t const*>(data)[laneId];
-      uint32_t q   = SHFL_SYNC(queryReg, d / veclen, WarpSize);
+      uint32_t q   = shfl(queryReg, d / veclen, WarpSize);
       computeDist(dist, q, enc);
     }  // end for d < dim - dimBlocks
   }
@@ -514,7 +495,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 1, uin
         encV[j]     = 0;
         encV[j]     = data[laneId + j * wordsPerVectorBlockDim];
         const int d = (i * kUnroll + j);
-        q[j]        = SHFL_SYNC(queryReg, d, WarpSize);
+        q[j]        = shfl(queryReg, d, WarpSize);
         computeDist(dist, q[j], encV[j]);
       }
     }
@@ -533,7 +514,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 1, uin
     for (int d = 0; d < dim - dimBlocks; d += veclen, data += wordsPerVectorBlockDim * veclen) {
       uint32_t enc = 0;
       enc          = data[laneId];
-      uint32_t q   = SHFL_SYNC(queryReg, d, WarpSize);
+      uint32_t q   = shfl(queryReg, d, WarpSize);
       computeDist(dist, q, enc);
     }  // end for d < dim - dimBlocks
   }
@@ -601,7 +582,7 @@ struct loadAndComputeDist<kUnroll,
         const int d = (i * kUnroll + j) * veclen_int;
 #pragma unroll
         for (int k = 0; k < veclen_int; ++k) {
-          q[j][k] = SHFL_SYNC(queryReg, d + k, WarpSize);
+          q[j][k] = shfl(queryReg, d + k, WarpSize);
           computeDist(dist, q[j][k], encV[j][k]);
         }
       }
@@ -621,7 +602,7 @@ struct loadAndComputeDist<kUnroll,
       ldg(enc, reinterpret_cast<int32_t const*>(data) + laneId * veclen_int);
 #pragma unroll
       for (int k = 0; k < veclen_int; k++) {
-        q[k] = SHFL_SYNC(queryReg, (d / 4) + k, WarpSize);  // Here 4 is for 1 - int;
+        q[k] = shfl(queryReg, (d / 4) + k, WarpSize);  // Here 4 is for 1 - int;
         computeDist(dist, q[k], enc[k]);
       }
     }  // end for d < dim - dimBlocks
@@ -674,7 +655,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 2, int
         encV[j]     = 0;
         encV[j]     = reinterpret_cast<uint16_t const*>(data)[laneId + j * wordsPerVectorBlockDim];
         const int d = (i * kUnroll + j);
-        q[j]        = SHFL_SYNC(queryReg, d, WarpSize);
+        q[j]        = shfl(queryReg, d, WarpSize);
         computeDist(dist, q[j], encV[j]);
       }
     }
@@ -689,7 +670,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 2, int
     queryReg = loadDim < dim ? reinterpret_cast<uint16_t const*>(query + loadDim)[0] : 0;
     for (int d = 0; d < dim - dimBlocks; d += veclen, data += wordsPerVectorBlockDim * veclen) {
       int32_t enc = reinterpret_cast<uint16_t const*>(data + laneId * veclen)[0];
-      int32_t q   = SHFL_SYNC(queryReg, d / veclen, WarpSize);
+      int32_t q   = shfl(queryReg, d / veclen, WarpSize);
       computeDist(dist, q, enc);
     }  // end for d < dim - dimBlocks
   }
@@ -743,7 +724,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 1, int
         encV[j]     = 0;
         encV[j]     = data[laneId + j * wordsPerVectorBlockDim];
         const int d = (i * kUnroll + j);
-        q[j]        = SHFL_SYNC(queryReg, d, WarpSize);
+        q[j]        = shfl(queryReg, d, WarpSize);
         computeDist(dist, q[j], encV[j]);
       }
     }
@@ -758,7 +739,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 1, int
     for (int d = 0; d < dim - dimBlocks; d += veclen, data += wordsPerVectorBlockDim * veclen) {
       int32_t enc = 0;
       enc         = data[laneId];
-      int32_t q   = SHFL_SYNC(queryReg, d, WarpSize);
+      int32_t q   = shfl(queryReg, d, WarpSize);
       computeDist(dist, q, enc);
     }  // end for d < dim - dimBlocks
   }
@@ -806,8 +787,9 @@ __global__ void interleaved_scan(
   topk::block_sort<topk::warp_sort_filtered, CAPACITY, !GREATER, float, size_t> queue(k, smem_ext);
 #endif
 
-  const int laneId = threadIdx.x % WarpSize;
-  const int warpId = threadIdx.x / WarpSize;
+  using align_warp = Pow2<WarpSize>;
+  const int laneId = align_warp::mod(threadIdx.x);
+  const int warpId = align_warp::div(threadIdx.x);
   int queryId      = blockIdx.y;
 
   /// Set the address
@@ -816,7 +798,7 @@ __global__ void interleaved_scan(
   constexpr int wordsPerVectorBlockDim = bytesPerVectorBlockDim / sizeof(T);
 
   // int wordsPerVectorBlock = wordsPerVectorBlockDim * dim;
-  const int dimBlocks = roundDown(dim, WarpSize);
+  const int dimBlocks = align_warp::roundDown(dim);
 
   // This should be multiple of warpSize = 32
   constexpr uint32_t queryShmemSize = 2048;
@@ -846,7 +828,7 @@ __global__ void interleaved_scan(
     const uint32_t numVecs = list_lengths[listId];
 
     // The number of interleaved group to be processed
-    const uint32_t numBlocks = divUp(numVecs, WarpSize);
+    const uint32_t numBlocks = ceildiv<uint32_t>(numVecs, WarpSize);
 
     for (uint32_t block = warpId; block < numBlocks; block += utils::kNumWarps) {
       value_t dist = 0;

From 510c46752adb0e5088f7d75ffa43d25668b90886 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 25 May 2022 13:08:53 +0200
Subject: [PATCH 037/118] Determine the template parameters Capacity and Veclen
 recursively

---
 .../knn/detail/ann_ivf_flat_kernel.cuh        | 306 ++++--------------
 1 file changed, 61 insertions(+), 245 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
index 08806b78cc..2032afa829 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
@@ -956,11 +956,17 @@ void launch_interleaved_scan_kernel(
   size_t* neighbors,  // [batch_size, nprobe]
   float* distances,   // [batch_size, nprobe]
   const bool greater,
-  const int smem_size,
   const uint32_t batch_size,
   cudaStream_t stream,
   uint32_t& gridDimX)
 {
+#ifdef USE_FAISS
+  int smem_size = 0;
+#else
+  int smem_size = raft::spatial::knn::detail::topk::calc_smem_size_for_block_wide<acc_type, size_t>(
+    utils::kNumWarps, k);
+#endif
+
   // Accumulation inner product lambda
   auto inner_prod_lambda = [] __device__(acc_type & acc, acc_type & x, acc_type & y) {
     if constexpr ((std::is_same<T, int8_t>{}) || (std::is_same<T, uint8_t>{})) {
@@ -1109,193 +1115,46 @@ void launch_interleaved_scan_kernel(
   }
 }
 
-template <int capacity, typename T, typename acc_type>
-void select_interleaved_scan_kernel(
-  const T* queries,        // Input: Query Vector; [batch_size, dim]
-  uint32_t* coarse_index,  // Record the cluster(list) id; [batch_size,nprobe]
-  uint32_t* list_index,    // Record the id of vector for each cluster(list); [nrow]
-  void* list_data,         // Record the full value of vector for each cluster(list) interleaved;
-                           // [nrow, dim]
-  uint32_t* list_lengths,  // The number of vectors in each cluster(list); [nlist]
-  uint32_t* list_prefix_interleave,           // The start offset of each cluster(list) for
-                                              // list_index; [nlist]
-  const raft::distance::DistanceType metric,  // Function to process the different metric
-  const uint32_t nprobe,
-  const uint32_t k,
-  const uint32_t dim,
-  size_t* neighbors,  // [batch_size, nprobe]
-  float* distances,   // [batch_size, nprobe]
-  const bool greater,
-  const int smem_size,
-  const uint32_t batch_size,
-  cudaStream_t stream,
-  const int veclen,
-  uint32_t& gridDimX)
-{
-  if constexpr ((std::is_same<T, uint8_t>{}) || (std::is_same<T, int8_t>{})) {
-    switch (veclen) {
-      case 1:
-        launch_interleaved_scan_kernel<capacity, 1, T, acc_type>(queries,
-                                                                 coarse_index,
-                                                                 list_index,
-                                                                 list_data,
-                                                                 list_lengths,
-                                                                 list_prefix_interleave,
-                                                                 metric,
-                                                                 nprobe,
-                                                                 k,
-                                                                 dim,
-                                                                 neighbors,
-                                                                 distances,
-                                                                 greater,
-                                                                 smem_size,
-                                                                 batch_size,
-                                                                 stream,
-                                                                 gridDimX);
-        break;
-      case 2:
-        launch_interleaved_scan_kernel<capacity, 2, T, acc_type>(queries,
-                                                                 coarse_index,
-                                                                 list_index,
-                                                                 list_data,
-                                                                 list_lengths,
-                                                                 list_prefix_interleave,
-                                                                 metric,
-                                                                 nprobe,
-                                                                 k,
-                                                                 dim,
-                                                                 neighbors,
-                                                                 distances,
-                                                                 greater,
-                                                                 smem_size,
-                                                                 batch_size,
-                                                                 stream,
-                                                                 gridDimX);
-        break;
-      case 4:
-        launch_interleaved_scan_kernel<capacity, 4, T, acc_type>(queries,
-                                                                 coarse_index,
-                                                                 list_index,
-                                                                 list_data,
-                                                                 list_lengths,
-                                                                 list_prefix_interleave,
-                                                                 metric,
-                                                                 nprobe,
-                                                                 k,
-                                                                 dim,
-                                                                 neighbors,
-                                                                 distances,
-                                                                 greater,
-                                                                 smem_size,
-                                                                 batch_size,
-                                                                 stream,
-                                                                 gridDimX);
-        break;
-      case 8:
-        launch_interleaved_scan_kernel<capacity, 8, T, acc_type>(queries,
-                                                                 coarse_index,
-                                                                 list_index,
-                                                                 list_data,
-                                                                 list_lengths,
-                                                                 list_prefix_interleave,
-                                                                 metric,
-                                                                 nprobe,
-                                                                 k,
-                                                                 dim,
-                                                                 neighbors,
-                                                                 distances,
-                                                                 greater,
-                                                                 smem_size,
-                                                                 batch_size,
-                                                                 stream,
-                                                                 gridDimX);
-        break;
-      case 16:
-        launch_interleaved_scan_kernel<capacity, 16, T, acc_type>(queries,
-                                                                  coarse_index,
-                                                                  list_index,
-                                                                  list_data,
-                                                                  list_lengths,
-                                                                  list_prefix_interleave,
-                                                                  metric,
-                                                                  nprobe,
-                                                                  k,
-                                                                  dim,
-                                                                  neighbors,
-                                                                  distances,
-                                                                  greater,
-                                                                  smem_size,
-                                                                  batch_size,
-                                                                  stream,
-                                                                  gridDimX);
-        break;
-      default: assert("veclen should be 1, 2, 4, 8 or 16\n"); break;
+/**
+ * Lift the `capacity` and `veclen` parameters to the template level,
+ * forward the rest of the arguments unmodified to `launch_interleaved_scan_kernel`.
+ */
+template <typename T,
+          typename AccT,
+          int Capacity = topk::kMaxCapacity,
+          int Veclen   = std::max<int>(1, 16 / sizeof(T))>
+struct select_interleaved_scan_kernel {
+  /**
+   * Recursively reduce the `Capacity` and `Veclen` parameters until they match the
+   * corresponding runtime arguments.
+   * By default, this recursive process starts with maximum possible values of the
+   * two parameters and ends with both values equal to 1.
+   */
+  template <typename... Args>
+  static inline void run(int capacity, int veclen, Args&&... args)
+  {
+    if constexpr (Capacity > 1) {
+      if (capacity * 2 <= Capacity) {
+        return select_interleaved_scan_kernel<T, AccT, Capacity / 2, Veclen>::run(
+          capacity, veclen, args...);
+      }
     }
-  } else if constexpr (std::is_same<T, float>{}) {
-    switch (veclen) {
-      case 1:
-        launch_interleaved_scan_kernel<capacity, 1, T, acc_type>(queries,
-                                                                 coarse_index,
-                                                                 list_index,
-                                                                 list_data,
-                                                                 list_lengths,
-                                                                 list_prefix_interleave,
-                                                                 metric,
-                                                                 nprobe,
-                                                                 k,
-                                                                 dim,
-                                                                 neighbors,
-                                                                 distances,
-                                                                 greater,
-                                                                 smem_size,
-                                                                 batch_size,
-                                                                 stream,
-                                                                 gridDimX);
-        break;
-      case 2:
-        launch_interleaved_scan_kernel<capacity, 2, T, acc_type>(queries,
-                                                                 coarse_index,
-                                                                 list_index,
-                                                                 list_data,
-                                                                 list_lengths,
-                                                                 list_prefix_interleave,
-                                                                 metric,
-                                                                 nprobe,
-                                                                 k,
-                                                                 dim,
-                                                                 neighbors,
-                                                                 distances,
-                                                                 greater,
-                                                                 smem_size,
-                                                                 batch_size,
-                                                                 stream,
-                                                                 gridDimX);
-        break;
-      case 4:
-        launch_interleaved_scan_kernel<capacity, 4, T, acc_type>(queries,
-                                                                 coarse_index,
-                                                                 list_index,
-                                                                 list_data,
-                                                                 list_lengths,
-                                                                 list_prefix_interleave,
-                                                                 metric,
-                                                                 nprobe,
-                                                                 k,
-                                                                 dim,
-                                                                 neighbors,
-                                                                 distances,
-                                                                 greater,
-                                                                 smem_size,
-                                                                 batch_size,
-                                                                 stream,
-                                                                 gridDimX);
-        break;
-      default: assert("veclen should be 1, 2 or 4\n"); break;
+    if constexpr (Veclen > 1) {
+      if (veclen * 2 <= Veclen) {
+        return select_interleaved_scan_kernel<T, AccT, Capacity, Veclen / 2>::run(
+          capacity, veclen, args...);
+      }
     }
+    RAFT_EXPECTS(capacity == Capacity,
+                 "Capacity must be power-of-two not bigger than the maximum allowed size.");
+    RAFT_EXPECTS(
+      veclen == Veclen,
+      "Veclen must be power-of-two not bigger than the maximum allowed size for this data type.");
+    return launch_interleaved_scan_kernel<Capacity, Veclen, T, AccT>(args...);
   }
-}
+};
 
-template <typename T, typename value_t>
+template <typename T, typename AccT>
 void ivfflat_interleaved_scan(const T* queries,                  //[batch_size, dim]
                               uint32_t* coarse_index,            //[batch_size,nprobe]
                               uint32_t* list_index,              // [nrow]
@@ -1315,67 +1174,24 @@ void ivfflat_interleaved_scan(const T* queries,                  //[batch_size,
                               uint32_t& gridDimX)
 {
   const int capacity = raft::spatial::knn::detail::topk::calc_capacity(k);
-
-#ifdef USE_FAISS
-  int smem_size = 0;
-#else
-  int smem_size = raft::spatial::knn::detail::topk::calc_smem_size_for_block_wide<value_t, size_t>(
-    utils::kNumWarps, k);
-#endif
-
-  switch (capacity) {
-    case 32:
-      select_interleaved_scan_kernel<32, T, value_t>(queries,
-                                                     coarse_index,
-                                                     list_index,
-                                                     list_data,
-                                                     list_lengths,
-                                                     list_prefix_interleave,
-                                                     metric,
-                                                     nprobe,
-                                                     k,
-                                                     dim,
-                                                     neighbors,
-                                                     distances,
-                                                     greater,
-                                                     smem_size,
-                                                     batch_size,
-                                                     stream,
-                                                     veclen,
-                                                     gridDimX);
-      break;
-    // case 64:
-    //   select_interleaved_scan_kernel<64, T, value_t>(queries, coarse_index,
-    //   list_index, list_data, list_lengths, list_prefix_interleave, metric,
-    //   nprobe, k, dim, neighbors, distances, greater, smem_size, batch_size,
-    //   stream, veclen, gridDimX);
-    //   break;
-    // case 128:
-    //   select_interleaved_scan_kernel<128, T, value_t>(queries, coarse_index,
-    //   list_index, list_data, list_lengths, list_prefix_interleave, metric,
-    //   nprobe, k, dim, neighbors, distances, greater, smem_size, batch_size,
-    //   stream, veclen, gridDimX);
-    //   break;
-    // case 256:
-    //   select_interleaved_scan_kernel<256, T, value_t>(queries, coarse_index,
-    //   list_index, list_data, list_lengths, list_prefix_interleave, metric,
-    //   nprobe, k, dim, neighbors, distances, greater, smem_size, batch_size,
-    //   stream, veclen, gridDimX);
-    //   break;
-    // case 512:
-    //   select_interleaved_scan_kernel<512, T, value_t>(queries, coarse_index,
-    //   list_index, list_data, list_lengths, list_prefix_interleave, metric,
-    //   nprobe, k, dim, neighbors, distances, greater, smem_size, batch_size,
-    //   stream, veclen, gridDimX);
-    //   break;
-    // case 1024:
-    //   select_interleaved_scan_kernel<1024, T, value_t>(queries, coarse_index,
-    //   list_index, list_data, list_lengths, list_prefix_interleave, metric,
-    //   nprobe, k, dim, neighbors, distances, greater, smem_size, batch_size,
-    //   stream, veclen, gridDimX);
-    //   break;
-    default: break;
-  }  // end switch
+  select_interleaved_scan_kernel<T, AccT>::run(capacity,
+                                               veclen,
+                                               queries,
+                                               coarse_index,
+                                               list_index,
+                                               list_data,
+                                               list_lengths,
+                                               list_prefix_interleave,
+                                               metric,
+                                               nprobe,
+                                               k,
+                                               dim,
+                                               neighbors,
+                                               distances,
+                                               greater,
+                                               batch_size,
+                                               stream,
+                                               gridDimX);
 }
 
 }  // namespace detail

From c5087be261057a8564c0031361ef7ebaa3f43d25 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 26 May 2022 08:01:26 +0200
Subject: [PATCH 038/118] wip: refactoring and reducing duplicate calls

---
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  |   2 +-
 .../knn/detail/ann_ivf_flat_kernel.cuh        | 355 +++++++-----------
 2 files changed, 130 insertions(+), 227 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index d2428c173b..3af29208b8 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -574,7 +574,7 @@ cuivflStatus_t cuivflHandle<T>::queryIVFFlatGridSize(const uint32_t nprobe,
                                                                    dim_,
                                                                    nullptr,
                                                                    nullptr,
-                                                                   0,
+                                                                   stream_,
                                                                    greater_,
                                                                    veclen_,
                                                                    grid_dim_x_);
diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
index 2032afa829..e7fa652655 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
@@ -16,69 +16,34 @@
 
 #pragma once
 
+// #define USE_FAISS
+
 #include "../ann_common.h"
 #include "ann_utils.cuh"
-#include "knn_brute_force_faiss.cuh"
 #include "topk/warpsort_topk.cuh"
-#include <raft/common/device_loads_stores.cuh>
-
-#include "common_faiss.h"
-#include "processing.hpp"
 
-#include "processing.hpp"
+#include <raft/common/device_loads_stores.cuh>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
+#include <raft/distance/distance.cuh>
 #include <raft/pow2_utils.cuh>
 
-//#include <label/classlabels.cuh>
-#include <raft/distance/distance.hpp>
-#include <raft/spatial/knn/faiss_mr.hpp>
-
-#include <faiss/gpu/GpuDistance.h>
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/GpuIndexIVFFlat.h>
-#include <faiss/gpu/GpuIndexIVFPQ.h>
-#include <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/utils/Limits.cuh>
+#ifdef USE_FAISS
+#include <faiss/gpu/utils/Comparators.cuh>
 #include <faiss/gpu/utils/Select.cuh>
-#include <faiss/gpu/utils/Tensor.cuh>
-#include <faiss/utils/Heap.h>
-
-#include <thrust/iterator/transform_iterator.h>
-
-#include <raft/distance/distance_type.hpp>
-
-#include <iostream>
-#include <set>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-namespace detail {
+#endif
 
-// #define DEBUG
-/// Init centriods
-template <typename T>
-void ivfflat_centriod_init(T* dataset, T* centriod, int nlist, int dim, int n)
-{
-  // srand (time(NULL));
-  int nparts = n / nlist;
-  int index  = rand() % nparts;
-  for (int i = 0; i < nlist; i++) {
-    memcpy(centriod + i * dim, dataset + i * nparts + index, sizeof(T) * dim);
-  }  // end for
-}  // end func ivfflat_centriod_init
+#include <rmm/cuda_stream_view.hpp>
 
-// search
+namespace raft::spatial::knn::detail {
 
-template <typename T, int veclen>
+template <typename T, int Veclen>
 __device__ __forceinline__ void queryLoadToShmem(const T* const& query,
                                                  T* queryShared,
                                                  const int loadDim)
 {
-  T queryReg[veclen];
-  const int loadIndex = loadDim * veclen;
+  T queryReg[Veclen];
+  const int loadIndex = loadDim * Veclen;
   ldg(queryReg, query + loadIndex);
   sts(&queryShared[loadIndex], queryReg);
 }
@@ -134,7 +99,7 @@ __device__ __forceinline__ void queryLoadToShmem<int8_t, 16>(const int8_t* const
 template <int kUnroll,
           int wordsPerVectorBlockDim,
           typename computeLambda,
-          int veclen,
+          int Veclen,
           typename T,
           typename AccT>
 struct loadAndComputeDist {
@@ -153,17 +118,17 @@ struct loadAndComputeDist {
                                                       IdxT baseShmemIndex,
                                                       IdxT iShmemIndex)
   {
-    T encV[kUnroll][veclen];
-    T queryRegs[kUnroll][veclen];
-    constexpr int stride  = kUnroll * veclen;
+    T encV[kUnroll][Veclen];
+    T queryRegs[kUnroll][Veclen];
+    constexpr int stride  = kUnroll * Veclen;
     const int shmemStride = baseShmemIndex + iShmemIndex * stride;
 #pragma unroll
     for (int j = 0; j < kUnroll; ++j) {
-      ldg(encV[j], data + (loadIndex + j * wordsPerVectorBlockDim) * veclen);
-      const int d = shmemStride + j * veclen;
+      ldg(encV[j], data + (loadIndex + j * wordsPerVectorBlockDim) * Veclen);
+      const int d = shmemStride + j * Veclen;
       lds(queryRegs[j], &queryShared[d]);
 #pragma unroll
-      for (int k = 0; k < veclen; ++k) {
+      for (int k = 0; k < Veclen; ++k) {
         computeDist(dist, queryRegs[j][k], encV[j][k]);
       }
     }
@@ -175,20 +140,20 @@ struct loadAndComputeDist {
                                                         IdxT baseLoadIndex,
                                                         const int laneId)
   {
-    T encV[kUnroll][veclen];
+    T encV[kUnroll][Veclen];
     T queryReg               = query[baseLoadIndex + laneId];
-    constexpr int stride     = kUnroll * veclen;
+    constexpr int stride     = kUnroll * Veclen;
     constexpr int totalIter  = WarpSize / stride;
     constexpr int gmemStride = stride * wordsPerVectorBlockDim;
 #pragma unroll
     for (int i = 0; i < totalIter; ++i, data += gmemStride) {
 #pragma unroll
       for (int j = 0; j < kUnroll; ++j) {
-        ldg(encV[j], (data + (laneId + j * wordsPerVectorBlockDim) * veclen));
-        T q[veclen];
-        const int d = (i * kUnroll + j) * veclen;
+        ldg(encV[j], (data + (laneId + j * wordsPerVectorBlockDim) * Veclen));
+        T q[Veclen];
+        const int d = (i * kUnroll + j) * Veclen;
 #pragma unroll
-        for (int k = 0; k < veclen; ++k) {
+        for (int k = 0; k < Veclen; ++k) {
           q[k] = shfl(queryReg, d + k, WarpSize);
           computeDist(dist, q[k], encV[j][k]);  //@TODO add other metrics
         }
@@ -201,13 +166,13 @@ struct loadAndComputeDist {
   {
     const int loadDim     = dimBlocks + laneId;
     T queryReg            = loadDim < dim ? query[loadDim] : 0;
-    const int loadDataIdx = laneId * veclen;
-    for (int d = 0; d < dim - dimBlocks; d += veclen, data += wordsPerVectorBlockDim * veclen) {
-      T enc[veclen];
-      T q[veclen];
+    const int loadDataIdx = laneId * Veclen;
+    for (int d = 0; d < dim - dimBlocks; d += Veclen, data += wordsPerVectorBlockDim * Veclen) {
+      T enc[Veclen];
+      T q[Veclen];
       ldg(enc, data + loadDataIdx);
 #pragma unroll
-      for (int k = 0; k < veclen; k++) {
+      for (int k = 0; k < Veclen; k++) {
         q[k] = shfl(queryReg, d + k, WarpSize);
         computeDist(dist, q[k], enc[k]);
       }
@@ -215,7 +180,7 @@ struct loadAndComputeDist {
   }
 };
 
-// This handles uint8_t 8, 16 veclens
+// This handles uint8_t 8, 16 Veclens
 template <int kUnroll, int wordsPerVectorBlockDim, typename computeLambda, int uint8_veclen>
 struct loadAndComputeDist<kUnroll,
                           wordsPerVectorBlockDim,
@@ -307,7 +272,7 @@ struct loadAndComputeDist<kUnroll,
   }
 };
 
-// Keep this specialized uint8 veclen = 4, because compiler is generating suboptimal code while
+// Keep this specialized uint8 Veclen = 4, because compiler is generating suboptimal code while
 // using above common template of int2/int4
 template <int kUnroll, int wordsPerVectorBlockDim, typename computeLambda>
 struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 4, uint8_t, uint32_t> {
@@ -745,9 +710,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 1, int
   }
 };
 
-//#define USE_FAISS 1
-
-template <int CAPACITY, int veclen, typename T, typename value_t, typename distLambda, bool GREATER>
+template <int Capacity, int Veclen, typename T, typename value_t, typename distLambda, bool GREATER>
 __global__ void interleaved_scan(
   const T* queries,        // Input: Query Vector; [batch_size, dim]
   uint32_t* coarse_index,  // Record the cluster(list) id; [batch_size,nprobe]
@@ -784,7 +747,7 @@ __global__ void interleaved_scan(
 
 #else
   extern __shared__ __align__(256) uint8_t smem_ext[];
-  topk::block_sort<topk::warp_sort_filtered, CAPACITY, !GREATER, float, size_t> queue(k, smem_ext);
+  topk::block_sort<topk::warp_sort_filtered, Capacity, !GREATER, float, size_t> queue(k, smem_ext);
 #endif
 
   using align_warp = Pow2<WarpSize>;
@@ -805,13 +768,13 @@ __global__ void interleaved_scan(
   __shared__ T queryShared[queryShmemSize];
 
   int shLoadDim = (dim < queryShmemSize) ? dim : queryShmemSize;
-  shLoadDim     = shLoadDim / veclen;
+  shLoadDim     = shLoadDim / Veclen;
 
   for (int loadDim = threadIdx.x; loadDim < shLoadDim; loadDim += blockDim.x) {
-    queryLoadToShmem<T, veclen>(query, queryShared, loadDim);
+    queryLoadToShmem<T, Veclen>(query, queryShared, loadDim);
   }
   __syncthreads();
-  shLoadDim = (dim > queryShmemSize) ? (shLoadDim * veclen) : dimBlocks;
+  shLoadDim = (dim > queryShmemSize) ? (shLoadDim * Veclen) : dimBlocks;
 
   for (int probeId = blockIdx.x; probeId < nprobe; probeId += gridDim.x) {
     uint32_t listId = coarse_index[queryId * nprobe + probeId];  // The id of cluster(list)
@@ -843,14 +806,14 @@ __global__ void interleaved_scan(
       if (valid) {
         /// load query from shared mem
         for (int dBase = 0; dBase < shLoadDim; dBase += WarpSize) {  //
-          constexpr int kUnroll   = WarpSize / veclen;
-          constexpr int stride    = kUnroll * veclen;
+          constexpr int kUnroll   = WarpSize / Veclen;
+          constexpr int stride    = kUnroll * Veclen;
           constexpr int totalIter = WarpSize / stride;
 
           loadAndComputeDist<kUnroll,
                              wordsPerVectorBlockDim,
                              decltype(computeDist),
-                             veclen,
+                             Veclen,
                              T,
                              value_t>
             obj(dist, computeDist);
@@ -862,12 +825,12 @@ __global__ void interleaved_scan(
       }
 
       if (dim > queryShmemSize) {
-        constexpr int kUnroll = WarpSize / veclen;
+        constexpr int kUnroll = WarpSize / Veclen;
         ;
         loadAndComputeDist<kUnroll,
                            wordsPerVectorBlockDim,
                            decltype(computeDist),
-                           veclen,
+                           Veclen,
                            T,
                            value_t>
           obj(dist, computeDist);
@@ -881,8 +844,8 @@ __global__ void interleaved_scan(
         if (valid) {
           /// Remainder chunk = dim - dimBlocks
           for (int d = 0; d < dim - dimBlocks;
-               d += veclen, data += wordsPerVectorBlockDim * veclen) {
-            loadAndComputeDist<1, wordsPerVectorBlockDim, decltype(computeDist), veclen, T, value_t>
+               d += Veclen, data += wordsPerVectorBlockDim * Veclen) {
+            loadAndComputeDist<1, wordsPerVectorBlockDim, decltype(computeDist), Veclen, T, value_t>
               obj(dist, computeDist);
             obj.runLoadShmemCompute(data, queryShared, laneId, dimBlocks + d, 0);
           }  // end for d < dim - dimBlocks
@@ -939,38 +902,60 @@ dim3 launchConfigGenerator(uint32_t numQueries, uint32_t nprobe, int32_t sMemSiz
   return grid;
 }
 
-template <int capacity, int veclen, typename T, typename acc_type>
-void launch_interleaved_scan_kernel(
-  const T* queries,        // Input: Query Vector; [batch_size, dim]
-  uint32_t* coarse_index,  // Record the cluster(list) id; [batch_size,nprobe]
-  uint32_t* list_index,    // Record the id of vector for each cluster(list); [nrow]
-  void* list_data,         // Record the full value of vector for each cluster(list) interleaved;
-                           // [nrow, dim]
-  uint32_t* list_lengths,  // The number of vectors in each cluster(list); [nlist]
-  uint32_t* list_prefix_interleave,     // The start offset of each cluster(list) for
-                                        // list_index; [nlist]
-  raft::distance::DistanceType metric,  // Function to process the different metric
-  const uint32_t nprobe,
-  const uint32_t k,
-  const uint32_t dim,
-  size_t* neighbors,  // [batch_size, nprobe]
-  float* distances,   // [batch_size, nprobe]
-  const bool greater,
-  const uint32_t batch_size,
-  cudaStream_t stream,
-  uint32_t& gridDimX)
+template <int Capacity, int Veclen, bool Greater, typename T, typename AccT, typename Lambda>
+void launch_with_lambda(Lambda lambda,
+                        raft::distance::DistanceType metric,
+                        const T* queries,
+                        uint32_t* coarse_index,
+                        uint32_t* list_index,
+                        T* list_data,
+                        uint32_t* list_lengths,
+                        uint32_t* list_prefix_interleave,
+                        const uint32_t nprobe,
+                        const uint32_t k,
+                        const uint32_t dim,
+                        size_t* neighbors,
+                        float* distances,
+                        const uint32_t batch_size,
+                        uint32_t& gridDimX,
+                        rmm::cuda_stream_view stream)
 {
+  constexpr auto kKernel = interleaved_scan<Capacity, Veclen, T, AccT, Lambda, Greater>;
 #ifdef USE_FAISS
   int smem_size = 0;
 #else
-  int smem_size = raft::spatial::knn::detail::topk::calc_smem_size_for_block_wide<acc_type, size_t>(
+  int smem_size = raft::spatial::knn::detail::topk::calc_smem_size_for_block_wide<AccT, size_t>(
     utils::kNumWarps, k);
 #endif
 
+  dim3 grid_dim = launchConfigGenerator(batch_size, nprobe, smem_size, kKernel);
+  if (gridDimX == 0) {
+    gridDimX = grid_dim.x;
+    return;
+  }
+  dim3 block_dim(utils::kThreadPerBlock);
+  kKernel<<<grid_dim, block_dim, smem_size, stream>>>(queries,
+                                                      coarse_index,
+                                                      list_index,
+                                                      list_data,
+                                                      list_lengths,
+                                                      list_prefix_interleave,
+                                                      metric,
+                                                      lambda,
+                                                      nprobe,
+                                                      k,
+                                                      dim,
+                                                      neighbors,
+                                                      distances);
+}
+
+template <int Capacity, int Veclen, bool Greater, typename T, typename acc_type, typename... Args>
+void launch_interleaved_scan_kernel(raft::distance::DistanceType metric, Args&&... args)
+{
   // Accumulation inner product lambda
   auto inner_prod_lambda = [] __device__(acc_type & acc, acc_type & x, acc_type & y) {
     if constexpr ((std::is_same<T, int8_t>{}) || (std::is_same<T, uint8_t>{})) {
-      if constexpr (veclen == 1) {
+      if constexpr (Veclen == 1) {
         acc += x * y;
       } else {
         acc = dp4a(x, y, acc);
@@ -983,7 +968,7 @@ void launch_interleaved_scan_kernel(
   // Accumulation euclidean L2 lambda
   auto euclidean_lambda = [] __device__(acc_type & acc, acc_type & x, acc_type & y) {
     if constexpr ((std::is_same<T, uint8_t>{})) {
-      if constexpr (veclen == 1) {
+      if constexpr (Veclen == 1) {
         const acc_type diff = x - y;
         acc += diff * diff;
       } else {
@@ -991,7 +976,7 @@ void launch_interleaved_scan_kernel(
         acc                 = dp4a(diff, diff, acc);
       }
     } else if constexpr (std::is_same<T, int8_t>{}) {
-      if constexpr (veclen == 1) {
+      if constexpr (Veclen == 1) {
         const acc_type diff = x - y;
         acc += diff * diff;
       } else {
@@ -1004,114 +989,13 @@ void launch_interleaved_scan_kernel(
     }
   };
 
-  dim3 block_dim(utils::kThreadPerBlock);
-
-  if (greater) {
-    if (metric == raft::distance::DistanceType::L2Expanded ||
-        metric == raft::distance::DistanceType::L2Unexpanded) {
-      constexpr auto interleaved_scan_euclidean_greater =
-        interleaved_scan<capacity, veclen, T, acc_type, decltype(euclidean_lambda), true>;
-      if (gridDimX == 0) {
-        dim3 grid_dim =
-          launchConfigGenerator(batch_size, nprobe, smem_size, interleaved_scan_euclidean_greater);
-        gridDimX = grid_dim.x;
-        return;
-      }
-      dim3 grid_dim =
-        launchConfigGenerator(batch_size, nprobe, smem_size, interleaved_scan_euclidean_greater);
-      interleaved_scan_euclidean_greater<<<grid_dim, block_dim, smem_size, stream>>>(
-        queries,
-        coarse_index,
-        list_index,
-        (T*)list_data,
-        list_lengths,
-        list_prefix_interleave,
-        metric,
-        euclidean_lambda,
-        nprobe,
-        k,
-        dim,
-        neighbors,
-        distances);
-    } else {
-      constexpr auto interleaved_scan_inner_prod_greater =
-        interleaved_scan<capacity, veclen, T, acc_type, decltype(inner_prod_lambda), true>;
-      if (gridDimX == 0) {
-        dim3 grid_dim =
-          launchConfigGenerator(batch_size, nprobe, smem_size, interleaved_scan_inner_prod_greater);
-        gridDimX = grid_dim.x;
-        return;
-      }
-      dim3 grid_dim =
-        launchConfigGenerator(batch_size, nprobe, smem_size, interleaved_scan_inner_prod_greater);
-      interleaved_scan_inner_prod_greater<<<grid_dim, block_dim, smem_size, stream>>>(
-        queries,
-        coarse_index,
-        list_index,
-        (T*)list_data,
-        list_lengths,
-        list_prefix_interleave,
-        metric,
-        inner_prod_lambda,
-        nprobe,
-        k,
-        dim,
-        neighbors,
-        distances);
-    }
+  if (metric == raft::distance::DistanceType::L2Expanded ||
+      metric == raft::distance::DistanceType::L2Unexpanded) {
+    launch_with_lambda<Capacity, Veclen, Greater, T, acc_type, decltype(euclidean_lambda)>(
+      euclidean_lambda, metric, args...);
   } else {
-    if (metric == raft::distance::DistanceType::L2Expanded ||
-        metric == raft::distance::DistanceType::L2Unexpanded) {
-      constexpr auto interleaved_scan_euclidean_ngreater =
-        interleaved_scan<capacity, veclen, T, acc_type, decltype(euclidean_lambda), false>;
-      if (gridDimX == 0) {
-        dim3 grid_dim =
-          launchConfigGenerator(batch_size, nprobe, smem_size, interleaved_scan_euclidean_ngreater);
-        gridDimX = grid_dim.x;
-        return;
-      }
-      dim3 grid_dim =
-        launchConfigGenerator(batch_size, nprobe, smem_size, interleaved_scan_euclidean_ngreater);
-      interleaved_scan_euclidean_ngreater<<<grid_dim, block_dim, smem_size, stream>>>(
-        queries,
-        coarse_index,
-        list_index,
-        (T*)list_data,
-        list_lengths,
-        list_prefix_interleave,
-        metric,
-        euclidean_lambda,
-        nprobe,
-        k,
-        dim,
-        neighbors,
-        distances);
-    } else {
-      constexpr auto interleaved_scan_inner_prod_ngreater =
-        interleaved_scan<capacity, veclen, T, acc_type, decltype(inner_prod_lambda), false>;
-      if (gridDimX == 0) {
-        dim3 grid_dim = launchConfigGenerator(
-          batch_size, nprobe, smem_size, interleaved_scan_inner_prod_ngreater);
-        gridDimX = grid_dim.x;
-        return;
-      }
-      dim3 grid_dim =
-        launchConfigGenerator(batch_size, nprobe, smem_size, interleaved_scan_inner_prod_ngreater);
-      interleaved_scan_inner_prod_ngreater<<<grid_dim, block_dim, smem_size, stream>>>(
-        queries,
-        coarse_index,
-        list_index,
-        (T*)list_data,
-        list_lengths,
-        list_prefix_interleave,
-        metric,
-        inner_prod_lambda,
-        nprobe,
-        k,
-        dim,
-        neighbors,
-        distances);
-    }
+    launch_with_lambda<Capacity, Veclen, Greater, T, acc_type, decltype(inner_prod_lambda)>(
+      inner_prod_lambda, metric, args...);
   }
 }
 
@@ -1131,18 +1015,18 @@ struct select_interleaved_scan_kernel {
    * two parameters and ends with both values equal to 1.
    */
   template <typename... Args>
-  static inline void run(int capacity, int veclen, Args&&... args)
+  static inline void run(int capacity, int veclen, bool greater, Args&&... args)
   {
     if constexpr (Capacity > 1) {
       if (capacity * 2 <= Capacity) {
         return select_interleaved_scan_kernel<T, AccT, Capacity / 2, Veclen>::run(
-          capacity, veclen, args...);
+          capacity, veclen, greater, args...);
       }
     }
     if constexpr (Veclen > 1) {
       if (veclen * 2 <= Veclen) {
         return select_interleaved_scan_kernel<T, AccT, Capacity, Veclen / 2>::run(
-          capacity, veclen, args...);
+          capacity, veclen, greater, args...);
       }
     }
     RAFT_EXPECTS(capacity == Capacity,
@@ -1150,15 +1034,37 @@ struct select_interleaved_scan_kernel {
     RAFT_EXPECTS(
       veclen == Veclen,
       "Veclen must be power-of-two not bigger than the maximum allowed size for this data type.");
-    return launch_interleaved_scan_kernel<Capacity, Veclen, T, AccT>(args...);
+    if (greater) {
+      launch_interleaved_scan_kernel<Capacity, Veclen, true, T, AccT>(args...);
+    } else {
+      launch_interleaved_scan_kernel<Capacity, Veclen, false, T, AccT>(args...);
+    }
   }
 };
 
+// rmm::cuda_stream_view stream,
+// const T* queries,        // Input: Query Vector; [batch_size, dim]
+// uint32_t* coarse_index,  // Record the cluster(list) id; [batch_size,nprobe]
+// uint32_t* list_index,    // Record the id of vector for each cluster(list); [nrow]
+// T* list_data,            // Record the full value of vector for each cluster(list) interleaved;
+//                          // [nrow, dim]
+// uint32_t* list_lengths,  // The number of vectors in each cluster(list); [nlist]
+// uint32_t* list_prefix_interleave,     // The start offset of each cluster(list) for
+//                                       // list_index; [nlist]
+// raft::distance::DistanceType metric,  // Function to process the different metric
+// const uint32_t nprobe,
+// const uint32_t k,
+// const uint32_t dim,
+// size_t* neighbors,  // [batch_size, nprobe]
+// float* distances,   // [batch_size, nprobe]
+// const uint32_t batch_size,
+// uint32_t& gridDimX
+
 template <typename T, typename AccT>
 void ivfflat_interleaved_scan(const T* queries,                  //[batch_size, dim]
                               uint32_t* coarse_index,            //[batch_size,nprobe]
                               uint32_t* list_index,              // [nrow]
-                              void* list_data,                   //[nrow, dim]
+                              T* list_data,                      //[nrow, dim]
                               uint32_t* list_lengths,            // [nlist]
                               uint32_t* list_prefix_interleave,  // [nlist]
                               const raft::distance::DistanceType metric,
@@ -1168,7 +1074,7 @@ void ivfflat_interleaved_scan(const T* queries,                  //[batch_size,
                               const uint32_t dim,
                               size_t* neighbors,  // [batch_size, nprobe, k]
                               float* distances,   // [batch_size, nprobe, k]
-                              cudaStream_t stream,
+                              rmm::cuda_stream_view stream,
                               const bool greater,
                               const int veclen,
                               uint32_t& gridDimX)
@@ -1176,25 +1082,22 @@ void ivfflat_interleaved_scan(const T* queries,                  //[batch_size,
   const int capacity = raft::spatial::knn::detail::topk::calc_capacity(k);
   select_interleaved_scan_kernel<T, AccT>::run(capacity,
                                                veclen,
+                                               greater,
+                                               metric,
                                                queries,
                                                coarse_index,
                                                list_index,
                                                list_data,
                                                list_lengths,
                                                list_prefix_interleave,
-                                               metric,
                                                nprobe,
                                                k,
                                                dim,
                                                neighbors,
                                                distances,
-                                               greater,
                                                batch_size,
-                                               stream,
-                                               gridDimX);
+                                               gridDimX,
+                                               stream);
 }
 
-}  // namespace detail
-}  // namespace knn
-}  // namespace spatial
-}  // namespace raft
+}  // namespace raft::spatial::knn::detail

From c5f1c89b84d9b001173d6c43a23d482255672943 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 27 May 2022 13:33:02 +0200
Subject: [PATCH 039/118] Refactor and document ann_ivf_flat_kernel

---
 .../knn/detail/ann_ivf_flat_kernel.cuh        | 310 +++++++++---------
 1 file changed, 159 insertions(+), 151 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
index e7fa652655..e4c7f8a25a 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
@@ -98,16 +98,15 @@ __device__ __forceinline__ void queryLoadToShmem<int8_t, 16>(const int8_t* const
 
 template <int kUnroll,
           int wordsPerVectorBlockDim,
-          typename computeLambda,
+          typename Lambda,
           int Veclen,
           typename T,
           typename AccT>
 struct loadAndComputeDist {
-  computeLambda computeDist;
+  Lambda computeDist;
   AccT& dist;
 
-  __device__ __forceinline__ loadAndComputeDist(AccT& dist, computeLambda op)
-    : dist(dist), computeDist(op)
+  __device__ __forceinline__ loadAndComputeDist(AccT& dist, Lambda op) : dist(dist), computeDist(op)
   {
   }
 
@@ -181,17 +180,17 @@ struct loadAndComputeDist {
 };
 
 // This handles uint8_t 8, 16 Veclens
-template <int kUnroll, int wordsPerVectorBlockDim, typename computeLambda, int uint8_veclen>
+template <int kUnroll, int wordsPerVectorBlockDim, typename Lambda, int uint8_veclen>
 struct loadAndComputeDist<kUnroll,
                           wordsPerVectorBlockDim,
-                          computeLambda,
+                          Lambda,
                           uint8_veclen,
                           uint8_t,
                           uint32_t> {
-  computeLambda computeDist;
+  Lambda computeDist;
   uint32_t& dist;
 
-  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, computeLambda op)
+  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, Lambda op)
     : dist(dist), computeDist(op)
   {
   }
@@ -274,12 +273,12 @@ struct loadAndComputeDist<kUnroll,
 
 // Keep this specialized uint8 Veclen = 4, because compiler is generating suboptimal code while
 // using above common template of int2/int4
-template <int kUnroll, int wordsPerVectorBlockDim, typename computeLambda>
-struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 4, uint8_t, uint32_t> {
-  computeLambda computeDist;
+template <int kUnroll, int wordsPerVectorBlockDim, typename Lambda>
+struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 4, uint8_t, uint32_t> {
+  Lambda computeDist;
   uint32_t& dist;
 
-  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, computeLambda op)
+  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, Lambda op)
     : dist(dist), computeDist(op)
   {
   }
@@ -342,12 +341,12 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 4, uin
   }
 };
 
-template <int kUnroll, int wordsPerVectorBlockDim, typename computeLambda>
-struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 2, uint8_t, uint32_t> {
-  computeLambda computeDist;
+template <int kUnroll, int wordsPerVectorBlockDim, typename Lambda>
+struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 2, uint8_t, uint32_t> {
+  Lambda computeDist;
   uint32_t& dist;
 
-  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, computeLambda op)
+  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, Lambda op)
     : dist(dist), computeDist(op)
   {
   }
@@ -414,12 +413,12 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 2, uin
   }
 };
 
-template <int kUnroll, int wordsPerVectorBlockDim, typename computeLambda>
-struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 1, uint8_t, uint32_t> {
-  computeLambda computeDist;
+template <int kUnroll, int wordsPerVectorBlockDim, typename Lambda>
+struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 1, uint8_t, uint32_t> {
+  Lambda computeDist;
   uint32_t& dist;
 
-  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, computeLambda op)
+  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, Lambda op)
     : dist(dist), computeDist(op)
   {
   }
@@ -486,17 +485,12 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 1, uin
 };
 
 // This device function is for int8 veclens 4, 8 and 16
-template <int kUnroll, int wordsPerVectorBlockDim, typename computeLambda, int int8_veclen>
-struct loadAndComputeDist<kUnroll,
-                          wordsPerVectorBlockDim,
-                          computeLambda,
-                          int8_veclen,
-                          int8_t,
-                          int32_t> {
-  computeLambda computeDist;
+template <int kUnroll, int wordsPerVectorBlockDim, typename Lambda, int int8_veclen>
+struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, int8_veclen, int8_t, int32_t> {
+  Lambda computeDist;
   int32_t& dist;
 
-  __device__ __forceinline__ loadAndComputeDist(int32_t& dist, computeLambda op)
+  __device__ __forceinline__ loadAndComputeDist(int32_t& dist, Lambda op)
     : dist(dist), computeDist(op)
   {
   }
@@ -574,11 +568,11 @@ struct loadAndComputeDist<kUnroll,
   }
 };
 
-template <int kUnroll, int wordsPerVectorBlockDim, typename computeLambda>
-struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 2, int8_t, int32_t> {
-  computeLambda computeDist;
+template <int kUnroll, int wordsPerVectorBlockDim, typename Lambda>
+struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 2, int8_t, int32_t> {
+  Lambda computeDist;
   int32_t& dist;
-  __device__ __forceinline__ loadAndComputeDist(int32_t& dist, computeLambda op)
+  __device__ __forceinline__ loadAndComputeDist(int32_t& dist, Lambda op)
     : dist(dist), computeDist(op)
   {
   }
@@ -641,11 +635,11 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 2, int
   }
 };
 
-template <int kUnroll, int wordsPerVectorBlockDim, typename computeLambda>
-struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 1, int8_t, int32_t> {
-  computeLambda computeDist;
+template <int kUnroll, int wordsPerVectorBlockDim, typename Lambda>
+struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 1, int8_t, int32_t> {
+  Lambda computeDist;
   int32_t& dist;
-  __device__ __forceinline__ loadAndComputeDist(int32_t& dist, computeLambda op)
+  __device__ __forceinline__ loadAndComputeDist(int32_t& dist, Lambda op)
     : dist(dist), computeDist(op)
   {
   }
@@ -710,23 +704,22 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, computeLambda, 1, int
   }
 };
 
-template <int Capacity, int Veclen, typename T, typename value_t, typename distLambda, bool GREATER>
-__global__ void interleaved_scan(
-  const T* queries,        // Input: Query Vector; [batch_size, dim]
-  uint32_t* coarse_index,  // Record the cluster(list) id; [batch_size,nprobe]
-  uint32_t* list_index,    // Record the id of vector for each cluster(list); [nrow]
-  const T* list_data,      // Record the full value of vector for each cluster(list) interleaved;
-                           // [nrow, dim]
-  uint32_t* list_lengths,  // The number of vectors in each cluster(list); [nlist]
-  uint32_t* list_prefix_interleave,           // The start offset of each cluster(list) for
-                                              // list_index; [nlist]
-  const raft::distance::DistanceType metric,  // Function to process the different metric
-  distLambda computeDist,
+template <int Capacity, int Veclen, typename T, typename AccT, typename Lambda, bool Greater>
+__global__ void interleaved_scan_kernel(
+  const T* queries,              // Input: Query Vector; [batch_size, dim]
+  const uint32_t* coarse_index,  // Record the cluster(list) id; [batch_size,nprobe]
+  const uint32_t* list_index,    // Record the id of vector for each cluster(list); [nrow]
+  const T* list_data,  // Record the full value of vector for each cluster(list) interleaved;
+                       // [nrow, dim]
+  const uint32_t* list_lengths,            // The number of vectors in each cluster(list); [nlist]
+  const uint32_t* list_prefix_interleave,  // The start offset of each cluster(list) for
+                                           // list_index; [nlist]
+  Lambda computeDist,
   const uint32_t nprobe,
   const uint32_t k,
   const uint32_t dim,
-  size_t* neighbors,  // [batch_size, nprobe]
-  float* distances    // [batch_size, nprobe]
+  size_t* neighbors,  // [batch_size, nprobe, k]
+  float* distances    // [batch_size, nprobe, k]
 )
 {
 #ifdef USE_FAISS
@@ -735,7 +728,7 @@ __global__ void interleaved_scan(
   __shared__ float smemK[utils::kNumWarps * 32];
   __shared__ size_t smemV[utils::kNumWarps * 32];
 
-  constexpr auto Dir = GREATER;
+  constexpr auto Dir = Greater;
   constexpr auto identity =
     Dir ? std::numeric_limits<float>::min() : std::numeric_limits<float>::max();
   constexpr auto keyMax =
@@ -747,7 +740,7 @@ __global__ void interleaved_scan(
 
 #else
   extern __shared__ __align__(256) uint8_t smem_ext[];
-  topk::block_sort<topk::warp_sort_filtered, Capacity, !GREATER, float, size_t> queue(k, smem_ext);
+  topk::block_sort<topk::warp_sort_filtered, Capacity, !Greater, float, size_t> queue(k, smem_ext);
 #endif
 
   using align_warp = Pow2<WarpSize>;
@@ -794,7 +787,7 @@ __global__ void interleaved_scan(
     const uint32_t numBlocks = ceildiv<uint32_t>(numVecs, WarpSize);
 
     for (uint32_t block = warpId; block < numBlocks; block += utils::kNumWarps) {
-      value_t dist = 0;
+      AccT dist = 0;
       // This is the vector a given lane/thread handles
       const uint32_t vec = block * WarpSize + laneId;
       bool valid         = vec < numVecs;
@@ -815,7 +808,7 @@ __global__ void interleaved_scan(
                              decltype(computeDist),
                              Veclen,
                              T,
-                             value_t>
+                             AccT>
             obj(dist, computeDist);
 #pragma unroll
           for (int i = 0; i < totalIter; ++i, data += stride * wordsPerVectorBlockDim) {
@@ -827,12 +820,7 @@ __global__ void interleaved_scan(
       if (dim > queryShmemSize) {
         constexpr int kUnroll = WarpSize / Veclen;
         ;
-        loadAndComputeDist<kUnroll,
-                           wordsPerVectorBlockDim,
-                           decltype(computeDist),
-                           Veclen,
-                           T,
-                           value_t>
+        loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, decltype(computeDist), Veclen, T, AccT>
           obj(dist, computeDist);
         for (int dBase = shLoadDim; dBase < dimBlocks; dBase += WarpSize) {  //
           obj.runLoadShflAndCompute(data, query, dBase, laneId);
@@ -845,7 +833,7 @@ __global__ void interleaved_scan(
           /// Remainder chunk = dim - dimBlocks
           for (int d = 0; d < dim - dimBlocks;
                d += Veclen, data += wordsPerVectorBlockDim * Veclen) {
-            loadAndComputeDist<1, wordsPerVectorBlockDim, decltype(computeDist), Veclen, T, value_t>
+            loadAndComputeDist<1, wordsPerVectorBlockDim, decltype(computeDist), Veclen, T, AccT>
               obj(dist, computeDist);
             obj.runLoadShmemCompute(data, queryShared, laneId, dimBlocks + d, 0);
           }  // end for d < dim - dimBlocks
@@ -853,7 +841,7 @@ __global__ void interleaved_scan(
       }
 
       // Enqueue one element per thread
-      constexpr float kDummy = GREATER ? lower_bound<float>() : upper_bound<float>();
+      constexpr float kDummy = Greater ? lower_bound<float>() : upper_bound<float>();
       float val              = (valid) ? (float)dist : kDummy;
       queue.add(val, idx);
     }  // end for block < numBlocks
@@ -874,7 +862,7 @@ __global__ void interleaved_scan(
 }  // end kernel
 
 template <typename T>
-dim3 launchConfigGenerator(uint32_t numQueries, uint32_t nprobe, int32_t sMemSize, T func)
+dim3 configure_launch(uint32_t numQueries, uint32_t nprobe, int32_t sMemSize, T func)
 {
   int devId;
   RAFT_CUDA_TRY(cudaGetDevice(&devId));
@@ -903,24 +891,23 @@ dim3 launchConfigGenerator(uint32_t numQueries, uint32_t nprobe, int32_t sMemSiz
 }
 
 template <int Capacity, int Veclen, bool Greater, typename T, typename AccT, typename Lambda>
-void launch_with_lambda(Lambda lambda,
-                        raft::distance::DistanceType metric,
-                        const T* queries,
-                        uint32_t* coarse_index,
-                        uint32_t* list_index,
-                        T* list_data,
-                        uint32_t* list_lengths,
-                        uint32_t* list_prefix_interleave,
-                        const uint32_t nprobe,
-                        const uint32_t k,
-                        const uint32_t dim,
-                        size_t* neighbors,
-                        float* distances,
-                        const uint32_t batch_size,
-                        uint32_t& gridDimX,
-                        rmm::cuda_stream_view stream)
+void launch_kernel(Lambda lambda,
+                   const T* queries,
+                   const uint32_t* coarse_index,
+                   const uint32_t* list_index,
+                   const T* list_data,
+                   const uint32_t* list_lengths,
+                   const uint32_t* list_prefix_interleave,
+                   const uint32_t nprobe,
+                   const uint32_t k,
+                   const uint32_t dim,
+                   size_t* neighbors,
+                   float* distances,
+                   const uint32_t batch_size,
+                   uint32_t& gridDimX,
+                   rmm::cuda_stream_view stream)
 {
-  constexpr auto kKernel = interleaved_scan<Capacity, Veclen, T, AccT, Lambda, Greater>;
+  constexpr auto kKernel = interleaved_scan_kernel<Capacity, Veclen, T, AccT, Lambda, Greater>;
 #ifdef USE_FAISS
   int smem_size = 0;
 #else
@@ -928,7 +915,7 @@ void launch_with_lambda(Lambda lambda,
     utils::kNumWarps, k);
 #endif
 
-  dim3 grid_dim = launchConfigGenerator(batch_size, nprobe, smem_size, kKernel);
+  dim3 grid_dim = configure_launch(batch_size, nprobe, smem_size, kKernel);
   if (gridDimX == 0) {
     gridDimX = grid_dim.x;
     return;
@@ -940,7 +927,6 @@ void launch_with_lambda(Lambda lambda,
                                                       list_data,
                                                       list_lengths,
                                                       list_prefix_interleave,
-                                                      metric,
                                                       lambda,
                                                       nprobe,
                                                       k,
@@ -949,53 +935,65 @@ void launch_with_lambda(Lambda lambda,
                                                       distances);
 }
 
-template <int Capacity, int Veclen, bool Greater, typename T, typename acc_type, typename... Args>
-void launch_interleaved_scan_kernel(raft::distance::DistanceType metric, Args&&... args)
-{
-  // Accumulation inner product lambda
-  auto inner_prod_lambda = [] __device__(acc_type & acc, acc_type & x, acc_type & y) {
-    if constexpr ((std::is_same<T, int8_t>{}) || (std::is_same<T, uint8_t>{})) {
-      if constexpr (Veclen == 1) {
-        acc += x * y;
-      } else {
-        acc = dp4a(x, y, acc);
-      }
-    } else if constexpr (std::is_same<T, float>{}) {
-      acc += x * y;
+template <int Veclen, typename T, typename AccT>
+struct euclidean_dist {
+  __device__ inline void operator()(AccT& acc, AccT x, AccT y)
+  {
+    const AccT diff = x - y;
+    acc += diff * diff;
+  }
+};
+
+template <int Veclen>
+struct euclidean_dist<Veclen, uint8_t, uint32_t> {
+  __device__ inline void operator()(uint32_t& acc, uint32_t x, uint32_t y)
+  {
+    if constexpr (Veclen > 1) {
+      const uint32_t diff = __vabsdiffu4(x, y);
+      acc                 = dp4a(diff, diff, acc);
+    } else {
+      const uint32_t diff = x - y;
+      acc += diff * diff;
     }
-  };
-
-  // Accumulation euclidean L2 lambda
-  auto euclidean_lambda = [] __device__(acc_type & acc, acc_type & x, acc_type & y) {
-    if constexpr ((std::is_same<T, uint8_t>{})) {
-      if constexpr (Veclen == 1) {
-        const acc_type diff = x - y;
-        acc += diff * diff;
-      } else {
-        const acc_type diff = __vabsdiffu4(x, y);
-        acc                 = dp4a(diff, diff, acc);
-      }
-    } else if constexpr (std::is_same<T, int8_t>{}) {
-      if constexpr (Veclen == 1) {
-        const acc_type diff = x - y;
-        acc += diff * diff;
-      } else {
-        asm("vabsdiff4.u32.s32.s32 %0,%1,%2,%3;" : "=r"(x) : "r"(x), "r"(y), "r"(0));
-        acc = dp4a(x, x, acc);
-      }
-    } else if constexpr ((std::is_same<T, float>{})) {
-      const acc_type diff = x - y;
+  }
+};
+
+template <int Veclen>
+struct euclidean_dist<Veclen, int8_t, int32_t> {
+  __device__ inline void operator()(int32_t& acc, int32_t x, int32_t y)
+  {
+    if constexpr (Veclen > 1) {
+      const int32_t diff = static_cast<int32_t>(__vabsdiffs4(x, y));
+      acc                = dp4a(diff, diff, acc);
+    } else {
+      const int32_t diff = x - y;
       acc += diff * diff;
     }
-  };
+  }
+};
+
+template <int Veclen, typename T, typename AccT>
+struct inner_prod_dist {
+  __device__ inline void operator()(AccT& acc, AccT x, AccT y)
+  {
+    if constexpr (Veclen > 1 && (std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>)) {
+      acc = dp4a(x, y, acc);
+    } else {
+      acc += x * y;
+    }
+  }
+};
 
+/** Select the distance computation function and forward the rest of the arguments. */
+template <int Capacity, int Veclen, bool Greater, typename T, typename AccT, typename... Args>
+void launch_with_fixed_consts(raft::distance::DistanceType metric, Args&&... args)
+{
   if (metric == raft::distance::DistanceType::L2Expanded ||
       metric == raft::distance::DistanceType::L2Unexpanded) {
-    launch_with_lambda<Capacity, Veclen, Greater, T, acc_type, decltype(euclidean_lambda)>(
-      euclidean_lambda, metric, args...);
+    launch_kernel<Capacity, Veclen, Greater, T, AccT, euclidean_dist<Veclen, T, AccT>>({}, args...);
   } else {
-    launch_with_lambda<Capacity, Veclen, Greater, T, acc_type, decltype(inner_prod_lambda)>(
-      inner_prod_lambda, metric, args...);
+    launch_kernel<Capacity, Veclen, Greater, T, AccT, inner_prod_dist<Veclen, T, AccT>>({},
+                                                                                        args...);
   }
 }
 
@@ -1035,45 +1033,55 @@ struct select_interleaved_scan_kernel {
       veclen == Veclen,
       "Veclen must be power-of-two not bigger than the maximum allowed size for this data type.");
     if (greater) {
-      launch_interleaved_scan_kernel<Capacity, Veclen, true, T, AccT>(args...);
+      launch_with_fixed_consts<Capacity, Veclen, true, T, AccT>(args...);
     } else {
-      launch_interleaved_scan_kernel<Capacity, Veclen, false, T, AccT>(args...);
+      launch_with_fixed_consts<Capacity, Veclen, false, T, AccT>(args...);
     }
   }
 };
 
-// rmm::cuda_stream_view stream,
-// const T* queries,        // Input: Query Vector; [batch_size, dim]
-// uint32_t* coarse_index,  // Record the cluster(list) id; [batch_size,nprobe]
-// uint32_t* list_index,    // Record the id of vector for each cluster(list); [nrow]
-// T* list_data,            // Record the full value of vector for each cluster(list) interleaved;
-//                          // [nrow, dim]
-// uint32_t* list_lengths,  // The number of vectors in each cluster(list); [nlist]
-// uint32_t* list_prefix_interleave,     // The start offset of each cluster(list) for
-//                                       // list_index; [nlist]
-// raft::distance::DistanceType metric,  // Function to process the different metric
-// const uint32_t nprobe,
-// const uint32_t k,
-// const uint32_t dim,
-// size_t* neighbors,  // [batch_size, nprobe]
-// float* distances,   // [batch_size, nprobe]
-// const uint32_t batch_size,
-// uint32_t& gridDimX
-
+/**
+ * @brief Configure and launch an appropriate template instance of the interleaved scan kernel.
+ *
+ * @tparam T value type
+ * @tparam AccT accumulated type
+ *
+ * @param[in] queries device pointer to the query vectors [batch_size, dim]
+ * @param[in] coarse_index device pointer to the cluster (list) ids [batch_size, nprobe]
+ * @param[in] list_index device pointer to the row ids in each cluster [nrow]
+ * @param[in] list_data device pointer to the data in all clusters interleaved [nrow, dim]
+ * @param[in] list_lengths device pointer to the numbers of vectors in each cluster [nlist]
+ * @param[in] list_prefix_interleave device pointer to the offsets of each cluster in list_index
+ * [nlist]
+ * @param[in] metric type of the measured distance
+ * @param[in] nprobe number of nearest clusters to query
+ * @param[in] k number of nearest neighbors
+ * @param[in] batch_size number of query vectors
+ * @param[in] dim dimensionality of search data and query vectors
+ * @param[out] neighbors device pointer to the result indices for each query and cluster
+ * [batch_size, nprobe, k]
+ * @param[out] distances device pointer to the result distances for each query and cluster
+ * [batch_size, nprobe, k]
+ * @param[in] stream
+ * @param[in] greater whether to select nearest (false) or furthest (true) points w.r.t. the given
+ * metric.
+ * @param[in] veclen (optimization parameters) size of the vector for vectorized processing
+ * @param[inout] gridDimX number of blocks launched for each cluster
+ */
 template <typename T, typename AccT>
-void ivfflat_interleaved_scan(const T* queries,                  //[batch_size, dim]
-                              uint32_t* coarse_index,            //[batch_size,nprobe]
-                              uint32_t* list_index,              // [nrow]
-                              T* list_data,                      //[nrow, dim]
-                              uint32_t* list_lengths,            // [nlist]
-                              uint32_t* list_prefix_interleave,  // [nlist]
+void ivfflat_interleaved_scan(const T* queries,
+                              const uint32_t* coarse_index,
+                              const uint32_t* list_index,
+                              const T* list_data,
+                              const uint32_t* list_lengths,
+                              const uint32_t* list_prefix_interleave,
                               const raft::distance::DistanceType metric,
                               const uint32_t nprobe,
                               const uint32_t k,
                               const uint32_t batch_size,
                               const uint32_t dim,
-                              size_t* neighbors,  // [batch_size, nprobe, k]
-                              float* distances,   // [batch_size, nprobe, k]
+                              size_t* neighbors,
+                              float* distances,
                               rmm::cuda_stream_view stream,
                               const bool greater,
                               const int veclen,

From 7b2b9ff6e52ecfe87ec214dcdeff66ddce6008b4 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 27 May 2022 14:41:12 +0200
Subject: [PATCH 040/118] Documenting and refactoring the kernel

---
 .../knn/detail/ann_ivf_flat_kernel.cuh        | 141 +++++++++---------
 1 file changed, 72 insertions(+), 69 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
index e4c7f8a25a..1c7ef83e8d 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
@@ -103,10 +103,11 @@ template <int kUnroll,
           typename T,
           typename AccT>
 struct loadAndComputeDist {
-  Lambda computeDist;
+  Lambda compute_dist;
   AccT& dist;
 
-  __device__ __forceinline__ loadAndComputeDist(AccT& dist, Lambda op) : dist(dist), computeDist(op)
+  __device__ __forceinline__ loadAndComputeDist(AccT& dist, Lambda op)
+    : dist(dist), compute_dist(op)
   {
   }
 
@@ -128,7 +129,7 @@ struct loadAndComputeDist {
       lds(queryRegs[j], &queryShared[d]);
 #pragma unroll
       for (int k = 0; k < Veclen; ++k) {
-        computeDist(dist, queryRegs[j][k], encV[j][k]);
+        compute_dist(dist, queryRegs[j][k], encV[j][k]);
       }
     }
   }
@@ -154,7 +155,7 @@ struct loadAndComputeDist {
 #pragma unroll
         for (int k = 0; k < Veclen; ++k) {
           q[k] = shfl(queryReg, d + k, WarpSize);
-          computeDist(dist, q[k], encV[j][k]);  //@TODO add other metrics
+          compute_dist(dist, q[k], encV[j][k]);  //@TODO add other metrics
         }
       }
     }
@@ -173,7 +174,7 @@ struct loadAndComputeDist {
 #pragma unroll
       for (int k = 0; k < Veclen; k++) {
         q[k] = shfl(queryReg, d + k, WarpSize);
-        computeDist(dist, q[k], enc[k]);
+        compute_dist(dist, q[k], enc[k]);
       }
     }  // end for d < dim - dimBlocks
   }
@@ -187,11 +188,11 @@ struct loadAndComputeDist<kUnroll,
                           uint8_veclen,
                           uint8_t,
                           uint32_t> {
-  Lambda computeDist;
+  Lambda compute_dist;
   uint32_t& dist;
 
   __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, Lambda op)
-    : dist(dist), computeDist(op)
+    : dist(dist), compute_dist(op)
   {
   }
 
@@ -215,7 +216,7 @@ struct loadAndComputeDist<kUnroll,
       lds(queryRegs[j], reinterpret_cast<unsigned const*>(queryShared + baseShmemIndex) + d);
 #pragma unroll
       for (int k = 0; k < veclen_int; k++) {
-        computeDist(dist, queryRegs[j][k], encV[j][k]);
+        compute_dist(dist, queryRegs[j][k], encV[j][k]);
       }
     }
   }
@@ -242,7 +243,7 @@ struct loadAndComputeDist<kUnroll,
 #pragma unroll
         for (int k = 0; k < veclen_int; ++k) {
           q[j][k] = shfl(queryReg, d + k, WarpSize);
-          computeDist(dist, q[j][k], encV[j][k]);
+          compute_dist(dist, q[j][k], encV[j][k]);
         }
       }
     }
@@ -265,7 +266,7 @@ struct loadAndComputeDist<kUnroll,
 #pragma unroll
       for (int k = 0; k < veclen_int; k++) {
         q[k] = shfl(queryReg, (d / 4) + k, WarpSize);
-        computeDist(dist, q[k], enc[k]);
+        compute_dist(dist, q[k], enc[k]);
       }
     }  // end for d < dim - dimBlocks
   }
@@ -275,11 +276,11 @@ struct loadAndComputeDist<kUnroll,
 // using above common template of int2/int4
 template <int kUnroll, int wordsPerVectorBlockDim, typename Lambda>
 struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 4, uint8_t, uint32_t> {
-  Lambda computeDist;
+  Lambda compute_dist;
   uint32_t& dist;
 
   __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, Lambda op)
-    : dist(dist), computeDist(op)
+    : dist(dist), compute_dist(op)
   {
   }
 
@@ -297,7 +298,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 4, uint8_t, u
       encV[j]     = reinterpret_cast<unsigned const*>(data)[loadIndex + j * wordsPerVectorBlockDim];
       const int d = (iShmemIndex * kUnroll + j);
       queryRegs[j] = reinterpret_cast<unsigned const*>(queryShared + baseShmemIndex)[d];
-      computeDist(dist, queryRegs[j], encV[j]);
+      compute_dist(dist, queryRegs[j], encV[j]);
     }
   }
   __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
@@ -319,7 +320,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 4, uint8_t, u
         encV[j]     = reinterpret_cast<unsigned const*>(data)[laneId + j * wordsPerVectorBlockDim];
         const int d = (i * kUnroll + j);
         q[j]        = shfl(queryReg, d, WarpSize);
-        computeDist(dist, q[j], encV[j]);
+        compute_dist(dist, q[j], encV[j]);
       }
     }
   }
@@ -336,18 +337,18 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 4, uint8_t, u
     for (int d = 0; d < dim - dimBlocks; d += veclen, data += wordsPerVectorBlockDim * veclen) {
       uint32_t enc = reinterpret_cast<unsigned const*>(data)[laneId];
       uint32_t q   = shfl(queryReg, d / veclen, WarpSize);
-      computeDist(dist, q, enc);
+      compute_dist(dist, q, enc);
     }  // end for d < dim - dimBlocks
   }
 };
 
 template <int kUnroll, int wordsPerVectorBlockDim, typename Lambda>
 struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 2, uint8_t, uint32_t> {
-  Lambda computeDist;
+  Lambda compute_dist;
   uint32_t& dist;
 
   __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, Lambda op)
-    : dist(dist), computeDist(op)
+    : dist(dist), compute_dist(op)
   {
   }
 
@@ -366,7 +367,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 2, uint8_t, u
       const int d = (iShmemIndex * kUnroll + j);
       queryRegs[j] = 0;
       queryRegs[j] = reinterpret_cast<uint16_t const*>(queryShared + baseShmemIndex)[d];
-      computeDist(dist, queryRegs[j], encV[j]);
+      compute_dist(dist, queryRegs[j], encV[j]);
     }
   }
 
@@ -390,7 +391,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 2, uint8_t, u
         encV[j]     = reinterpret_cast<uint16_t const*>(data)[laneId + j * wordsPerVectorBlockDim];
         const int d = (i * kUnroll + j);
         q[j]        = shfl(queryReg, d, WarpSize);
-        computeDist(dist, q[j], encV[j]);
+        compute_dist(dist, q[j], encV[j]);
       }
     }
   }
@@ -408,18 +409,18 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 2, uint8_t, u
     for (int d = 0; d < dim - dimBlocks; d += veclen, data += wordsPerVectorBlockDim * veclen) {
       uint32_t enc = reinterpret_cast<uint16_t const*>(data)[laneId];
       uint32_t q   = shfl(queryReg, d / veclen, WarpSize);
-      computeDist(dist, q, enc);
+      compute_dist(dist, q, enc);
     }  // end for d < dim - dimBlocks
   }
 };
 
 template <int kUnroll, int wordsPerVectorBlockDim, typename Lambda>
 struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 1, uint8_t, uint32_t> {
-  Lambda computeDist;
+  Lambda compute_dist;
   uint32_t& dist;
 
   __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, Lambda op)
-    : dist(dist), computeDist(op)
+    : dist(dist), compute_dist(op)
   {
   }
 
@@ -436,7 +437,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 1, uint8_t, u
       encV[j]      = data[loadIndex + j * wordsPerVectorBlockDim];
       const int d  = (iShmemIndex * kUnroll + j);
       queryRegs[j] = queryShared[baseShmemIndex + d];
-      computeDist(dist, queryRegs[j], encV[j]);
+      compute_dist(dist, queryRegs[j], encV[j]);
     }
   }
 
@@ -460,7 +461,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 1, uint8_t, u
         encV[j]     = data[laneId + j * wordsPerVectorBlockDim];
         const int d = (i * kUnroll + j);
         q[j]        = shfl(queryReg, d, WarpSize);
-        computeDist(dist, q[j], encV[j]);
+        compute_dist(dist, q[j], encV[j]);
       }
     }
   }
@@ -479,7 +480,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 1, uint8_t, u
       uint32_t enc = 0;
       enc          = data[laneId];
       uint32_t q   = shfl(queryReg, d, WarpSize);
-      computeDist(dist, q, enc);
+      compute_dist(dist, q, enc);
     }  // end for d < dim - dimBlocks
   }
 };
@@ -487,11 +488,11 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 1, uint8_t, u
 // This device function is for int8 veclens 4, 8 and 16
 template <int kUnroll, int wordsPerVectorBlockDim, typename Lambda, int int8_veclen>
 struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, int8_veclen, int8_t, int32_t> {
-  Lambda computeDist;
+  Lambda compute_dist;
   int32_t& dist;
 
   __device__ __forceinline__ loadAndComputeDist(int32_t& dist, Lambda op)
-    : dist(dist), computeDist(op)
+    : dist(dist), compute_dist(op)
   {
   }
 
@@ -514,7 +515,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, int8_veclen,
       lds(queryRegs[j], reinterpret_cast<int32_t const*>(queryShared + baseShmemIndex) + d);
 #pragma unroll
       for (int k = 0; k < veclen_int; k++) {
-        computeDist(dist, queryRegs[j][k], encV[j][k]);
+        compute_dist(dist, queryRegs[j][k], encV[j][k]);
       }
     }
   }
@@ -542,7 +543,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, int8_veclen,
 #pragma unroll
         for (int k = 0; k < veclen_int; ++k) {
           q[j][k] = shfl(queryReg, d + k, WarpSize);
-          computeDist(dist, q[j][k], encV[j][k]);
+          compute_dist(dist, q[j][k], encV[j][k]);
         }
       }
     }
@@ -562,7 +563,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, int8_veclen,
 #pragma unroll
       for (int k = 0; k < veclen_int; k++) {
         q[k] = shfl(queryReg, (d / 4) + k, WarpSize);  // Here 4 is for 1 - int;
-        computeDist(dist, q[k], enc[k]);
+        compute_dist(dist, q[k], enc[k]);
       }
     }  // end for d < dim - dimBlocks
   }
@@ -570,10 +571,10 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, int8_veclen,
 
 template <int kUnroll, int wordsPerVectorBlockDim, typename Lambda>
 struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 2, int8_t, int32_t> {
-  Lambda computeDist;
+  Lambda compute_dist;
   int32_t& dist;
   __device__ __forceinline__ loadAndComputeDist(int32_t& dist, Lambda op)
-    : dist(dist), computeDist(op)
+    : dist(dist), compute_dist(op)
   {
   }
   __device__ __forceinline__ void runLoadShmemCompute(const int8_t* const& data,
@@ -591,7 +592,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 2, int8_t, in
       const int d = (iShmemIndex * kUnroll + j);
       queryRegs[j] = 0;
       queryRegs[j] = reinterpret_cast<uint16_t const*>(queryShared + baseShmemIndex)[d];
-      computeDist(dist, queryRegs[j], encV[j]);
+      compute_dist(dist, queryRegs[j], encV[j]);
     }
   }
 
@@ -615,7 +616,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 2, int8_t, in
         encV[j]     = reinterpret_cast<uint16_t const*>(data)[laneId + j * wordsPerVectorBlockDim];
         const int d = (i * kUnroll + j);
         q[j]        = shfl(queryReg, d, WarpSize);
-        computeDist(dist, q[j], encV[j]);
+        compute_dist(dist, q[j], encV[j]);
       }
     }
   }
@@ -630,17 +631,17 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 2, int8_t, in
     for (int d = 0; d < dim - dimBlocks; d += veclen, data += wordsPerVectorBlockDim * veclen) {
       int32_t enc = reinterpret_cast<uint16_t const*>(data + laneId * veclen)[0];
       int32_t q   = shfl(queryReg, d / veclen, WarpSize);
-      computeDist(dist, q, enc);
+      compute_dist(dist, q, enc);
     }  // end for d < dim - dimBlocks
   }
 };
 
 template <int kUnroll, int wordsPerVectorBlockDim, typename Lambda>
 struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 1, int8_t, int32_t> {
-  Lambda computeDist;
+  Lambda compute_dist;
   int32_t& dist;
   __device__ __forceinline__ loadAndComputeDist(int32_t& dist, Lambda op)
-    : dist(dist), computeDist(op)
+    : dist(dist), compute_dist(op)
   {
   }
 
@@ -660,7 +661,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 1, int8_t, in
       const int d  = (iShmemIndex * kUnroll + j);
       queryRegs[j] = 0;
       queryRegs[j] = queryShared[baseShmemIndex + d];
-      computeDist(dist, queryRegs[j], encV[j]);
+      compute_dist(dist, queryRegs[j], encV[j]);
     }
   }
 
@@ -684,7 +685,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 1, int8_t, in
         encV[j]     = data[laneId + j * wordsPerVectorBlockDim];
         const int d = (i * kUnroll + j);
         q[j]        = shfl(queryReg, d, WarpSize);
-        computeDist(dist, q[j], encV[j]);
+        compute_dist(dist, q[j], encV[j]);
       }
     }
   }
@@ -699,28 +700,27 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 1, int8_t, in
       int32_t enc = 0;
       enc         = data[laneId];
       int32_t q   = shfl(queryReg, d, WarpSize);
-      computeDist(dist, q, enc);
+      compute_dist(dist, q, enc);
     }  // end for d < dim - dimBlocks
   }
 };
 
-template <int Capacity, int Veclen, typename T, typename AccT, typename Lambda, bool Greater>
-__global__ void interleaved_scan_kernel(
-  const T* queries,              // Input: Query Vector; [batch_size, dim]
-  const uint32_t* coarse_index,  // Record the cluster(list) id; [batch_size,nprobe]
-  const uint32_t* list_index,    // Record the id of vector for each cluster(list); [nrow]
-  const T* list_data,  // Record the full value of vector for each cluster(list) interleaved;
-                       // [nrow, dim]
-  const uint32_t* list_lengths,            // The number of vectors in each cluster(list); [nlist]
-  const uint32_t* list_prefix_interleave,  // The start offset of each cluster(list) for
-                                           // list_index; [nlist]
-  Lambda computeDist,
-  const uint32_t nprobe,
-  const uint32_t k,
-  const uint32_t dim,
-  size_t* neighbors,  // [batch_size, nprobe, k]
-  float* distances    // [batch_size, nprobe, k]
-)
+/**
+ * See `ivfflat_interleaved_scan` for parameter docs.
+ */
+template <int Capacity, int Veclen, bool Greater, typename T, typename AccT, typename Lambda>
+__global__ void interleaved_scan_kernel(Lambda compute_dist,
+                                        const T* queries,
+                                        const uint32_t* coarse_index,
+                                        const uint32_t* list_index,
+                                        const T* list_data,
+                                        const uint32_t* list_lengths,
+                                        const uint32_t* list_prefix_interleave,
+                                        const uint32_t nprobe,
+                                        const uint32_t k,
+                                        const uint32_t dim,
+                                        size_t* neighbors,
+                                        float* distances)
 {
 #ifdef USE_FAISS
   // temporary use of FAISS blockSelect for development purpose of k <= 32
@@ -750,8 +750,7 @@ __global__ void interleaved_scan_kernel(
 
   /// Set the address
   auto query                           = queries + queryId * dim;
-  constexpr int bytesPerVectorBlockDim = sizeof(T) * WarpSize;
-  constexpr int wordsPerVectorBlockDim = bytesPerVectorBlockDim / sizeof(T);
+  constexpr int wordsPerVectorBlockDim = WarpSize;
 
   // int wordsPerVectorBlock = wordsPerVectorBlockDim * dim;
   const int dimBlocks = align_warp::roundDown(dim);
@@ -769,6 +768,7 @@ __global__ void interleaved_scan_kernel(
   __syncthreads();
   shLoadDim = (dim > queryShmemSize) ? (shLoadDim * Veclen) : dimBlocks;
 
+  // Every CUDA block scans one cluster at a time.
   for (int probeId = blockIdx.x; probeId < nprobe; probeId += gridDim.x) {
     uint32_t listId = coarse_index[queryId * nprobe + probeId];  // The id of cluster(list)
 
@@ -786,6 +786,9 @@ __global__ void interleaved_scan_kernel(
     // The number of interleaved group to be processed
     const uint32_t numBlocks = ceildiv<uint32_t>(numVecs, WarpSize);
 
+    // Every warp reads WarpSize vectors and computes the distances to them.
+    // Then, the distances and corresponding ids are distributed among the threads,
+    // and each thread adds one (id, dist) pair to the filtering queue.
     for (uint32_t block = warpId; block < numBlocks; block += utils::kNumWarps) {
       AccT dist = 0;
       // This is the vector a given lane/thread handles
@@ -805,11 +808,11 @@ __global__ void interleaved_scan_kernel(
 
           loadAndComputeDist<kUnroll,
                              wordsPerVectorBlockDim,
-                             decltype(computeDist),
+                             decltype(compute_dist),
                              Veclen,
                              T,
                              AccT>
-            obj(dist, computeDist);
+            obj(dist, compute_dist);
 #pragma unroll
           for (int i = 0; i < totalIter; ++i, data += stride * wordsPerVectorBlockDim) {
             obj.runLoadShmemCompute(data, queryShared, laneId, dBase, i);
@@ -820,8 +823,8 @@ __global__ void interleaved_scan_kernel(
       if (dim > queryShmemSize) {
         constexpr int kUnroll = WarpSize / Veclen;
         ;
-        loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, decltype(computeDist), Veclen, T, AccT>
-          obj(dist, computeDist);
+        loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, decltype(compute_dist), Veclen, T, AccT>
+          obj(dist, compute_dist);
         for (int dBase = shLoadDim; dBase < dimBlocks; dBase += WarpSize) {  //
           obj.runLoadShflAndCompute(data, query, dBase, laneId);
         }
@@ -833,8 +836,8 @@ __global__ void interleaved_scan_kernel(
           /// Remainder chunk = dim - dimBlocks
           for (int d = 0; d < dim - dimBlocks;
                d += Veclen, data += wordsPerVectorBlockDim * Veclen) {
-            loadAndComputeDist<1, wordsPerVectorBlockDim, decltype(computeDist), Veclen, T, AccT>
-              obj(dist, computeDist);
+            loadAndComputeDist<1, wordsPerVectorBlockDim, decltype(compute_dist), Veclen, T, AccT>
+              obj(dist, compute_dist);
             obj.runLoadShmemCompute(data, queryShared, laneId, dimBlocks + d, 0);
           }  // end for d < dim - dimBlocks
         }
@@ -907,7 +910,7 @@ void launch_kernel(Lambda lambda,
                    uint32_t& gridDimX,
                    rmm::cuda_stream_view stream)
 {
-  constexpr auto kKernel = interleaved_scan_kernel<Capacity, Veclen, T, AccT, Lambda, Greater>;
+  constexpr auto kKernel = interleaved_scan_kernel<Capacity, Veclen, Greater, T, AccT, Lambda>;
 #ifdef USE_FAISS
   int smem_size = 0;
 #else
@@ -921,13 +924,13 @@ void launch_kernel(Lambda lambda,
     return;
   }
   dim3 block_dim(utils::kThreadPerBlock);
-  kKernel<<<grid_dim, block_dim, smem_size, stream>>>(queries,
+  kKernel<<<grid_dim, block_dim, smem_size, stream>>>(lambda,
+                                                      queries,
                                                       coarse_index,
                                                       list_index,
                                                       list_data,
                                                       list_lengths,
                                                       list_prefix_interleave,
-                                                      lambda,
                                                       nprobe,
                                                       k,
                                                       dim,
@@ -1066,7 +1069,7 @@ struct select_interleaved_scan_kernel {
  * @param[in] greater whether to select nearest (false) or furthest (true) points w.r.t. the given
  * metric.
  * @param[in] veclen (optimization parameters) size of the vector for vectorized processing
- * @param[inout] gridDimX number of blocks launched for each cluster
+ * @param[inout] gridDimX number of blocks launched across all nprobe clusters.
  */
 template <typename T, typename AccT>
 void ivfflat_interleaved_scan(const T* queries,

From b1208edf00c606dcf0d55ed7f0c2f060b691ee78 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 31 May 2022 08:01:11 +0200
Subject: [PATCH 041/118] Add a case of high dimensionality

---
 cpp/test/spatial/ann_ivf_flat.cu | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/test/spatial/ann_ivf_flat.cu b/cpp/test/spatial/ann_ivf_flat.cu
index dc66dd43e6..58dcc8bfe5 100644
--- a/cpp/test/spatial/ann_ivf_flat.cu
+++ b/cpp/test/spatial/ann_ivf_flat.cu
@@ -269,7 +269,9 @@ const std::vector<AnnIvfFlatInputs> inputs = {
   {100, 10000, 16, 10, 20, 512, raft::distance::DistanceType::InnerProduct},
   {20, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::InnerProduct},
   {1000, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::InnerProduct},
-  {10000, 131072, 8, 10, 50, 1024, raft::distance::DistanceType::InnerProduct}};
+  {10000, 131072, 8, 10, 50, 1024, raft::distance::DistanceType::InnerProduct},
+
+  {1000, 10000, 4096, 20, 50, 1024, raft::distance::DistanceType::InnerProduct}};
 
 typedef AnnIVFFlatTest<float, float> AnnIVFFlatTestF;
 TEST_P(AnnIVFFlatTestF, AnnIVFFlat) { this->testIVFFlat(false); }

From a30ade5ca25bc3f9eb73061d9dc07ccb0877ac5f Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 31 May 2022 09:27:16 +0200
Subject: [PATCH 042/118] Add more sync into the test to detect device errors

---
 cpp/test/spatial/ann_ivf_flat.cu | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/cpp/test/spatial/ann_ivf_flat.cu b/cpp/test/spatial/ann_ivf_flat.cu
index 58dcc8bfe5..5b0bb03634 100644
--- a/cpp/test/spatial/ann_ivf_flat.cu
+++ b/cpp/test/spatial/ann_ivf_flat.cu
@@ -24,6 +24,7 @@
 
 #include <raft/spatial/knn/knn.cuh>
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 
 #include <gtest/gtest.h>
@@ -122,6 +123,7 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
       faiss_indices_(params_.num_queries * params_.k, stream_),
       faiss_distances_(params_.num_queries * params_.k, stream_)
   {
+    handle_.sync_stream(stream_);
     RAFT_CUDA_TRY(cudaMemsetAsync(database.data(), 0, database.size() * sizeof(DataT), stream_));
     RAFT_CUDA_TRY(
       cudaMemsetAsync(search_queries.data(), 0, search_queries.size() * sizeof(DataT), stream_));
@@ -133,11 +135,13 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
       cudaMemsetAsync(faiss_indices_.data(), 0, faiss_indices_.size() * sizeof(int64_t), stream_));
     RAFT_CUDA_TRY(
       cudaMemsetAsync(faiss_distances_.data(), 0, faiss_distances_.size() * sizeof(T), stream_));
+    handle_.sync_stream(stream_);
   }
 
  protected:
   void testIVFFlat(bool is8bit)
   {
+    handle_.sync_stream(stream_);
     if constexpr (std::is_same<DataT, uint8_t>{}) {
       naiveBfKnn<uint8_t, uint32_t>(faiss_distances_.data(),
                                     faiss_indices_.data(),
@@ -178,6 +182,7 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
                                2.0f,
                                stream_);
     }
+    handle_.sync_stream(stream_);
 
     raft::spatial::knn::IVFFlatParam ivfParams;
     ivfParams.nprobe = nprobe_;
@@ -194,6 +199,7 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
                            database.data(),
                            num_db_vecs,
                            dim);
+    handle_.sync_stream(stream_);
     approx_knn_search(handle_,
                       raft_distances_.data(),
                       raft_indices_.data(),
@@ -202,7 +208,7 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
                       k_,
                       search_queries.data(),
                       num_queries);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream_));
+    handle_.sync_stream(stream_);
     // verify.
     devArrMatchKnnPair(faiss_indices_.data(),
                        raft_indices_.data(),
@@ -212,10 +218,12 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
                        k_,
                        float(0.001),
                        stream_);
+    handle_.sync_stream(stream_);
   }
 
   void SetUp() override
   {
+    handle_.sync_stream(stream_);
     num_queries = params_.num_queries;
     num_db_vecs = params_.num_db_vecs;
     dim         = params_.dim;
@@ -233,11 +241,12 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
       r.uniformInt(database.data(), num_db_vecs * dim, DataT(1), DataT(20), stream_);
       r.uniformInt(search_queries.data(), num_queries * dim, DataT(1), DataT(20), stream_);
     }
+    handle_.sync_stream(stream_);
   }
 
  private:
   raft::handle_t handle_;
-  cudaStream_t stream_ = 0;
+  rmm::cuda_stream_view stream_;
   AnnIvfFlatInputs params_;
   int num_queries;
   int num_db_vecs;

From 84db732b72196bdcd2a0dfe6a9d62a28e2040a50 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 31 May 2022 10:20:01 +0200
Subject: [PATCH 043/118] Add more sync into the test to detect device errors

---
 cpp/test/spatial/ann_ivf_flat.cu | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/cpp/test/spatial/ann_ivf_flat.cu b/cpp/test/spatial/ann_ivf_flat.cu
index 5b0bb03634..337febeca3 100644
--- a/cpp/test/spatial/ann_ivf_flat.cu
+++ b/cpp/test/spatial/ann_ivf_flat.cu
@@ -123,7 +123,7 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
       faiss_indices_(params_.num_queries * params_.k, stream_),
       faiss_distances_(params_.num_queries * params_.k, stream_)
   {
-    handle_.sync_stream(stream_);
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream_));
     RAFT_CUDA_TRY(cudaMemsetAsync(database.data(), 0, database.size() * sizeof(DataT), stream_));
     RAFT_CUDA_TRY(
       cudaMemsetAsync(search_queries.data(), 0, search_queries.size() * sizeof(DataT), stream_));
@@ -135,13 +135,13 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
       cudaMemsetAsync(faiss_indices_.data(), 0, faiss_indices_.size() * sizeof(int64_t), stream_));
     RAFT_CUDA_TRY(
       cudaMemsetAsync(faiss_distances_.data(), 0, faiss_distances_.size() * sizeof(T), stream_));
-    handle_.sync_stream(stream_);
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream_));
   }
 
  protected:
   void testIVFFlat(bool is8bit)
   {
-    handle_.sync_stream(stream_);
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream_));
     if constexpr (std::is_same<DataT, uint8_t>{}) {
       naiveBfKnn<uint8_t, uint32_t>(faiss_distances_.data(),
                                     faiss_indices_.data(),
@@ -182,7 +182,7 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
                                2.0f,
                                stream_);
     }
-    handle_.sync_stream(stream_);
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream_));
 
     raft::spatial::knn::IVFFlatParam ivfParams;
     ivfParams.nprobe = nprobe_;
@@ -199,7 +199,7 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
                            database.data(),
                            num_db_vecs,
                            dim);
-    handle_.sync_stream(stream_);
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream_));
     approx_knn_search(handle_,
                       raft_distances_.data(),
                       raft_indices_.data(),
@@ -208,7 +208,7 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
                       k_,
                       search_queries.data(),
                       num_queries);
-    handle_.sync_stream(stream_);
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream_));
     // verify.
     devArrMatchKnnPair(faiss_indices_.data(),
                        raft_indices_.data(),
@@ -218,12 +218,12 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
                        k_,
                        float(0.001),
                        stream_);
-    handle_.sync_stream(stream_);
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream_));
   }
 
   void SetUp() override
   {
-    handle_.sync_stream(stream_);
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream_));
     num_queries = params_.num_queries;
     num_db_vecs = params_.num_db_vecs;
     dim         = params_.dim;
@@ -241,7 +241,7 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
       r.uniformInt(database.data(), num_db_vecs * dim, DataT(1), DataT(20), stream_);
       r.uniformInt(search_queries.data(), num_queries * dim, DataT(1), DataT(20), stream_);
     }
-    handle_.sync_stream(stream_);
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream_));
   }
 
  private:

From 346afb29a723a9d5667695b814e80631a54eb49d Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 31 May 2022 14:29:45 +0200
Subject: [PATCH 044/118] Allow large batch sizes and document more functions

---
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  |   1 +
 .../knn/detail/ann_ivf_flat_kernel.cuh        | 144 ++++++++++--------
 cpp/test/spatial/ann_ivf_flat.cu              |  20 +--
 3 files changed, 94 insertions(+), 71 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index 3af29208b8..592e8f2232 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -603,6 +603,7 @@ cuivflStatus_t cuivflHandle<T>::cuivflSetSearchParameters(const uint32_t nprobe,
     search_mem_res.emplace(cur_memory_resource,
                            Pow2<256>::roundUp(max_batch * nprobe * max_k * 16));
   }
+
   return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
 }
 
diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
index 1c7ef83e8d..15e0093bae 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
@@ -23,6 +23,7 @@
 #include "topk/warpsort_topk.cuh"
 
 #include <raft/common/device_loads_stores.cuh>
+#include <raft/core/logger.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/distance/distance.cuh>
@@ -37,6 +38,14 @@
 
 namespace raft::spatial::knn::detail {
 
+/**
+ * @brief Copy Veclen elements of type T from `query` to `queryShared` at position `loadDim *
+ * Veclen`.
+ *
+ * @param[in] query a pointer to a device global memory
+ * @param[out] queryShared a pointer to a device shared memory
+ * @param loadDim position at which to start copying elements.
+ */
 template <typename T, int Veclen>
 __device__ __forceinline__ void queryLoadToShmem(const T* const& query,
                                                  T* queryShared,
@@ -752,21 +761,24 @@ __global__ void interleaved_scan_kernel(Lambda compute_dist,
   auto query                           = queries + queryId * dim;
   constexpr int wordsPerVectorBlockDim = WarpSize;
 
-  // int wordsPerVectorBlock = wordsPerVectorBlockDim * dim;
-  const int dimBlocks = align_warp::roundDown(dim);
+  // How many full warps needed to compute the distance (without remainder)
+  const int full_warps_along_dim = align_warp::roundDown(dim);
 
+  // Using shared memory for the query;
+  // This allows to save on global memory bandwidth when reading index and query
+  // data at the same time.
   // This should be multiple of warpSize = 32
   constexpr uint32_t queryShmemSize = 2048;
   __shared__ T queryShared[queryShmemSize];
 
   int shLoadDim = (dim < queryShmemSize) ? dim : queryShmemSize;
-  shLoadDim     = shLoadDim / Veclen;
 
-  for (int loadDim = threadIdx.x; loadDim < shLoadDim; loadDim += blockDim.x) {
+  // load the query data from global to shared memory
+  for (int loadDim = threadIdx.x; loadDim * Veclen < shLoadDim; loadDim += blockDim.x) {
     queryLoadToShmem<T, Veclen>(query, queryShared, loadDim);
   }
   __syncthreads();
-  shLoadDim = (dim > queryShmemSize) ? (shLoadDim * Veclen) : dimBlocks;
+  shLoadDim = (dim > queryShmemSize) ? shLoadDim : full_warps_along_dim;
 
   // Every CUDA block scans one cluster at a time.
   for (int probeId = blockIdx.x; probeId < nprobe; probeId += gridDim.x) {
@@ -817,7 +829,7 @@ __global__ void interleaved_scan_kernel(Lambda compute_dist,
           for (int i = 0; i < totalIter; ++i, data += stride * wordsPerVectorBlockDim) {
             obj.runLoadShmemCompute(data, queryShared, laneId, dBase, i);
           }  // end for i < WarpSize / kUnroll
-        }    // end for dBase < dimBlocks
+        }    // end for dBase < full_warps_along_dim
       }
 
       if (dim > queryShmemSize) {
@@ -825,21 +837,21 @@ __global__ void interleaved_scan_kernel(Lambda compute_dist,
         ;
         loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, decltype(compute_dist), Veclen, T, AccT>
           obj(dist, compute_dist);
-        for (int dBase = shLoadDim; dBase < dimBlocks; dBase += WarpSize) {  //
+        for (int dBase = shLoadDim; dBase < full_warps_along_dim; dBase += WarpSize) {  //
           obj.runLoadShflAndCompute(data, query, dBase, laneId);
         }
-        // Remainder chunk = dim - dimBlocks
-        obj.runLoadShflAndComputeRemainder(data, query, laneId, dim, dimBlocks);
-        // end for d < dim - dimBlocks
+        // Remainder chunk = dim - full_warps_along_dim
+        obj.runLoadShflAndComputeRemainder(data, query, laneId, dim, full_warps_along_dim);
+        // end for d < dim - full_warps_along_dim
       } else {
         if (valid) {
-          /// Remainder chunk = dim - dimBlocks
-          for (int d = 0; d < dim - dimBlocks;
+          /// Remainder chunk = dim - full_warps_along_dim
+          for (int d = 0; d < dim - full_warps_along_dim;
                d += Veclen, data += wordsPerVectorBlockDim * Veclen) {
             loadAndComputeDist<1, wordsPerVectorBlockDim, decltype(compute_dist), Veclen, T, AccT>
               obj(dist, compute_dist);
-            obj.runLoadShmemCompute(data, queryShared, laneId, dimBlocks + d, 0);
-          }  // end for d < dim - dimBlocks
+            obj.runLoadShmemCompute(data, queryShared, laneId, full_warps_along_dim + d, 0);
+          }  // end for d < dim - full_warps_along_dim
         }
       }
 
@@ -864,33 +876,23 @@ __global__ void interleaved_scan_kernel(Lambda compute_dist,
 #endif
 }  // end kernel
 
+/**
+ *  Configure the gridDim.x to maximize GPU occupancy, but reduce the output size
+ */
 template <typename T>
-dim3 configure_launch(uint32_t numQueries, uint32_t nprobe, int32_t sMemSize, T func)
+uint32_t configure_launch_x(uint32_t numQueries, uint32_t nprobe, int32_t sMemSize, T func)
 {
-  int devId;
-  RAFT_CUDA_TRY(cudaGetDevice(&devId));
-  int numSMs;
-  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, devId));
-  int numBlocksPerSm = 0;
-  dim3 grid;
+  int dev_id;
+  RAFT_CUDA_TRY(cudaGetDevice(&dev_id));
+  int num_sms;
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
+  int num_blocks_per_sm = 0;
   RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &numBlocksPerSm, func, utils::kThreadPerBlock, sMemSize));
-
-  std::size_t minGridSize = numSMs * numBlocksPerSm;
-  std::size_t yChunks     = numQueries;
-  std::size_t xChunks     = nprobe;
-  // grid.y                  = yChunks > minGridSize ? minGridSize : yChunks;
-  grid.y = yChunks;
-  grid.x = (minGridSize - grid.y) <= 0 ? 1 : xChunks;
-  if (grid.x != 1) {
-    std::size_t i = 1;
-    while (grid.y * i < minGridSize) {
-      i++;
-    }
-    grid.x = i >= xChunks ? xChunks : i;
-  }
+    &num_blocks_per_sm, func, utils::kThreadPerBlock, sMemSize));
 
-  return grid;
+  size_t min_grid_size = num_sms * num_blocks_per_sm;
+  size_t min_grid_x    = ceildiv<size_t>(min_grid_size, numQueries);
+  return min_grid_x > nprobe ? nprobe : static_cast<uint32_t>(min_grid_x);
 }
 
 template <int Capacity, int Veclen, bool Greater, typename T, typename AccT, typename Lambda>
@@ -907,7 +909,7 @@ void launch_kernel(Lambda lambda,
                    size_t* neighbors,
                    float* distances,
                    const uint32_t batch_size,
-                   uint32_t& gridDimX,
+                   uint32_t& grid_dim_x,
                    rmm::cuda_stream_view stream)
 {
   constexpr auto kKernel = interleaved_scan_kernel<Capacity, Veclen, Greater, T, AccT, Lambda>;
@@ -918,24 +920,40 @@ void launch_kernel(Lambda lambda,
     utils::kNumWarps, k);
 #endif
 
-  dim3 grid_dim = configure_launch(batch_size, nprobe, smem_size, kKernel);
-  if (gridDimX == 0) {
-    gridDimX = grid_dim.x;
+  // power-of-two less than cuda limit (for better addr alignment)
+  constexpr uint32_t kMaxGridY = 32768;
+
+  if (grid_dim_x == 0) {
+    grid_dim_x = configure_launch_x(std::min(kMaxGridY, batch_size), nprobe, smem_size, kKernel);
     return;
   }
-  dim3 block_dim(utils::kThreadPerBlock);
-  kKernel<<<grid_dim, block_dim, smem_size, stream>>>(lambda,
-                                                      queries,
-                                                      coarse_index,
-                                                      list_index,
-                                                      list_data,
-                                                      list_lengths,
-                                                      list_prefix_interleave,
-                                                      nprobe,
-                                                      k,
-                                                      dim,
-                                                      neighbors,
-                                                      distances);
+
+  for (uint32_t query_offset = 0; query_offset < batch_size; query_offset += kMaxGridY) {
+    uint32_t grid_dim_y = std::min<uint32_t>(kMaxGridY, batch_size - query_offset);
+    dim3 grid_dim(grid_dim_x, grid_dim_y, 1);
+    dim3 block_dim(utils::kThreadPerBlock);
+    RAFT_LOG_TRACE(
+      "Launching the ivf-flat interleaved_scan_kernel (%d, %d, 1) x (%d, 1, 1), nprobe = %d",
+      grid_dim.x,
+      grid_dim.y,
+      block_dim.x,
+      nprobe);
+    kKernel<<<grid_dim, block_dim, smem_size, stream>>>(lambda,
+                                                        queries,
+                                                        coarse_index,
+                                                        list_index,
+                                                        list_data,
+                                                        list_lengths,
+                                                        list_prefix_interleave,
+                                                        nprobe,
+                                                        k,
+                                                        dim,
+                                                        neighbors,
+                                                        distances);
+    queries += grid_dim_y * dim;
+    neighbors += grid_dim_y * grid_dim_x * k;
+    distances += grid_dim_y * grid_dim_x * k;
+  }
 }
 
 template <int Veclen, typename T, typename AccT>
@@ -1031,7 +1049,9 @@ struct select_interleaved_scan_kernel {
       }
     }
     RAFT_EXPECTS(capacity == Capacity,
-                 "Capacity must be power-of-two not bigger than the maximum allowed size.");
+                 "Capacity must be power-of-two not bigger than the maximum allowed size "
+                 "topk::kMaxCapacity (%d).",
+                 topk::kMaxCapacity);
     RAFT_EXPECTS(
       veclen == Veclen,
       "Veclen must be power-of-two not bigger than the maximum allowed size for this data type.");
@@ -1058,18 +1078,20 @@ struct select_interleaved_scan_kernel {
  * [nlist]
  * @param[in] metric type of the measured distance
  * @param[in] nprobe number of nearest clusters to query
- * @param[in] k number of nearest neighbors
+ * @param[in] k number of nearest neighbors.
+ *            NB: the maximum value of `k` is limited statically by `topk::kMaxCapacity`.
  * @param[in] batch_size number of query vectors
  * @param[in] dim dimensionality of search data and query vectors
  * @param[out] neighbors device pointer to the result indices for each query and cluster
- * [batch_size, nprobe, k]
+ * [batch_size, grid_dim_x, k]
  * @param[out] distances device pointer to the result distances for each query and cluster
- * [batch_size, nprobe, k]
+ * [batch_size, grid_dim_x, k]
  * @param[in] stream
  * @param[in] greater whether to select nearest (false) or furthest (true) points w.r.t. the given
  * metric.
  * @param[in] veclen (optimization parameters) size of the vector for vectorized processing
- * @param[inout] gridDimX number of blocks launched across all nprobe clusters.
+ * @param[inout] grid_dim_x number of blocks launched across all nprobe clusters;
+ *               (one block processes one or more probes, hence: 1 <= grid_dim_x <= nprobe)
  */
 template <typename T, typename AccT>
 void ivfflat_interleaved_scan(const T* queries,
@@ -1088,7 +1110,7 @@ void ivfflat_interleaved_scan(const T* queries,
                               rmm::cuda_stream_view stream,
                               const bool greater,
                               const int veclen,
-                              uint32_t& gridDimX)
+                              uint32_t& grid_dim_x)
 {
   const int capacity = raft::spatial::knn::detail::topk::calc_capacity(k);
   select_interleaved_scan_kernel<T, AccT>::run(capacity,
@@ -1107,7 +1129,7 @@ void ivfflat_interleaved_scan(const T* queries,
                                                neighbors,
                                                distances,
                                                batch_size,
-                                               gridDimX,
+                                               grid_dim_x,
                                                stream);
 }
 
diff --git a/cpp/test/spatial/ann_ivf_flat.cu b/cpp/test/spatial/ann_ivf_flat.cu
index 337febeca3..7c2a789177 100644
--- a/cpp/test/spatial/ann_ivf_flat.cu
+++ b/cpp/test/spatial/ann_ivf_flat.cu
@@ -123,7 +123,6 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
       faiss_indices_(params_.num_queries * params_.k, stream_),
       faiss_distances_(params_.num_queries * params_.k, stream_)
   {
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream_));
     RAFT_CUDA_TRY(cudaMemsetAsync(database.data(), 0, database.size() * sizeof(DataT), stream_));
     RAFT_CUDA_TRY(
       cudaMemsetAsync(search_queries.data(), 0, search_queries.size() * sizeof(DataT), stream_));
@@ -135,13 +134,12 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
       cudaMemsetAsync(faiss_indices_.data(), 0, faiss_indices_.size() * sizeof(int64_t), stream_));
     RAFT_CUDA_TRY(
       cudaMemsetAsync(faiss_distances_.data(), 0, faiss_distances_.size() * sizeof(T), stream_));
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream_));
+    handle_.sync_stream(stream_);
   }
 
  protected:
   void testIVFFlat(bool is8bit)
   {
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream_));
     if constexpr (std::is_same<DataT, uint8_t>{}) {
       naiveBfKnn<uint8_t, uint32_t>(faiss_distances_.data(),
                                     faiss_indices_.data(),
@@ -182,7 +180,7 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
                                2.0f,
                                stream_);
     }
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream_));
+    handle_.sync_stream(stream_);
 
     raft::spatial::knn::IVFFlatParam ivfParams;
     ivfParams.nprobe = nprobe_;
@@ -199,7 +197,7 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
                            database.data(),
                            num_db_vecs,
                            dim);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream_));
+    handle_.sync_stream(stream_);
     approx_knn_search(handle_,
                       raft_distances_.data(),
                       raft_indices_.data(),
@@ -208,7 +206,7 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
                       k_,
                       search_queries.data(),
                       num_queries);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream_));
+    handle_.sync_stream(stream_);
     // verify.
     devArrMatchKnnPair(faiss_indices_.data(),
                        raft_indices_.data(),
@@ -218,12 +216,10 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
                        k_,
                        float(0.001),
                        stream_);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream_));
   }
 
   void SetUp() override
   {
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream_));
     num_queries = params_.num_queries;
     num_db_vecs = params_.num_db_vecs;
     dim         = params_.dim;
@@ -241,7 +237,7 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
       r.uniformInt(database.data(), num_db_vecs * dim, DataT(1), DataT(20), stream_);
       r.uniformInt(search_queries.data(), num_queries * dim, DataT(1), DataT(20), stream_);
     }
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream_));
+    handle_.sync_stream(stream_);
   }
 
  private:
@@ -280,7 +276,11 @@ const std::vector<AnnIvfFlatInputs> inputs = {
   {1000, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::InnerProduct},
   {10000, 131072, 8, 10, 50, 1024, raft::distance::DistanceType::InnerProduct},
 
-  {1000, 10000, 4096, 20, 50, 1024, raft::distance::DistanceType::InnerProduct}};
+  {1000, 10000, 4096, 20, 50, 1024, raft::distance::DistanceType::InnerProduct},
+
+  {100000, 1024, 32, 10, 64, 64, raft::distance::DistanceType::InnerProduct},
+
+  {98306, 1024, 32, 10, 64, 64, raft::distance::DistanceType::InnerProduct}};
 
 typedef AnnIVFFlatTest<float, float> AnnIVFFlatTestF;
 TEST_P(AnnIVFFlatTestF, AnnIVFFlat) { this->testIVFFlat(false); }

From fc201b5f25fed82c0c6f363a9320937a979bebb8 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 31 May 2022 15:07:01 +0200
Subject: [PATCH 045/118] Add a lower bound on expected recall

---
 cpp/test/spatial/ann_ivf_flat.cu | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/cpp/test/spatial/ann_ivf_flat.cu b/cpp/test/spatial/ann_ivf_flat.cu
index 7c2a789177..24ec18b527 100644
--- a/cpp/test/spatial/ann_ivf_flat.cu
+++ b/cpp/test/spatial/ann_ivf_flat.cu
@@ -68,6 +68,7 @@ testing::AssertionResult devArrMatchKnnPair(const T* expected_idx,
                                             size_t rows,
                                             size_t cols,
                                             const DistT eps,
+                                            double min_recall,
                                             cudaStream_t stream = 0)
 {
   size_t size = rows * cols;
@@ -107,6 +108,12 @@ testing::AssertionResult devArrMatchKnnPair(const T* expected_idx,
     }
   }
   std::cout << "Recall = " << match_count << "/" << rows * cols << std::endl;
+  double actual_recall = static_cast<double>(match_count) / static_cast<double>(rows * cols);
+  if (actual_recall < min_recall - eps) {
+    return testing::AssertionFailure()
+           << "actual recall (" << actual_recall
+           << ") is smaller than the minimum expected recall (" << min_recall << ").";
+  }
   return testing::AssertionSuccess();
 }
 
@@ -207,6 +214,9 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
                       search_queries.data(),
                       num_queries);
     handle_.sync_stream(stream_);
+
+    // unless something is really wrong with clustering, this could serve as a lower bound on recall
+    double min_recall = static_cast<double>(nprobe_) / static_cast<double>(nlist_);
     // verify.
     devArrMatchKnnPair(faiss_indices_.data(),
                        raft_indices_.data(),
@@ -215,6 +225,7 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
                        num_queries,
                        k_,
                        float(0.001),
+                       min_recall,
                        stream_);
   }
 

From 4021ea247b567b1045a70c4355f1af812af17e91 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 31 May 2022 17:10:50 +0200
Subject: [PATCH 046/118] Compure required memory dynamically

---
 cpp/bench/spatial/knn.cu                      |   3 +-
 .../knn/detail/ann_ivf_flat_kernel.cuh        | 105 ++++++++++--------
 2 files changed, 58 insertions(+), 50 deletions(-)

diff --git a/cpp/bench/spatial/knn.cu b/cpp/bench/spatial/knn.cu
index 04b15a38a6..e8273bd9ed 100644
--- a/cpp/bench/spatial/knn.cu
+++ b/cpp/bench/spatial/knn.cu
@@ -332,7 +332,8 @@ struct knn : public fixture {
   rmm::device_uvector<IdxT> out_idxs_;
 };
 
-const std::vector<params> kInputs{{2000000, 128, 1000, 32}, {10000000, 128, 1000, 32}};
+const std::vector<params> kInputs{
+  {2000000, 128, 1000, 32}, {10000000, 128, 1000, 32}, {10000, 8192, 1000, 32}};
 
 const std::vector<TransferStrategy> kAllStrategies{TransferStrategy::NO_COPY,
                                                    TransferStrategy::COPY_PLAIN,
diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
index 15e0093bae..1e8395ded7 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
@@ -39,70 +39,70 @@
 namespace raft::spatial::knn::detail {
 
 /**
- * @brief Copy Veclen elements of type T from `query` to `queryShared` at position `loadDim *
+ * @brief Copy Veclen elements of type T from `query` to `query_shared` at position `loadDim *
  * Veclen`.
  *
  * @param[in] query a pointer to a device global memory
- * @param[out] queryShared a pointer to a device shared memory
+ * @param[out] query_shared a pointer to a device shared memory
  * @param loadDim position at which to start copying elements.
  */
 template <typename T, int Veclen>
 __device__ __forceinline__ void queryLoadToShmem(const T* const& query,
-                                                 T* queryShared,
+                                                 T* query_shared,
                                                  const int loadDim)
 {
   T queryReg[Veclen];
   const int loadIndex = loadDim * Veclen;
   ldg(queryReg, query + loadIndex);
-  sts(&queryShared[loadIndex], queryReg);
+  sts(&query_shared[loadIndex], queryReg);
 }
 
 template <>
 __device__ __forceinline__ void queryLoadToShmem<uint8_t, 8>(const uint8_t* const& query,
-                                                             uint8_t* queryShared,
+                                                             uint8_t* query_shared,
                                                              const int loadDim)
 {
   constexpr int veclen = 2;  // 8 uint8_t
   uint32_t queryReg[veclen];
   const int loadIndex = loadDim * veclen;
   ldg(queryReg, reinterpret_cast<uint32_t const*>(query) + loadIndex);
-  sts(reinterpret_cast<uint32_t*>(queryShared) + loadIndex, queryReg);
+  sts(reinterpret_cast<uint32_t*>(query_shared) + loadIndex, queryReg);
 }
 
 template <>
 __device__ __forceinline__ void queryLoadToShmem<uint8_t, 16>(const uint8_t* const& query,
-                                                              uint8_t* queryShared,
+                                                              uint8_t* query_shared,
                                                               const int loadDim)
 {
   constexpr int veclen = 4;  // 16 uint8_t
   uint32_t queryReg[veclen];
   const int loadIndex = loadDim * veclen;
   ldg(queryReg, reinterpret_cast<uint32_t const*>(query) + loadIndex);
-  sts(reinterpret_cast<uint32_t*>(queryShared) + loadIndex, queryReg);
+  sts(reinterpret_cast<uint32_t*>(query_shared) + loadIndex, queryReg);
 }
 
 template <>
 __device__ __forceinline__ void queryLoadToShmem<int8_t, 8>(const int8_t* const& query,
-                                                            int8_t* queryShared,
+                                                            int8_t* query_shared,
                                                             const int loadDim)
 {
   constexpr int veclen = 2;  // 8 int8_t
   int32_t queryReg[veclen];
   const int loadIndex = loadDim * veclen;
   ldg(queryReg, reinterpret_cast<int32_t const*>(query) + loadIndex);
-  sts(reinterpret_cast<int32_t*>(queryShared) + loadIndex, queryReg);
+  sts(reinterpret_cast<int32_t*>(query_shared) + loadIndex, queryReg);
 }
 
 template <>
 __device__ __forceinline__ void queryLoadToShmem<int8_t, 16>(const int8_t* const& query,
-                                                             int8_t* queryShared,
+                                                             int8_t* query_shared,
                                                              const int loadDim)
 {
   constexpr int veclen = 4;  // 16 int8_t
   int32_t queryReg[veclen];
   const int loadIndex = loadDim * veclen;
   ldg(queryReg, reinterpret_cast<int32_t const*>(query) + loadIndex);
-  sts(reinterpret_cast<int32_t*>(queryShared) + loadIndex, queryReg);
+  sts(reinterpret_cast<int32_t*>(query_shared) + loadIndex, queryReg);
 }
 
 template <int kUnroll,
@@ -122,7 +122,7 @@ struct loadAndComputeDist {
 
   template <typename IdxT>
   __device__ __forceinline__ void runLoadShmemCompute(const T* const& data,
-                                                      const T* queryShared,
+                                                      const T* query_shared,
                                                       IdxT loadIndex,
                                                       IdxT baseShmemIndex,
                                                       IdxT iShmemIndex)
@@ -135,7 +135,7 @@ struct loadAndComputeDist {
     for (int j = 0; j < kUnroll; ++j) {
       ldg(encV[j], data + (loadIndex + j * wordsPerVectorBlockDim) * Veclen);
       const int d = shmemStride + j * Veclen;
-      lds(queryRegs[j], &queryShared[d]);
+      lds(queryRegs[j], &query_shared[d]);
 #pragma unroll
       for (int k = 0; k < Veclen; ++k) {
         compute_dist(dist, queryRegs[j][k], encV[j][k]);
@@ -206,7 +206,7 @@ struct loadAndComputeDist<kUnroll,
   }
 
   __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
-                                                      const uint8_t* queryShared,
+                                                      const uint8_t* query_shared,
                                                       int loadIndex,
                                                       int baseShmemIndex,
                                                       int iShmemIndex)
@@ -222,7 +222,7 @@ struct loadAndComputeDist<kUnroll,
           reinterpret_cast<unsigned const*>(data) + loadIndex +
             j * wordsPerVectorBlockDim * veclen_int);
       const int d = iShmemIndex * kUnroll + j * veclen_int;
-      lds(queryRegs[j], reinterpret_cast<unsigned const*>(queryShared + baseShmemIndex) + d);
+      lds(queryRegs[j], reinterpret_cast<unsigned const*>(query_shared + baseShmemIndex) + d);
 #pragma unroll
       for (int k = 0; k < veclen_int; k++) {
         compute_dist(dist, queryRegs[j][k], encV[j][k]);
@@ -294,7 +294,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 4, uint8_t, u
   }
 
   __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
-                                                      const uint8_t* queryShared,
+                                                      const uint8_t* query_shared,
                                                       int loadIndex,
                                                       int baseShmemIndex,
                                                       int iShmemIndex)
@@ -306,7 +306,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 4, uint8_t, u
     for (int j = 0; j < kUnroll; ++j) {
       encV[j]     = reinterpret_cast<unsigned const*>(data)[loadIndex + j * wordsPerVectorBlockDim];
       const int d = (iShmemIndex * kUnroll + j);
-      queryRegs[j] = reinterpret_cast<unsigned const*>(queryShared + baseShmemIndex)[d];
+      queryRegs[j] = reinterpret_cast<unsigned const*>(query_shared + baseShmemIndex)[d];
       compute_dist(dist, queryRegs[j], encV[j]);
     }
   }
@@ -362,7 +362,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 2, uint8_t, u
   }
 
   __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
-                                                      const uint8_t* queryShared,
+                                                      const uint8_t* query_shared,
                                                       int loadIndex,
                                                       int baseShmemIndex,
                                                       int iShmemIndex)
@@ -375,7 +375,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 2, uint8_t, u
       encV[j]     = reinterpret_cast<uint16_t const*>(data)[loadIndex + j * wordsPerVectorBlockDim];
       const int d = (iShmemIndex * kUnroll + j);
       queryRegs[j] = 0;
-      queryRegs[j] = reinterpret_cast<uint16_t const*>(queryShared + baseShmemIndex)[d];
+      queryRegs[j] = reinterpret_cast<uint16_t const*>(query_shared + baseShmemIndex)[d];
       compute_dist(dist, queryRegs[j], encV[j]);
     }
   }
@@ -434,7 +434,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 1, uint8_t, u
   }
 
   __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
-                                                      const uint8_t* queryShared,
+                                                      const uint8_t* query_shared,
                                                       int loadIndex,
                                                       int baseShmemIndex,
                                                       int iShmemIndex)
@@ -445,7 +445,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 1, uint8_t, u
     for (int j = 0; j < kUnroll; ++j) {
       encV[j]      = data[loadIndex + j * wordsPerVectorBlockDim];
       const int d  = (iShmemIndex * kUnroll + j);
-      queryRegs[j] = queryShared[baseShmemIndex + d];
+      queryRegs[j] = query_shared[baseShmemIndex + d];
       compute_dist(dist, queryRegs[j], encV[j]);
     }
   }
@@ -506,7 +506,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, int8_veclen,
   }
 
   __device__ __forceinline__ void runLoadShmemCompute(const int8_t* const& data,
-                                                      const int8_t* queryShared,
+                                                      const int8_t* query_shared,
                                                       int loadIndex,
                                                       int baseShmemIndex,
                                                       int iShmemIndex)
@@ -521,7 +521,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, int8_veclen,
           reinterpret_cast<int32_t const*>(data) +
             (loadIndex + j * wordsPerVectorBlockDim) * veclen_int);
       const int d = iShmemIndex * kUnroll + j * veclen_int;
-      lds(queryRegs[j], reinterpret_cast<int32_t const*>(queryShared + baseShmemIndex) + d);
+      lds(queryRegs[j], reinterpret_cast<int32_t const*>(query_shared + baseShmemIndex) + d);
 #pragma unroll
       for (int k = 0; k < veclen_int; k++) {
         compute_dist(dist, queryRegs[j][k], encV[j][k]);
@@ -587,7 +587,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 2, int8_t, in
   {
   }
   __device__ __forceinline__ void runLoadShmemCompute(const int8_t* const& data,
-                                                      const int8_t* queryShared,
+                                                      const int8_t* query_shared,
                                                       int loadIndex,
                                                       int baseShmemIndex,
                                                       int iShmemIndex)
@@ -600,7 +600,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 2, int8_t, in
       encV[j]     = reinterpret_cast<uint16_t const*>(data)[loadIndex + j * wordsPerVectorBlockDim];
       const int d = (iShmemIndex * kUnroll + j);
       queryRegs[j] = 0;
-      queryRegs[j] = reinterpret_cast<uint16_t const*>(queryShared + baseShmemIndex)[d];
+      queryRegs[j] = reinterpret_cast<uint16_t const*>(query_shared + baseShmemIndex)[d];
       compute_dist(dist, queryRegs[j], encV[j]);
     }
   }
@@ -655,7 +655,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 1, int8_t, in
   }
 
   __device__ __forceinline__ void runLoadShmemCompute(const int8_t* const& data,
-                                                      const int8_t* queryShared,
+                                                      const int8_t* query_shared,
                                                       int loadIndex,
                                                       int baseShmemIndex,
                                                       int iShmemIndex)
@@ -669,7 +669,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 1, int8_t, in
       encV[j]      = data[loadIndex + j * wordsPerVectorBlockDim];
       const int d  = (iShmemIndex * kUnroll + j);
       queryRegs[j] = 0;
-      queryRegs[j] = queryShared[baseShmemIndex + d];
+      queryRegs[j] = query_shared[baseShmemIndex + d];
       compute_dist(dist, queryRegs[j], encV[j]);
     }
   }
@@ -716,9 +716,12 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 1, int8_t, in
 
 /**
  * See `ivfflat_interleaved_scan` for parameter docs.
+ *
+ * query_smem_elems must be multiple of WarpSize * Veclen
  */
 template <int Capacity, int Veclen, bool Greater, typename T, typename AccT, typename Lambda>
 __global__ void interleaved_scan_kernel(Lambda compute_dist,
+                                        const uint32_t query_smem_elems,
                                         const T* queries,
                                         const uint32_t* coarse_index,
                                         const uint32_t* list_index,
@@ -731,6 +734,12 @@ __global__ void interleaved_scan_kernel(Lambda compute_dist,
                                         size_t* neighbors,
                                         float* distances)
 {
+  extern __shared__ __align__(256) uint8_t interleaved_scan_kernel_smem[];
+  // Using shared memory for the (part of the) query;
+  // This allows to save on global memory bandwidth when reading index and query
+  // data at the same time.
+  // Its size is `query_smem_elems`.
+  T* query_shared = reinterpret_cast<T*>(interleaved_scan_kernel_smem);
 #ifdef USE_FAISS
   // temporary use of FAISS blockSelect for development purpose of k <= 32
   // for comparison purpose
@@ -748,8 +757,8 @@ __global__ void interleaved_scan_kernel(Lambda compute_dist,
       queue(identity, keyMax, smemK, smemV, k);
 
 #else
-  extern __shared__ __align__(256) uint8_t smem_ext[];
-  topk::block_sort<topk::warp_sort_filtered, Capacity, !Greater, float, size_t> queue(k, smem_ext);
+  topk::block_sort<topk::warp_sort_filtered, Capacity, !Greater, float, size_t> queue(
+    k, interleaved_scan_kernel_smem + query_smem_elems * sizeof(T));
 #endif
 
   using align_warp = Pow2<WarpSize>;
@@ -764,21 +773,14 @@ __global__ void interleaved_scan_kernel(Lambda compute_dist,
   // How many full warps needed to compute the distance (without remainder)
   const int full_warps_along_dim = align_warp::roundDown(dim);
 
-  // Using shared memory for the query;
-  // This allows to save on global memory bandwidth when reading index and query
-  // data at the same time.
-  // This should be multiple of warpSize = 32
-  constexpr uint32_t queryShmemSize = 2048;
-  __shared__ T queryShared[queryShmemSize];
-
-  int shLoadDim = (dim < queryShmemSize) ? dim : queryShmemSize;
+  int shLoadDim = (dim < query_smem_elems) ? dim : query_smem_elems;
 
   // load the query data from global to shared memory
   for (int loadDim = threadIdx.x; loadDim * Veclen < shLoadDim; loadDim += blockDim.x) {
-    queryLoadToShmem<T, Veclen>(query, queryShared, loadDim);
+    queryLoadToShmem<T, Veclen>(query, query_shared, loadDim);
   }
   __syncthreads();
-  shLoadDim = (dim > queryShmemSize) ? shLoadDim : full_warps_along_dim;
+  shLoadDim = (dim > query_smem_elems) ? shLoadDim : full_warps_along_dim;
 
   // Every CUDA block scans one cluster at a time.
   for (int probeId = blockIdx.x; probeId < nprobe; probeId += gridDim.x) {
@@ -827,12 +829,12 @@ __global__ void interleaved_scan_kernel(Lambda compute_dist,
             obj(dist, compute_dist);
 #pragma unroll
           for (int i = 0; i < totalIter; ++i, data += stride * wordsPerVectorBlockDim) {
-            obj.runLoadShmemCompute(data, queryShared, laneId, dBase, i);
+            obj.runLoadShmemCompute(data, query_shared, laneId, dBase, i);
           }  // end for i < WarpSize / kUnroll
         }    // end for dBase < full_warps_along_dim
       }
 
-      if (dim > queryShmemSize) {
+      if (dim > query_smem_elems) {
         constexpr int kUnroll = WarpSize / Veclen;
         ;
         loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, decltype(compute_dist), Veclen, T, AccT>
@@ -850,7 +852,7 @@ __global__ void interleaved_scan_kernel(Lambda compute_dist,
                d += Veclen, data += wordsPerVectorBlockDim * Veclen) {
             loadAndComputeDist<1, wordsPerVectorBlockDim, decltype(compute_dist), Veclen, T, AccT>
               obj(dist, compute_dist);
-            obj.runLoadShmemCompute(data, queryShared, laneId, full_warps_along_dim + d, 0);
+            obj.runLoadShmemCompute(data, query_shared, laneId, full_warps_along_dim + d, 0);
           }  // end for d < dim - full_warps_along_dim
         }
       }
@@ -913,10 +915,12 @@ void launch_kernel(Lambda lambda,
                    rmm::cuda_stream_view stream)
 {
   constexpr auto kKernel = interleaved_scan_kernel<Capacity, Veclen, Greater, T, AccT, Lambda>;
-#ifdef USE_FAISS
-  int smem_size = 0;
-#else
-  int smem_size = raft::spatial::knn::detail::topk::calc_smem_size_for_block_wide<AccT, size_t>(
+  int max_query_smem     = 16384;
+  int query_smem_elems =
+    std::min<int>(max_query_smem / sizeof(T), Pow2<Veclen * WarpSize>::roundUp(dim));
+  int smem_size = query_smem_elems * sizeof(T);
+#ifndef USE_FAISS
+  smem_size += raft::spatial::knn::detail::topk::calc_smem_size_for_block_wide<AccT, size_t>(
     utils::kNumWarps, k);
 #endif
 
@@ -933,12 +937,15 @@ void launch_kernel(Lambda lambda,
     dim3 grid_dim(grid_dim_x, grid_dim_y, 1);
     dim3 block_dim(utils::kThreadPerBlock);
     RAFT_LOG_TRACE(
-      "Launching the ivf-flat interleaved_scan_kernel (%d, %d, 1) x (%d, 1, 1), nprobe = %d",
+      "Launching the ivf-flat interleaved_scan_kernel (%d, %d, 1) x (%d, 1, 1), nprobe = %d, "
+      "smem_size = %d",
       grid_dim.x,
       grid_dim.y,
       block_dim.x,
-      nprobe);
+      nprobe,
+      smem_size);
     kKernel<<<grid_dim, block_dim, smem_size, stream>>>(lambda,
+                                                        query_smem_elems,
                                                         queries,
                                                         coarse_index,
                                                         list_index,

From ea8b1c43b3cf7e353dc8142a208045848e338e39 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 31 May 2022 17:24:33 +0200
Subject: [PATCH 047/118] readability quickfix

---
 cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index 592e8f2232..1993c519b4 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -520,7 +520,7 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildIndex(const T* dataset,
   ninterleave_ = 0;
   for (uint32_t i = 0; i < nlist_; i++) {
     list_prefix_interleaved_host_[i] = ninterleave_;
-    ninterleave_ += ((list_lengths_host_[i] - 1) / WarpSize + 1) * WarpSize;
+    ninterleave_ += Pow2<WarpSize>::roundUp(list_lengths_host_[i]);
   }
 
   list_data_host_.assign(ninterleave_ * dim_, 0);

From d8a034a450081ceb07d04403f9e3a2bf4d79e5b5 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 31 May 2022 19:02:36 +0200
Subject: [PATCH 048/118] Correct the smem size for the warpsort and add launch
 bounds

---
 .../knn/detail/ann_ivf_flat_kernel.cuh        | 30 ++++++++++---------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
index 1e8395ded7..908828f300 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
@@ -720,19 +720,20 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 1, int8_t, in
  * query_smem_elems must be multiple of WarpSize * Veclen
  */
 template <int Capacity, int Veclen, bool Greater, typename T, typename AccT, typename Lambda>
-__global__ void interleaved_scan_kernel(Lambda compute_dist,
-                                        const uint32_t query_smem_elems,
-                                        const T* queries,
-                                        const uint32_t* coarse_index,
-                                        const uint32_t* list_index,
-                                        const T* list_data,
-                                        const uint32_t* list_lengths,
-                                        const uint32_t* list_prefix_interleave,
-                                        const uint32_t nprobe,
-                                        const uint32_t k,
-                                        const uint32_t dim,
-                                        size_t* neighbors,
-                                        float* distances)
+__global__ void __launch_bounds__(utils::kThreadPerBlock)
+  interleaved_scan_kernel(Lambda compute_dist,
+                          const uint32_t query_smem_elems,
+                          const T* queries,
+                          const uint32_t* coarse_index,
+                          const uint32_t* list_index,
+                          const T* list_data,
+                          const uint32_t* list_lengths,
+                          const uint32_t* list_prefix_interleave,
+                          const uint32_t nprobe,
+                          const uint32_t k,
+                          const uint32_t dim,
+                          size_t* neighbors,
+                          float* distances)
 {
   extern __shared__ __align__(256) uint8_t interleaved_scan_kernel_smem[];
   // Using shared memory for the (part of the) query;
@@ -920,8 +921,9 @@ void launch_kernel(Lambda lambda,
     std::min<int>(max_query_smem / sizeof(T), Pow2<Veclen * WarpSize>::roundUp(dim));
   int smem_size = query_smem_elems * sizeof(T);
 #ifndef USE_FAISS
+  constexpr int kSubwarpSize = std::min<int>(Capacity, WarpSize);
   smem_size += raft::spatial::knn::detail::topk::calc_smem_size_for_block_wide<AccT, size_t>(
-    utils::kNumWarps, k);
+    utils::kThreadPerBlock / kSubwarpSize, k);
 #endif
 
   // power-of-two less than cuda limit (for better addr alignment)

From d97d24884b21837cad22b2a8892d3e6e876a11e6 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 1 Jun 2022 09:09:17 +0200
Subject: [PATCH 049/118] Add couple checks against floating point exceptions

---
 cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index 76fe7270fc..ae19db93f8 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -84,6 +84,7 @@ void _cuann_kmeans_predict_core(const handle_t& handle,
 //
 uint32_t _cuann_kmeans_predict_chunkSize(uint32_t numCenters, uint32_t numDataset)
 {
+  numCenters     = max(1, numCenters);
   uint32_t chunk = (1 << 20);
   if (chunk > (1 << 28) / numCenters) {
     chunk = (1 << 28) / numCenters;
@@ -282,7 +283,7 @@ bool _cuann_kmeans_adjust_centers(float* centers,  // [numCenters, dimCenters]
                                   const uint32_t* clusterSize,  // [numCenters]
                                   float threshold)
 {
-  // cudaDeviceSynchronize();
+  if (numCenters == 0) { return false; }
   bool adjusted                = false;
   static uint32_t i            = 0;
   static uint32_t iPrimes      = 0;

From 2e64037da831e08f09d73825c94cf31046d51d1f Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 2 Jun 2022 08:55:10 +0200
Subject: [PATCH 050/118] Don't run kmeans on empty dataset

---
 cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index ae19db93f8..2601c662b2 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -179,6 +179,12 @@ void _cuann_kmeans_predict(const handle_t& handle,
                            uint32_t* clusterSize = NULL,  // [numCenters,]
                            bool updateCenter     = true)
 {
+  if (numDataset == 0) {
+    RAFT_LOG_WARN("cuann_kmeans_predict: empty dataset (numDataset = %d, numCenters = %d)",
+                  numDataset,
+                  numCenters);
+    return;
+  }
   rmm::cuda_stream_view stream = handle.get_stream();
   if (!isCenterSet) {
     // If centers are not set, the labels will be determined randomly.

From 9ed50acc228ce8c5e5155cde0ad3cf031544061c Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 2 Jun 2022 11:07:32 +0200
Subject: [PATCH 051/118] Order all ops by a cuda stream

---
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  |  52 ++++---
 .../knn/detail/ann_kmeans_balanced.cuh        |  83 +++++++----
 .../raft/spatial/knn/detail/ann_utils.cuh     | 132 +++++++++---------
 3 files changed, 148 insertions(+), 119 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index 1993c519b4..ce6aab1bec 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -273,7 +273,8 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_manag
                           (iter != 0),
                           predictWorkspace.data(),
                           mesoClusterCentersTemp.data(),
-                          mesoClusterSize);
+                          mesoClusterSize,
+                          stream_);
 
     if (iter < 2 * (numIterations - 2)) {
       if (_cuann_kmeans_adjust_centers(mesoClusterCenters.data(),
@@ -284,13 +285,14 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_manag
                                        mesoClusterLabels.data(),
                                        metric_type_,
                                        mesoClusterSize,
-                                       (float)1.0 / 4)) {
+                                       (float)1.0 / 4,
+                                       stream_)) {
         iter -= 1;
       }  // end if _cuann_kmeans_adjust_centers
     }    // end if iter < 2 * (numIterations - 2)
   }      // end for (int iter = 0; iter < 2 * numIterations; iter += 2)
 
-  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  handle_.sync_stream(stream_);
 
   std::vector<uint32_t> numFineClusters(numMesoClusters);
   std::vector<uint32_t> csumFineClusters(numMesoClusters + 1);
@@ -368,8 +370,8 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_manag
                                     dimDataset,
                                     subTrainset,
                                     dimDataset,
-                                    ivfflat_config<T>::kDivisor);
-    RAFT_CUDA_TRY(cudaDeviceSynchronize());
+                                    ivfflat_config<T>::kDivisor,
+                                    stream_);
 
     for (uint32_t iter = 0; iter < 2 * numIterations; iter += 2) {
       RAFT_LOG_TRACE("Training kmeans of clusters in meso-cluster %u (numClusters: %u): %.1f / %u",
@@ -389,7 +391,8 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_manag
                             (iter != 0),
                             predictWorkspace.data(),
                             clusterCentersMP.data(),
-                            clusterSizeMP.data());
+                            clusterSizeMP.data(),
+                            stream_);
 
       if (iter < 2 * (numIterations - 2)) {
         if (_cuann_kmeans_adjust_centers(clusterCentersEach.data(),
@@ -400,16 +403,17 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_manag
                                          labelsMP.data(),
                                          metric_type_,
                                          clusterSizeMP.data(),
-                                         (float)1.0 / 4)) {
+                                         (float)1.0 / 4,
+                                         stream_)) {
           iter -= 1;
         }
       }
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());
     }
-    RAFT_CUDA_TRY(cudaMemcpy(clusterCenters + (dimDataset * csumFineClusters[i]),
-                             clusterCentersEach.data(),
-                             sizeof(float) * numFineClusters[i] * dimDataset,
-                             cudaMemcpyDefault));
+    copy(clusterCenters + (dimDataset * csumFineClusters[i]),
+         clusterCentersEach.data(),
+         numFineClusters[i] * dimDataset,
+         stream_);
+    handle_.sync_stream(stream_);
     numClustersDone += numFineClusters[i];
   }  // end for (uint32_t i = 0; i < numMesoClusters; i++)
   assert(numClustersDone == numClusters);
@@ -435,7 +439,8 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_manag
                           predictWorkspace.data(),
                           clusterCentersMP.data(),
                           clusterSizeMP.data(),
-                          true);
+                          true,
+                          stream_);
   }  // end for (int iter = 0; iter < 2; iter++)
 
   RAFT_LOG_DEBUG("(%s) Final fitting.", __func__);
@@ -455,7 +460,8 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_manag
                         predictWorkspace.data(),
                         clusterCentersMP.data(),
                         clusterSizeMP.data(),
-                        true);
+                        true,
+                        stream_);
 
   _cuann_kmeans_predict(handle_,
                         clusterCenters,
@@ -469,7 +475,8 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_manag
                         predictWorkspace.data(),
                         clusterCentersMP.data(),
                         clusterSizeMP.data(),
-                        false);
+                        false,
+                        stream_);
 
   return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
 }  // end func cuivflBuildOptimizedKmeans
@@ -503,12 +510,12 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildIndex(const T* dataset,
   centriod_norm_dev_.resize(nlist_, stream_);
 
   if (metric_type_ == raft::distance::DistanceType::L2Expanded) {
-    utils::_cuann_sqsum(nlist_, dim_, centriod_managed_ptr, centriod_norm_dev_.data());
+    utils::_cuann_sqsum(nlist_, dim_, centriod_managed_ptr, centriod_norm_dev_.data(), stream_);
     RAFT_LOG_TRACE_VEC(centriod_norm_dev_.data(), 20);
   }
 
   // Step 4: Record the number of elements in each clusters
-  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  handle_.sync_stream(stream_);
 
   list_prefix_interleaved_host_.resize(nlist_);
   list_lengths_host_.assign(nlist_, 0);
@@ -544,7 +551,7 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildIndex(const T* dataset,
   list_lengths_dev_.resize(nlist_, stream_);
   centriod_dev_.resize(nlist_ * dim_, stream_);
 
-  // Step 3: Read the list
+  // Step 5: Read the list
   copy(list_prefix_interleaved_dev_.data(), list_prefix_interleaved_host_.data(), nlist_, stream_);
   copy(list_lengths_dev_.data(), list_lengths_host_.data(), nlist_, stream_);
   copy(centriod_dev_.data(), centriod_managed_ptr, nlist_ * dim_, stream_);
@@ -663,8 +670,8 @@ cuivflStatus_t cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueri
                                  dim_,
                                  converted_queries_ptr,
                                  dim_,
-                                 stream_,
-                                 ivfflat_config<T>::kDivisor);
+                                 ivfflat_config<T>::kDivisor,
+                                 stream_);
   }
 
   float alpha = 1.0f;
@@ -673,12 +680,13 @@ cuivflStatus_t cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueri
   if (metric_type_ == raft::distance::DistanceType::L2Expanded) {
     alpha = -2.0f;
     beta  = 1.0f;
-    utils::_cuann_sqsum(batch_size, dim_, converted_queries_ptr, query_norm_dev.data());
+    utils::_cuann_sqsum(batch_size, dim_, converted_queries_ptr, query_norm_dev.data(), stream_);
     utils::_cuann_outer_add(query_norm_dev.data(),
                             batch_size,
                             centriod_norm_dev_.data(),
                             nlist_,
-                            distance_buffer_dev.data());
+                            distance_buffer_dev.data(),
+                            stream_);
     RAFT_LOG_TRACE_VEC(centriod_norm_dev_.data(), 20);
     RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), 20);
   } else {
diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index 2601c662b2..433a5f28aa 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -40,7 +40,8 @@ void _cuann_kmeans_predict_core(const handle_t& handle,
                                 uint32_t numDataset,
                                 uint32_t* labels,  // [numDataset]
                                 raft::distance::DistanceType metric,
-                                float* workspace)
+                                float* workspace,
+                                rmm::cuda_stream_view stream)
 {
   const uint32_t dimDataset = dimCenters;
   float* sqsumCenters;  // [numCenters]
@@ -57,9 +58,9 @@ void _cuann_kmeans_predict_core(const handle_t& handle,
     alpha = -1.0;
     beta  = 0.0;
   } else {
-    utils::_cuann_sqsum(numCenters, dimCenters, centers, sqsumCenters);
-    utils::_cuann_sqsum(numDataset, dimDataset, dataset, sqsumDataset);
-    utils::_cuann_outer_add(sqsumDataset, numDataset, sqsumCenters, numCenters, distances);
+    utils::_cuann_sqsum(numCenters, dimCenters, centers, sqsumCenters, stream);
+    utils::_cuann_sqsum(numDataset, dimDataset, dataset, sqsumDataset, stream);
+    utils::_cuann_outer_add(sqsumDataset, numDataset, sqsumCenters, numCenters, distances, stream);
     alpha = -2.0;
     beta  = 1.0;
   }
@@ -77,8 +78,8 @@ void _cuann_kmeans_predict_core(const handle_t& handle,
                &beta,
                distances,
                numCenters,
-               handle.get_stream());
-  utils::_cuann_argmin(numDataset, numCenters, distances, labels);
+               stream);
+  utils::_cuann_argmin(numDataset, numCenters, distances, labels, stream);
 }
 
 //
@@ -132,29 +133,29 @@ void _cuann_kmeans_update_centers(float* centers,  // [numCenters, dimCenters]
                                   uint32_t* labels,  // [numDataset]
                                   raft::distance::DistanceType metric,
                                   uint32_t* clusterSize,  // [numCenters]
-                                  float* accumulatedCenters = NULL)
+                                  float* accumulatedCenters    = NULL,
+                                  rmm::cuda_stream_view stream = rmm::cuda_stream_default)
 {
   if (accumulatedCenters == NULL) {
     // accumulate
-    utils::_cuann_memset(centers, 0, sizeof(float) * numCenters * dimCenters);
-    utils::_cuann_memset(clusterSize, 0, sizeof(uint32_t) * numCenters);
+    utils::_cuann_memset(centers, 0, sizeof(float) * numCenters * dimCenters, stream);
+    utils::_cuann_memset(clusterSize, 0, sizeof(uint32_t) * numCenters, stream);
     float divisor;
     if constexpr (std::is_same_v<T, float>) { divisor = 1.0; }
     if constexpr (std::is_same_v<T, uint8_t>) { divisor = 256.0; }
     if constexpr (std::is_same_v<T, int8_t>) { divisor = 128.0; }
     utils::_cuann_accumulate_with_label<T>(
-      numCenters, dimCenters, centers, clusterSize, numDataset, dataset, labels, divisor);
+      numCenters, dimCenters, centers, clusterSize, numDataset, dataset, labels, divisor, stream);
   } else {
-    copy(centers, accumulatedCenters, numCenters * dimCenters, rmm::cuda_stream_default);
-    interruptible::synchronize(rmm::cuda_stream_default);
+    copy(centers, accumulatedCenters, numCenters * dimCenters, stream);
   }
 
   if (metric == raft::distance::DistanceType::InnerProduct) {
     // normalize
-    utils::_cuann_normalize(numCenters, dimCenters, centers, clusterSize);
+    utils::_cuann_normalize(numCenters, dimCenters, centers, clusterSize, stream);
   } else {
     // average
-    utils::_cuann_divide(numCenters, dimCenters, centers, clusterSize);
+    utils::_cuann_divide(numCenters, dimCenters, centers, clusterSize, stream);
   }
 }
 
@@ -173,11 +174,12 @@ void _cuann_kmeans_predict(const handle_t& handle,
                            uint32_t numDataset,
                            uint32_t* labels,  // [numDataset]
                            raft::distance::DistanceType metric,
-                           bool isCenterSet      = true,
-                           void* _workspace      = NULL,
-                           float* tempCenters    = NULL,  // [numCenters, dimCenters]
-                           uint32_t* clusterSize = NULL,  // [numCenters,]
-                           bool updateCenter     = true)
+                           bool isCenterSet             = true,
+                           void* _workspace             = NULL,
+                           float* tempCenters           = NULL,  // [numCenters, dimCenters]
+                           uint32_t* clusterSize        = NULL,  // [numCenters,]
+                           bool updateCenter            = true,
+                           rmm::cuda_stream_view stream = rmm::cuda_stream_default)
 {
   if (numDataset == 0) {
     RAFT_LOG_WARN("cuann_kmeans_predict: empty dataset (numDataset = %d, numCenters = %d)",
@@ -185,7 +187,6 @@ void _cuann_kmeans_predict(const handle_t& handle,
                   numCenters);
     return;
   }
-  rmm::cuda_stream_view stream = handle.get_stream();
   if (!isCenterSet) {
     // If centers are not set, the labels will be determined randomly.
     linalg::writeOnlyUnaryOp(
@@ -195,8 +196,16 @@ void _cuann_kmeans_predict(const handle_t& handle,
       stream);
     if (tempCenters != NULL && clusterSize != NULL) {
       // update centers
-      _cuann_kmeans_update_centers(
-        centers, numCenters, dimCenters, dataset, numDataset, labels, metric, clusterSize);
+      _cuann_kmeans_update_centers(centers,
+                                   numCenters,
+                                   dimCenters,
+                                   dataset,
+                                   numDataset,
+                                   labels,
+                                   metric,
+                                   clusterSize,
+                                   nullptr,
+                                   stream);
     }
     return;
   }
@@ -220,8 +229,8 @@ void _cuann_kmeans_predict(const handle_t& handle,
     (float*)((uint8_t*)bufDataset + utils::_cuann_aligned(sizeof(float) * chunk * dimCenters));
 
   if (tempCenters != NULL && clusterSize != NULL) {
-    utils::_cuann_memset(tempCenters, 0, sizeof(float) * numCenters * dimCenters);
-    utils::_cuann_memset(clusterSize, 0, sizeof(uint32_t) * numCenters);
+    utils::_cuann_memset(tempCenters, 0, sizeof(float) * numCenters * dimCenters, stream);
+    utils::_cuann_memset(clusterSize, 0, sizeof(uint32_t) * numCenters, stream);
   }
 
   for (uint64_t is = 0; is < numDataset; is += chunk) {
@@ -239,7 +248,7 @@ void _cuann_kmeans_predict(const handle_t& handle,
       if constexpr (std::is_same_v<T, uint8_t>) { divisor = 256.0; }
       if constexpr (std::is_same_v<T, int8_t>) { divisor = 128.0; }
       utils::_cuann_copy<T, float>(
-        nDataset, dimCenters, bufDataset, dimCenters, curDataset, dimCenters, divisor);
+        nDataset, dimCenters, bufDataset, dimCenters, curDataset, dimCenters, divisor, stream);
     }
 
     // predict
@@ -251,12 +260,20 @@ void _cuann_kmeans_predict(const handle_t& handle,
                                nDataset,
                                labels + is,
                                metric,
-                               workspace_core);
+                               workspace_core,
+                               stream);
 
     if ((tempCenters != NULL) && (clusterSize != NULL)) {
       // accumulate
-      utils::_cuann_accumulate_with_label<float>(
-        numCenters, dimCenters, tempCenters, clusterSize, nDataset, curDataset, labels + is);
+      utils::_cuann_accumulate_with_label<float>(numCenters,
+                                                 dimCenters,
+                                                 tempCenters,
+                                                 clusterSize,
+                                                 nDataset,
+                                                 curDataset,
+                                                 labels + is,
+                                                 1.0,
+                                                 stream);
     }
   }
 
@@ -269,14 +286,15 @@ void _cuann_kmeans_predict(const handle_t& handle,
                                  labels,
                                  metric,
                                  clusterSize,
-                                 tempCenters);
+                                 tempCenters,
+                                 stream);
   }
 }
 
 /**
  * @brief adjust centers which have small number of entries
  *
- * NB: all pointers are used on the CPU side.
+ * NB: all pointers are used on the host side.
  */
 template <typename T>
 bool _cuann_kmeans_adjust_centers(float* centers,  // [numCenters, dimCenters]
@@ -287,8 +305,10 @@ bool _cuann_kmeans_adjust_centers(float* centers,  // [numCenters, dimCenters]
                                   const uint32_t* labels,  // [numDataset]
                                   raft::distance::DistanceType metric,
                                   const uint32_t* clusterSize,  // [numCenters]
-                                  float threshold)
+                                  float threshold,
+                                  rmm::cuda_stream_view stream)
 {
+  stream.synchronize();
   if (numCenters == 0) { return false; }
   bool adjusted                = false;
   static uint32_t i            = 0;
@@ -337,6 +357,7 @@ bool _cuann_kmeans_adjust_centers(float* centers,  // [numCenters, dimCenters]
     adjusted = true;
     count += 1;
   }
+  stream.synchronize();
   return adjusted;
 }
 
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index 740798ee08..8860412b8b 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -23,11 +23,9 @@
 #include <raft/distance/distance.hpp>
 #include <raft/distance/distance_type.hpp>
 
-namespace raft {
-namespace spatial {
-namespace knn {
-namespace detail {
-namespace utils {
+#include <rmm/cuda_stream_view.hpp>
+
+namespace raft::spatial ::knn::detail::utils {
 
 constexpr int kThreadPerBlock = 128;
 constexpr int kNumWarps       = kThreadPerBlock / WarpSize;
@@ -45,18 +43,19 @@ size_t _cuann_aligned(size_t size, size_t unit = 128)
  * @param[in] value
  * @param[in] count
  */
-void _cuann_memset(void* ptr, int value, size_t count)
+void _cuann_memset(void* ptr, int value, size_t count, rmm::cuda_stream_view stream)
 {
   cudaPointerAttributes attr;
   cudaPointerGetAttributes(&attr, ptr);
   if (attr.type == cudaMemoryTypeDevice || attr.type == cudaMemoryTypeManaged) {
-    RAFT_CUDA_TRY(cudaMemset(ptr, value, count));
+    RAFT_CUDA_TRY(cudaMemsetAsync(ptr, value, count, stream));
   } else {
+    stream.synchronize();
     memset(ptr, value, count);
+    stream.synchronize();
   }
 }
 
-// argmin along column
 __global__ void kern_argmin(uint32_t nRows,
                             uint32_t nCols,
                             const float* a,  // [nRows, nCols]
@@ -94,22 +93,25 @@ __global__ void kern_argmin(uint32_t nRows,
   if (threadIdx.x == 0) { out[iRow] = smCol[0]; }
 }
 
-// argmin along column
+/**
+ * argmin along column
+ *
+ * NB: device-only
+ */
 void _cuann_argmin(uint32_t nRows,
                    uint32_t nCols,
                    const float* a,  // [nRows, nCols]
-                   uint32_t* out    // [nRows]
-)
+                   uint32_t* out,   // [nRows]
+                   rmm::cuda_stream_view stream)
 {
   uint32_t nThreads = 1024;
   while (nThreads > nCols) {
     nThreads /= 2;
   }
   nThreads = max(nThreads, 128);
-  kern_argmin<<<nRows, nThreads>>>(nRows, nCols, a, out);
+  kern_argmin<<<nRows, nThreads, 0, stream>>>(nRows, nCols, a, out);
 }
 
-// square sum along column
 __global__ void kern_sqsum(uint32_t nRows,
                            uint32_t nCols,
                            const float* a,  // [nRows, nCols]
@@ -132,20 +134,22 @@ __global__ void kern_sqsum(uint32_t nRows,
   if (threadIdx.x == 0) { out[iRow] = sqsum; }
 }
 
-// square sum along column
+/**
+ * square sum along column
+ *
+ * NB: device-only
+ */
 void _cuann_sqsum(uint32_t nRows,
                   uint32_t nCols,
                   const float* a,  // [numDataset, dimDataset]
-                  float* out       // [numDataset,]
-)
+                  float* out,      // [numDataset,]
+                  rmm::cuda_stream_view stream)
 {
   dim3 threads(32, 4, 1);  // DO NOT CHANGE
   dim3 blocks(ceildiv(nRows, threads.y), 1, 1);
-  kern_sqsum<<<blocks, threads>>>(nRows, nCols, a, out);
+  kern_sqsum<<<blocks, threads, 0, stream>>>(nRows, nCols, a, out);
 }
 
-// copy
-
 template <typename S, typename D>
 __global__ void kern_copy(uint32_t nRows,
                           uint32_t nCols,
@@ -161,7 +165,6 @@ __global__ void kern_copy(uint32_t nRows,
   dst[iCol + (ldDst * iRow)] = src[iCol + (ldSrc * iRow)];
 }
 
-// copy
 template <typename S, typename D>
 __global__ void kern_copy(uint32_t nRows,
                           uint32_t nCols,
@@ -178,6 +181,10 @@ __global__ void kern_copy(uint32_t nRows,
   dst[iCol + (ldDst * iRow)] = src[iCol + (ldSrc * iRow)] / divisor;
 }
 
+/**
+ *
+ * NB: device-only
+ */
 template <typename S, typename D>
 void _cuann_copy(uint32_t nRows,
                  uint32_t nCols,
@@ -185,29 +192,14 @@ void _cuann_copy(uint32_t nRows,
                  uint32_t ldSrc,
                  D* dst,  // [nRows, ldDst]
                  uint32_t ldDst,
-                 D divisor)
-{
-  uint32_t nThreads = 128;
-  uint32_t nBlocks  = ceildiv(nRows * nCols, nThreads);
-  kern_copy<S, D><<<nBlocks, nThreads>>>(nRows, nCols, src, ldSrc, dst, ldDst, divisor);
-}
-
-template <typename S, typename D>
-void _cuann_copy(uint32_t nRows,
-                 uint32_t nCols,
-                 const S* src,  // [nRows, ldSrc]
-                 uint32_t ldSrc,
-                 D* dst,  // [nRows, ldDst]
-                 uint32_t ldDst,
-                 cudaStream_t stream,
-                 D divisor)
+                 D divisor,
+                 rmm::cuda_stream_view stream)
 {
   uint32_t nThreads = 128;
   uint32_t nBlocks  = ceildiv(nRows * nCols, nThreads);
   kern_copy<S, D><<<nBlocks, nThreads, 0, stream>>>(nRows, nCols, src, ldSrc, dst, ldDst, divisor);
 }
 
-// accumulate
 template <typename T>
 __global__ void kern_accumulate_with_label(uint32_t nRowsOutput,
                                            uint32_t nCols,
@@ -251,7 +243,8 @@ void _cuann_accumulate_with_label(uint32_t nRowsOutput,
                                   uint32_t nRowsInput,
                                   const T* input,         // [nRowsInput, nCols,]
                                   const uint32_t* label,  // [nRowsInput,]
-                                  float divisor = 1.0)
+                                  float divisor,
+                                  rmm::cuda_stream_view stream)
 {
   bool useGPU = 1;
   cudaPointerAttributes attr;
@@ -270,11 +263,11 @@ void _cuann_accumulate_with_label(uint32_t nRowsOutput,
     // GPU
     uint32_t nThreads = 128;
     uint64_t nBlocks  = ceildiv((uint64_t)nRowsInput * nCols, (uint64_t)nThreads);
-    kern_accumulate_with_label<T>
-      <<<nBlocks, nThreads>>>(nRowsOutput, nCols, output, count, nRowsInput, input, label, divisor);
+    kern_accumulate_with_label<T><<<nBlocks, nThreads, 0, stream>>>(
+      nRowsOutput, nCols, output, count, nRowsInput, input, label, divisor);
   } else {
     // CPU
-    cudaDeviceSynchronize();
+    stream.synchronize();
     for (uint64_t i = 0; i < nRowsInput; i++) {
       uint64_t l = label[i];
       count[l] += 1;
@@ -282,10 +275,10 @@ void _cuann_accumulate_with_label(uint32_t nRowsOutput,
         output[j + (nCols * l)] += input[j + (nCols * i)] / divisor;
       }
     }
+    stream.synchronize();
   }
 }
 
-// normalize
 __global__ void kern_normalize(uint32_t nRows,
                                uint32_t nCols,
                                float* a,                   // [nRows, nCols]
@@ -315,6 +308,8 @@ __global__ void kern_normalize(uint32_t nRows,
 /**
  * @brief Normalize
  *
+ * NB: device-only
+ *
  * @param[in] nRows
  * @param[in] nCols
  * @param[inout] a device pointer
@@ -322,13 +317,13 @@ __global__ void kern_normalize(uint32_t nRows,
  */
 void _cuann_normalize(uint32_t nRows,
                       uint32_t nCols,
-                      float* a,                   // [nRows, nCols]
-                      const uint32_t* numSamples  // [nRows,]
-)
+                      float* a,                    // [nRows, nCols]
+                      const uint32_t* numSamples,  // [nRows,]
+                      rmm::cuda_stream_view stream)
 {
   dim3 threads(32, 4, 1);  // DO NOT CHANGE
   dim3 blocks(ceildiv(nRows, threads.y), 1, 1);
-  kern_normalize<<<blocks, threads>>>(nRows, nCols, a, numSamples);
+  kern_normalize<<<blocks, threads, 0, stream>>>(nRows, nCols, a, numSamples);
 }
 
 // divide
@@ -348,6 +343,8 @@ __global__ void kern_divide(uint32_t nRows,
 /**
  * @brief Divide
  *
+ * NB: device-only
+ *
  * @param[in] nRows
  * @param[in] nCols
  * @param[inout] a device pointer
@@ -355,16 +352,14 @@ __global__ void kern_divide(uint32_t nRows,
  */
 void _cuann_divide(uint32_t nRows,
                    uint32_t nCols,
-                   float* a,                   // [nRows, nCols]
-                   const uint32_t* numSamples  // [nRows,]
-)
+                   float* a,                    // [nRows, nCols]
+                   const uint32_t* numSamples,  // [nRows,]
+                   rmm::cuda_stream_view stream)
 {
   dim3 threads(128, 1, 1);
   dim3 blocks(ceildiv<uint64_t>((uint64_t)nRows * (uint64_t)nCols, threads.x), 1, 1);
-  kern_divide<<<blocks, threads>>>(nRows, nCols, a, numSamples);
+  kern_divide<<<blocks, threads, 0, stream>>>(nRows, nCols, a, numSamples);
 }
-
-// outer add
 __global__ void kern_outer_add(const float* a,
                                uint32_t numA,
                                const float* b,
@@ -381,17 +376,21 @@ __global__ void kern_outer_add(const float* a,
   c[gid]     = valA + valB;
 }
 
-// outer add
+/**
+ * outer add
+ *
+ * NB: device-only
+ */
 void _cuann_outer_add(const float* a,
                       uint32_t numA,
                       const float* b,
                       uint32_t numB,
-                      float* c  // [numA, numB]
-)
+                      float* c,  // [numA, numB]
+                      rmm::cuda_stream_view stream)
 {
   dim3 threads(128, 1, 1);
   dim3 blocks(ceildiv<uint64_t>((uint64_t)numA * (uint64_t)numB, threads.x), 1, 1);
-  kern_outer_add<<<blocks, threads>>>(a, numA, b, numB, c);
+  kern_outer_add<<<blocks, threads, 0, stream>>>(a, numA, b, numB, c);
 }
 
 // copy with row list
@@ -411,8 +410,6 @@ __global__ void kern_copy_with_list(uint32_t nRows,
   uint64_t iaRow             = rowList[iRow];
   dst[iCol + (ldDst * iRow)] = src[iCol + (ldSrc * iaRow)];
 }
-
-// copy with row list
 template <typename T>
 __global__ void kern_copy_with_list(uint32_t nRows,
                                     uint32_t nCols,
@@ -431,7 +428,11 @@ __global__ void kern_copy_with_list(uint32_t nRows,
   dst[iCol + (ldDst * iRow)] = src[iCol + (ldSrc * iaRow)] / divisor;
 }
 
-// copy with row list
+/**
+ * copy with row list
+ *
+ * NB: host or device
+ */
 template <typename T>
 void _cuann_copy_with_list(uint32_t nRows,
                            uint32_t nCols,
@@ -440,26 +441,25 @@ void _cuann_copy_with_list(uint32_t nRows,
                            uint32_t ldSrc,
                            float* dst,  // [nRows, ldDst]
                            uint32_t ldDst,
-                           float divisor = 1.0)
+                           float divisor,
+                           rmm::cuda_stream_view stream)
 {
   cudaPointerAttributes attr;
   cudaPointerGetAttributes(&attr, src);
   if (attr.type == cudaMemoryTypeUnregistered || attr.type == cudaMemoryTypeHost) {
+    stream.synchronize();
     for (uint64_t iRow = 0; iRow < nRows; iRow++) {
       uint64_t iaRow = rowList[iRow];
       for (uint64_t iCol = 0; iCol < nCols; iCol++) {
         dst[iCol + (ldDst * iRow)] = src[iCol + (ldSrc * iaRow)] / divisor;
       }
     }
+    stream.synchronize();
   } else {
     uint32_t nThreads = 128;
     uint32_t nBlocks  = ceildiv(nRows * nCols, nThreads);
     kern_copy_with_list<T>
-      <<<nBlocks, nThreads>>>(nRows, nCols, src, rowList, ldSrc, dst, ldDst, divisor);
+      <<<nBlocks, nThreads, 0, stream>>>(nRows, nCols, src, rowList, ldSrc, dst, ldDst, divisor);
   }
 }
-}  // namespace utils
-}  // namespace detail
-}  // namespace knn
-}  // namespace spatial
-}  // namespace raft
+}  // namespace raft::spatial::knn::detail::utils

From 1f9352ca7d83292f24282d60f28a057ba1e3e906 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 2 Jun 2022 11:15:55 +0200
Subject: [PATCH 052/118] Update comments

---
 cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index ce6aab1bec..910b02e2c0 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -503,10 +503,10 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildIndex(const T* dataset,
   rmm::device_uvector<uint32_t> datasetLabels_buf(nrow_, stream_, &managed_memory);  // [numDataset]
   auto datasetLabels = datasetLabels_buf.data();
 
-  // Step 3: Predict labels of the whole dataset
+  // Predict labels of the whole dataset
   cuivflBuildOptimizedKmeans(centriod_managed_ptr, dataset, trainset, datasetLabels, nrow, ntrain);
 
-  // Step 3.2: Calculate the L2 related result
+  // Calculate the L2 related result
   centriod_norm_dev_.resize(nlist_, stream_);
 
   if (metric_type_ == raft::distance::DistanceType::L2Expanded) {
@@ -514,7 +514,7 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildIndex(const T* dataset,
     RAFT_LOG_TRACE_VEC(centriod_norm_dev_.data(), 20);
   }
 
-  // Step 4: Record the number of elements in each clusters
+  // Record the number of elements in each clusters
   handle_.sync_stream(stream_);
 
   list_prefix_interleaved_host_.resize(nlist_);
@@ -551,7 +551,7 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildIndex(const T* dataset,
   list_lengths_dev_.resize(nlist_, stream_);
   centriod_dev_.resize(nlist_ * dim_, stream_);
 
-  // Step 5: Read the list
+  // Read the list
   copy(list_prefix_interleaved_dev_.data(), list_prefix_interleaved_host_.data(), nlist_, stream_);
   copy(list_lengths_dev_.data(), list_lengths_host_.data(), nlist_, stream_);
   copy(centriod_dev_.data(), centriod_managed_ptr, nlist_ * dim_, stream_);
@@ -560,7 +560,7 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildIndex(const T* dataset,
   copy(list_index_dev_.data(), list_index_host_.data(), ninterleave_, stream_);
 
   return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
-}  // end func cuivflBuildIndex
+}
 
 template <typename T>
 cuivflStatus_t cuivflHandle<T>::queryIVFFlatGridSize(const uint32_t nprobe,

From c048af294b276d133ab1b454157a202e0ad022c6 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 2 Jun 2022 14:19:27 +0200
Subject: [PATCH 053/118] Suggest replacing _cuann_sqsum

---
 .../raft/spatial/knn/detail/ann_utils.cuh       | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index 8860412b8b..9d79810772 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -135,9 +135,15 @@ __global__ void kern_sqsum(uint32_t nRows,
 }
 
 /**
- * square sum along column
+ * @brief Square sum along rows (row-major).
  *
  * NB: device-only
+ *
+ * @param nRows
+ * @param nCols
+ * @param[in] a device pointer to the row-major matrix [nRows, nCols]
+ * @param[out] out device pointer to the vector of dot-products [nRows]
+ * @param stream
  */
 void _cuann_sqsum(uint32_t nRows,
                   uint32_t nCols,
@@ -145,9 +151,16 @@ void _cuann_sqsum(uint32_t nRows,
                   float* out,      // [numDataset,]
                   rmm::cuda_stream_view stream)
 {
-  dim3 threads(32, 4, 1);  // DO NOT CHANGE
+  dim3 threads(32, 4, 1);
   dim3 blocks(ceildiv(nRows, threads.y), 1, 1);
   kern_sqsum<<<blocks, threads, 0, stream>>>(nRows, nCols, a, out);
+  /**
+   * TODO: this can be replaced with the rowNorm helper as shown below.
+   * However, the rowNorm helper seems to incur a significant performance penalty
+   * (example case ann-search slowed down from 150ms to 186ms).
+   *
+   * raft::linalg::rowNorm(out, a, nCols, nRows, raft::linalg::L2Norm, true, stream);
+   */
 }
 
 template <typename S, typename D>

From 96f39a8de544ee6b195bd424fb9c413e675cb9b6 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 2 Jun 2022 15:59:10 +0200
Subject: [PATCH 054/118] wip: refactoting utils

---
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  |  72 +++------
 .../knn/detail/ann_kmeans_balanced.cuh        |  43 ++----
 .../raft/spatial/knn/detail/ann_utils.cuh     | 142 +++++++-----------
 3 files changed, 97 insertions(+), 160 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index 910b02e2c0..00314b58dd 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -26,9 +26,10 @@
 #include <raft/core/logger.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/distance/distance.hpp>
+#include <raft/distance/distance.cuh>
 #include <raft/distance/distance_type.hpp>
 #include <raft/linalg/gemm.cuh>
+#include <raft/linalg/unary_op.cuh>
 #include <raft/spatial/knn/ann_common.h>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -88,26 +89,6 @@ enum cuivflStatus_t : unsigned int {
   CUIVFL_STATUS_NOT_BUILD         = 12
 };
 
-template <typename T>
-struct ivfflat_config {
-};
-
-template <>
-struct ivfflat_config<float> {
-  using value_t                   = float;
-  static constexpr float kDivisor = 1.0;
-};
-template <>
-struct ivfflat_config<uint8_t> {
-  using value_t                   = uint32_t;
-  static constexpr float kDivisor = 256.0;
-};
-template <>
-struct ivfflat_config<int8_t> {
-  using value_t                   = int32_t;
-  static constexpr float kDivisor = 128.0;
-};
-
 template <typename T>
 class cuivflHandle {
  public:
@@ -370,7 +351,6 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_manag
                                     dimDataset,
                                     subTrainset,
                                     dimDataset,
-                                    ivfflat_config<T>::kDivisor,
                                     stream_);
 
     for (uint32_t iter = 0; iter < 2 * numIterations; iter += 2) {
@@ -493,7 +473,7 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildIndex(const T* dataset,
   rmm::device_uvector<float> centriod_managed_buf(nlist_ * dim_, stream_, &managed_memory);
   auto centriod_managed_ptr = centriod_managed_buf.data();
 
-  if (this == NULL || nrow_ == 0) { return CUIVFL_STATUS_NOT_INITIALIZED; }
+  if (this == nullptr || nrow_ == 0) { return CUIVFL_STATUS_NOT_INITIALIZED; }
   if constexpr (!std::is_same_v<T, float> && !std::is_same_v<T, uint8_t> &&
                 !std::is_same_v<T, int8_t>) {
     return CUIVFL_STATUS_UNSUPPORTED_DTYPE;
@@ -568,23 +548,23 @@ cuivflStatus_t cuivflHandle<T>::queryIVFFlatGridSize(const uint32_t nprobe,
                                                      const uint32_t k)
 {
   // query the gridDimX size to store probes topK output
-  ivfflat_interleaved_scan<T, typename ivfflat_config<T>::value_t>(nullptr,
-                                                                   nullptr,
-                                                                   nullptr,
-                                                                   nullptr,
-                                                                   nullptr,
-                                                                   nullptr,
-                                                                   metric_type_,
-                                                                   nprobe,
-                                                                   k,
-                                                                   batch_size,
-                                                                   dim_,
-                                                                   nullptr,
-                                                                   nullptr,
-                                                                   stream_,
-                                                                   greater_,
-                                                                   veclen_,
-                                                                   grid_dim_x_);
+  ivfflat_interleaved_scan<T, typename utils::config<T>::value_t>(nullptr,
+                                                                  nullptr,
+                                                                  nullptr,
+                                                                  nullptr,
+                                                                  nullptr,
+                                                                  nullptr,
+                                                                  metric_type_,
+                                                                  nprobe,
+                                                                  k,
+                                                                  batch_size,
+                                                                  dim_,
+                                                                  nullptr,
+                                                                  nullptr,
+                                                                  stream_,
+                                                                  greater_,
+                                                                  veclen_,
+                                                                  grid_dim_x_);
   return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
 }
 
@@ -664,14 +644,8 @@ cuivflStatus_t cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueri
   if constexpr (std::is_same_v<T, float>) {
     converted_queries_ptr = const_cast<float*>(queries);
   } else {
-    utils::_cuann_copy<T, float>(batch_size,
-                                 dim_,
-                                 queries,
-                                 dim_,
-                                 converted_queries_ptr,
-                                 dim_,
-                                 ivfflat_config<T>::kDivisor,
-                                 stream_);
+    linalg::unaryOp(
+      converted_queries_ptr, queries, batch_size * dim_, utils::mapping<T, float>{}, stream_);
   }
 
   float alpha = 1.0f;
@@ -743,7 +717,7 @@ cuivflStatus_t cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueri
     indices_dev_ptr   = neighbors;
   }
 
-  ivfflat_interleaved_scan<T, typename ivfflat_config<T>::value_t>(
+  ivfflat_interleaved_scan<T, typename utils::config<T>::value_t>(
     queries,
     coarse_indices_dev.data(),
     list_index_dev_.data(),
diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index 433a5f28aa..17f9f620a2 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -23,8 +23,9 @@
 #include <raft/cudart_utils.h>
 #include <raft/distance/distance.hpp>
 #include <raft/distance/distance_type.hpp>
-#include <raft/interruptible.hpp>
 #include <raft/linalg/gemm.cuh>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/matrix/matrix.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
@@ -133,19 +134,15 @@ void _cuann_kmeans_update_centers(float* centers,  // [numCenters, dimCenters]
                                   uint32_t* labels,  // [numDataset]
                                   raft::distance::DistanceType metric,
                                   uint32_t* clusterSize,  // [numCenters]
-                                  float* accumulatedCenters    = NULL,
+                                  float* accumulatedCenters    = nullptr,
                                   rmm::cuda_stream_view stream = rmm::cuda_stream_default)
 {
-  if (accumulatedCenters == NULL) {
+  if (accumulatedCenters == nullptr) {
     // accumulate
     utils::_cuann_memset(centers, 0, sizeof(float) * numCenters * dimCenters, stream);
     utils::_cuann_memset(clusterSize, 0, sizeof(uint32_t) * numCenters, stream);
-    float divisor;
-    if constexpr (std::is_same_v<T, float>) { divisor = 1.0; }
-    if constexpr (std::is_same_v<T, uint8_t>) { divisor = 256.0; }
-    if constexpr (std::is_same_v<T, int8_t>) { divisor = 128.0; }
     utils::_cuann_accumulate_with_label<T>(
-      numCenters, dimCenters, centers, clusterSize, numDataset, dataset, labels, divisor, stream);
+      numCenters, dimCenters, centers, clusterSize, numDataset, dataset, labels, stream);
   } else {
     copy(centers, accumulatedCenters, numCenters * dimCenters, stream);
   }
@@ -175,9 +172,9 @@ void _cuann_kmeans_predict(const handle_t& handle,
                            uint32_t* labels,  // [numDataset]
                            raft::distance::DistanceType metric,
                            bool isCenterSet             = true,
-                           void* _workspace             = NULL,
-                           float* tempCenters           = NULL,  // [numCenters, dimCenters]
-                           uint32_t* clusterSize        = NULL,  // [numCenters,]
+                           void* _workspace             = nullptr,
+                           float* tempCenters           = nullptr,  // [numCenters, dimCenters]
+                           uint32_t* clusterSize        = nullptr,  // [numCenters,]
                            bool updateCenter            = true,
                            rmm::cuda_stream_view stream = rmm::cuda_stream_default)
 {
@@ -194,7 +191,7 @@ void _cuann_kmeans_predict(const handle_t& handle,
       numDataset,
       [numCenters] __device__(uint32_t * out, uint32_t i) { *out = i % numCenters; },
       stream);
-    if (tempCenters != NULL && clusterSize != NULL) {
+    if (tempCenters != nullptr && clusterSize != nullptr) {
       // update centers
       _cuann_kmeans_update_centers(centers,
                                    numCenters,
@@ -214,7 +211,7 @@ void _cuann_kmeans_predict(const handle_t& handle,
   void* workspace = _workspace;
   rmm::device_buffer sub_workspace(0, stream);
 
-  if (_workspace == NULL) {
+  if (_workspace == nullptr) {
     sub_workspace.resize(_cuann_kmeans_predict_bufferSize(numCenters, dimCenters, numDataset),
                          stream);
     workspace = sub_workspace.data();
@@ -228,7 +225,7 @@ void _cuann_kmeans_predict(const handle_t& handle,
   workspace_core =
     (float*)((uint8_t*)bufDataset + utils::_cuann_aligned(sizeof(float) * chunk * dimCenters));
 
-  if (tempCenters != NULL && clusterSize != NULL) {
+  if (tempCenters != nullptr && clusterSize != nullptr) {
     utils::_cuann_memset(tempCenters, 0, sizeof(float) * numCenters * dimCenters, stream);
     utils::_cuann_memset(clusterSize, 0, sizeof(uint32_t) * numCenters, stream);
   }
@@ -244,11 +241,8 @@ void _cuann_kmeans_predict(const handle_t& handle,
       // No need to copy floats
       curDataset = bufDataset;
     } else {
-      float divisor;
-      if constexpr (std::is_same_v<T, uint8_t>) { divisor = 256.0; }
-      if constexpr (std::is_same_v<T, int8_t>) { divisor = 128.0; }
-      utils::_cuann_copy<T, float>(
-        nDataset, dimCenters, bufDataset, dimCenters, curDataset, dimCenters, divisor, stream);
+      linalg::unaryOp(
+        curDataset, bufDataset, nDataset * dimCenters, utils::mapping<T, float>{}, stream);
     }
 
     // predict
@@ -263,7 +257,7 @@ void _cuann_kmeans_predict(const handle_t& handle,
                                workspace_core,
                                stream);
 
-    if ((tempCenters != NULL) && (clusterSize != NULL)) {
+    if ((tempCenters != nullptr) && (clusterSize != nullptr)) {
       // accumulate
       utils::_cuann_accumulate_with_label<float>(numCenters,
                                                  dimCenters,
@@ -272,12 +266,11 @@ void _cuann_kmeans_predict(const handle_t& handle,
                                                  nDataset,
                                                  curDataset,
                                                  labels + is,
-                                                 1.0,
                                                  stream);
     }
   }
 
-  if ((tempCenters != NULL) && (clusterSize != NULL) && updateCenter) {
+  if ((tempCenters != nullptr) && (clusterSize != nullptr) && updateCenter) {
     _cuann_kmeans_update_centers(centers,
                                  numCenters,
                                  dimCenters,
@@ -320,10 +313,6 @@ bool _cuann_kmeans_adjust_centers(float* centers,  // [numCenters, dimCenters]
                                 2053, 2129, 2213, 2287, 2357, 2423, 2531, 2617, 2687, 2741};
   uint32_t average             = numDataset / numCenters;
   uint32_t ofst;
-  float divisor;
-  if constexpr (std::is_same_v<T, float>) { divisor = 1.0; }
-  if constexpr (std::is_same_v<T, uint8_t>) { divisor = 256.0; }
-  if constexpr (std::is_same_v<T, int8_t>) { divisor = 128.0; }
 
   do {
     iPrimes = (iPrimes + 1) % numPrimes;
@@ -343,7 +332,7 @@ bool _cuann_kmeans_adjust_centers(float* centers,  // [numCenters, dimCenters]
       constexpr float kWd = 1.0;
       float val           = 0;
       val += kWc * centers[j + ((uint64_t)dimCenters * li)];
-      val += kWd * dataset[j + ((uint64_t)dimCenters * i)] / divisor;
+      val += kWd * dataset[j + ((uint64_t)dimCenters * i)] / utils::config<T>::kDivisor;
       val /= kWc + kWd;
       sqsum += val * val;
       centers[j + ((uint64_t)dimCenters * l)] = val;
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index 9d79810772..8dc75067b3 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -56,6 +56,52 @@ void _cuann_memset(void* ptr, int value, size_t count, rmm::cuda_stream_view str
   }
 }
 
+template <typename T>
+struct config {
+};
+
+template <>
+struct config<float> {
+  using value_t                   = float;
+  static constexpr float kDivisor = 1.0;
+};
+template <>
+struct config<uint8_t> {
+  using value_t                   = uint32_t;
+  static constexpr float kDivisor = 256.0;
+};
+template <>
+struct config<int8_t> {
+  using value_t                   = int32_t;
+  static constexpr float kDivisor = 128.0;
+};
+
+template <typename T, typename S>
+struct mapping {
+  HDI auto operator()(T x) -> S;
+};
+
+template <typename T>
+struct mapping<T, T> {
+  HDI auto operator()(T x) -> T { return x; }
+};
+
+template <typename T>
+struct mapping<T, float> {
+  HDI auto operator()(T x) -> float { return float(x) * kMult; }
+
+ private:
+  static constexpr float kMult = 1 / config<T>::kDivisor;
+};
+
+template <typename S>
+struct mapping<float, S> {
+  HDI auto operator()(float x) -> S { return S(x * kMult); }
+
+ private:
+  static constexpr float kMult = config<S>::kDivisor;
+};
+
 __global__ void kern_argmin(uint32_t nRows,
                             uint32_t nCols,
                             const float* a,  // [nRows, nCols]
@@ -163,65 +209,15 @@ void _cuann_sqsum(uint32_t nRows,
    */
 }
 
-template <typename S, typename D>
-__global__ void kern_copy(uint32_t nRows,
-                          uint32_t nCols,
-                          const S* src,  // [nRows, ldSrc]
-                          uint32_t ldSrc,
-                          D* dst,  // [nRows, ldDst]
-                          uint32_t ldDst)
-{
-  uint32_t gid  = threadIdx.x + (blockDim.x * blockIdx.x);
-  uint32_t iCol = gid % nCols;
-  uint32_t iRow = gid / nCols;
-  if (iRow >= nRows) return;
-  dst[iCol + (ldDst * iRow)] = src[iCol + (ldSrc * iRow)];
-}
-
-template <typename S, typename D>
-__global__ void kern_copy(uint32_t nRows,
-                          uint32_t nCols,
-                          const S* src,  // [nRows, ldSrc]
-                          uint32_t ldSrc,
-                          D* dst,  // [nRows, ldDst]
-                          uint32_t ldDst,
-                          D divisor)
-{
-  uint32_t gid  = threadIdx.x + (blockDim.x * blockIdx.x);
-  uint32_t iCol = gid % nCols;
-  uint32_t iRow = gid / nCols;
-  if (iRow >= nRows) return;
-  dst[iCol + (ldDst * iRow)] = src[iCol + (ldSrc * iRow)] / divisor;
-}
-
-/**
- *
- * NB: device-only
- */
-template <typename S, typename D>
-void _cuann_copy(uint32_t nRows,
-                 uint32_t nCols,
-                 const S* src,  // [nRows, ldSrc]
-                 uint32_t ldSrc,
-                 D* dst,  // [nRows, ldDst]
-                 uint32_t ldDst,
-                 D divisor,
-                 rmm::cuda_stream_view stream)
-{
-  uint32_t nThreads = 128;
-  uint32_t nBlocks  = ceildiv(nRows * nCols, nThreads);
-  kern_copy<S, D><<<nBlocks, nThreads, 0, stream>>>(nRows, nCols, src, ldSrc, dst, ldDst, divisor);
-}
-
 template <typename T>
 __global__ void kern_accumulate_with_label(uint32_t nRowsOutput,
                                            uint32_t nCols,
                                            float* output,    // [nRowsOutput, nCols,]
                                            uint32_t* count,  // [nRowsOutput,]
                                            uint32_t nRowsInput,
-                                           const T* input,         // [nRowsInput, nCols,]
-                                           const uint32_t* label,  // [nRowsInput,]
-                                           float divisor)
+                                           const T* input,        // [nRowsInput, nCols,]
+                                           const uint32_t* label  // [nRowsInput,]
+)
 {
   uint64_t gid       = threadIdx.x + (blockDim.x * blockIdx.x);
   uint64_t iCol      = gid % nCols;
@@ -229,7 +225,7 @@ __global__ void kern_accumulate_with_label(uint32_t nRowsOutput,
   if (iRowInput >= nRowsInput) return;
   uint64_t iRowOutput = label[iRowInput];
   if (iCol == 0) { atomicAdd(&(count[iRowOutput]), 1); }
-  atomicAdd(&(output[iCol + (nCols * iRowOutput)]), input[gid] / divisor);
+  atomicAdd(&(output[iCol + (nCols * iRowOutput)]), float(input[gid]) / config<T>::kDivisor);
 }
 
 /**
@@ -246,7 +242,6 @@ __global__ void kern_accumulate_with_label(uint32_t nRowsOutput,
  * @param nRowsInput
  * @param input device/host pointer
  * @param label device/host pointer
- * @param divisor
  */
 template <typename T>
 void _cuann_accumulate_with_label(uint32_t nRowsOutput,
@@ -256,7 +251,6 @@ void _cuann_accumulate_with_label(uint32_t nRowsOutput,
                                   uint32_t nRowsInput,
                                   const T* input,         // [nRowsInput, nCols,]
                                   const uint32_t* label,  // [nRowsInput,]
-                                  float divisor,
                                   rmm::cuda_stream_view stream)
 {
   bool useGPU = 1;
@@ -269,15 +263,13 @@ void _cuann_accumulate_with_label(uint32_t nRowsOutput,
   if (attr.type == cudaMemoryTypeUnregistered || attr.type == cudaMemoryTypeHost) { useGPU = 0; }
   cudaPointerGetAttributes(&attr, label);
   if (attr.type == cudaMemoryTypeUnregistered || attr.type == cudaMemoryTypeHost) { useGPU = 0; }
-  // _cuann_memset(output, 0, sizeof(float) * nRowsOutput * nCols);
-  // _cuann_memset(count, 0, sizeof(uint32_t) * nRowsOutput);
 
   if (useGPU) {
     // GPU
     uint32_t nThreads = 128;
     uint64_t nBlocks  = ceildiv((uint64_t)nRowsInput * nCols, (uint64_t)nThreads);
     kern_accumulate_with_label<T><<<nBlocks, nThreads, 0, stream>>>(
-      nRowsOutput, nCols, output, count, nRowsInput, input, label, divisor);
+      nRowsOutput, nCols, output, count, nRowsInput, input, label);
   } else {
     // CPU
     stream.synchronize();
@@ -285,7 +277,7 @@ void _cuann_accumulate_with_label(uint32_t nRowsOutput,
       uint64_t l = label[i];
       count[l] += 1;
       for (uint64_t j = 0; j < nCols; j++) {
-        output[j + (nCols * l)] += input[j + (nCols * i)] / divisor;
+        output[j + (nCols * l)] += float(input[j + (nCols * i)]) / config<T>::kDivisor;
       }
     }
     stream.synchronize();
@@ -300,7 +292,7 @@ __global__ void kern_normalize(uint32_t nRows,
 {
   uint64_t iRow = threadIdx.y + (blockDim.y * blockIdx.x);
   if (iRow >= nRows) return;
-  if (numSamples != NULL and numSamples[iRow] < 1) return;
+  if (numSamples != nullptr and numSamples[iRow] < 1) return;
 
   float sqsum = 0.0;
   for (uint32_t iCol = threadIdx.x; iCol < nCols; iCol += blockDim.x) {
@@ -384,8 +376,8 @@ __global__ void kern_outer_add(const float* a,
   uint64_t iA  = gid / numB;
   uint64_t iB  = gid % numB;
   if (iA >= numA) return;
-  float valA = (a == NULL) ? 0.0 : a[iA];
-  float valB = (b == NULL) ? 0.0 : b[iB];
+  float valA = (a == nullptr) ? 0.0 : a[iA];
+  float valB = (b == nullptr) ? 0.0 : b[iB];
   c[gid]     = valA + valB;
 }
 
@@ -421,24 +413,7 @@ __global__ void kern_copy_with_list(uint32_t nRows,
   uint64_t iRow = gid / nCols;
   if (iRow >= nRows) return;
   uint64_t iaRow             = rowList[iRow];
-  dst[iCol + (ldDst * iRow)] = src[iCol + (ldSrc * iaRow)];
-}
-template <typename T>
-__global__ void kern_copy_with_list(uint32_t nRows,
-                                    uint32_t nCols,
-                                    const T* src,             // [..., ldSrc]
-                                    const uint32_t* rowList,  // [nRows,]
-                                    uint32_t ldSrc,
-                                    float* dst,  // [nRows, ldDst]
-                                    uint32_t ldDst,
-                                    float divisor)
-{
-  uint64_t gid  = threadIdx.x + (blockDim.x * blockIdx.x);
-  uint64_t iCol = gid % nCols;
-  uint64_t iRow = gid / nCols;
-  if (iRow >= nRows) return;
-  uint64_t iaRow             = rowList[iRow];
-  dst[iCol + (ldDst * iRow)] = src[iCol + (ldSrc * iaRow)] / divisor;
+  dst[iCol + (ldDst * iRow)] = float(src[iCol + (ldSrc * iaRow)]) / config<T>::kDivisor;
 }
 
 /**
@@ -454,7 +429,6 @@ void _cuann_copy_with_list(uint32_t nRows,
                            uint32_t ldSrc,
                            float* dst,  // [nRows, ldDst]
                            uint32_t ldDst,
-                           float divisor,
                            rmm::cuda_stream_view stream)
 {
   cudaPointerAttributes attr;
@@ -464,7 +438,7 @@ void _cuann_copy_with_list(uint32_t nRows,
     for (uint64_t iRow = 0; iRow < nRows; iRow++) {
       uint64_t iaRow = rowList[iRow];
       for (uint64_t iCol = 0; iCol < nCols; iCol++) {
-        dst[iCol + (ldDst * iRow)] = src[iCol + (ldSrc * iaRow)] / divisor;
+        dst[iCol + (ldDst * iRow)] = float(src[iCol + (ldSrc * iaRow)]) / config<T>::kDivisor;
       }
     }
     stream.synchronize();
@@ -472,7 +446,7 @@ void _cuann_copy_with_list(uint32_t nRows,
     uint32_t nThreads = 128;
     uint32_t nBlocks  = ceildiv(nRows * nCols, nThreads);
     kern_copy_with_list<T>
-      <<<nBlocks, nThreads, 0, stream>>>(nRows, nCols, src, rowList, ldSrc, dst, ldDst, divisor);
+      <<<nBlocks, nThreads, 0, stream>>>(nRows, nCols, src, rowList, ldSrc, dst, ldDst);
   }
 }
 }  // namespace raft::spatial::knn::detail::utils

From 888daebd15df312813e6f8b8e10bc13a15f14073 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 2 Jun 2022 16:05:58 +0200
Subject: [PATCH 055/118] minor comments

---
 cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index 00314b58dd..12abde7fd9 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -605,7 +605,7 @@ cuivflStatus_t cuivflHandle<T>::cuivflSearch(const T* queries,  // [numQueries,
     "cuivflSearch(%u, %u, %zu)", batch_size, k, neighbors);
   cuivflSearchImpl<float>(queries, batch_size, k, neighbors, distances);
   return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
-}  // end func cuivflSearch
+}
 
 template <typename T>
 template <typename value_t>
@@ -739,6 +739,7 @@ cuivflStatus_t cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueri
   RAFT_LOG_TRACE_VEC(distances_dev_ptr, 2 * k);
   RAFT_LOG_TRACE_VEC(indices_dev_ptr, 2 * k);
 
+  // Merge topk values from different blocks
   if (grid_dim_x_ > 1) {
     if (k <= raft::spatial::knn::detail::topk::kMaxCapacity) {
       topk::warp_sort_topk<value_t, size_t>(refined_distances_dev.data(),
@@ -762,9 +763,9 @@ cuivflStatus_t cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueri
                                                  stream_,
                                                  &(search_mem_res.value()));
     }
-  }  // end if nprobe=1
+  }
 
   return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
-}  // end func cuivflHandle::cuivflSearchImpl
+}
 
 }  // namespace raft::spatial::knn::detail

From e6ff267b62bf66bd328ab004a94425cb610cbc89 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 3 Jun 2022 14:09:26 +0200
Subject: [PATCH 056/118] ann_utils refactoring, docs, and clang-tidy

---
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  |  34 +-
 .../knn/detail/ann_ivf_flat_kernel.cuh        |  21 +-
 .../knn/detail/ann_kmeans_balanced.cuh        |  54 +-
 .../raft/spatial/knn/detail/ann_utils.cuh     | 621 ++++++++++--------
 4 files changed, 389 insertions(+), 341 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index 12abde7fd9..0816a19307 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -344,14 +344,14 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_manag
     }
     assert(k == mesoClusterSize[i]);
 
-    utils::_cuann_copy_with_list<T>(mesoClusterSize[i],
-                                    dimDataset,
-                                    trainset,
-                                    idsTrainset,
-                                    dimDataset,
-                                    subTrainset,
-                                    dimDataset,
-                                    stream_);
+    utils::copy_selected<T>(mesoClusterSize[i],
+                            dimDataset,
+                            trainset,
+                            idsTrainset,
+                            dimDataset,
+                            subTrainset,
+                            dimDataset,
+                            stream_);
 
     for (uint32_t iter = 0; iter < 2 * numIterations; iter += 2) {
       RAFT_LOG_TRACE("Training kmeans of clusters in meso-cluster %u (numClusters: %u): %.1f / %u",
@@ -490,7 +490,7 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildIndex(const T* dataset,
   centriod_norm_dev_.resize(nlist_, stream_);
 
   if (metric_type_ == raft::distance::DistanceType::L2Expanded) {
-    utils::_cuann_sqsum(nlist_, dim_, centriod_managed_ptr, centriod_norm_dev_.data(), stream_);
+    utils::dots_along_rows(nlist_, dim_, centriod_managed_ptr, centriod_norm_dev_.data(), stream_);
     RAFT_LOG_TRACE_VEC(centriod_norm_dev_.data(), 20);
   }
 
@@ -645,7 +645,7 @@ cuivflStatus_t cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueri
     converted_queries_ptr = const_cast<float*>(queries);
   } else {
     linalg::unaryOp(
-      converted_queries_ptr, queries, batch_size * dim_, utils::mapping<T, float>{}, stream_);
+      converted_queries_ptr, queries, batch_size * dim_, utils::mapping<float>{}, stream_);
   }
 
   float alpha = 1.0f;
@@ -654,13 +654,13 @@ cuivflStatus_t cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueri
   if (metric_type_ == raft::distance::DistanceType::L2Expanded) {
     alpha = -2.0f;
     beta  = 1.0f;
-    utils::_cuann_sqsum(batch_size, dim_, converted_queries_ptr, query_norm_dev.data(), stream_);
-    utils::_cuann_outer_add(query_norm_dev.data(),
-                            batch_size,
-                            centriod_norm_dev_.data(),
-                            nlist_,
-                            distance_buffer_dev.data(),
-                            stream_);
+    utils::dots_along_rows(batch_size, dim_, converted_queries_ptr, query_norm_dev.data(), stream_);
+    utils::outer_add(query_norm_dev.data(),
+                     batch_size,
+                     centriod_norm_dev_.data(),
+                     nlist_,
+                     distance_buffer_dev.data(),
+                     stream_);
     RAFT_LOG_TRACE_VEC(centriod_norm_dev_.data(), 20);
     RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), 20);
   } else {
diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
index 908828f300..c963f55144 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
@@ -38,6 +38,8 @@
 
 namespace raft::spatial::knn::detail {
 
+constexpr int kThreadsPerBlock = 128;
+
 /**
  * @brief Copy Veclen elements of type T from `query` to `query_shared` at position `loadDim *
  * Veclen`.
@@ -720,7 +722,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 1, int8_t, in
  * query_smem_elems must be multiple of WarpSize * Veclen
  */
 template <int Capacity, int Veclen, bool Greater, typename T, typename AccT, typename Lambda>
-__global__ void __launch_bounds__(utils::kThreadPerBlock)
+__global__ void __launch_bounds__(kThreadsPerBlock)
   interleaved_scan_kernel(Lambda compute_dist,
                           const uint32_t query_smem_elems,
                           const T* queries,
@@ -744,8 +746,8 @@ __global__ void __launch_bounds__(utils::kThreadPerBlock)
 #ifdef USE_FAISS
   // temporary use of FAISS blockSelect for development purpose of k <= 32
   // for comparison purpose
-  __shared__ float smemK[utils::kNumWarps * 32];
-  __shared__ size_t smemV[utils::kNumWarps * 32];
+  __shared__ float smemK[kThreadsPerBlock];
+  __shared__ size_t smemV[kThreadsPerBlock];
 
   constexpr auto Dir = Greater;
   constexpr auto identity =
@@ -754,7 +756,7 @@ __global__ void __launch_bounds__(utils::kThreadPerBlock)
     Dir ? std::numeric_limits<size_t>::min() : std::numeric_limits<size_t>::max();
 
   faiss::gpu::
-    BlockSelect<float, size_t, Dir, faiss::gpu::Comparator<float>, 32, 2, utils::kThreadPerBlock>
+    BlockSelect<float, size_t, Dir, faiss::gpu::Comparator<float>, 32, 2, kThreadsPerBlock>
       queue(identity, keyMax, smemK, smemV, k);
 
 #else
@@ -801,10 +803,11 @@ __global__ void __launch_bounds__(utils::kThreadPerBlock)
     // The number of interleaved group to be processed
     const uint32_t numBlocks = ceildiv<uint32_t>(numVecs, WarpSize);
 
+    constexpr uint32_t kNumWarps = kThreadsPerBlock / WarpSize;
     // Every warp reads WarpSize vectors and computes the distances to them.
     // Then, the distances and corresponding ids are distributed among the threads,
     // and each thread adds one (id, dist) pair to the filtering queue.
-    for (uint32_t block = warpId; block < numBlocks; block += utils::kNumWarps) {
+    for (uint32_t block = warpId; block < numBlocks; block += kNumWarps) {
       AccT dist = 0;
       // This is the vector a given lane/thread handles
       const uint32_t vec = block * WarpSize + laneId;
@@ -868,7 +871,7 @@ __global__ void __launch_bounds__(utils::kThreadPerBlock)
   /// Warp_wise topk
 #ifdef USE_FAISS
   queue.reduce();
-  for (int i = threadIdx.x; i < k; i += utils::kThreadPerBlock) {
+  for (int i = threadIdx.x; i < k; i += kThreadsPerBlock) {
     neighbors[queryId * k * gridDim.x + blockIdx.x * k + i] = (size_t)smemV[i];
     distances[queryId * k * gridDim.x + blockIdx.x * k + i] = smemK[i];
   }
@@ -891,7 +894,7 @@ uint32_t configure_launch_x(uint32_t numQueries, uint32_t nprobe, int32_t sMemSi
   RAFT_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
   int num_blocks_per_sm = 0;
   RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &num_blocks_per_sm, func, utils::kThreadPerBlock, sMemSize));
+    &num_blocks_per_sm, func, kThreadsPerBlock, sMemSize));
 
   size_t min_grid_size = num_sms * num_blocks_per_sm;
   size_t min_grid_x    = ceildiv<size_t>(min_grid_size, numQueries);
@@ -923,7 +926,7 @@ void launch_kernel(Lambda lambda,
 #ifndef USE_FAISS
   constexpr int kSubwarpSize = std::min<int>(Capacity, WarpSize);
   smem_size += raft::spatial::knn::detail::topk::calc_smem_size_for_block_wide<AccT, size_t>(
-    utils::kThreadPerBlock / kSubwarpSize, k);
+    kThreadsPerBlock / kSubwarpSize, k);
 #endif
 
   // power-of-two less than cuda limit (for better addr alignment)
@@ -937,7 +940,7 @@ void launch_kernel(Lambda lambda,
   for (uint32_t query_offset = 0; query_offset < batch_size; query_offset += kMaxGridY) {
     uint32_t grid_dim_y = std::min<uint32_t>(kMaxGridY, batch_size - query_offset);
     dim3 grid_dim(grid_dim_x, grid_dim_y, 1);
-    dim3 block_dim(utils::kThreadPerBlock);
+    dim3 block_dim(kThreadsPerBlock);
     RAFT_LOG_TRACE(
       "Launching the ivf-flat interleaved_scan_kernel (%d, %d, 1) x (%d, 1, 1), nprobe = %d, "
       "smem_size = %d",
diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index 17f9f620a2..049940ab85 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -26,6 +26,7 @@
 #include <raft/linalg/gemm.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/matrix/matrix.cuh>
+#include <raft/pow2_utils.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
@@ -59,9 +60,9 @@ void _cuann_kmeans_predict_core(const handle_t& handle,
     alpha = -1.0;
     beta  = 0.0;
   } else {
-    utils::_cuann_sqsum(numCenters, dimCenters, centers, sqsumCenters, stream);
-    utils::_cuann_sqsum(numDataset, dimDataset, dataset, sqsumDataset, stream);
-    utils::_cuann_outer_add(sqsumDataset, numDataset, sqsumCenters, numCenters, distances, stream);
+    utils::dots_along_rows(numCenters, dimCenters, centers, sqsumCenters, stream);
+    utils::dots_along_rows(numDataset, dimDataset, dataset, sqsumDataset, stream);
+    utils::outer_add(sqsumDataset, numDataset, sqsumCenters, numCenters, distances, stream);
     alpha = -2.0;
     beta  = 1.0;
   }
@@ -80,7 +81,7 @@ void _cuann_kmeans_predict_core(const handle_t& handle,
                distances,
                numCenters,
                stream);
-  utils::_cuann_argmin(numDataset, numCenters, distances, labels, stream);
+  utils::argmin_along_rows(numDataset, numCenters, distances, labels, stream);
 }
 
 //
@@ -104,20 +105,22 @@ size_t _cuann_kmeans_predict_bufferSize(uint32_t numCenters,
 {
   uint32_t chunk = _cuann_kmeans_predict_chunkSize(numCenters, numDataset);
   size_t size    = 0;
+  using align_t  = Pow2<128>;
   // float *curDataset;  // [chunk, dimCenters]
-  size += utils::_cuann_aligned(sizeof(float) * chunk * dimCenters);
+  size += align_t::roundUp(sizeof(float) * chunk * dimCenters);
   // void *bufDataset;  // [chunk, dimCenters]
-  size += utils::_cuann_aligned(sizeof(float) * chunk * dimCenters);
+  size += align_t::roundUp(sizeof(float) * chunk * dimCenters);
   // float *workspace;
-  size += utils::_cuann_aligned(sizeof(float) * (numCenters + chunk + (numCenters * chunk)));
+  size += align_t::roundUp(sizeof(float) * (numCenters + chunk + (numCenters * chunk)));
   return size;
 }
 
 /**
  * @brief update kmeans centers
  *
- * NB: `centers` and `clusterSize` must be accessible on GPU due to _cuann_divide/_cuann_normalize.
- *      The rest can be both, under assumption that all pointer are accessible from the same place.
+ * NB: `centers` and `clusterSize` must be accessible on GPU due to
+ * divide_along_rows/normalize_rows. The rest can be both, under assumption that all pointer are
+ * accessible from the same place.
  *
  * i.e. two variants are possible:
  *
@@ -139,20 +142,20 @@ void _cuann_kmeans_update_centers(float* centers,  // [numCenters, dimCenters]
 {
   if (accumulatedCenters == nullptr) {
     // accumulate
-    utils::_cuann_memset(centers, 0, sizeof(float) * numCenters * dimCenters, stream);
-    utils::_cuann_memset(clusterSize, 0, sizeof(uint32_t) * numCenters, stream);
-    utils::_cuann_accumulate_with_label<T>(
-      numCenters, dimCenters, centers, clusterSize, numDataset, dataset, labels, stream);
+    utils::memset(centers, 0, sizeof(float) * numCenters * dimCenters, stream);
+    utils::memset(clusterSize, 0, sizeof(uint32_t) * numCenters, stream);
+    utils::accumulate_into_selected<T>(
+      numDataset, dimCenters, centers, clusterSize, dataset, labels, stream);
   } else {
     copy(centers, accumulatedCenters, numCenters * dimCenters, stream);
   }
 
   if (metric == raft::distance::DistanceType::InnerProduct) {
     // normalize
-    utils::_cuann_normalize(numCenters, dimCenters, centers, clusterSize, stream);
+    utils::normalize_rows(numCenters, dimCenters, centers, stream);
   } else {
     // average
-    utils::_cuann_divide(numCenters, dimCenters, centers, clusterSize, stream);
+    utils::divide_along_rows(numCenters, dimCenters, centers, clusterSize, stream);
   }
 }
 
@@ -220,14 +223,13 @@ void _cuann_kmeans_predict(const handle_t& handle,
   T* bufDataset;      // [chunk, dimCenters]
   float* workspace_core;
   curDataset = (float*)workspace;
-  bufDataset =
-    (T*)((uint8_t*)curDataset + utils::_cuann_aligned(sizeof(float) * chunk * dimCenters));
+  bufDataset = (T*)((uint8_t*)curDataset + Pow2<128>::roundUp(sizeof(float) * chunk * dimCenters));
   workspace_core =
-    (float*)((uint8_t*)bufDataset + utils::_cuann_aligned(sizeof(float) * chunk * dimCenters));
+    (float*)((uint8_t*)bufDataset + Pow2<128>::roundUp(sizeof(float) * chunk * dimCenters));
 
   if (tempCenters != nullptr && clusterSize != nullptr) {
-    utils::_cuann_memset(tempCenters, 0, sizeof(float) * numCenters * dimCenters, stream);
-    utils::_cuann_memset(clusterSize, 0, sizeof(uint32_t) * numCenters, stream);
+    utils::memset(tempCenters, 0, sizeof(float) * numCenters * dimCenters, stream);
+    utils::memset(clusterSize, 0, sizeof(uint32_t) * numCenters, stream);
   }
 
   for (uint64_t is = 0; is < numDataset; is += chunk) {
@@ -242,7 +244,7 @@ void _cuann_kmeans_predict(const handle_t& handle,
       curDataset = bufDataset;
     } else {
       linalg::unaryOp(
-        curDataset, bufDataset, nDataset * dimCenters, utils::mapping<T, float>{}, stream);
+        curDataset, bufDataset, nDataset * dimCenters, utils::mapping<float>{}, stream);
     }
 
     // predict
@@ -259,14 +261,8 @@ void _cuann_kmeans_predict(const handle_t& handle,
 
     if ((tempCenters != nullptr) && (clusterSize != nullptr)) {
       // accumulate
-      utils::_cuann_accumulate_with_label<float>(numCenters,
-                                                 dimCenters,
-                                                 tempCenters,
-                                                 clusterSize,
-                                                 nDataset,
-                                                 curDataset,
-                                                 labels + is,
-                                                 stream);
+      utils::accumulate_into_selected<float>(
+        nDataset, dimCenters, tempCenters, clusterSize, curDataset, labels + is, stream);
     }
   }
 
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index 8dc75067b3..cae531a312 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -25,35 +25,56 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
-namespace raft::spatial ::knn::detail::utils {
+namespace raft::spatial::knn::detail::utils {
+
+/** Whether pointers are accessible on the device or on the host. */
+enum class pointer_residency {
+  /** Some of the pointers are on the device, some on the host. */
+  mixed,
+  /** All pointers accessible from both the device and the host. */
+  host_and_device,
+  /** All pointers are host accessible. */
+  host_only,
+  /** All poitners are device accessible. */
+  device_only
+};
 
-constexpr int kThreadPerBlock = 128;
-constexpr int kNumWarps       = kThreadPerBlock / WarpSize;
+template <typename... Types>
+struct pointer_residency_count {
+};
 
-size_t _cuann_aligned(size_t size, size_t unit = 128)
-{
-  if (size % unit) { size += unit - (size % unit); }
-  return size;
-}
+template <>
+struct pointer_residency_count<> {
+  static inline auto run() -> std::tuple<int, int> { return std::make_tuple(0, 0); }
+};
 
-/**
- * @brief Sets the first num bytes of the block of memory pointed by ptr to the specified value.
- *
- * @param[out] ptr host or device pointer
- * @param[in] value
- * @param[in] count
- */
-void _cuann_memset(void* ptr, int value, size_t count, rmm::cuda_stream_view stream)
-{
-  cudaPointerAttributes attr;
-  cudaPointerGetAttributes(&attr, ptr);
-  if (attr.type == cudaMemoryTypeDevice || attr.type == cudaMemoryTypeManaged) {
-    RAFT_CUDA_TRY(cudaMemsetAsync(ptr, value, count, stream));
-  } else {
-    stream.synchronize();
-    memset(ptr, value, count);
-    stream.synchronize();
+template <typename Type, typename... Types>
+struct pointer_residency_count<Type, Types...> {
+  static inline auto run(const Type* ptr, const Types*... ptrs) -> std::tuple<int, int>
+  {
+    auto [on_device, on_host] = pointer_residency_count<Types...>::run(ptrs...);
+    cudaPointerAttributes attr;
+    RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, ptr));
+    switch (attr.type) {
+      case cudaMemoryTypeUnregistered:
+      case cudaMemoryTypeHost: return std::make_tuple(on_device, on_host + 1);
+      case cudaMemoryTypeDevice: return std::make_tuple(on_device + 1, on_host);
+      case cudaMemoryTypeManaged: return std::make_tuple(on_device + 1, on_host + 1);
+      default: return std::make_tuple(on_device, on_host);
+    }
   }
+};
+
+/** Check if all argument pointers reside on the host or on the device. */
+template <typename... Types>
+inline auto check_pointer_residency(const Types*... ptrs) -> pointer_residency
+{
+  auto [on_device, on_host] = pointer_residency_count<Types...>::run(ptrs...);
+  int n_args                = sizeof...(Types);
+  if (on_device == n_args && on_host == n_args) { return pointer_residency::host_and_device; }
+  if (on_device == n_args) { return pointer_residency::device_only; }
+  if (on_host == n_args) { return pointer_residency::host_only; }
+  return pointer_residency::mixed;
 }
 
 template <typename T>
@@ -62,114 +83,149 @@ struct config {
 
 template <>
 struct config<float> {
-  using value_t                   = float;
-  static constexpr float kDivisor = 1.0;
+  using value_t                    = float;
+  static constexpr double kDivisor = 1.0;
 };
 template <>
 struct config<uint8_t> {
-  using value_t                   = uint32_t;
-  static constexpr float kDivisor = 256.0;
+  using value_t                    = uint32_t;
+  static constexpr double kDivisor = 256.0;
 };
 template <>
 struct config<int8_t> {
-  using value_t                   = int32_t;
-  static constexpr float kDivisor = 128.0;
-};
-
-template <typename T, typename S>
-struct mapping {
-  HDI auto operator()(T x) -> S;
+  using value_t                    = int32_t;
+  static constexpr double kDivisor = 128.0;
 };
 
+/**
+ * @brief Converting values between the types taking into account scaling factors
+ * for the integral types.
+ *
+ * @tparam T target type of the mapping.
+ */
 template <typename T>
-struct mapping<T, T> {
-  HDI auto operator()(T x) -> T { return x; }
+struct mapping {
+  /**
+   * @defgroup
+   * @brief Cast and possibly scale a value of the source type `S` to the target type `T`.
+   *
+   * @tparam S source type
+   * @param x source value
+   * @{
+   */
+  template <typename S>
+  HDI auto operator()(const S& x) -> std::enable_if_t<std::is_same_v<S, T>, T>
+  {
+    return x;
+  };
+
+  template <typename S>
+  HDI auto operator()(const S& x) -> std::enable_if_t<!std::is_same_v<S, T>, T>
+  {
+    constexpr double kMult = config<S>::kDivisor / config<T>::kDivisor;
+    return static_cast<T>(static_cast<double>(x) * kMult);
+  };
+  /** @} */
 };
 
-template <typename T>
-struct mapping<T, float> {
-  HDI auto operator()(T x) -> float { return float(x) * kMult; }
-
- private:
-  static constexpr float kMult = 1 / config<T>::kDivisor;
+template <>
+struct mapping<float> {
+  template <typename S>
+  HDI auto operator()(const S& x) -> float
+  {
+    constexpr float kMult = static_cast<float>(config<float>::kDivisor / config<S>::kDivisor);
+    return static_cast<float>(x) * kMult;
+  };
 };
 
-template <typename S>
-struct mapping<float, S> {
-  HDI auto operator()(float x) -> S { return S(x * kMult); }
-
- private:
-  static constexpr float kMult = config<S>::kDivisor;
-};
+/**
+ * @brief Sets the first num bytes of the block of memory pointed by ptr to the specified value.
+ *
+ * @param[out] ptr host or device pointer
+ * @param[in] value
+ * @param[in] n_bytes
+ */
+void memset(void* ptr, int value, size_t n_bytes, rmm::cuda_stream_view stream)
+{
+  switch (check_pointer_residency(ptr)) {
+    case pointer_residency::host_and_device:
+    case pointer_residency::device_only: {
+      RAFT_CUDA_TRY(cudaMemsetAsync(ptr, value, n_bytes, stream));
+    } break;
+    case pointer_residency::host_only: {
+      stream.synchronize();
+      ::memset(ptr, value, n_bytes);
+    } break;
+    default: RAFT_FAIL("memset: unreachable code");
+  }
+}
 
-__global__ void kern_argmin(uint32_t nRows,
-                            uint32_t nCols,
-                            const float* a,  // [nRows, nCols]
-                            uint32_t* out    // [nRows]
-)
+__global__ void argmin_along_rows_kernel(uint32_t n_rows,
+                                         uint32_t n_cols,
+                                         const float* a,
+                                         uint32_t* out)
 {
-  __shared__ uint32_t smCol[1024];
-  __shared__ float smVal[1024];
-  uint32_t iRow = blockIdx.x;
-  if (iRow >= nRows) return;
-  uint32_t iCol   = threadIdx.x;
-  uint32_t minCol = nCols;
-  float minVal    = FLT_MAX;
-  for (iCol = threadIdx.x; iCol < nCols; iCol += blockDim.x) {
-    if (minVal > a[iCol + (nCols * iRow)]) {
-      minVal = a[iCol + (nCols * iRow)];
-      minCol = iCol;
+  __shared__ uint32_t shm_ids[1024];  // NOLINT
+  __shared__ float shm_vals[1024];    // NOLINT
+  uint32_t i = blockIdx.x;
+  if (i >= n_rows) return;
+  uint32_t min_idx = n_cols;
+  float min_val    = raft::upper_bound<float>();
+  for (uint32_t j = threadIdx.x; j < n_cols; j += blockDim.x) {
+    if (min_val > a[j + n_cols * i]) {
+      min_val = a[j + n_cols * i];
+      min_idx = j;
     }
   }
-  smVal[threadIdx.x] = minVal;
-  smCol[threadIdx.x] = minCol;
+  shm_vals[threadIdx.x] = min_val;
+  shm_ids[threadIdx.x]  = min_idx;
   __syncthreads();
   for (uint32_t offset = blockDim.x / 2; offset > 0; offset >>= 1) {
     if (threadIdx.x < offset) {
-      if (smVal[threadIdx.x] < smVal[threadIdx.x + offset]) {
-      } else if (smVal[threadIdx.x] > smVal[threadIdx.x + offset]) {
-        smVal[threadIdx.x] = smVal[threadIdx.x + offset];
-        smCol[threadIdx.x] = smCol[threadIdx.x + offset];
-      } else if (smCol[threadIdx.x] > smCol[threadIdx.x + offset]) {
-        smCol[threadIdx.x] = smCol[threadIdx.x + offset];
+      if (shm_vals[threadIdx.x] < shm_vals[threadIdx.x + offset]) {
+      } else if (shm_vals[threadIdx.x] > shm_vals[threadIdx.x + offset]) {
+        shm_vals[threadIdx.x] = shm_vals[threadIdx.x + offset];
+        shm_ids[threadIdx.x]  = shm_ids[threadIdx.x + offset];
+      } else if (shm_ids[threadIdx.x] > shm_ids[threadIdx.x + offset]) {
+        shm_ids[threadIdx.x] = shm_ids[threadIdx.x + offset];
       }
     }
     __syncthreads();
   }
-  if (threadIdx.x == 0) { out[iRow] = smCol[0]; }
+  if (threadIdx.x == 0) { out[i] = shm_ids[0]; }
 }
 
 /**
- * argmin along column
+ * @brief Find index of the smallest element in each row.
+ *
+ * NB: device-only function
+ * TODO: specialize select_k for the case of `k == 1` and use that one instead.
  *
- * NB: device-only
+ * @param n_rows
+ * @param n_cols
+ * @param[in] a device pointer to the row-major matrix [n_rows, n_cols]
+ * @param[out] out device pointer to the vector of selected indices [n_rows]
+ * @param stream
  */
-void _cuann_argmin(uint32_t nRows,
-                   uint32_t nCols,
-                   const float* a,  // [nRows, nCols]
-                   uint32_t* out,   // [nRows]
-                   rmm::cuda_stream_view stream)
+void argmin_along_rows(
+  uint32_t n_rows, uint32_t n_cols, const float* a, uint32_t* out, rmm::cuda_stream_view stream)
 {
-  uint32_t nThreads = 1024;
-  while (nThreads > nCols) {
-    nThreads /= 2;
+  uint32_t block_dim = 1024;
+  while (block_dim > n_cols) {
+    block_dim /= 2;
   }
-  nThreads = max(nThreads, 128);
-  kern_argmin<<<nRows, nThreads, 0, stream>>>(nRows, nCols, a, out);
+  block_dim = max(block_dim, 128);
+  argmin_along_rows_kernel<<<n_rows, block_dim, 0, stream>>>(n_rows, n_cols, a, out);
 }
 
-__global__ void kern_sqsum(uint32_t nRows,
-                           uint32_t nCols,
-                           const float* a,  // [nRows, nCols]
-                           float* out       // [nRows]
-)
+__global__ void dots_along_rows_kernel(uint32_t n_rows, uint32_t n_cols, const float* a, float* out)
 {
-  uint64_t iRow = threadIdx.y + (blockDim.y * blockIdx.x);
-  if (iRow >= nRows) return;
+  uint64_t i = threadIdx.y + (blockDim.y * blockIdx.x);
+  if (i >= n_rows) return;
 
   float sqsum = 0.0;
-  for (uint64_t iCol = threadIdx.x; iCol < nCols; iCol += blockDim.x) {
-    float val = a[iCol + (nCols * iRow)];
+  for (uint64_t j = threadIdx.x; j < n_cols; j += blockDim.x) {
+    float val = a[j + (n_cols * i)];
     sqsum += val * val;
   }
   sqsum += __shfl_xor_sync(0xffffffff, sqsum, 1);
@@ -177,126 +233,107 @@ __global__ void kern_sqsum(uint32_t nRows,
   sqsum += __shfl_xor_sync(0xffffffff, sqsum, 4);
   sqsum += __shfl_xor_sync(0xffffffff, sqsum, 8);
   sqsum += __shfl_xor_sync(0xffffffff, sqsum, 16);
-  if (threadIdx.x == 0) { out[iRow] = sqsum; }
+  if (threadIdx.x == 0) { out[i] = sqsum; }
 }
 
 /**
- * @brief Square sum along rows (row-major).
+ * @brief Square sum of values in each row (row-major matrix).
  *
- * NB: device-only
+ * NB: device-only function
  *
- * @param nRows
- * @param nCols
- * @param[in] a device pointer to the row-major matrix [nRows, nCols]
- * @param[out] out device pointer to the vector of dot-products [nRows]
+ * @param n_rows
+ * @param n_cols
+ * @param[in] a device pointer to the row-major matrix [n_rows, n_cols]
+ * @param[out] out device pointer to the vector of dot-products [n_rows]
  * @param stream
  */
-void _cuann_sqsum(uint32_t nRows,
-                  uint32_t nCols,
-                  const float* a,  // [numDataset, dimDataset]
-                  float* out,      // [numDataset,]
-                  rmm::cuda_stream_view stream)
+void dots_along_rows(
+  uint32_t n_rows, uint32_t n_cols, const float* a, float* out, rmm::cuda_stream_view stream)
 {
   dim3 threads(32, 4, 1);
-  dim3 blocks(ceildiv(nRows, threads.y), 1, 1);
-  kern_sqsum<<<blocks, threads, 0, stream>>>(nRows, nCols, a, out);
+  dim3 blocks(ceildiv(n_rows, threads.y), 1, 1);
+  dots_along_rows_kernel<<<blocks, threads, 0, stream>>>(n_rows, n_cols, a, out);
   /**
    * TODO: this can be replaced with the rowNorm helper as shown below.
    * However, the rowNorm helper seems to incur a significant performance penalty
    * (example case ann-search slowed down from 150ms to 186ms).
    *
-   * raft::linalg::rowNorm(out, a, nCols, nRows, raft::linalg::L2Norm, true, stream);
+   * raft::linalg::rowNorm(out, a, n_cols, n_rows, raft::linalg::L2Norm, true, stream);
    */
 }
 
 template <typename T>
-__global__ void kern_accumulate_with_label(uint32_t nRowsOutput,
-                                           uint32_t nCols,
-                                           float* output,    // [nRowsOutput, nCols,]
-                                           uint32_t* count,  // [nRowsOutput,]
-                                           uint32_t nRowsInput,
-                                           const T* input,        // [nRowsInput, nCols,]
-                                           const uint32_t* label  // [nRowsInput,]
-)
+__global__ void accumulate_into_selected_kernel(uint32_t n_rows,
+                                                uint32_t n_cols,
+                                                float* output,
+                                                uint32_t* selection_counters,
+                                                const T* input,
+                                                const uint32_t* row_ids)
 {
-  uint64_t gid       = threadIdx.x + (blockDim.x * blockIdx.x);
-  uint64_t iCol      = gid % nCols;
-  uint64_t iRowInput = gid / nCols;
-  if (iRowInput >= nRowsInput) return;
-  uint64_t iRowOutput = label[iRowInput];
-  if (iCol == 0) { atomicAdd(&(count[iRowOutput]), 1); }
-  atomicAdd(&(output[iCol + (nCols * iRowOutput)]), float(input[gid]) / config<T>::kDivisor);
+  uint64_t gid = threadIdx.x + (blockDim.x * blockIdx.x);
+  uint64_t j   = gid % n_cols;
+  uint64_t i   = gid / n_cols;
+  if (i >= n_rows) return;
+  uint64_t l = row_ids[i];
+  if (j == 0) { atomicAdd(&(selection_counters[l]), 1); }
+  atomicAdd(&(output[j + n_cols * l]), mapping<float>{}(input[gid]));
 }
 
 /**
- * @brief Accumulate
- *
- * Pointer residency: altogether available either on GPU or on CPU
+ * @brief Add all rows of input matrix into a selection of rows in the output matrix
+ * (cast and possibly scale the data input type). Count the number of times every output
+ * row was selected along the way.
  *
  * @tparam T
  *
- * @param nRowsOutput
- * @param nCols
- * @param output device/host pointer
- * @param count device/host pointer
- * @param nRowsInput
- * @param input device/host pointer
- * @param label device/host pointer
+ * @param n_cols number of columns in all matrices
+ * @param[out] output output matrix [..., n_cols]
+ * @param[out] selection_counters number of occurrences of each row id in row_ids [..., n_cols]
+ * @param n_rows number of rows in the input
+ * @param[in] input row-major input matrix [n_rows, n_cols]
+ * @param[in] row_ids row indices in the output matrix [n_rows]
  */
 template <typename T>
-void _cuann_accumulate_with_label(uint32_t nRowsOutput,
-                                  uint32_t nCols,
-                                  float* output,    // [nRowsOutput, nCols,]
-                                  uint32_t* count,  // [nRowsOutput,]
-                                  uint32_t nRowsInput,
-                                  const T* input,         // [nRowsInput, nCols,]
-                                  const uint32_t* label,  // [nRowsInput,]
-                                  rmm::cuda_stream_view stream)
+void accumulate_into_selected(uint32_t n_rows,
+                              uint32_t n_cols,
+                              float* output,
+                              uint32_t* selection_counters,
+                              const T* input,
+                              const uint32_t* row_ids,
+                              rmm::cuda_stream_view stream)
 {
-  bool useGPU = 1;
-  cudaPointerAttributes attr;
-  cudaPointerGetAttributes(&attr, output);
-  if (attr.type == cudaMemoryTypeUnregistered || attr.type == cudaMemoryTypeHost) { useGPU = 0; }
-  cudaPointerGetAttributes(&attr, count);
-  if (attr.type == cudaMemoryTypeUnregistered || attr.type == cudaMemoryTypeHost) { useGPU = 0; }
-  cudaPointerGetAttributes(&attr, input);
-  if (attr.type == cudaMemoryTypeUnregistered || attr.type == cudaMemoryTypeHost) { useGPU = 0; }
-  cudaPointerGetAttributes(&attr, label);
-  if (attr.type == cudaMemoryTypeUnregistered || attr.type == cudaMemoryTypeHost) { useGPU = 0; }
-
-  if (useGPU) {
-    // GPU
-    uint32_t nThreads = 128;
-    uint64_t nBlocks  = ceildiv((uint64_t)nRowsInput * nCols, (uint64_t)nThreads);
-    kern_accumulate_with_label<T><<<nBlocks, nThreads, 0, stream>>>(
-      nRowsOutput, nCols, output, count, nRowsInput, input, label);
-  } else {
-    // CPU
-    stream.synchronize();
-    for (uint64_t i = 0; i < nRowsInput; i++) {
-      uint64_t l = label[i];
-      count[l] += 1;
-      for (uint64_t j = 0; j < nCols; j++) {
-        output[j + (nCols * l)] += float(input[j + (nCols * i)]) / config<T>::kDivisor;
+  switch (check_pointer_residency(output, input, selection_counters, row_ids)) {
+    case pointer_residency::host_and_device:
+    case pointer_residency::device_only: {
+      uint32_t block_dim = 128;
+      auto grid_dim      = static_cast<uint32_t>(ceildiv<uint64_t>(
+        static_cast<uint64_t>(n_rows) * static_cast<uint64_t>(n_cols), block_dim));
+      accumulate_into_selected_kernel<T><<<grid_dim, block_dim, 0, stream>>>(
+        n_rows, n_cols, output, selection_counters, input, row_ids);
+    } break;
+    case pointer_residency::host_only: {
+      stream.synchronize();
+      for (uint64_t i = 0; i < n_rows; i++) {
+        uint64_t l = row_ids[i];
+        selection_counters[l]++;
+        for (uint64_t j = 0; j < n_cols; j++) {
+          output[j + n_cols * l] += mapping<float>{}(input[j + n_cols * i]);
+        }
       }
-    }
-    stream.synchronize();
+      stream.synchronize();
+    } break;
+    default: RAFT_FAIL("All pointers must reside on the same side, host or device.");
   }
 }
 
-__global__ void kern_normalize(uint32_t nRows,
-                               uint32_t nCols,
-                               float* a,                   // [nRows, nCols]
-                               const uint32_t* numSamples  // [nRows,]
-)
+__global__ void normalize_rows_kernel(uint32_t n_rows, uint32_t n_cols, float* a)
 {
-  uint64_t iRow = threadIdx.y + (blockDim.y * blockIdx.x);
-  if (iRow >= nRows) return;
-  if (numSamples != nullptr and numSamples[iRow] < 1) return;
+  uint64_t i = threadIdx.y + (blockDim.y * blockIdx.x);
+  if (i >= n_rows) return;
 
   float sqsum = 0.0;
-  for (uint32_t iCol = threadIdx.x; iCol < nCols; iCol += blockDim.x) {
-    float val = a[iCol + (nCols * iRow)];
+  for (uint32_t j = threadIdx.x; j < n_cols; j += blockDim.x) {
+    float val = a[j + (n_cols * i)];
     sqsum += val * val;
   }
   sqsum += __shfl_xor_sync(0xffffffff, sqsum, 1);
@@ -304,149 +341,161 @@ __global__ void kern_normalize(uint32_t nRows,
   sqsum += __shfl_xor_sync(0xffffffff, sqsum, 4);
   sqsum += __shfl_xor_sync(0xffffffff, sqsum, 8);
   sqsum += __shfl_xor_sync(0xffffffff, sqsum, 16);
-  sqsum = sqrt(sqsum);
-  for (uint32_t iCol = threadIdx.x; iCol < nCols; iCol += blockDim.x) {
-    a[iCol + (nCols * iRow)] /= sqsum;
+  if (sqsum <= 1e-8) return;
+  sqsum = rsqrtf(sqsum);  // reciprocal of the square root
+  for (uint32_t j = threadIdx.x; j < n_cols; j += blockDim.x) {
+    a[j + n_cols * i] *= sqsum;
   }
 }
 
 /**
- * @brief Normalize
+ * @brief Divide rows by their L2 norm (square root of sum of squares).
  *
- * NB: device-only
+ * NB: device-only function
  *
- * @param[in] nRows
- * @param[in] nCols
- * @param[inout] a device pointer
- * @param[in] numSamples device pointer
+ * @param[in] n_rows
+ * @param[in] n_cols
+ * @param[inout] a device pointer to a row-major matrix [n_rows, n_cols]
+ * @param stream
  */
-void _cuann_normalize(uint32_t nRows,
-                      uint32_t nCols,
-                      float* a,                    // [nRows, nCols]
-                      const uint32_t* numSamples,  // [nRows,]
-                      rmm::cuda_stream_view stream)
+void normalize_rows(uint32_t n_rows, uint32_t n_cols, float* a, rmm::cuda_stream_view stream)
 {
   dim3 threads(32, 4, 1);  // DO NOT CHANGE
-  dim3 blocks(ceildiv(nRows, threads.y), 1, 1);
-  kern_normalize<<<blocks, threads, 0, stream>>>(nRows, nCols, a, numSamples);
+  dim3 blocks(ceildiv(n_rows, threads.y), 1, 1);
+  normalize_rows_kernel<<<blocks, threads, 0, stream>>>(n_rows, n_cols, a);
 }
 
-// divide
-__global__ void kern_divide(uint32_t nRows,
-                            uint32_t nCols,
-                            float* a,                   // [nRows, nCols]
-                            const uint32_t* numSamples  // [nRows,]
-)
+__global__ void divide_along_rows_kernel(uint32_t n_rows,
+                                         uint32_t n_cols,
+                                         float* a,
+                                         const uint32_t* d)
 {
-  uint64_t gid  = threadIdx.x + (blockDim.x * blockIdx.x);
-  uint64_t iRow = gid / nCols;
-  if (iRow >= nRows) return;
-  if (numSamples[iRow] == 0) return;
-  a[gid] /= numSamples[iRow];
+  uint64_t gid = threadIdx.x + blockDim.x * blockIdx.x;
+  uint64_t i   = gid / n_cols;
+  if (i >= n_rows) return;
+  if (d[i] != 0) { a[gid] /= d[i]; }
 }
 
 /**
- * @brief Divide
+ * @brief Divide matrix values along rows by an integer value, skipping rows if the corresponding
+ * divisor is zero.
  *
- * NB: device-only
+ * NB: device-only function
  *
- * @param[in] nRows
- * @param[in] nCols
- * @param[inout] a device pointer
- * @param[in] numSamples device pointer
+ * @param[in] n_rows
+ * @param[in] n_cols
+ * @param[inout] a device pointer to a row-major matrix [n_rows, n_cols]
+ * @param[in] d device pointer to a vector of divisors [n_rows]
  */
-void _cuann_divide(uint32_t nRows,
-                   uint32_t nCols,
-                   float* a,                    // [nRows, nCols]
-                   const uint32_t* numSamples,  // [nRows,]
-                   rmm::cuda_stream_view stream)
+void divide_along_rows(
+  uint32_t n_rows, uint32_t n_cols, float* a, const uint32_t* d, rmm::cuda_stream_view stream)
 {
   dim3 threads(128, 1, 1);
-  dim3 blocks(ceildiv<uint64_t>((uint64_t)nRows * (uint64_t)nCols, threads.x), 1, 1);
-  kern_divide<<<blocks, threads, 0, stream>>>(nRows, nCols, a, numSamples);
+  dim3 blocks(
+    ceildiv<uint64_t>(static_cast<uint64_t>(n_rows) * static_cast<uint64_t>(n_cols), threads.x),
+    1,
+    1);
+  divide_along_rows_kernel<<<blocks, threads, 0, stream>>>(n_rows, n_cols, a, d);
 }
-__global__ void kern_outer_add(const float* a,
-                               uint32_t numA,
-                               const float* b,
-                               uint32_t numB,
-                               float* c  // [numA, numB]
-)
+
+template <typename T>
+__global__ void outer_add_kernel(const T* a, uint32_t len_a, const T* b, uint32_t len_b, T* c)
 {
-  uint64_t gid = threadIdx.x + (blockDim.x * blockIdx.x);
-  uint64_t iA  = gid / numB;
-  uint64_t iB  = gid % numB;
-  if (iA >= numA) return;
-  float valA = (a == nullptr) ? 0.0 : a[iA];
-  float valB = (b == nullptr) ? 0.0 : b[iB];
-  c[gid]     = valA + valB;
+  uint64_t gid = threadIdx.x + blockDim.x * blockIdx.x;
+  uint64_t i   = gid / len_b;
+  uint64_t j   = gid % len_b;
+  if (i >= len_a) return;
+  c[gid] = (a == nullptr ? T(0) : a[i]) + (b == nullptr ? T(0) : b[j]);
 }
 
 /**
- * outer add
+ * @brief Fill matrix `c` with all combinations of sums of vectors `a` and `b`.
  *
- * NB: device-only
+ * NB: device-only function
+ *
+ * @tparam T element type
+ *
+ * @param[in] a device pointer to a vector [len_a]
+ * @param len_a number of elements in `a`
+ * @param[in] b device pointer to a vector [len_b]
+ * @param len_b number of elements in `b`
+ * @param[out] c row-major matrix [len_a, len_b]
+ * @param stream
  */
-void _cuann_outer_add(const float* a,
-                      uint32_t numA,
-                      const float* b,
-                      uint32_t numB,
-                      float* c,  // [numA, numB]
-                      rmm::cuda_stream_view stream)
+template <typename T>
+void outer_add(
+  const T* a, uint32_t len_a, const T* b, uint32_t len_b, T* c, rmm::cuda_stream_view stream)
 {
   dim3 threads(128, 1, 1);
-  dim3 blocks(ceildiv<uint64_t>((uint64_t)numA * (uint64_t)numB, threads.x), 1, 1);
-  kern_outer_add<<<blocks, threads, 0, stream>>>(a, numA, b, numB, c);
+  dim3 blocks(
+    ceildiv<uint64_t>(static_cast<uint64_t>(len_a) * static_cast<uint64_t>(len_b), threads.x),
+    1,
+    1);
+  outer_add_kernel<<<blocks, threads, 0, stream>>>(a, len_a, b, len_b, c);
 }
 
-// copy with row list
-template <typename T>
-__global__ void kern_copy_with_list(uint32_t nRows,
-                                    uint32_t nCols,
-                                    const T* src,             // [..., ldSrc]
-                                    const uint32_t* rowList,  // [nRows,]
-                                    uint32_t ldSrc,
-                                    float* dst,  // [nRows, ldDst]
-                                    uint32_t ldDst)
+template <typename T, typename S>
+__global__ void copy_selected_kernel(uint32_t n_rows,
+                                     uint32_t n_cols,
+                                     const T* src,
+                                     const uint32_t* row_ids,
+                                     uint32_t ld_src,
+                                     S* dst,
+                                     uint32_t ld_dst)
 {
-  uint64_t gid  = threadIdx.x + (blockDim.x * blockIdx.x);
-  uint64_t iCol = gid % nCols;
-  uint64_t iRow = gid / nCols;
-  if (iRow >= nRows) return;
-  uint64_t iaRow             = rowList[iRow];
-  dst[iCol + (ldDst * iRow)] = float(src[iCol + (ldSrc * iaRow)]) / config<T>::kDivisor;
+  uint64_t gid   = threadIdx.x + blockDim.x * blockIdx.x;
+  uint64_t j     = gid % n_cols;
+  uint64_t i_dst = gid / n_cols;
+  if (i_dst >= n_rows) return;
+  uint64_t i_src          = row_ids[i_dst];
+  dst[ld_dst * i_dst + j] = mapping<T>{}(src[ld_src * i_src + j]);
 }
 
 /**
- * copy with row list
+ * @brief Copy selected rows of a matrix while mapping the data from the source to the target
+ * type.
  *
- * NB: host or device
+ * @tparam T target type
+ * @tparam S source type
+ *
+ * @param n_rows
+ * @param n_cols
+ * @param[in] src input matrix [..., ld_src]
+ * @param[in] row_ids selection of rows to be copied [n_rows]
+ * @param ld_src number of cols in the input (ld_src >= n_cols)
+ * @param[out] dst output matrix [n_rows, ld_dst]
+ * @param ld_dst number of cols in the output (ld_dst >= n_cols)
+ * @param stream
  */
-template <typename T>
-void _cuann_copy_with_list(uint32_t nRows,
-                           uint32_t nCols,
-                           const T* src,             // [..., ldSrc]
-                           const uint32_t* rowList,  // [nRows,]
-                           uint32_t ldSrc,
-                           float* dst,  // [nRows, ldDst]
-                           uint32_t ldDst,
-                           rmm::cuda_stream_view stream)
+template <typename T, typename S>
+void copy_selected(uint32_t n_rows,
+                   uint32_t n_cols,
+                   const T* src,
+                   const uint32_t* row_ids,
+                   uint32_t ld_src,
+                   S* dst,
+                   uint32_t ld_dst,
+                   rmm::cuda_stream_view stream)
 {
-  cudaPointerAttributes attr;
-  cudaPointerGetAttributes(&attr, src);
-  if (attr.type == cudaMemoryTypeUnregistered || attr.type == cudaMemoryTypeHost) {
-    stream.synchronize();
-    for (uint64_t iRow = 0; iRow < nRows; iRow++) {
-      uint64_t iaRow = rowList[iRow];
-      for (uint64_t iCol = 0; iCol < nCols; iCol++) {
-        dst[iCol + (ldDst * iRow)] = float(src[iCol + (ldSrc * iaRow)]) / config<T>::kDivisor;
+  switch (check_pointer_residency(src, dst)) {
+    case pointer_residency::host_and_device:
+    case pointer_residency::device_only: {
+      uint32_t block_dim = 128;
+      uint32_t grid_dim  = ceildiv(n_rows * n_cols, block_dim);
+      copy_selected_kernel<T, S>
+        <<<grid_dim, block_dim, 0, stream>>>(n_rows, n_cols, src, row_ids, ld_src, dst, ld_dst);
+    } break;
+    case pointer_residency::host_only: {
+      stream.synchronize();
+      for (uint64_t i_dst = 0; i_dst < n_rows; i_dst++) {
+        uint64_t i_src = row_ids[i_dst];
+        for (uint64_t j = 0; j < n_cols; j++) {
+          dst[ld_dst * i_dst + j] = mapping<T>{}(src[ld_src * i_src + j]);
+        }
       }
-    }
-    stream.synchronize();
-  } else {
-    uint32_t nThreads = 128;
-    uint32_t nBlocks  = ceildiv(nRows * nCols, nThreads);
-    kern_copy_with_list<T>
-      <<<nBlocks, nThreads, 0, stream>>>(nRows, nCols, src, rowList, ldSrc, dst, ldDst);
+      stream.synchronize();
+    } break;
+    default: RAFT_FAIL("All pointers must reside on the same side, host or device.");
   }
 }
 }  // namespace raft::spatial::knn::detail::utils

From bacb402ebfb4918a620a6988d5e36ddcc1ecb6cd Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 7 Jun 2022 11:16:51 +0200
Subject: [PATCH 057/118] Refactor tests and reduce their memory footprint

---
 cpp/test/spatial/ann_base_kernel.cuh | 108 +++++++-----
 cpp/test/spatial/ann_ivf_flat.cu     | 248 +++++++++++----------------
 2 files changed, 159 insertions(+), 197 deletions(-)

diff --git a/cpp/test/spatial/ann_base_kernel.cuh b/cpp/test/spatial/ann_base_kernel.cuh
index 51ac3f0ba6..2c9698eafd 100644
--- a/cpp/test/spatial/ann_base_kernel.cuh
+++ b/cpp/test/spatial/ann_base_kernel.cuh
@@ -16,73 +16,89 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/distance/distance_type.hpp>
-#include <raft/spatial/knn/detail/selection_faiss.cuh>
+#include <raft/spatial/knn/knn.cuh>
 
 #include <rmm/device_uvector.hpp>
 
-namespace raft {
-namespace spatial {
-namespace knn {
+namespace raft::spatial::knn {
 template <typename DataType, typename AccT>
 __global__ void naiveDistanceKernel(float* dist,
-                                    int64_t* indices,
                                     const DataType* x,
                                     const DataType* y,
-                                    int m,
-                                    int n,
-                                    int k,
-                                    raft::distance::DistanceType type,
-                                    bool isRowMajor)
+                                    size_t m,
+                                    size_t n,
+                                    size_t k,
+                                    raft::distance::DistanceType type)
 {
-  int midx = threadIdx.x + blockIdx.x * blockDim.x;
-  int nidx = threadIdx.y + blockIdx.y * blockDim.y;
-  if (midx >= m || nidx >= n) return;
-  AccT acc = AccT(0);
-  for (int i = 0; i < k; ++i) {
-    int xidx = isRowMajor ? i + midx * k : i * m + midx;
-    int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
-    if (type == raft::distance::DistanceType::InnerProduct) {
-      acc += x[xidx] * y[yidx];
-    } else {
-      AccT diff = x[xidx] - y[yidx];
-      acc += diff * diff;
+  size_t midx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (midx >= m) return;
+  for (size_t nidx = threadIdx.y + blockIdx.y * blockDim.y; nidx < n;
+       nidx += blockDim.y * gridDim.y) {
+    AccT acc = AccT(0);
+    for (size_t i = 0; i < k; ++i) {
+      size_t xidx = i + midx * k;
+      size_t yidx = i + nidx * k;
+      if (type == raft::distance::DistanceType::InnerProduct) {
+        acc += x[xidx] * y[yidx];
+      } else {
+        AccT diff = x[xidx] - y[yidx];
+        acc += diff * diff;
+      }
     }
+    float dist_val = (float)acc;
+    if (type == raft::distance::DistanceType::L2SqrtExpanded ||
+        type == raft::distance::DistanceType::L2SqrtUnexpanded)
+      dist_val = raft::mySqrt(dist_val);
+    dist[midx * n + nidx] = dist_val;
   }
-  float dist_val = (float)acc;
-  if (type == raft::distance::DistanceType::L2SqrtExpanded ||
-      type == raft::distance::DistanceType::L2SqrtUnexpanded)
-    dist_val = raft::mySqrt(dist_val);
-  int outidx      = isRowMajor ? midx * n + nidx : midx + m * nidx;
-  dist[outidx]    = dist_val;
-  indices[outidx] = outidx;  // This is required because of the select_k API.
 }
 
-// currently using this naive kernel as FAISS & fusedL2kNN doesn't support 8-bit
+/**
+ * TODO: either replace this with brute_force_knn or with distance+select_k
+ *       when either distance or brute_force_knn support 8-bit int inputs.
+ */
 template <typename DataType, typename AccT>
 void naiveBfKnn(float* dist_topk,
                 int64_t* indices_topk,
                 const DataType* x,
                 const DataType* y,
-                int m,
-                int n,
-                int k,
-                int numOfNN,
+                size_t n_inputs,
+                size_t input_len,
+                size_t dim,
+                uint32_t k,
                 raft::distance::DistanceType type,
-                bool isRowMajor,
                 DataType metric_arg = 2.0f,
                 cudaStream_t stream = 0)
 {
-  static const dim3 TPB(16, 32, 1);
-  dim3 nblks(raft::ceildiv(m, (int)TPB.x), raft::ceildiv(n, (int)TPB.y), 1);
+  dim3 block_dim(16, 32, 1);
+  // maximum reasonable grid size in `y` direction
+  uint16_t grid_y =
+    static_cast<uint16_t>(std::min<size_t>(raft::ceildiv<size_t>(input_len, block_dim.y), 32768));
+
+  // bound the memory used by this function
+  size_t max_batch_size =
+    std::min(n_inputs, raft::ceildiv<size_t>(size_t(1) << size_t(27), input_len));
+  rmm::device_uvector<float> dist(max_batch_size * input_len, stream);
+
+  for (size_t offset = 0; offset < n_inputs; offset += max_batch_size) {
+    size_t batch_size = std::min(max_batch_size, n_inputs - offset);
+    dim3 grid_dim(raft::ceildiv<size_t>(batch_size, block_dim.x), grid_y, 1);
 
-  rmm::device_uvector<float> dist(m * n, stream);
-  rmm::device_uvector<int64_t> indices(m * n, stream);
-  naiveDistanceKernel<DataType, AccT>
-    <<<nblks, TPB, 0, stream>>>(dist.data(), indices.data(), x, y, m, n, k, type, isRowMajor);
-  detail::select_k(
-    dist.data(), indices.data(), m, n, dist_topk, indices_topk, true, numOfNN, stream);
+    naiveDistanceKernel<DataType, AccT><<<grid_dim, block_dim, 0, stream>>>(
+      dist.data(), x + offset * dim, y, batch_size, input_len, dim, type);
+
+    select_k<int64_t, float>(dist.data(),
+                             nullptr,
+                             batch_size,
+                             input_len,
+                             dist_topk + offset * k,
+                             indices_topk + offset * k,
+                             true,
+                             static_cast<int>(k),
+                             stream,
+                             SelectKAlgo::WARP_SORT);
+  }
   RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 }
-}  // namespace knn
-}  // namespace spatial
-}  // namespace raft
+
+}  // namespace raft::spatial::knn
diff --git a/cpp/test/spatial/ann_ivf_flat.cu b/cpp/test/spatial/ann_ivf_flat.cu
index 24ec18b527..5c3a01c75b 100644
--- a/cpp/test/spatial/ann_ivf_flat.cu
+++ b/cpp/test/spatial/ann_ivf_flat.cu
@@ -15,13 +15,12 @@
  */
 
 #include "../test_utils.h"
-
 #include "./ann_base_kernel.cuh"
+
+#include <raft/core/logger.hpp>
 #include <raft/distance/distance_type.hpp>
 #include <raft/random/rng.cuh>
 #include <raft/spatial/knn/ann.cuh>
-#include <raft/spatial/knn/detail/common_faiss.h>
-
 #include <raft/spatial/knn/knn.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -43,7 +42,7 @@ struct AnnIvfFlatInputs {
   int k;
   int nprobe;
   int nlist;
-  raft::distance::DistanceType metric_;
+  raft::distance::DistanceType metric;
 };
 
 template <typename IdxT, typename DistT, typename compareDist>
@@ -61,54 +60,37 @@ struct idx_dist_pair {
 };
 
 template <typename T, typename DistT>
-testing::AssertionResult devArrMatchKnnPair(const T* expected_idx,
-                                            const T* actual_idx,
-                                            const DistT* expected_dist,
-                                            const DistT* actual_dist,
-                                            size_t rows,
-                                            size_t cols,
-                                            const DistT eps,
-                                            double min_recall,
-                                            cudaStream_t stream = 0)
+auto eval_knn(const std::vector<T>& expected_idx,
+              const std::vector<T>& actual_idx,
+              const std::vector<DistT>& expected_dist,
+              const std::vector<DistT>& actual_dist,
+              size_t rows,
+              size_t cols,
+              const DistT eps,
+              double min_recall) -> testing::AssertionResult
 {
-  size_t size = rows * cols;
-  std::unique_ptr<T[]> exp_idx_h(new T[size]);
-  std::unique_ptr<T[]> act_idx_h(new T[size]);
-  std::unique_ptr<DistT[]> exp_dist_h(new DistT[size]);
-  std::unique_ptr<DistT[]> act_dist_h(new DistT[size]);
-  raft::update_host<T>(exp_idx_h.get(), expected_idx, size, stream);
-  raft::update_host<T>(act_idx_h.get(), actual_idx, size, stream);
-  raft::update_host<DistT>(exp_dist_h.get(), expected_dist, size, stream);
-  raft::update_host<DistT>(act_dist_h.get(), actual_dist, size, stream);
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   size_t match_count = 0;
+  size_t total_count = static_cast<size_t>(rows) * static_cast<size_t>(cols);
   for (size_t i = 0; i < rows; ++i) {
     for (size_t k = 0; k < cols; ++k) {
       size_t idx_k  = i * cols + k;  // row major assumption!
-      auto act_idx  = act_idx_h.get()[idx_k];
-      auto act_dist = act_dist_h.get()[idx_k];
+      auto act_idx  = actual_idx[idx_k];
+      auto act_dist = actual_dist[idx_k];
       for (size_t j = 0; j < cols; ++j) {
         size_t idx    = i * cols + j;  // row major assumption!
-        auto exp_idx  = exp_idx_h.get()[idx];
-        auto exp_dist = exp_dist_h.get()[idx];
+        auto exp_idx  = expected_idx[idx];
+        auto exp_dist = expected_dist[idx];
         idx_dist_pair exp_kvp(exp_idx, exp_dist, raft::CompareApprox<DistT>(eps));
         idx_dist_pair act_kvp(act_idx, act_dist, raft::CompareApprox<DistT>(eps));
-        if (!(exp_kvp == act_kvp)) {
-          // return testing::AssertionFailure()
-          //        << "actual=" << act_kvp.idx << "," << act_kvp.dist << "!="
-          //        << "expected" << exp_kvp.idx << "," << exp_kvp.dist << " @" << i << "," << j;
-          // std::cout<< "actual = " << act_kvp.idx << "," << act_kvp.dist << " != "  <<
-          //           " expected = " << exp_kvp.idx << "," << exp_kvp.dist << " @" << i
-          //           << "," << j << std::endl;
-        } else {
+        if (exp_kvp == act_kvp) {
           match_count++;
           break;
         }
       }
     }
   }
-  std::cout << "Recall = " << match_count << "/" << rows * cols << std::endl;
-  double actual_recall = static_cast<double>(match_count) / static_cast<double>(rows * cols);
+  RAFT_LOG_INFO("Recall = %zu/%zu", match_count, total_count);
+  double actual_recall = static_cast<double>(match_count) / static_cast<double>(total_count);
   if (actual_recall < min_recall - eps) {
     return testing::AssertionFailure()
            << "actual recall (" << actual_recall
@@ -122,152 +104,116 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
  public:
   AnnIVFFlatTest()
     : stream_(handle_.get_stream()),
-      params_(::testing::TestWithParam<AnnIvfFlatInputs>::GetParam()),
-      database(params_.num_db_vecs * params_.dim, stream_),
-      search_queries(params_.num_queries * params_.dim, stream_),
-      raft_indices_(params_.num_queries * params_.k, stream_),
-      raft_distances_(params_.num_queries * params_.k, stream_),
-      faiss_indices_(params_.num_queries * params_.k, stream_),
-      faiss_distances_(params_.num_queries * params_.k, stream_)
+      ps(::testing::TestWithParam<AnnIvfFlatInputs>::GetParam()),
+      database(0, stream_),
+      search_queries(0, stream_)
   {
-    RAFT_CUDA_TRY(cudaMemsetAsync(database.data(), 0, database.size() * sizeof(DataT), stream_));
-    RAFT_CUDA_TRY(
-      cudaMemsetAsync(search_queries.data(), 0, search_queries.size() * sizeof(DataT), stream_));
-    RAFT_CUDA_TRY(
-      cudaMemsetAsync(raft_indices_.data(), 0, raft_indices_.size() * sizeof(int64_t), stream_));
-    RAFT_CUDA_TRY(
-      cudaMemsetAsync(raft_distances_.data(), 0, raft_distances_.size() * sizeof(T), stream_));
-    RAFT_CUDA_TRY(
-      cudaMemsetAsync(faiss_indices_.data(), 0, faiss_indices_.size() * sizeof(int64_t), stream_));
-    RAFT_CUDA_TRY(
-      cudaMemsetAsync(faiss_distances_.data(), 0, faiss_distances_.size() * sizeof(T), stream_));
-    handle_.sync_stream(stream_);
   }
 
  protected:
   void testIVFFlat(bool is8bit)
   {
-    if constexpr (std::is_same<DataT, uint8_t>{}) {
-      naiveBfKnn<uint8_t, uint32_t>(faiss_distances_.data(),
-                                    faiss_indices_.data(),
-                                    search_queries.data(),
-                                    database.data(),
-                                    num_queries,
-                                    num_db_vecs,
-                                    dim,
-                                    k_,
-                                    metric,
-                                    true,
-                                    2.0f,
-                                    stream_);
-    } else if constexpr (std::is_same<DataT, int8_t>{}) {
-      naiveBfKnn<int8_t, int32_t>(faiss_distances_.data(),
-                                  faiss_indices_.data(),
-                                  search_queries.data(),
-                                  database.data(),
-                                  num_queries,
-                                  num_db_vecs,
-                                  dim,
-                                  k_,
-                                  metric,
-                                  true,
-                                  2.0f,
-                                  stream_);
-    } else if constexpr (std::is_same<DataT, float>{}) {
-      naiveBfKnn<float, float>(faiss_distances_.data(),
-                               faiss_indices_.data(),
+    size_t queries_size = ps.num_queries * ps.k;
+    std::vector<int64_t> indices_ivfflat(queries_size);
+    std::vector<int64_t> indices_naive(queries_size);
+    std::vector<T> distances_ivfflat(queries_size);
+    std::vector<T> distances_naive(queries_size);
+
+    {
+      rmm::device_uvector<T> distances_naive_dev(queries_size, stream_);
+      rmm::device_uvector<int64_t> indices_naive_dev(queries_size, stream_);
+      using acc_t = typename detail::utils::config<DataT>::value_t;
+      naiveBfKnn<DataT, acc_t>(distances_naive_dev.data(),
+                               indices_naive_dev.data(),
                                search_queries.data(),
                                database.data(),
-                               num_queries,
-                               num_db_vecs,
-                               dim,
-                               k_,
-                               metric,
-                               true,
+                               ps.num_queries,
+                               ps.num_db_vecs,
+                               ps.dim,
+                               ps.k,
+                               ps.metric,
                                2.0f,
                                stream_);
+      update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_);
+      update_host(indices_naive.data(), indices_naive_dev.data(), queries_size, stream_);
+      handle_.sync_stream(stream_);
     }
-    handle_.sync_stream(stream_);
 
-    raft::spatial::knn::IVFFlatParam ivfParams;
-    ivfParams.nprobe = nprobe_;
-    ivfParams.nlist  = nlist_;
-    raft::spatial::knn::knnIndex index;
-    index.index   = nullptr;
-    index.gpu_res = nullptr;
+    {
+      rmm::device_uvector<T> distances_ivfflat_dev(queries_size, stream_);
+      rmm::device_uvector<int64_t> indices_ivfflat_dev(queries_size, stream_);
+      raft::spatial::knn::IVFFlatParam ivfParams;
+      ivfParams.nprobe = ps.nprobe;
+      ivfParams.nlist  = ps.nlist;
+      raft::spatial::knn::knnIndex index;
+      index.index   = nullptr;
+      index.gpu_res = nullptr;
 
-    approx_knn_build_index(handle_,
-                           &index,
-                           dynamic_cast<raft::spatial::knn::knnIndexParam*>(&ivfParams),
-                           metric,
-                           0,
-                           database.data(),
-                           num_db_vecs,
-                           dim);
-    handle_.sync_stream(stream_);
-    approx_knn_search(handle_,
-                      raft_distances_.data(),
-                      raft_indices_.data(),
-                      &index,
-                      dynamic_cast<raft::spatial::knn::knnIndexParam*>(&ivfParams),
-                      k_,
-                      search_queries.data(),
-                      num_queries);
-    handle_.sync_stream(stream_);
+      approx_knn_build_index(handle_,
+                             &index,
+                             dynamic_cast<raft::spatial::knn::knnIndexParam*>(&ivfParams),
+                             ps.metric,
+                             0,
+                             database.data(),
+                             ps.num_db_vecs,
+                             ps.dim);
+      handle_.sync_stream(stream_);
+
+      approx_knn_search(handle_,
+                        distances_ivfflat_dev.data(),
+                        indices_ivfflat_dev.data(),
+                        &index,
+                        dynamic_cast<raft::spatial::knn::knnIndexParam*>(&ivfParams),
+                        ps.k,
+                        search_queries.data(),
+                        ps.num_queries);
+      update_host(distances_ivfflat.data(), distances_ivfflat_dev.data(), queries_size, stream_);
+      update_host(indices_ivfflat.data(), indices_ivfflat_dev.data(), queries_size, stream_);
+      handle_.sync_stream(stream_);
+    }
 
     // unless something is really wrong with clustering, this could serve as a lower bound on recall
-    double min_recall = static_cast<double>(nprobe_) / static_cast<double>(nlist_);
+    double min_recall = static_cast<double>(ps.nprobe) / static_cast<double>(ps.nlist);
     // verify.
-    devArrMatchKnnPair(faiss_indices_.data(),
-                       raft_indices_.data(),
-                       faiss_distances_.data(),
-                       raft_distances_.data(),
-                       num_queries,
-                       k_,
-                       float(0.001),
-                       min_recall,
-                       stream_);
+    eval_knn(indices_naive,
+             indices_ivfflat,
+             distances_naive,
+             distances_ivfflat,
+             ps.num_queries,
+             ps.k,
+             float(0.001),
+             min_recall);
   }
 
   void SetUp() override
   {
-    num_queries = params_.num_queries;
-    num_db_vecs = params_.num_db_vecs;
-    dim         = params_.dim;
-    k_          = params_.k;
-    metric      = params_.metric_;
-    nprobe_     = params_.nprobe;
-    nlist_      = params_.nlist;
+    database.resize(ps.num_db_vecs * ps.dim, stream_);
+    search_queries.resize(ps.num_queries * ps.dim, stream_);
 
-    unsigned long long int seed = 1234ULL;
-    raft::random::Rng r(seed);
+    raft::random::Rng r(1234ULL);
     if constexpr (std::is_same<DataT, float>{}) {
-      r.uniform(database.data(), num_db_vecs * dim, DataT(0.1), DataT(2.0), stream_);
-      r.uniform(search_queries.data(), num_queries * dim, DataT(0.1), DataT(2.0), stream_);
+      r.uniform(database.data(), ps.num_db_vecs * ps.dim, DataT(0.1), DataT(2.0), stream_);
+      r.uniform(search_queries.data(), ps.num_queries * ps.dim, DataT(0.1), DataT(2.0), stream_);
     } else {
-      r.uniformInt(database.data(), num_db_vecs * dim, DataT(1), DataT(20), stream_);
-      r.uniformInt(search_queries.data(), num_queries * dim, DataT(1), DataT(20), stream_);
+      r.uniformInt(database.data(), ps.num_db_vecs * ps.dim, DataT(1), DataT(20), stream_);
+      r.uniformInt(search_queries.data(), ps.num_queries * ps.dim, DataT(1), DataT(20), stream_);
     }
     handle_.sync_stream(stream_);
   }
 
+  void TearDown() override
+  {
+    handle_.sync_stream(stream_);
+    database.resize(0, stream_);
+    search_queries.resize(0, stream_);
+  }
+
  private:
   raft::handle_t handle_;
   rmm::cuda_stream_view stream_;
-  AnnIvfFlatInputs params_;
-  int num_queries;
-  int num_db_vecs;
-  int dim;
+  AnnIvfFlatInputs ps;
   rmm::device_uvector<DataT> database;
   rmm::device_uvector<DataT> search_queries;
-  rmm::device_uvector<int64_t> raft_indices_;
-  rmm::device_uvector<T> raft_distances_;
-  rmm::device_uvector<int64_t> faiss_indices_;
-  rmm::device_uvector<T> faiss_distances_;
-  int k_;
-  int nprobe_;
-  int nlist_;
-  raft::distance::DistanceType metric;
 };
 
 const std::vector<AnnIvfFlatInputs> inputs = {

From 4042b2841ab87cfacab67fdbba2bac4388a3eb64 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 7 Jun 2022 14:51:56 +0200
Subject: [PATCH 058/118] Refactored and documents ann_kmeans_balanced

---
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  | 218 +++++-----
 .../knn/detail/ann_kmeans_balanced.cuh        | 405 ++++++++++--------
 2 files changed, 333 insertions(+), 290 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index 0816a19307..1ff4d3dd84 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -234,44 +234,44 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_manag
 
   auto mesoClusterSize = mesoClusterSize_buf.data();
 
-  size_t sizePredictWorkspace =
-    _cuann_kmeans_predict_bufferSize(numMesoClusters,  // number of centers
-                                     dimDataset,
-                                     numTrainset  // number of vectors
-    );
+  size_t sizePredictWorkspace = kmeans::predict_buffer_size(numMesoClusters,  // number of centers
+                                                            dimDataset,
+                                                            numTrainset  // number of vectors
+  );
   rmm::device_buffer predictWorkspace(sizePredictWorkspace, stream_);
   // Training meso-clusters
   for (uint32_t iter = 0; iter < 2 * numIterations; iter += 2) {
     RAFT_LOG_TRACE("Training kmeans of meso-clusters: %.1f / %u", (float)iter / 2, numIterations);
-    _cuann_kmeans_predict(handle_,
-                          mesoClusterCenters.data(),
-                          numMesoClusters,
-                          dimDataset,
-                          trainset,
-                          numTrainset,
-                          mesoClusterLabels.data(),
-                          metric_type_,
-                          (iter != 0),
-                          predictWorkspace.data(),
-                          mesoClusterCentersTemp.data(),
-                          mesoClusterSize,
-                          stream_);
-
-    if (iter < 2 * (numIterations - 2)) {
-      if (_cuann_kmeans_adjust_centers(mesoClusterCenters.data(),
-                                       numMesoClusters,
-                                       dimDataset,
-                                       trainset,
-                                       numTrainset,
-                                       mesoClusterLabels.data(),
-                                       metric_type_,
-                                       mesoClusterSize,
-                                       (float)1.0 / 4,
-                                       stream_)) {
+    kmeans::predict(handle_,
+                    mesoClusterCenters.data(),
+                    numMesoClusters,
+                    dimDataset,
+                    trainset,
+                    numTrainset,
+                    mesoClusterLabels.data(),
+                    metric_type_,
+                    (iter != 0),
+                    predictWorkspace.data(),
+                    mesoClusterCentersTemp.data(),
+                    mesoClusterSize,
+                    true,
+                    stream_);
+
+    if (iter + 1 < 2 * numIterations) {
+      if (kmeans::adjust_centers(mesoClusterCenters.data(),
+                                 numMesoClusters,
+                                 dimDataset,
+                                 trainset,
+                                 numTrainset,
+                                 mesoClusterLabels.data(),
+                                 metric_type_,
+                                 mesoClusterSize,
+                                 (float)1.0 / 4,
+                                 stream_)) {
         iter -= 1;
-      }  // end if _cuann_kmeans_adjust_centers
-    }    // end if iter < 2 * (numIterations - 2)
-  }      // end for (int iter = 0; iter < 2 * numIterations; iter += 2)
+      }
+    }
+  }
 
   handle_.sync_stream(stream_);
 
@@ -314,12 +314,11 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_manag
 
   sizePredictWorkspace = 0;
   for (uint32_t i = 0; i < numMesoClusters; i++) {
-    sizePredictWorkspace =
-      max(sizePredictWorkspace,
-          _cuann_kmeans_predict_bufferSize(numFineClusters[i],  // number of centers
-                                           dimDataset,
-                                           mesoClusterSize[i]  // number of vectors
-                                           ));
+    sizePredictWorkspace = max(sizePredictWorkspace,
+                               kmeans::predict_buffer_size(numFineClusters[i],  // number of centers
+                                                           dimDataset,
+                                                           mesoClusterSize[i]  // number of vectors
+                                                           ));
   }
 
   // label (cluster ID) of each vector
@@ -360,31 +359,32 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_manag
                      (float)iter / 2,
                      numIterations);
 
-      _cuann_kmeans_predict(handle_,
-                            clusterCentersEach.data(),
-                            numFineClusters[i],
-                            dimDataset,
-                            subTrainset,
-                            mesoClusterSize[i],
-                            labelsMP.data(),
-                            metric_type_,
-                            (iter != 0),
-                            predictWorkspace.data(),
-                            clusterCentersMP.data(),
-                            clusterSizeMP.data(),
-                            stream_);
-
-      if (iter < 2 * (numIterations - 2)) {
-        if (_cuann_kmeans_adjust_centers(clusterCentersEach.data(),
-                                         numFineClusters[i],
-                                         dimDataset,
-                                         subTrainset,
-                                         mesoClusterSize[i],
-                                         labelsMP.data(),
-                                         metric_type_,
-                                         clusterSizeMP.data(),
-                                         (float)1.0 / 4,
-                                         stream_)) {
+      kmeans::predict(handle_,
+                      clusterCentersEach.data(),
+                      numFineClusters[i],
+                      dimDataset,
+                      subTrainset,
+                      mesoClusterSize[i],
+                      labelsMP.data(),
+                      metric_type_,
+                      (iter != 0),
+                      predictWorkspace.data(),
+                      clusterCentersMP.data(),
+                      clusterSizeMP.data(),
+                      true,
+                      stream_);
+
+      if (iter + 1 < 2 * numIterations) {
+        if (kmeans::adjust_centers(clusterCentersEach.data(),
+                                   numFineClusters[i],
+                                   dimDataset,
+                                   subTrainset,
+                                   mesoClusterSize[i],
+                                   labelsMP.data(),
+                                   metric_type_,
+                                   clusterSizeMP.data(),
+                                   (float)1.0 / 4,
+                                   stream_)) {
           iter -= 1;
         }
       }
@@ -402,61 +402,61 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_manag
   clusterSizeMP.resize(numClusters, stream_);
 
   // [...]
-  sizePredictWorkspace = _cuann_kmeans_predict_bufferSize(numClusters, dimDataset, numTrainset);
+  sizePredictWorkspace = kmeans::predict_buffer_size(numClusters, dimDataset, numTrainset);
   predictWorkspace.resize(sizePredictWorkspace, stream_);
 
   // Fitting whole clusters using whole trainset.
   for (int iter = 0; iter < 2; iter++) {
-    _cuann_kmeans_predict(handle_,
-                          clusterCenters,
-                          numClusters,
-                          dimDataset,
-                          trainset,
-                          numTrainset,
-                          trainsetLabels.data(),
-                          metric_type_,
-                          true,
-                          predictWorkspace.data(),
-                          clusterCentersMP.data(),
-                          clusterSizeMP.data(),
-                          true,
-                          stream_);
+    kmeans::predict(handle_,
+                    clusterCenters,
+                    numClusters,
+                    dimDataset,
+                    trainset,
+                    numTrainset,
+                    trainsetLabels.data(),
+                    metric_type_,
+                    true,
+                    predictWorkspace.data(),
+                    clusterCentersMP.data(),
+                    clusterSizeMP.data(),
+                    true,
+                    stream_);
   }  // end for (int iter = 0; iter < 2; iter++)
 
   RAFT_LOG_DEBUG("(%s) Final fitting.", __func__);
 
-  sizePredictWorkspace = _cuann_kmeans_predict_bufferSize(numClusters, dimDataset, nrow_);
+  sizePredictWorkspace = kmeans::predict_buffer_size(numClusters, dimDataset, nrow_);
   predictWorkspace.resize(sizePredictWorkspace, stream_);
 
-  _cuann_kmeans_predict(handle_,
-                        clusterCenters,
-                        nlist_,
-                        dim_,
-                        dataset,
-                        nrow_,
-                        datasetLabels,
-                        metric_type_,
-                        true,
-                        predictWorkspace.data(),
-                        clusterCentersMP.data(),
-                        clusterSizeMP.data(),
-                        true,
-                        stream_);
-
-  _cuann_kmeans_predict(handle_,
-                        clusterCenters,
-                        nlist_,
-                        dim_,
-                        dataset,
-                        nrow_,
-                        datasetLabels,
-                        metric_type_,
-                        true,
-                        predictWorkspace.data(),
-                        clusterCentersMP.data(),
-                        clusterSizeMP.data(),
-                        false,
-                        stream_);
+  kmeans::predict(handle_,
+                  clusterCenters,
+                  nlist_,
+                  dim_,
+                  dataset,
+                  nrow_,
+                  datasetLabels,
+                  metric_type_,
+                  true,
+                  predictWorkspace.data(),
+                  clusterCentersMP.data(),
+                  clusterSizeMP.data(),
+                  true,
+                  stream_);
+
+  kmeans::predict(handle_,
+                  clusterCenters,
+                  nlist_,
+                  dim_,
+                  dataset,
+                  nrow_,
+                  datasetLabels,
+                  metric_type_,
+                  true,
+                  predictWorkspace.data(),
+                  clusterCentersMP.data(),
+                  clusterSizeMP.data(),
+                  false,
+                  stream_);
 
   return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
 }  // end func cuivflBuildOptimizedKmeans
diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index 049940ab85..2a552c3a25 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -31,28 +31,22 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 
-namespace raft::spatial::knn::detail {
+namespace raft::spatial::knn::detail::kmeans {
 
-// predict label of dataset
-void _cuann_kmeans_predict_core(const handle_t& handle,
-                                const float* centers,  // [numCenters, dimCenters]
-                                uint32_t numCenters,
-                                uint32_t dimCenters,
-                                const float* dataset,  // [numDataset, dimCenters]
-                                uint32_t numDataset,
-                                uint32_t* labels,  // [numDataset]
-                                raft::distance::DistanceType metric,
-                                float* workspace,
-                                rmm::cuda_stream_view stream)
+void predict_core_(const handle_t& handle,
+                   const float* centers,  // [n_clusters, dim]
+                   uint32_t n_clusters,
+                   uint32_t dim,
+                   const float* dataset,  // [n_rows, dim]
+                   uint32_t n_rows,
+                   uint32_t* labels,  // [n_rows]
+                   raft::distance::DistanceType metric,
+                   float* workspace,
+                   rmm::cuda_stream_view stream)
 {
-  const uint32_t dimDataset = dimCenters;
-  float* sqsumCenters;  // [numCenters]
-  float* sqsumDataset;  // [numDataset]
-  float* distances;     // [numDataset, numCenters]
-
-  sqsumCenters = workspace;
-  sqsumDataset = sqsumCenters + numCenters;
-  distances    = sqsumDataset + numDataset;
+  float* sqsum_centers = workspace;                   // [n_clusters]
+  float* sqsum_data    = sqsum_centers + n_clusters;  // [n_rows]
+  float* distances     = sqsum_data + n_rows;         // [n_rows, n_clusters]
 
   float alpha;
   float beta;
@@ -60,290 +54,339 @@ void _cuann_kmeans_predict_core(const handle_t& handle,
     alpha = -1.0;
     beta  = 0.0;
   } else {
-    utils::dots_along_rows(numCenters, dimCenters, centers, sqsumCenters, stream);
-    utils::dots_along_rows(numDataset, dimDataset, dataset, sqsumDataset, stream);
-    utils::outer_add(sqsumDataset, numDataset, sqsumCenters, numCenters, distances, stream);
+    utils::dots_along_rows(n_clusters, dim, centers, sqsum_centers, stream);
+    utils::dots_along_rows(n_rows, dim, dataset, sqsum_data, stream);
+    utils::outer_add(sqsum_data, n_rows, sqsum_centers, n_clusters, distances, stream);
     alpha = -2.0;
     beta  = 1.0;
   }
   linalg::gemm(handle,
                true,
                false,
-               numCenters,
-               numDataset,
-               dimCenters,
+               n_clusters,
+               n_rows,
+               dim,
                &alpha,
                centers,
-               dimCenters,
+               dim,
                dataset,
-               dimDataset,
+               dim,
                &beta,
                distances,
-               numCenters,
+               n_clusters,
                stream);
-  utils::argmin_along_rows(numDataset, numCenters, distances, labels, stream);
+  utils::argmin_along_rows(n_rows, n_clusters, distances, labels, stream);
 }
 
-//
-uint32_t _cuann_kmeans_predict_chunkSize(uint32_t numCenters, uint32_t numDataset)
+auto predict_chunk_size_(uint32_t n_clusters, uint32_t n_rows) -> uint32_t
 {
-  numCenters     = max(1, numCenters);
+  n_clusters     = max(1, n_clusters);
   uint32_t chunk = (1 << 20);
-  if (chunk > (1 << 28) / numCenters) {
-    chunk = (1 << 28) / numCenters;
+  if (chunk > (1 << 28) / n_clusters) {
+    chunk = (1 << 28) / n_clusters;
     chunk += 32;
     chunk -= chunk % 64;
   }
-  chunk = min(chunk, numDataset);
+  chunk = min(chunk, n_rows);
   return chunk;
 }
 
-//
-size_t _cuann_kmeans_predict_bufferSize(uint32_t numCenters,
-                                        uint32_t dimCenters,
-                                        uint32_t numDataset)
+/**
+ * @brief Calculate the required workspace size for the `predict`.
+ */
+auto predict_buffer_size(uint32_t n_clusters, uint32_t dim, uint32_t n_rows) -> size_t
 {
-  uint32_t chunk = _cuann_kmeans_predict_chunkSize(numCenters, numDataset);
+  uint32_t chunk = predict_chunk_size_(n_clusters, n_rows);
   size_t size    = 0;
   using align_t  = Pow2<128>;
-  // float *curDataset;  // [chunk, dimCenters]
-  size += align_t::roundUp(sizeof(float) * chunk * dimCenters);
-  // void *bufDataset;  // [chunk, dimCenters]
-  size += align_t::roundUp(sizeof(float) * chunk * dimCenters);
+  // float *cur_dataset;  // [chunk, dim]
+  size += align_t::roundUp(sizeof(float) * chunk * dim);
+  // void *buf_dataset;  // [chunk, dim]
+  size += align_t::roundUp(sizeof(float) * chunk * dim);
   // float *workspace;
-  size += align_t::roundUp(sizeof(float) * (numCenters + chunk + (numCenters * chunk)));
+  size += align_t::roundUp(sizeof(float) * (n_clusters + chunk + (n_clusters * chunk)));
   return size;
 }
 
 /**
  * @brief update kmeans centers
  *
- * NB: `centers` and `clusterSize` must be accessible on GPU due to
- * divide_along_rows/normalize_rows. The rest can be both, under assumption that all pointer are
+ * Let S_i = {x_k | x_k \in dataset & labels[k] == i} be the vectors in the dataset with label i.
+ *   On exit centers_i = normalize(\sum_{x \in S_i} x), where `normalize` depends on the distance
+ * type.
+ *
+ * If accumulated_centers is not null, then it is expected that the summation is already done and
+ * the results are stored in accumulated_centers. In that case only the normalization will be
+ * applied.
+ *
+ * NB: `centers` and `cluster_sizes` must be accessible on GPU due to
+ * divide_along_rows/normalize_rows. The rest can be both, under assumption that all pointers are
  * accessible from the same place.
  *
  * i.e. two variants are possible:
  *
  *   1. All pointers are on the device.
- *   2. All pointers are on the host, but `centers` and `clusterSize` are accessible from GPU.
+ *   2. All pointers are on the host, but `centers` and `cluster_sizes` are accessible from GPU.
+ *
+ * @tparam T element type
  *
+ * @param[out] centers pointer to the output [n_clusters, dim]
+ * @param n_clusters number of clusters/centers
+ * @param dim dimensionality of the data
+ * @param[in] dataset a pointer to the data [n_rows, dim]
+ * @param n_rows number samples in the `dataset`
+ * @param[in] labels output predictions [n_rows]
+ * @param metric
+ * @param[inout] cluster_sizes (optional) number of rows in each cluster [n_clusters]
+ * @param[in] accumulated_centers (optional) pre-computed accumulated sums
+ *                                (non-normalized centers) [n_clusters, dim]
+ * @param stream
  */
 template <typename T>
-void _cuann_kmeans_update_centers(float* centers,  // [numCenters, dimCenters]
-                                  uint32_t numCenters,
-                                  uint32_t dimCenters,
-                                  const T* dataset,  // [numDataset, dimCenters]
-                                  uint32_t numDataset,
-                                  uint32_t* labels,  // [numDataset]
-                                  raft::distance::DistanceType metric,
-                                  uint32_t* clusterSize,  // [numCenters]
-                                  float* accumulatedCenters    = nullptr,
-                                  rmm::cuda_stream_view stream = rmm::cuda_stream_default)
+void update_centers(float* centers,
+                    uint32_t n_clusters,
+                    uint32_t dim,
+                    const T* dataset,
+                    uint32_t n_rows,
+                    const uint32_t* labels,
+                    raft::distance::DistanceType metric,
+                    uint32_t* cluster_sizes,
+                    const float* accumulated_centers,
+                    rmm::cuda_stream_view stream)
 {
-  if (accumulatedCenters == nullptr) {
+  if (accumulated_centers == nullptr) {
     // accumulate
-    utils::memset(centers, 0, sizeof(float) * numCenters * dimCenters, stream);
-    utils::memset(clusterSize, 0, sizeof(uint32_t) * numCenters, stream);
+    utils::memset(centers, 0, sizeof(float) * n_clusters * dim, stream);
+    utils::memset(cluster_sizes, 0, sizeof(uint32_t) * n_clusters, stream);
     utils::accumulate_into_selected<T>(
-      numDataset, dimCenters, centers, clusterSize, dataset, labels, stream);
+      n_rows, dim, centers, cluster_sizes, dataset, labels, stream);
   } else {
-    copy(centers, accumulatedCenters, numCenters * dimCenters, stream);
+    copy(centers, accumulated_centers, n_clusters * dim, stream);
   }
 
   if (metric == raft::distance::DistanceType::InnerProduct) {
     // normalize
-    utils::normalize_rows(numCenters, dimCenters, centers, stream);
+    utils::normalize_rows(n_clusters, dim, centers, stream);
   } else {
     // average
-    utils::divide_along_rows(numCenters, dimCenters, centers, clusterSize, stream);
+    utils::divide_along_rows(n_clusters, dim, centers, cluster_sizes, stream);
   }
 }
 
 /**
- * @brief predict label of dataset
+ * @brief Predict labels for the dataset. For each point we assign the label of the nearest center.
  *
  * NB: seems that all pointers here are accessed by devicie code only
  *
+ * @tparam T element type
+ *
+ * @param handle
+ * @param[inout] centers a pointer to the row-major matrix of cluster centers [n_clusters, dim]
+ * @param n_clusters number of clusters/centers
+ * @param dim dimensionality of the data
+ * @param[in] dataset a pointer to the data [n_rows, dim]
+ * @param n_rows number samples in the `dataset`
+ * @param[out] labels output predictions [n_rows]
+ * @param metric
+ * @param is_center_set
+ * @param[in] _workspace optional
+ * @param[in] centers_temp optional [n_clusters, dim]
+ * @param[inout] cluster_sizes (optional) number of rows in each cluster [n_clusters]
+ * @param shall_update_centers
+ * @param stream
  */
 template <typename T>
-void _cuann_kmeans_predict(const handle_t& handle,
-                           float* centers,  // [numCenters, dimCenters]
-                           uint32_t numCenters,
-                           uint32_t dimCenters,
-                           const T* dataset,  // [numDataset, dimCenters]
-                           uint32_t numDataset,
-                           uint32_t* labels,  // [numDataset]
-                           raft::distance::DistanceType metric,
-                           bool isCenterSet             = true,
-                           void* _workspace             = nullptr,
-                           float* tempCenters           = nullptr,  // [numCenters, dimCenters]
-                           uint32_t* clusterSize        = nullptr,  // [numCenters,]
-                           bool updateCenter            = true,
-                           rmm::cuda_stream_view stream = rmm::cuda_stream_default)
+void predict(const handle_t& handle,
+             float* centers,
+             uint32_t n_clusters,
+             uint32_t dim,
+             const T* dataset,
+             uint32_t n_rows,
+             uint32_t* labels,
+             raft::distance::DistanceType metric,
+             bool is_center_set,
+             void* _workspace,
+             float* centers_temp,
+             uint32_t* cluster_sizes,
+             bool shall_update_centers,
+             rmm::cuda_stream_view stream)
 {
-  if (numDataset == 0) {
-    RAFT_LOG_WARN("cuann_kmeans_predict: empty dataset (numDataset = %d, numCenters = %d)",
-                  numDataset,
-                  numCenters);
+  if (n_rows == 0) {
+    RAFT_LOG_WARN(
+      "cuann_kmeans_predict: empty dataset (n_rows = %d, n_clusters = %d)", n_rows, n_clusters);
     return;
   }
-  if (!isCenterSet) {
+  if (!is_center_set) {
     // If centers are not set, the labels will be determined randomly.
     linalg::writeOnlyUnaryOp(
       labels,
-      numDataset,
-      [numCenters] __device__(uint32_t * out, uint32_t i) { *out = i % numCenters; },
+      n_rows,
+      [n_clusters] __device__(uint32_t * out, uint32_t i) { *out = i % n_clusters; },
       stream);
-    if (tempCenters != nullptr && clusterSize != nullptr) {
+    if (centers_temp != nullptr && cluster_sizes != nullptr) {
       // update centers
-      _cuann_kmeans_update_centers(centers,
-                                   numCenters,
-                                   dimCenters,
-                                   dataset,
-                                   numDataset,
-                                   labels,
-                                   metric,
-                                   clusterSize,
-                                   nullptr,
-                                   stream);
+      update_centers(
+        centers, n_clusters, dim, dataset, n_rows, labels, metric, cluster_sizes, nullptr, stream);
     }
     return;
   }
 
-  uint32_t chunk  = _cuann_kmeans_predict_chunkSize(numCenters, numDataset);
-  void* workspace = _workspace;
+  uint32_t chunk_max = predict_chunk_size_(n_clusters, n_rows);
+  void* workspace    = _workspace;
   rmm::device_buffer sub_workspace(0, stream);
 
   if (_workspace == nullptr) {
-    sub_workspace.resize(_cuann_kmeans_predict_bufferSize(numCenters, dimCenters, numDataset),
-                         stream);
+    sub_workspace.resize(predict_buffer_size(n_clusters, dim, n_rows), stream);
     workspace = sub_workspace.data();
   }
-  float* curDataset;  // [chunk, dimCenters]
-  T* bufDataset;      // [chunk, dimCenters]
-  float* workspace_core;
-  curDataset = (float*)workspace;
-  bufDataset = (T*)((uint8_t*)curDataset + Pow2<128>::roundUp(sizeof(float) * chunk * dimCenters));
-  workspace_core =
-    (float*)((uint8_t*)bufDataset + Pow2<128>::roundUp(sizeof(float) * chunk * dimCenters));
 
-  if (tempCenters != nullptr && clusterSize != nullptr) {
-    utils::memset(tempCenters, 0, sizeof(float) * numCenters * dimCenters, stream);
-    utils::memset(clusterSize, 0, sizeof(uint32_t) * numCenters, stream);
+  // [chunk_max, dim]
+  auto cur_dataset = reinterpret_cast<float*>(workspace);
+  // [chunk_max, dim]
+  auto buf_dataset    = reinterpret_cast<T*>(reinterpret_cast<uint8_t*>(cur_dataset) +
+                                          Pow2<128>::roundUp(sizeof(float) * chunk_max * dim));
+  auto workspace_core = reinterpret_cast<float*>(
+    reinterpret_cast<uint8_t*>(buf_dataset) + Pow2<128>::roundUp(sizeof(float) * chunk_max * dim));
+
+  if (centers_temp != nullptr && cluster_sizes != nullptr) {
+    utils::memset(centers_temp, 0, sizeof(float) * n_clusters * dim, stream);
+    utils::memset(cluster_sizes, 0, sizeof(uint32_t) * n_clusters, stream);
   }
 
-  for (uint64_t is = 0; is < numDataset; is += chunk) {
-    uint64_t ie       = min(is + chunk, (uint64_t)numDataset);
-    uint32_t nDataset = ie - is;
+  for (uint32_t offset = 0; offset < n_rows; offset += chunk_max) {
+    auto chunk = std::min<uint32_t>(chunk_max, n_rows - offset);
 
-    copy(bufDataset, dataset + is * dimCenters, nDataset * dimCenters, stream);
+    copy(buf_dataset, dataset + offset * dim, chunk * dim, stream);
     handle.sync_stream(stream);
 
     if constexpr (std::is_same_v<T, float>) {
       // No need to copy floats
-      curDataset = bufDataset;
+      cur_dataset = buf_dataset;
     } else {
-      linalg::unaryOp(
-        curDataset, bufDataset, nDataset * dimCenters, utils::mapping<float>{}, stream);
+      linalg::unaryOp(cur_dataset, buf_dataset, chunk * dim, utils::mapping<float>{}, stream);
     }
 
     // predict
-    _cuann_kmeans_predict_core(handle,
-                               centers,
-                               numCenters,
-                               dimCenters,
-                               curDataset,
-                               nDataset,
-                               labels + is,
-                               metric,
-                               workspace_core,
-                               stream);
+    predict_core_(handle,
+                  centers,
+                  n_clusters,
+                  dim,
+                  cur_dataset,
+                  chunk,
+                  labels + offset,
+                  metric,
+                  workspace_core,
+                  stream);
 
-    if ((tempCenters != nullptr) && (clusterSize != nullptr)) {
+    if ((centers_temp != nullptr) && (cluster_sizes != nullptr)) {
       // accumulate
       utils::accumulate_into_selected<float>(
-        nDataset, dimCenters, tempCenters, clusterSize, curDataset, labels + is, stream);
+        chunk, dim, centers_temp, cluster_sizes, cur_dataset, labels + offset, stream);
     }
   }
 
-  if ((tempCenters != nullptr) && (clusterSize != nullptr) && updateCenter) {
-    _cuann_kmeans_update_centers(centers,
-                                 numCenters,
-                                 dimCenters,
-                                 dataset,
-                                 numDataset,
-                                 labels,
-                                 metric,
-                                 clusterSize,
-                                 tempCenters,
-                                 stream);
+  if ((centers_temp != nullptr) && (cluster_sizes != nullptr) && shall_update_centers) {
+    update_centers(centers,
+                   n_clusters,
+                   dim,
+                   dataset,
+                   n_rows,
+                   labels,
+                   metric,
+                   cluster_sizes,
+                   centers_temp,
+                   stream);
   }
 }
 
 /**
- * @brief adjust centers which have small number of entries
+ * @brief Adjust centers which have small number of entries.
+ *
+ * For each cluster, where the cluster size is not bigger than a threshold, the center is moved
+ * towards a data point that belongs to a large cluster.
+ *
+ * NB: if this function returns `true`, you should update the labels.
  *
  * NB: all pointers are used on the host side.
+ *
+ * @tparam T element type
+ *
+ * @param[inout] centers cluster centers [n_clusters, dim]
+ * @param n_clusters number of rows in `centers`
+ * @param dim number of columns in `centers` and `dataset`
+ * @param[in] dataset a host pointer to the row-major data matrix [n_rows, dim]
+ * @param n_rows number of rows in `dataset`
+ * @param[in] labels a host pointer to the cluster indices [n_rows]
+ * @param metric
+ * @param[in] cluster_sizes number of rows in each cluster [n_clusters]
+ * @param threshold defines a criterion for adjusting a cluster
+ *                   (cluster_sizes <= average_size * threshold)
+ *                   0 <= threshold < 1
+ * @param stream
+ *
+ * @return whether any of the centers has been updated (and thus, `labels` need to be recalculated).
  */
 template <typename T>
-bool _cuann_kmeans_adjust_centers(float* centers,  // [numCenters, dimCenters]
-                                  uint32_t numCenters,
-                                  uint32_t dimCenters,
-                                  const T* dataset,  // [numDataset, dimCenters]
-                                  uint32_t numDataset,
-                                  const uint32_t* labels,  // [numDataset]
-                                  raft::distance::DistanceType metric,
-                                  const uint32_t* clusterSize,  // [numCenters]
-                                  float threshold,
-                                  rmm::cuda_stream_view stream)
+auto adjust_centers(float* centers,
+                    size_t n_clusters,
+                    size_t dim,
+                    const T* dataset,
+                    size_t n_rows,
+                    const uint32_t* labels,
+                    raft::distance::DistanceType metric,
+                    const uint32_t* cluster_sizes,
+                    float threshold,
+                    rmm::cuda_stream_view stream) -> bool
 {
   stream.synchronize();
-  if (numCenters == 0) { return false; }
-  bool adjusted                = false;
-  static uint32_t i            = 0;
-  static uint32_t iPrimes      = 0;
-  constexpr uint32_t numPrimes = 40;
-  uint32_t primes[numPrimes]   = {29,   71,   113,  173,  229,  281,  349,  409,  463,  541,
-                                601,  659,  733,  809,  863,  941,  1013, 1069, 1151, 1223,
-                                1291, 1373, 1451, 1511, 1583, 1657, 1733, 1811, 1889, 1987,
-                                2053, 2129, 2213, 2287, 2357, 2423, 2531, 2617, 2687, 2741};
-  uint32_t average             = numDataset / numCenters;
+  if (n_clusters == 0) { return false; }
+  constexpr static std::array kPrimes{29,   71,   113,  173,  229,  281,  349,  409,  463,  541,
+                                      601,  659,  733,  809,  863,  941,  1013, 1069, 1151, 1223,
+                                      1291, 1373, 1451, 1511, 1583, 1657, 1733, 1811, 1889, 1987,
+                                      2053, 2129, 2213, 2287, 2357, 2423, 2531, 2617, 2687, 2741};
+  static size_t i        = 0;
+  static size_t i_primes = 0;
+
+  bool adjusted    = false;
+  uint32_t average = n_rows / n_clusters;
   uint32_t ofst;
 
   do {
-    iPrimes = (iPrimes + 1) % numPrimes;
-    ofst    = primes[iPrimes];
-  } while (numDataset % ofst == 0);
-  uint32_t count = 0;
+    i_primes = (i_primes + 1) % kPrimes.size();
+    ofst     = kPrimes[i_primes];
+  } while (n_rows % ofst == 0);
 
-  for (uint32_t l = 0; l < numCenters; l++) {
-    if (clusterSize[l] > (uint32_t)(average * threshold)) continue;
+  for (size_t l = 0; l < n_clusters; l++) {
+    // skip big clusters
+    if (cluster_sizes[l] > static_cast<uint32_t>(average * threshold)) continue;
+    // choose a "random" i that belongs to a rather large cluster
     do {
-      i = (i + ofst) % numDataset;
-    } while (clusterSize[labels[i]] < average);
-    uint32_t li = labels[i];
+      i = (i + ofst) % n_rows;
+    } while (cluster_sizes[labels[i]] < average);
+    // Adjust the center of the selected smaller cluster to gravitate towards
+    // a sample from the selected larger cluster.
+    size_t li   = labels[i];
     float sqsum = 0.0;
-    for (uint32_t j = 0; j < dimCenters; j++) {
+    for (size_t j = 0; j < dim; j++) {
       constexpr float kWc = 7.0;
       constexpr float kWd = 1.0;
       float val           = 0;
-      val += kWc * centers[j + ((uint64_t)dimCenters * li)];
-      val += kWd * dataset[j + ((uint64_t)dimCenters * i)] / utils::config<T>::kDivisor;
+      val += kWc * centers[j + dim * li];
+      val += kWd * utils::mapping<float>{}(dataset[j + dim * i]);
       val /= kWc + kWd;
       sqsum += val * val;
-      centers[j + ((uint64_t)dimCenters * l)] = val;
+      centers[j + dim * l] = val;
     }
     if (metric == raft::distance::DistanceType::InnerProduct) {
       sqsum = sqrt(sqsum);
-      for (uint32_t j = 0; j < dimCenters; j++) {
-        centers[j + ((uint64_t)dimCenters * l)] /= sqsum;
+      for (size_t j = 0; j < dim; j++) {
+        centers[j + dim * l] /= sqsum;
       }
     }
     adjusted = true;
-    count += 1;
   }
   stream.synchronize();
   return adjusted;
 }
 
-}  // namespace raft::spatial::knn::detail
+}  // namespace raft::spatial::knn::detail::kmeans

From bb5726b4e9f99d53a008e7c34e0e5df8c46baf94 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 7 Jun 2022 16:25:09 +0200
Subject: [PATCH 059/118] Use memory_resource for temp data in kmeans

---
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  |  48 ++-----
 .../knn/detail/ann_kmeans_balanced.cuh        | 129 +++++++++---------
 2 files changed, 77 insertions(+), 100 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index 1ff4d3dd84..1110326027 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -234,11 +234,11 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_manag
 
   auto mesoClusterSize = mesoClusterSize_buf.data();
 
-  size_t sizePredictWorkspace = kmeans::predict_buffer_size(numMesoClusters,  // number of centers
-                                                            dimDataset,
-                                                            numTrainset  // number of vectors
-  );
-  rmm::device_buffer predictWorkspace(sizePredictWorkspace, stream_);
+  rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> kmeans_mem_res(
+    rmm::mr::get_current_device_resource(),
+    // an arbitrary guess on the upper bound of the workspace size
+    Pow2<256>::roundUp(kmeans::calc_minibatch_size(numMesoClusters, nrow) * dimDataset * 4));
+
   // Training meso-clusters
   for (uint32_t iter = 0; iter < 2 * numIterations; iter += 2) {
     RAFT_LOG_TRACE("Training kmeans of meso-clusters: %.1f / %u", (float)iter / 2, numIterations);
@@ -251,11 +251,11 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_manag
                     mesoClusterLabels.data(),
                     metric_type_,
                     (iter != 0),
-                    predictWorkspace.data(),
                     mesoClusterCentersTemp.data(),
                     mesoClusterSize,
                     true,
-                    stream_);
+                    stream_,
+                    &kmeans_mem_res);
 
     if (iter + 1 < 2 * numIterations) {
       if (kmeans::adjust_centers(mesoClusterCenters.data(),
@@ -312,20 +312,9 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_manag
   auto idsTrainset = idsTrainset_buf.data();
   auto subTrainset = subTrainset_buf.data();
 
-  sizePredictWorkspace = 0;
-  for (uint32_t i = 0; i < numMesoClusters; i++) {
-    sizePredictWorkspace = max(sizePredictWorkspace,
-                               kmeans::predict_buffer_size(numFineClusters[i],  // number of centers
-                                                           dimDataset,
-                                                           mesoClusterSize[i]  // number of vectors
-                                                           ));
-  }
-
   // label (cluster ID) of each vector
   rmm::device_uvector<uint32_t> labelsMP(mesoClusterSizeMax, stream_, &managed_memory);
 
-  predictWorkspace.resize(sizePredictWorkspace, stream_);
-
   rmm::device_uvector<float> clusterCentersEach(
     numFineClustersMax * dimDataset, stream_, &managed_memory);
   rmm::device_uvector<float> clusterCentersMP(
@@ -368,11 +357,11 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_manag
                       labelsMP.data(),
                       metric_type_,
                       (iter != 0),
-                      predictWorkspace.data(),
                       clusterCentersMP.data(),
                       clusterSizeMP.data(),
                       true,
-                      stream_);
+                      stream_,
+                      &kmeans_mem_res);
 
       if (iter + 1 < 2 * numIterations) {
         if (kmeans::adjust_centers(clusterCentersEach.data(),
@@ -401,10 +390,6 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_manag
   clusterCentersMP.resize(numClusters * dimDataset, stream_);
   clusterSizeMP.resize(numClusters, stream_);
 
-  // [...]
-  sizePredictWorkspace = kmeans::predict_buffer_size(numClusters, dimDataset, numTrainset);
-  predictWorkspace.resize(sizePredictWorkspace, stream_);
-
   // Fitting whole clusters using whole trainset.
   for (int iter = 0; iter < 2; iter++) {
     kmeans::predict(handle_,
@@ -416,18 +401,15 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_manag
                     trainsetLabels.data(),
                     metric_type_,
                     true,
-                    predictWorkspace.data(),
                     clusterCentersMP.data(),
                     clusterSizeMP.data(),
                     true,
-                    stream_);
+                    stream_,
+                    &kmeans_mem_res);
   }  // end for (int iter = 0; iter < 2; iter++)
 
   RAFT_LOG_DEBUG("(%s) Final fitting.", __func__);
 
-  sizePredictWorkspace = kmeans::predict_buffer_size(numClusters, dimDataset, nrow_);
-  predictWorkspace.resize(sizePredictWorkspace, stream_);
-
   kmeans::predict(handle_,
                   clusterCenters,
                   nlist_,
@@ -437,11 +419,11 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_manag
                   datasetLabels,
                   metric_type_,
                   true,
-                  predictWorkspace.data(),
                   clusterCentersMP.data(),
                   clusterSizeMP.data(),
                   true,
-                  stream_);
+                  stream_,
+                  &kmeans_mem_res);
 
   kmeans::predict(handle_,
                   clusterCenters,
@@ -452,11 +434,11 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_manag
                   datasetLabels,
                   metric_type_,
                   true,
-                  predictWorkspace.data(),
                   clusterCentersMP.data(),
                   clusterSizeMP.data(),
                   false,
-                  stream_);
+                  stream_,
+                  &kmeans_mem_res);
 
   return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
 }  // end func cuivflBuildOptimizedKmeans
diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index 2a552c3a25..ae3dc98d8c 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -29,7 +29,11 @@
 #include <raft/pow2_utils.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_buffer.hpp>
+#include <rmm/device_vector.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <optional>
 
 namespace raft::spatial::knn::detail::kmeans {
 
@@ -41,12 +45,10 @@ void predict_core_(const handle_t& handle,
                    uint32_t n_rows,
                    uint32_t* labels,  // [n_rows]
                    raft::distance::DistanceType metric,
-                   float* workspace,
+                   rmm::mr::device_memory_resource* mr,
                    rmm::cuda_stream_view stream)
 {
-  float* sqsum_centers = workspace;                   // [n_clusters]
-  float* sqsum_data    = sqsum_centers + n_clusters;  // [n_rows]
-  float* distances     = sqsum_data + n_rows;         // [n_rows, n_clusters]
+  rmm::device_uvector<float> distances(n_rows * n_clusters, stream, mr);
 
   float alpha;
   float beta;
@@ -54,9 +56,12 @@ void predict_core_(const handle_t& handle,
     alpha = -1.0;
     beta  = 0.0;
   } else {
-    utils::dots_along_rows(n_clusters, dim, centers, sqsum_centers, stream);
-    utils::dots_along_rows(n_rows, dim, dataset, sqsum_data, stream);
-    utils::outer_add(sqsum_data, n_rows, sqsum_centers, n_clusters, distances, stream);
+    rmm::device_uvector<float> sqsum_centers(n_clusters, stream, mr);
+    rmm::device_uvector<float> sqsum_data(n_rows, stream, mr);
+    utils::dots_along_rows(n_clusters, dim, centers, sqsum_centers.data(), stream);
+    utils::dots_along_rows(n_rows, dim, dataset, sqsum_data.data(), stream);
+    utils::outer_add(
+      sqsum_data.data(), n_rows, sqsum_centers.data(), n_clusters, distances.data(), stream);
     alpha = -2.0;
     beta  = 1.0;
   }
@@ -72,40 +77,33 @@ void predict_core_(const handle_t& handle,
                dataset,
                dim,
                &beta,
-               distances,
+               distances.data(),
                n_clusters,
                stream);
-  utils::argmin_along_rows(n_rows, n_clusters, distances, labels, stream);
-}
-
-auto predict_chunk_size_(uint32_t n_clusters, uint32_t n_rows) -> uint32_t
-{
-  n_clusters     = max(1, n_clusters);
-  uint32_t chunk = (1 << 20);
-  if (chunk > (1 << 28) / n_clusters) {
-    chunk = (1 << 28) / n_clusters;
-    chunk += 32;
-    chunk -= chunk % 64;
-  }
-  chunk = min(chunk, n_rows);
-  return chunk;
+  utils::argmin_along_rows(n_rows, n_clusters, distances.data(), labels, stream);
 }
 
 /**
- * @brief Calculate the required workspace size for the `predict`.
+ * @brief Suggest a minibatch size for kmeans prediction.
+ *
+ * This function is used as a heuristic to split the work over a large dataset
+ * to reduce the size of temporary memory allocations.
+ *
+ * @param n_clusters number of clusters in kmeans clustering
+ * @param n_rows dataset size
+ * @return a suggested minibatch size
  */
-auto predict_buffer_size(uint32_t n_clusters, uint32_t dim, uint32_t n_rows) -> size_t
+constexpr auto calc_minibatch_size(uint32_t n_clusters, uint32_t n_rows) -> uint32_t
 {
-  uint32_t chunk = predict_chunk_size_(n_clusters, n_rows);
-  size_t size    = 0;
-  using align_t  = Pow2<128>;
-  // float *cur_dataset;  // [chunk, dim]
-  size += align_t::roundUp(sizeof(float) * chunk * dim);
-  // void *buf_dataset;  // [chunk, dim]
-  size += align_t::roundUp(sizeof(float) * chunk * dim);
-  // float *workspace;
-  size += align_t::roundUp(sizeof(float) * (n_clusters + chunk + (n_clusters * chunk)));
-  return size;
+  n_clusters              = std::max<uint32_t>(1, n_clusters);
+  uint32_t minibatch_size = (1 << 20);
+  if (minibatch_size > (1 << 28) / n_clusters) {
+    minibatch_size = (1 << 28) / n_clusters;
+    minibatch_size += 32;
+    minibatch_size -= minibatch_size % 64;
+  }
+  minibatch_size = std::min<uint32_t>(minibatch_size, n_rows);
+  return minibatch_size;
 }
 
 /**
@@ -189,11 +187,11 @@ void update_centers(float* centers,
  * @param[out] labels output predictions [n_rows]
  * @param metric
  * @param is_center_set
- * @param[in] _workspace optional
  * @param[in] centers_temp optional [n_clusters, dim]
  * @param[inout] cluster_sizes (optional) number of rows in each cluster [n_clusters]
  * @param shall_update_centers
  * @param stream
+ * @param mr (optional) memory resource to use for temporary allocations
  */
 template <typename T>
 void predict(const handle_t& handle,
@@ -205,11 +203,11 @@ void predict(const handle_t& handle,
              uint32_t* labels,
              raft::distance::DistanceType metric,
              bool is_center_set,
-             void* _workspace,
              float* centers_temp,
              uint32_t* cluster_sizes,
              bool shall_update_centers,
-             rmm::cuda_stream_view stream)
+             rmm::cuda_stream_view stream,
+             rmm::mr::device_memory_resource* mr = nullptr)
 {
   if (n_rows == 0) {
     RAFT_LOG_WARN(
@@ -231,57 +229,54 @@ void predict(const handle_t& handle,
     return;
   }
 
-  uint32_t chunk_max = predict_chunk_size_(n_clusters, n_rows);
-  void* workspace    = _workspace;
-  rmm::device_buffer sub_workspace(0, stream);
-
-  if (_workspace == nullptr) {
-    sub_workspace.resize(predict_buffer_size(n_clusters, dim, n_rows), stream);
-    workspace = sub_workspace.data();
-  }
-
-  // [chunk_max, dim]
-  auto cur_dataset = reinterpret_cast<float*>(workspace);
-  // [chunk_max, dim]
-  auto buf_dataset    = reinterpret_cast<T*>(reinterpret_cast<uint8_t*>(cur_dataset) +
-                                          Pow2<128>::roundUp(sizeof(float) * chunk_max * dim));
-  auto workspace_core = reinterpret_cast<float*>(
-    reinterpret_cast<uint8_t*>(buf_dataset) + Pow2<128>::roundUp(sizeof(float) * chunk_max * dim));
+  const uint32_t max_minibatch_size = calc_minibatch_size(n_clusters, n_rows);
 
   if (centers_temp != nullptr && cluster_sizes != nullptr) {
     utils::memset(centers_temp, 0, sizeof(float) * n_clusters * dim, stream);
     utils::memset(cluster_sizes, 0, sizeof(uint32_t) * n_clusters, stream);
   }
 
-  for (uint32_t offset = 0; offset < n_rows; offset += chunk_max) {
-    auto chunk = std::min<uint32_t>(chunk_max, n_rows - offset);
+  std::optional<rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>> pool_res;
+  if (mr == nullptr) {
+    pool_res.emplace(rmm::mr::get_current_device_resource(),
+                     Pow2<256>::roundUp(max_minibatch_size * dim * 4));
+    mr = &(pool_res.value());
+  }
 
-    copy(buf_dataset, dataset + offset * dim, chunk * dim, stream);
-    handle.sync_stream(stream);
+  rmm::device_uvector<float> cur_dataset(max_minibatch_size * dim, stream, mr);
+  for (uint32_t offset = 0; offset < n_rows; offset += max_minibatch_size) {
+    auto minibatch_size = std::min<uint32_t>(max_minibatch_size, n_rows - offset);
 
     if constexpr (std::is_same_v<T, float>) {
-      // No need to copy floats
-      cur_dataset = buf_dataset;
+      copy(cur_dataset.data(), dataset + offset * dim, minibatch_size * dim, stream);
     } else {
-      linalg::unaryOp(cur_dataset, buf_dataset, chunk * dim, utils::mapping<float>{}, stream);
+      linalg::unaryOp(cur_dataset.data(),
+                      dataset + offset * dim,
+                      minibatch_size * dim,
+                      utils::mapping<float>{},
+                      stream);
     }
-
     // predict
     predict_core_(handle,
                   centers,
                   n_clusters,
                   dim,
-                  cur_dataset,
-                  chunk,
+                  cur_dataset.data(),
+                  minibatch_size,
                   labels + offset,
                   metric,
-                  workspace_core,
+                  mr,
                   stream);
 
     if ((centers_temp != nullptr) && (cluster_sizes != nullptr)) {
       // accumulate
-      utils::accumulate_into_selected<float>(
-        chunk, dim, centers_temp, cluster_sizes, cur_dataset, labels + offset, stream);
+      utils::accumulate_into_selected<float>(minibatch_size,
+                                             dim,
+                                             centers_temp,
+                                             cluster_sizes,
+                                             cur_dataset.data(),
+                                             labels + offset,
+                                             stream);
     }
   }
 

From 810c26b1462623dacaca60a19415f334cf98a8a1 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 8 Jun 2022 11:29:15 +0200
Subject: [PATCH 060/118] Address clang-tidy and other refactoring suggestions

---
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  | 607 ++++++++----------
 .../raft/spatial/knn/detail/ann_quantized.cuh |   4 +-
 2 files changed, 285 insertions(+), 326 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index 1110326027..7ddd2fc398 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -72,45 +72,25 @@ __global__ void write_ivf_flat_interleaved_index(
   }
 }
 
-/* CUIVFL status type */
-enum cuivflStatus_t : unsigned int {
-  CUIVFL_STATUS_SUCCESS           = 0,
-  CUIVFL_STATUS_ALLOC_FAILED      = 1,
-  CUIVFL_STATUS_NOT_INITIALIZED   = 2,
-  CUIVFL_STATUS_INVALID_VALUE     = 3,
-  CUIVFL_STATUS_INTERNAL_ERROR    = 4,
-  CUIVFL_STATUS_FILEIO_ERROR      = 5,
-  CUIVFL_STATUS_CUDA_ERROR        = 6,
-  CUIVFL_STATUS_CUBLAS_ERROR      = 7,
-  CUIVFL_STATUS_INVALID_POINTER   = 8,
-  CUIVFL_STATUS_VERSION_ERROR     = 9,
-  CUIVFL_STATUS_UNSUPPORTED_DTYPE = 10,
-  CUIVFL_STATUS_FAISS_ERROR       = 11,
-  CUIVFL_STATUS_NOT_BUILD         = 12
-};
-
 template <typename T>
 class cuivflHandle {
  public:
   cuivflHandle(const handle_t& handle,
                raft::distance::DistanceType metric_type,
                uint32_t dim,
-               uint32_t nlist,
-               uint32_t niter,
-               uint32_t device);
+               uint32_t n_lists,
+               uint32_t n_iters);
 
-  cuivflStatus_t cuivflBuildIndex(const T* dataset, T* trainset, uint32_t nrow, uint32_t nTrainset);
+  void cuivflBuildIndex(const T* dataset, T* trainset, uint32_t n_rows, uint32_t nTrainset);
 
-  cuivflStatus_t cuivflSetSearchParameters(const uint32_t nprobe,
-                                           const uint32_t max_batch,
-                                           const uint32_t max_k);
+  void cuivflSetSearchParameters(const uint32_t n_probes,
+                                 const uint32_t max_batch,
+                                 const uint32_t max_k);
 
-  cuivflStatus_t cuivflSearch(
-    const T* queries, uint32_t batch_size, uint32_t k, size_t* neighbors, float* distances);
+  void cuivflSearch(
+    const T* queries, uint32_t n_queries, uint32_t k, size_t* neighbors, float* distances);
 
-  cuivflStatus_t queryIVFFlatGridSize(const uint32_t nprobe,
-                                      const uint32_t batch_size,
-                                      const uint32_t k);
+  void queryIVFFlatGridSize(const uint32_t n_probes, const uint32_t n_queries, const uint32_t k);
   uint32_t getDim() { return dim_; }
 
  private:
@@ -119,27 +99,27 @@ class cuivflHandle {
 
   raft::distance::DistanceType metric_type_;
   bool greater_;
-  uint32_t nlist_;       // The number of inverted lists= the number of centriods
-  uint32_t niter_;       // The number of uint32_terations for kmeans to build the indexs
+  uint32_t n_lists_;     // The number of inverted lists= the number of centriods
+  uint32_t n_iters_;     // The number of uint32_terations for kmeans to build the indexs
   uint32_t dim_;         // The dimension of vectors for input dataset
-  uint32_t nprobe_;      // The number of clusters for searching
-  uint32_t nrow_;        // The number of elements for input dataset
+  uint32_t n_probes_;    // The number of clusters for searching
+  uint32_t n_rows_;      // The number of elements for input dataset
   size_t ninterleave_;   // The number of elements in 32 interleaved group for input dataset
   uint32_t veclen_;      // The vectorization length of dataset in index.
-  uint32_t grid_dim_x_;  // The number of blocks launched across nprobe.
+  uint32_t grid_dim_x_;  // The number of blocks launched across n_probes.
 
   // device pointer
   //  The device memory pointer; inverted list for data; size [ninterleave_, dim_]
   rmm::device_uvector<T> list_data_dev_;
   // The device memory pointer; inverted list for index; size [ninterleave_]
   rmm::device_uvector<uint32_t> list_index_dev_;
-  // The device memory pointer; Used for list_data_manage_ptr_; size [nlist_]
+  // The device memory pointer; Used for list_data_manage_ptr_; size [n_lists_]
   rmm::device_uvector<uint32_t> list_prefix_interleaved_dev_;
-  // The device memory pointer; the number of each cluster(list); size [nlist_]
+  // The device memory pointer; the number of each cluster(list); size [n_lists_]
   rmm::device_uvector<uint32_t> list_lengths_dev_;
-  // The device memory pointer; centriod; size [nlist_, dim_]
+  // The device memory pointer; centriod; size [n_lists_, dim_]
   rmm::device_uvector<float> centriod_dev_;
-  // The device memory pointer; centriod norm ; size [nlist_, dim_]
+  // The device memory pointer; centriod norm ; size [n_lists_, dim_]
   rmm::device_uvector<float> centriod_norm_dev_;
   // Memory pool for use during search; after the first search is done the pool is not likely to
   // resize, saving the costs of allocations.
@@ -150,35 +130,34 @@ class cuivflHandle {
   std::vector<T> list_data_host_;
   // The host memory pointer; inverted list for index; size [ninterleave_]
   std::vector<uint32_t> list_index_host_;
-  // The host memory pointer; Used for list_data_manage_ptr_; size [nlist_]
+  // The host memory pointer; Used for list_data_manage_ptr_; size [n_lists_]
   std::vector<uint32_t> list_prefix_interleaved_host_;
-  // The host memory pointer; the number of each cluster(list); size [nlist_]
+  // The host memory pointer; the number of each cluster(list); size [n_lists_]
   std::vector<uint32_t> list_lengths_host_;
 
-  cuivflStatus_t cuivflBuildOptimizedKmeans(float* centriod_managed_ptr,
-                                            const T* dataset,
-                                            T* trainset,
-                                            uint32_t* clusterSize,
-                                            uint32_t nrow,
-                                            uint32_t ntrain);
+  void cuivflBuildOptimizedKmeans(float* centriod_managed_ptr,
+                                  const T* dataset,
+                                  T* trainset,
+                                  uint32_t* clusterSize,
+                                  uint32_t n_rows,
+                                  uint32_t n_rows_train);
 
-  template <typename value_t>
-  cuivflStatus_t cuivflSearchImpl(
-    const T* queries, uint32_t batch_size, uint32_t k, size_t* neighbors, value_t* distances);
+  template <typename AccT>
+  void cuivflSearchImpl(
+    const T* queries, uint32_t n_queries, uint32_t k, size_t* neighbors, AccT* distances);
 };
 
 template <typename T>
 cuivflHandle<T>::cuivflHandle(const handle_t& handle,
                               raft::distance::DistanceType metric_type,
                               uint32_t dim,
-                              uint32_t nlist,
-                              uint32_t niter,
-                              uint32_t device)
+                              uint32_t n_lists,
+                              uint32_t n_iters)
   : handle_(handle),
     stream_(handle_.get_stream()),
     dim_(dim),
-    nlist_(nlist),
-    niter_(niter),
+    n_lists_(n_lists),
+    n_iters_(n_iters),
     metric_type_(metric_type),
     grid_dim_x_(0),
     list_data_dev_(0, stream_),
@@ -203,69 +182,64 @@ cuivflHandle<T>::cuivflHandle(const handle_t& handle,
  *
  */
 template <typename T>
-cuivflStatus_t cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_managed_ptr,
-                                                           const T* dataset,
-                                                           T* trainset,
-                                                           uint32_t* datasetLabels,
-                                                           uint32_t nrow,
-                                                           uint32_t ntrain)
+void cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_managed_ptr,
+                                                 const T* dataset,
+                                                 T* trainset,
+                                                 uint32_t* labels,
+                                                 uint32_t n_rows,
+                                                 uint32_t n_rows_train)
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
-    "cuivflBuildOptimizedKmeans(%u, %u)", nrow, ntrain);
-  uint32_t numTrainset   = ntrain;
-  uint32_t numClusters   = nlist_;
-  uint32_t dimDataset    = dim_;
-  uint32_t numIterations = niter_;
+    "cuivflBuildOptimizedKmeans(%u, %u)", n_rows, n_rows_train);
 
-  rmm::device_uvector<uint32_t> trainsetLabels(numTrainset, stream_);
+  rmm::device_uvector<uint32_t> trainset_labels(n_rows_train, stream_);
 
-  float* clusterCenters = centriod_managed_ptr;
+  float* cluster_centers = centriod_managed_ptr;
 
-  uint32_t numMesoClusters = pow((double)(numClusters), (double)1.0 / 2.0) + 0.5;
-  RAFT_LOG_DEBUG("(%s) # numMesoClusters: %u", __func__, numMesoClusters);
+  uint32_t n_mesoclusters = std::pow<double>(n_lists_, 0.5) + 0.5;
+  RAFT_LOG_DEBUG("(%s) # n_mesoclusters: %u", __func__, n_mesoclusters);
 
   rmm::mr::managed_memory_resource managed_memory;
-  rmm::device_uvector<float> mesoClusterCenters(
-    numMesoClusters * dimDataset, stream_, &managed_memory);
-  rmm::device_uvector<uint32_t> mesoClusterLabels(numTrainset, stream_, &managed_memory);
-  rmm::device_uvector<uint32_t> mesoClusterSize_buf(numMesoClusters, stream_, &managed_memory);
-  rmm::device_uvector<float> mesoClusterCentersTemp(
-    numMesoClusters * dimDataset, stream_, &managed_memory);
+  rmm::device_uvector<float> mesocluster_centers(n_mesoclusters * dim_, stream_, &managed_memory);
+  rmm::device_uvector<uint32_t> mesocluster_labels(n_rows_train, stream_, &managed_memory);
+  rmm::device_uvector<uint32_t> mesocluster_sizes_buf(n_mesoclusters, stream_, &managed_memory);
+  rmm::device_uvector<float> mesocluster_centers_tmp(
+    n_mesoclusters * dim_, stream_, &managed_memory);
 
-  auto mesoClusterSize = mesoClusterSize_buf.data();
+  auto mesocluster_sizes = mesocluster_sizes_buf.data();
 
   rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> kmeans_mem_res(
     rmm::mr::get_current_device_resource(),
     // an arbitrary guess on the upper bound of the workspace size
-    Pow2<256>::roundUp(kmeans::calc_minibatch_size(numMesoClusters, nrow) * dimDataset * 4));
+    Pow2<256>::roundUp(kmeans::calc_minibatch_size(n_mesoclusters, n_rows) * dim_ * 4));
 
   // Training meso-clusters
-  for (uint32_t iter = 0; iter < 2 * numIterations; iter += 2) {
-    RAFT_LOG_TRACE("Training kmeans of meso-clusters: %.1f / %u", (float)iter / 2, numIterations);
+  for (uint32_t iter = 0; iter < 2 * n_iters_; iter += 2) {
+    RAFT_LOG_TRACE("Training kmeans of meso-clusters: %.1f / %u", (float)iter / 2, n_iters_);
     kmeans::predict(handle_,
-                    mesoClusterCenters.data(),
-                    numMesoClusters,
-                    dimDataset,
+                    mesocluster_centers.data(),
+                    n_mesoclusters,
+                    dim_,
                     trainset,
-                    numTrainset,
-                    mesoClusterLabels.data(),
+                    n_rows_train,
+                    mesocluster_labels.data(),
                     metric_type_,
                     (iter != 0),
-                    mesoClusterCentersTemp.data(),
-                    mesoClusterSize,
+                    mesocluster_centers_tmp.data(),
+                    mesocluster_sizes,
                     true,
                     stream_,
                     &kmeans_mem_res);
 
-    if (iter + 1 < 2 * numIterations) {
-      if (kmeans::adjust_centers(mesoClusterCenters.data(),
-                                 numMesoClusters,
-                                 dimDataset,
+    if (iter + 1 < 2 * n_iters_) {
+      if (kmeans::adjust_centers(mesocluster_centers.data(),
+                                 n_mesoclusters,
+                                 dim_,
                                  trainset,
-                                 numTrainset,
-                                 mesoClusterLabels.data(),
+                                 n_rows_train,
+                                 mesocluster_labels.data(),
                                  metric_type_,
-                                 mesoClusterSize,
+                                 mesocluster_sizes,
                                  (float)1.0 / 4,
                                  stream_)) {
         iter -= 1;
@@ -275,134 +249,128 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_manag
 
   handle_.sync_stream(stream_);
 
-  std::vector<uint32_t> numFineClusters(numMesoClusters);
-  std::vector<uint32_t> csumFineClusters(numMesoClusters + 1);
-  csumFineClusters[0] = 0;
-
-  uint32_t numClustersRemain  = numClusters;
-  uint32_t numTrainsetRemain  = numTrainset;
-  uint32_t mesoClusterSizeMax = 0;
-  uint32_t mesoClusterSizeSum = 0;
-  uint32_t numFineClustersSum = 0;  // checking
-  uint32_t numFineClustersMax = 0;
-  for (uint32_t i = 0; i < numMesoClusters; i++) {
-    if (i < numMesoClusters - 1) {
-      numFineClusters[i] = (double)numClustersRemain * mesoClusterSize[i] / numTrainsetRemain + .5;
+  std::vector<uint32_t> fine_clusters_nums(n_mesoclusters);
+  std::vector<uint32_t> fine_clusters_csum(n_mesoclusters + 1);
+  fine_clusters_csum[0] = 0;
+
+  uint32_t n_lists_rem            = n_lists_;
+  uint32_t n_rows_train_rem       = n_rows_train;
+  uint32_t mesocluster_size_max   = 0;
+  uint32_t mesocluster_size_sum   = 0;
+  uint32_t fine_clusters_nums_sum = 0;  // checking
+  uint32_t fine_clusters_nums_max = 0;
+  for (uint32_t i = 0; i < n_mesoclusters; i++) {
+    if (i < n_mesoclusters - 1) {
+      fine_clusters_nums[i] = (double)n_lists_rem * mesocluster_sizes[i] / n_rows_train_rem + .5;
     } else {
-      numFineClusters[i] = numClustersRemain;
+      fine_clusters_nums[i] = n_lists_rem;
     }
-    numClustersRemain -= numFineClusters[i];
-    numTrainsetRemain -= mesoClusterSize[i];
-    mesoClusterSizeMax = max(mesoClusterSizeMax, mesoClusterSize[i]);
-    mesoClusterSizeSum += mesoClusterSize[i];
-    numFineClustersSum += numFineClusters[i];
-    numFineClustersMax      = max(numFineClustersMax, numFineClusters[i]);
-    csumFineClusters[i + 1] = csumFineClusters[i] + numFineClusters[i];
-  }  // end for (uint32_t i = 0; i < numMesoClusters; i++)
-
-  RAFT_LOG_DEBUG("(%s) # mesoClusterSizeSum: %u", __func__, mesoClusterSizeSum);
-  RAFT_LOG_DEBUG("(%s) # numFineClustersSum: %u", __func__, numFineClustersSum);
-  assert(mesoClusterSizeSum == numTrainset);
-  assert(numFineClustersSum == numClusters);
-  assert(csumFineClusters[numMesoClusters] == numClusters);
-
-  rmm::device_uvector<uint32_t> idsTrainset_buf(mesoClusterSizeMax, stream_, &managed_memory);
-  rmm::device_uvector<float> subTrainset_buf(
-    mesoClusterSizeMax * dimDataset, stream_, &managed_memory);
-  auto idsTrainset = idsTrainset_buf.data();
-  auto subTrainset = subTrainset_buf.data();
+    n_lists_rem -= fine_clusters_nums[i];
+    n_rows_train_rem -= mesocluster_sizes[i];
+    mesocluster_size_max = max(mesocluster_size_max, mesocluster_sizes[i]);
+    mesocluster_size_sum += mesocluster_sizes[i];
+    fine_clusters_nums_sum += fine_clusters_nums[i];
+    fine_clusters_nums_max    = max(fine_clusters_nums_max, fine_clusters_nums[i]);
+    fine_clusters_csum[i + 1] = fine_clusters_csum[i] + fine_clusters_nums[i];
+  }
+
+  RAFT_LOG_DEBUG("(%s) # mesocluster_size_sum: %u", __func__, mesocluster_size_sum);
+  RAFT_LOG_DEBUG("(%s) # fine_clusters_nums_sum: %u", __func__, fine_clusters_nums_sum);
+  assert(mesocluster_size_sum == n_rows_train);
+  assert(fine_clusters_nums_sum == n_lists_);
+  assert(fine_clusters_csum[n_mesoclusters] == n_lists_);
+
+  rmm::device_uvector<uint32_t> mc_trainset_ids_buf(mesocluster_size_max, stream_, &managed_memory);
+  rmm::device_uvector<float> mc_trainset_buf(mesocluster_size_max * dim_, stream_, &managed_memory);
+  auto mc_trainset_ids = mc_trainset_ids_buf.data();
+  auto mc_trainset     = mc_trainset_buf.data();
 
   // label (cluster ID) of each vector
-  rmm::device_uvector<uint32_t> labelsMP(mesoClusterSizeMax, stream_, &managed_memory);
+  rmm::device_uvector<uint32_t> mc_trainset_labels(mesocluster_size_max, stream_, &managed_memory);
 
-  rmm::device_uvector<float> clusterCentersEach(
-    numFineClustersMax * dimDataset, stream_, &managed_memory);
-  rmm::device_uvector<float> clusterCentersMP(
-    numFineClustersMax * dimDataset, stream_, &managed_memory);
+  rmm::device_uvector<float> mc_trainset_ccenters(
+    fine_clusters_nums_max * dim_, stream_, &managed_memory);
+  rmm::device_uvector<float> mc_trainset_ccenters_tmp(
+    fine_clusters_nums_max * dim_, stream_, &managed_memory);
   // number of vectors in each cluster
-  rmm::device_uvector<uint32_t> clusterSizeMP(numFineClustersMax, stream_, &managed_memory);
+  rmm::device_uvector<uint32_t> mc_trainset_csizes_tmp(
+    fine_clusters_nums_max, stream_, &managed_memory);
 
   // Training clusters in each meso-clusters
-  uint32_t numClustersDone = 0;
-  for (uint32_t i = 0; i < numMesoClusters; i++) {
+  uint32_t n_clusters_done = 0;
+  for (uint32_t i = 0; i < n_mesoclusters; i++) {
     uint32_t k = 0;
-    for (uint32_t j = 0; j < numTrainset; j++) {
-      if (mesoClusterLabels.data()[j] != i) continue;
-      idsTrainset[k++] = j;
+    for (uint32_t j = 0; j < n_rows_train; j++) {
+      if (mesocluster_labels.data()[j] != i) continue;
+      mc_trainset_ids[k++] = j;
     }
-    assert(k == mesoClusterSize[i]);
-
-    utils::copy_selected<T>(mesoClusterSize[i],
-                            dimDataset,
-                            trainset,
-                            idsTrainset,
-                            dimDataset,
-                            subTrainset,
-                            dimDataset,
-                            stream_);
-
-    for (uint32_t iter = 0; iter < 2 * numIterations; iter += 2) {
-      RAFT_LOG_TRACE("Training kmeans of clusters in meso-cluster %u (numClusters: %u): %.1f / %u",
+    assert(k == mesocluster_sizes[i]);
+
+    utils::copy_selected<T>(
+      mesocluster_sizes[i], dim_, trainset, mc_trainset_ids, dim_, mc_trainset, dim_, stream_);
+
+    for (uint32_t iter = 0; iter < 2 * n_iters_; iter += 2) {
+      RAFT_LOG_TRACE("Training kmeans of clusters in meso-cluster %u (n_lists: %u): %.1f / %u",
                      i,
-                     numFineClusters[i],
+                     fine_clusters_nums[i],
                      (float)iter / 2,
-                     numIterations);
+                     n_iters_);
 
       kmeans::predict(handle_,
-                      clusterCentersEach.data(),
-                      numFineClusters[i],
-                      dimDataset,
-                      subTrainset,
-                      mesoClusterSize[i],
-                      labelsMP.data(),
+                      mc_trainset_ccenters.data(),
+                      fine_clusters_nums[i],
+                      dim_,
+                      mc_trainset,
+                      mesocluster_sizes[i],
+                      mc_trainset_labels.data(),
                       metric_type_,
                       (iter != 0),
-                      clusterCentersMP.data(),
-                      clusterSizeMP.data(),
+                      mc_trainset_ccenters_tmp.data(),
+                      mc_trainset_csizes_tmp.data(),
                       true,
                       stream_,
                       &kmeans_mem_res);
 
-      if (iter + 1 < 2 * numIterations) {
-        if (kmeans::adjust_centers(clusterCentersEach.data(),
-                                   numFineClusters[i],
-                                   dimDataset,
-                                   subTrainset,
-                                   mesoClusterSize[i],
-                                   labelsMP.data(),
+      if (iter + 1 < 2 * n_iters_) {
+        if (kmeans::adjust_centers(mc_trainset_ccenters.data(),
+                                   fine_clusters_nums[i],
+                                   dim_,
+                                   mc_trainset,
+                                   mesocluster_sizes[i],
+                                   mc_trainset_labels.data(),
                                    metric_type_,
-                                   clusterSizeMP.data(),
+                                   mc_trainset_csizes_tmp.data(),
                                    (float)1.0 / 4,
                                    stream_)) {
           iter -= 1;
         }
       }
     }
-    copy(clusterCenters + (dimDataset * csumFineClusters[i]),
-         clusterCentersEach.data(),
-         numFineClusters[i] * dimDataset,
+    copy(cluster_centers + (dim_ * fine_clusters_csum[i]),
+         mc_trainset_ccenters.data(),
+         fine_clusters_nums[i] * dim_,
          stream_);
     handle_.sync_stream(stream_);
-    numClustersDone += numFineClusters[i];
-  }  // end for (uint32_t i = 0; i < numMesoClusters; i++)
-  assert(numClustersDone == numClusters);
+    n_clusters_done += fine_clusters_nums[i];
+  }  // end for (uint32_t i = 0; i < n_mesoclusters; i++)
+  assert(n_clusters_done == n_lists_);
 
-  clusterCentersMP.resize(numClusters * dimDataset, stream_);
-  clusterSizeMP.resize(numClusters, stream_);
+  mc_trainset_ccenters_tmp.resize(n_lists_ * dim_, stream_);
+  mc_trainset_csizes_tmp.resize(n_lists_, stream_);
 
   // Fitting whole clusters using whole trainset.
   for (int iter = 0; iter < 2; iter++) {
     kmeans::predict(handle_,
-                    clusterCenters,
-                    numClusters,
-                    dimDataset,
+                    cluster_centers,
+                    n_lists_,
+                    dim_,
                     trainset,
-                    numTrainset,
-                    trainsetLabels.data(),
+                    n_rows_train,
+                    trainset_labels.data(),
                     metric_type_,
                     true,
-                    clusterCentersMP.data(),
-                    clusterSizeMP.data(),
+                    mc_trainset_ccenters_tmp.data(),
+                    mc_trainset_csizes_tmp.data(),
                     true,
                     stream_,
                     &kmeans_mem_res);
@@ -411,93 +379,90 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_manag
   RAFT_LOG_DEBUG("(%s) Final fitting.", __func__);
 
   kmeans::predict(handle_,
-                  clusterCenters,
-                  nlist_,
+                  cluster_centers,
+                  n_lists_,
                   dim_,
                   dataset,
-                  nrow_,
-                  datasetLabels,
+                  n_rows_,
+                  labels,
                   metric_type_,
                   true,
-                  clusterCentersMP.data(),
-                  clusterSizeMP.data(),
+                  mc_trainset_ccenters_tmp.data(),
+                  mc_trainset_csizes_tmp.data(),
                   true,
                   stream_,
                   &kmeans_mem_res);
 
   kmeans::predict(handle_,
-                  clusterCenters,
-                  nlist_,
+                  cluster_centers,
+                  n_lists_,
                   dim_,
                   dataset,
-                  nrow_,
-                  datasetLabels,
+                  n_rows_,
+                  labels,
                   metric_type_,
                   true,
-                  clusterCentersMP.data(),
-                  clusterSizeMP.data(),
+                  mc_trainset_ccenters_tmp.data(),
+                  mc_trainset_csizes_tmp.data(),
                   false,
                   stream_,
                   &kmeans_mem_res);
-
-  return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
-}  // end func cuivflBuildOptimizedKmeans
+}
 
 template <typename T>
-cuivflStatus_t cuivflHandle<T>::cuivflBuildIndex(const T* dataset,
-                                                 T* trainset,
-                                                 uint32_t nrow,
-                                                 uint32_t ntrain)
+void cuivflHandle<T>::cuivflBuildIndex(const T* dataset,
+                                       T* trainset,
+                                       uint32_t n_rows,
+                                       uint32_t n_rows_train)
 {
-  nrow_ = nrow;
+  n_rows_ = n_rows;
+  RAFT_EXPECTS(n_rows_ > 0, "empty dataset");
 
   rmm::mr::managed_memory_resource managed_memory;
-  rmm::device_uvector<float> centriod_managed_buf(nlist_ * dim_, stream_, &managed_memory);
+  rmm::device_uvector<float> centriod_managed_buf(n_lists_ * dim_, stream_, &managed_memory);
   auto centriod_managed_ptr = centriod_managed_buf.data();
 
-  if (this == nullptr || nrow_ == 0) { return CUIVFL_STATUS_NOT_INITIALIZED; }
-  if constexpr (!std::is_same_v<T, float> && !std::is_same_v<T, uint8_t> &&
-                !std::is_same_v<T, int8_t>) {
-    return CUIVFL_STATUS_UNSUPPORTED_DTYPE;
-  }
+  static_assert(std::is_same_v<T, float> || std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
+                "unsupported data type");
 
   // Alloc manage memory for centriods, trainset and workspace
-  rmm::device_uvector<uint32_t> datasetLabels_buf(nrow_, stream_, &managed_memory);  // [numDataset]
-  auto datasetLabels = datasetLabels_buf.data();
+  rmm::device_uvector<uint32_t> labels_buf(n_rows_, stream_, &managed_memory);  // [numDataset]
+  auto labels = labels_buf.data();
 
   // Predict labels of the whole dataset
-  cuivflBuildOptimizedKmeans(centriod_managed_ptr, dataset, trainset, datasetLabels, nrow, ntrain);
+  cuivflBuildOptimizedKmeans(centriod_managed_ptr, dataset, trainset, labels, n_rows, n_rows_train);
 
   // Calculate the L2 related result
-  centriod_norm_dev_.resize(nlist_, stream_);
+  centriod_norm_dev_.resize(n_lists_, stream_);
 
   if (metric_type_ == raft::distance::DistanceType::L2Expanded) {
-    utils::dots_along_rows(nlist_, dim_, centriod_managed_ptr, centriod_norm_dev_.data(), stream_);
+    utils::dots_along_rows(
+      n_lists_, dim_, centriod_managed_ptr, centriod_norm_dev_.data(), stream_);
     RAFT_LOG_TRACE_VEC(centriod_norm_dev_.data(), 20);
   }
 
   // Record the number of elements in each clusters
   handle_.sync_stream(stream_);
 
-  list_prefix_interleaved_host_.resize(nlist_);
-  list_lengths_host_.assign(nlist_, 0);
-  for (uint32_t i = 0; i < nrow_; i++) {
-    uint32_t id_cluster = datasetLabels[i];
+  list_prefix_interleaved_host_.resize(n_lists_);
+  list_lengths_host_.assign(n_lists_, 0);
+  for (uint32_t i = 0; i < n_rows_; i++) {
+    uint32_t id_cluster = labels[i];
     list_lengths_host_[id_cluster] += 1;
   }
 
   ninterleave_ = 0;
-  for (uint32_t i = 0; i < nlist_; i++) {
+  for (uint32_t i = 0; i < n_lists_; i++) {
     list_prefix_interleaved_host_[i] = ninterleave_;
     ninterleave_ += Pow2<WarpSize>::roundUp(list_lengths_host_[i]);
   }
 
   list_data_host_.assign(ninterleave_ * dim_, 0);
   list_index_host_.assign(ninterleave_, 0);
-  list_lengths_host_.assign(nlist_, 0);
+  list_lengths_host_.assign(n_lists_, 0);
 
-  for (size_t i = 0; i < nrow_; i++) {
-    uint32_t id_cluster     = datasetLabels[i];
+  for (size_t i = 0; i < n_rows_; i++) {
+    uint32_t id_cluster     = labels[i];
     uint32_t current_add    = list_lengths_host_[id_cluster];
     uint32_t interleave_add = list_prefix_interleaved_host_[id_cluster];
     _ivfflat_interleaved(
@@ -509,25 +474,24 @@ cuivflStatus_t cuivflHandle<T>::cuivflBuildIndex(const T* dataset,
   // Store index on GPU memory: temp WAR until we've entire index building buffers on device
   list_data_dev_.resize(ninterleave_ * dim_, stream_);
   list_index_dev_.resize(ninterleave_, stream_);
-  list_prefix_interleaved_dev_.resize(nlist_, stream_);
-  list_lengths_dev_.resize(nlist_, stream_);
-  centriod_dev_.resize(nlist_ * dim_, stream_);
+  list_prefix_interleaved_dev_.resize(n_lists_, stream_);
+  list_lengths_dev_.resize(n_lists_, stream_);
+  centriod_dev_.resize(n_lists_ * dim_, stream_);
 
   // Read the list
-  copy(list_prefix_interleaved_dev_.data(), list_prefix_interleaved_host_.data(), nlist_, stream_);
-  copy(list_lengths_dev_.data(), list_lengths_host_.data(), nlist_, stream_);
-  copy(centriod_dev_.data(), centriod_managed_ptr, nlist_ * dim_, stream_);
+  copy(
+    list_prefix_interleaved_dev_.data(), list_prefix_interleaved_host_.data(), n_lists_, stream_);
+  copy(list_lengths_dev_.data(), list_lengths_host_.data(), n_lists_, stream_);
+  copy(centriod_dev_.data(), centriod_managed_ptr, n_lists_ * dim_, stream_);
 
   copy(list_data_dev_.data(), list_data_host_.data(), ninterleave_ * dim_, stream_);
   copy(list_index_dev_.data(), list_index_host_.data(), ninterleave_, stream_);
-
-  return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
 }
 
 template <typename T>
-cuivflStatus_t cuivflHandle<T>::queryIVFFlatGridSize(const uint32_t nprobe,
-                                                     const uint32_t batch_size,
-                                                     const uint32_t k)
+void cuivflHandle<T>::queryIVFFlatGridSize(const uint32_t n_probes,
+                                           const uint32_t n_queries,
+                                           const uint32_t k)
 {
   // query the gridDimX size to store probes topK output
   ivfflat_interleaved_scan<T, typename utils::config<T>::value_t>(nullptr,
@@ -537,9 +501,9 @@ cuivflStatus_t cuivflHandle<T>::queryIVFFlatGridSize(const uint32_t nprobe,
                                                                   nullptr,
                                                                   nullptr,
                                                                   metric_type_,
-                                                                  nprobe,
+                                                                  n_probes,
                                                                   k,
-                                                                  batch_size,
+                                                                  n_queries,
                                                                   dim_,
                                                                   nullptr,
                                                                   nullptr,
@@ -547,16 +511,16 @@ cuivflStatus_t cuivflHandle<T>::queryIVFFlatGridSize(const uint32_t nprobe,
                                                                   greater_,
                                                                   veclen_,
                                                                   grid_dim_x_);
-  return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
 }
 
 template <typename T>
-cuivflStatus_t cuivflHandle<T>::cuivflSetSearchParameters(const uint32_t nprobe,
-                                                          const uint32_t max_batch,
-                                                          const uint32_t max_k)
+void cuivflHandle<T>::cuivflSetSearchParameters(const uint32_t n_probes,
+                                                const uint32_t max_batch,
+                                                const uint32_t max_k)
 {
-  nprobe_ = nprobe;
-  if (nprobe_ <= 0) { return CUIVFL_STATUS_INVALID_VALUE; }
+  RAFT_EXPECTS(n_probes > 0,
+               "n_probes (number of clusters to probe in the search) must be positive.");
+  n_probes_ = n_probes;
   // Set the greater_
   if (metric_type_ == raft::distance::DistanceType::L2Expanded ||
       metric_type_ == raft::distance::DistanceType::L2Unexpanded) {
@@ -570,53 +534,50 @@ cuivflStatus_t cuivflHandle<T>::cuivflSetSearchParameters(const uint32_t nprobe,
   auto cur_memory_resource = rmm::mr::get_current_device_resource();
   if (!search_mem_res.has_value() || search_mem_res->get_upstream() != cur_memory_resource) {
     search_mem_res.emplace(cur_memory_resource,
-                           Pow2<256>::roundUp(max_batch * nprobe * max_k * 16));
+                           Pow2<256>::roundUp(max_batch * n_probes * max_k * 16));
   }
-
-  return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
 }
 
 template <typename T>
-cuivflStatus_t cuivflHandle<T>::cuivflSearch(const T* queries,  // [numQueries, dimDataset]
-                                             uint32_t batch_size,
-                                             uint32_t k,
-                                             size_t* neighbors,  // [numQueries, topK]
-                                             float* distances)
+void cuivflHandle<T>::cuivflSearch(const T* queries,  // [numQueries, dim]
+                                   uint32_t n_queries,
+                                   uint32_t k,
+                                   size_t* neighbors,  // [numQueries, topK]
+                                   float* distances)
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
-    "cuivflSearch(%u, %u, %zu)", batch_size, k, neighbors);
-  cuivflSearchImpl<float>(queries, batch_size, k, neighbors, distances);
-  return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
+    "cuivflSearch(%u, %u, %zu)", n_queries, k, neighbors);
+  cuivflSearchImpl<float>(queries, n_queries, k, neighbors, distances);
 }
 
 template <typename T>
-template <typename value_t>
-cuivflStatus_t cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueries, dimDataset]
-                                                 uint32_t batch_size,
-                                                 uint32_t k,
-                                                 size_t* neighbors,  // [numQueries, topK]
-                                                 value_t* distances)
+template <typename AccT>
+void cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueries, dim]
+                                       uint32_t n_queries,
+                                       uint32_t k,
+                                       size_t* neighbors,  // [numQueries, topK]
+                                       AccT* distances)
 {
-  uint32_t nprobe = std::min(nprobe_, nlist_);
-  grid_dim_x_     = 0;
-  queryIVFFlatGridSize(nprobe, batch_size, k);
+  uint32_t n_probes = std::min(n_probes_, n_lists_);
+  grid_dim_x_       = 0;
+  queryIVFFlatGridSize(n_probes, n_queries, k);
   auto search_mr = &(search_mem_res.value());
   // The norm of query
-  rmm::device_uvector<float> query_norm_dev(batch_size, stream_, search_mr);
+  rmm::device_uvector<float> query_norm_dev(n_queries, stream_, search_mr);
   // The distance value of cluster(list) and queries
-  rmm::device_uvector<float> distance_buffer_dev(batch_size * nlist_, stream_, search_mr);
+  rmm::device_uvector<float> distance_buffer_dev(n_queries * n_lists_, stream_, search_mr);
   // The topk distance value of cluster(list) and queries
-  rmm::device_uvector<float> coarse_distances_dev(batch_size * nprobe, stream_, search_mr);
+  rmm::device_uvector<float> coarse_distances_dev(n_queries * n_probes, stream_, search_mr);
   // The topk  index of cluster(list) and queries
-  rmm::device_uvector<uint32_t> coarse_indices_dev(batch_size * nprobe, stream_, search_mr);
+  rmm::device_uvector<uint32_t> coarse_indices_dev(n_queries * n_probes, stream_, search_mr);
   // The topk distance value of candicate vectors from each cluster(list)
-  rmm::device_uvector<value_t> refined_distances_dev(batch_size * nprobe * k, stream_, search_mr);
+  rmm::device_uvector<AccT> refined_distances_dev(n_queries * n_probes * k, stream_, search_mr);
   // The topk index of candicate vectors from each cluster(list)
-  rmm::device_uvector<size_t> refined_indices_dev(batch_size * nprobe * k, stream_, search_mr);
+  rmm::device_uvector<size_t> refined_indices_dev(n_queries * n_probes * k, stream_, search_mr);
 
   size_t float_query_size;
   if constexpr (std::is_integral_v<T>) {
-    float_query_size = batch_size * dim_;
+    float_query_size = n_queries * dim_;
   } else {
     float_query_size = 0;
   }
@@ -627,7 +588,7 @@ cuivflStatus_t cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueri
     converted_queries_ptr = const_cast<float*>(queries);
   } else {
     linalg::unaryOp(
-      converted_queries_ptr, queries, batch_size * dim_, utils::mapping<float>{}, stream_);
+      converted_queries_ptr, queries, n_queries * dim_, utils::mapping<float>{}, stream_);
   }
 
   float alpha = 1.0f;
@@ -636,11 +597,11 @@ cuivflStatus_t cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueri
   if (metric_type_ == raft::distance::DistanceType::L2Expanded) {
     alpha = -2.0f;
     beta  = 1.0f;
-    utils::dots_along_rows(batch_size, dim_, converted_queries_ptr, query_norm_dev.data(), stream_);
+    utils::dots_along_rows(n_queries, dim_, converted_queries_ptr, query_norm_dev.data(), stream_);
     utils::outer_add(query_norm_dev.data(),
-                     batch_size,
+                     n_queries,
                      centriod_norm_dev_.data(),
-                     nlist_,
+                     n_lists_,
                      distance_buffer_dev.data(),
                      stream_);
     RAFT_LOG_TRACE_VEC(centriod_norm_dev_.data(), 20);
@@ -653,8 +614,8 @@ cuivflStatus_t cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueri
   linalg::gemm(handle_,
                true,
                false,
-               nlist_,
-               batch_size,
+               n_lists_,
+               n_queries,
                dim_,
                &alpha,
                centriod_dev_.data(),
@@ -663,38 +624,38 @@ cuivflStatus_t cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueri
                dim_,
                &beta,
                distance_buffer_dev.data(),
-               nlist_,
+               n_lists_,
                stream_);
 
   RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), 20);
-  if (nprobe <= raft::spatial::knn::detail::topk::kMaxCapacity) {
-    topk::warp_sort_topk<value_t, uint32_t>(distance_buffer_dev.data(),
-                                            nullptr,
-                                            batch_size,
-                                            nlist_,
-                                            nprobe,
-                                            coarse_distances_dev.data(),
-                                            coarse_indices_dev.data(),
-                                            !greater_,
-                                            stream_);
+  if (n_probes <= raft::spatial::knn::detail::topk::kMaxCapacity) {
+    topk::warp_sort_topk<AccT, uint32_t>(distance_buffer_dev.data(),
+                                         nullptr,
+                                         n_queries,
+                                         n_lists_,
+                                         n_probes,
+                                         coarse_distances_dev.data(),
+                                         coarse_indices_dev.data(),
+                                         !greater_,
+                                         stream_);
   } else {
-    topk::radix_topk<value_t, uint32_t, 11, 512>(distance_buffer_dev.data(),
-                                                 nullptr,
-                                                 batch_size,
-                                                 nlist_,
-                                                 nprobe,
-                                                 coarse_distances_dev.data(),
-                                                 coarse_indices_dev.data(),
-                                                 !greater_,
-                                                 stream_,
-                                                 &(search_mem_res.value()));
+    topk::radix_topk<AccT, uint32_t, 11, 512>(distance_buffer_dev.data(),
+                                              nullptr,
+                                              n_queries,
+                                              n_lists_,
+                                              n_probes,
+                                              coarse_distances_dev.data(),
+                                              coarse_indices_dev.data(),
+                                              !greater_,
+                                              stream_,
+                                              &(search_mem_res.value()));
   }
-  RAFT_LOG_TRACE_VEC(coarse_indices_dev.data(), 1 * nprobe);
-  RAFT_LOG_TRACE_VEC(coarse_distances_dev.data(), 1 * nprobe);
+  RAFT_LOG_TRACE_VEC(coarse_indices_dev.data(), 1 * n_probes);
+  RAFT_LOG_TRACE_VEC(coarse_distances_dev.data(), 1 * n_probes);
 
-  value_t* distances_dev_ptr = refined_distances_dev.data();
-  size_t* indices_dev_ptr    = refined_indices_dev.data();
-  if (nprobe == 1 || grid_dim_x_ == 1) {
+  AccT* distances_dev_ptr = refined_distances_dev.data();
+  size_t* indices_dev_ptr = refined_indices_dev.data();
+  if (n_probes == 1 || grid_dim_x_ == 1) {
     distances_dev_ptr = distances;
     indices_dev_ptr   = neighbors;
   }
@@ -707,9 +668,9 @@ cuivflStatus_t cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueri
     list_lengths_dev_.data(),
     list_prefix_interleaved_dev_.data(),
     metric_type_,
-    nprobe,
+    n_probes,
     k,
-    batch_size,
+    n_queries,
     dim_,
     indices_dev_ptr,
     distances_dev_ptr,
@@ -724,30 +685,28 @@ cuivflStatus_t cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueri
   // Merge topk values from different blocks
   if (grid_dim_x_ > 1) {
     if (k <= raft::spatial::knn::detail::topk::kMaxCapacity) {
-      topk::warp_sort_topk<value_t, size_t>(refined_distances_dev.data(),
-                                            refined_indices_dev.data(),
-                                            batch_size,
-                                            k * grid_dim_x_,
-                                            k,
-                                            distances,
-                                            neighbors,
-                                            !greater_,
-                                            stream_);
+      topk::warp_sort_topk<AccT, size_t>(refined_distances_dev.data(),
+                                         refined_indices_dev.data(),
+                                         n_queries,
+                                         k * grid_dim_x_,
+                                         k,
+                                         distances,
+                                         neighbors,
+                                         !greater_,
+                                         stream_);
     } else {
-      topk::radix_topk<value_t, size_t, 11, 512>(refined_distances_dev.data(),
-                                                 refined_indices_dev.data(),
-                                                 batch_size,
-                                                 k * grid_dim_x_,
-                                                 k,
-                                                 distances,
-                                                 neighbors,
-                                                 !greater_,
-                                                 stream_,
-                                                 &(search_mem_res.value()));
+      topk::radix_topk<AccT, size_t, 11, 512>(refined_distances_dev.data(),
+                                              refined_indices_dev.data(),
+                                              n_queries,
+                                              k * grid_dim_x_,
+                                              k,
+                                              distances,
+                                              neighbors,
+                                              !greater_,
+                                              stream_,
+                                              &(search_mem_res.value()));
     }
   }
-
-  return cuivflStatus_t::CUIVFL_STATUS_SUCCESS;
 }
 
 }  // namespace raft::spatial::knn::detail
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
index 2b347968e7..3399e31044 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
@@ -118,8 +118,8 @@ void approx_knn_cuivfl_ivfflat_build_index(const raft::handle_t& handle,
                                   cudaMemcpyDefault,
                                   stream));
 
-  index->handle_.get<T>() = std::make_unique<detail::cuivflHandle<T>>(
-    handle, metric, D, params->nlist, niter, index->device);
+  index->handle_.get<T>() =
+    std::make_unique<detail::cuivflHandle<T>>(handle, metric, D, params->nlist, niter);
 
   // NB: `trainset` is accessed by both CPU and GPU code here.
   index->handle_.get<T>()->cuivflBuildIndex(dataset, trainset.data(), n, ntrain);

From 042c4109180ec47b3749c04f773bdcbeddd70f2e Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 8 Jun 2022 17:14:55 +0200
Subject: [PATCH 061/118] Move part of the index building onto gpu

---
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  | 223 +++++++++---------
 .../raft/spatial/knn/detail/ann_quantized.cuh |  11 +-
 2 files changed, 112 insertions(+), 122 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index 7ddd2fc398..bbb5d8259b 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -31,6 +31,7 @@
 #include <raft/linalg/gemm.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/spatial/knn/ann_common.h>
+#include <raft/stats/histogram.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -40,38 +41,6 @@
 
 namespace raft::spatial::knn::detail {
 
-template <typename T>
-void _ivfflat_interleaved(
-  T* list_data, const T* dataset, uint32_t dim, size_t index, size_t prefix, uint32_t veclen)
-{
-  size_t group_id = index / WarpSize;
-  size_t in_id    = (index % WarpSize) * veclen;
-  list_data += (prefix + group_id * WarpSize) * dim + in_id;
-
-  for (size_t i = 0; i < dim; i += veclen) {
-    for (size_t j = 0; j < veclen; j++) {
-      list_data[i * WarpSize + j] = dataset[i + j];
-    }
-  }
-}
-
-// This kernel intends to remove the dependency of having dataset in managed mem/host mem.
-//
-template <typename T>
-__global__ void write_ivf_flat_interleaved_index(
-  T* list_data, const T* dataset, uint32_t dim, size_t index, size_t prefix, uint32_t veclen)
-{
-  size_t group_id = index / WarpSize;
-  size_t in_id    = (index % WarpSize) * veclen;
-  list_data += (prefix + group_id * WarpSize) * dim + in_id;
-
-  for (size_t i = 0; i < dim; i += veclen) {
-    for (size_t j = 0; j < veclen; j++) {
-      list_data[i * WarpSize + j] = dataset[i + j];
-    }
-  }
-}
-
 template <typename T>
 class cuivflHandle {
  public:
@@ -104,17 +73,17 @@ class cuivflHandle {
   uint32_t dim_;         // The dimension of vectors for input dataset
   uint32_t n_probes_;    // The number of clusters for searching
   uint32_t n_rows_;      // The number of elements for input dataset
-  size_t ninterleave_;   // The number of elements in 32 interleaved group for input dataset
+  uint32_t index_size_;  // The number of elements in 32 interleaved group for input dataset
   uint32_t veclen_;      // The vectorization length of dataset in index.
   uint32_t grid_dim_x_;  // The number of blocks launched across n_probes.
 
   // device pointer
-  //  The device memory pointer; inverted list for data; size [ninterleave_, dim_]
+  //  The device memory pointer; inverted list for data; size [index_size_, dim_]
   rmm::device_uvector<T> list_data_dev_;
-  // The device memory pointer; inverted list for index; size [ninterleave_]
+  // The device memory pointer; inverted list for index; size [index_size_]
   rmm::device_uvector<uint32_t> list_index_dev_;
   // The device memory pointer; Used for list_data_manage_ptr_; size [n_lists_]
-  rmm::device_uvector<uint32_t> list_prefix_interleaved_dev_;
+  rmm::device_uvector<uint32_t> list_offsets_dev_;
   // The device memory pointer; the number of each cluster(list); size [n_lists_]
   rmm::device_uvector<uint32_t> list_lengths_dev_;
   // The device memory pointer; centriod; size [n_lists_, dim_]
@@ -125,16 +94,6 @@ class cuivflHandle {
   // resize, saving the costs of allocations.
   std::optional<rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>> search_mem_res;
 
-  // host pointer
-  //  The host memory pointer; inverted list for data; size [ninterleave_, dim_]
-  std::vector<T> list_data_host_;
-  // The host memory pointer; inverted list for index; size [ninterleave_]
-  std::vector<uint32_t> list_index_host_;
-  // The host memory pointer; Used for list_data_manage_ptr_; size [n_lists_]
-  std::vector<uint32_t> list_prefix_interleaved_host_;
-  // The host memory pointer; the number of each cluster(list); size [n_lists_]
-  std::vector<uint32_t> list_lengths_host_;
-
   void cuivflBuildOptimizedKmeans(float* centriod_managed_ptr,
                                   const T* dataset,
                                   T* trainset,
@@ -162,14 +121,10 @@ cuivflHandle<T>::cuivflHandle(const handle_t& handle,
     grid_dim_x_(0),
     list_data_dev_(0, stream_),
     list_index_dev_(0, stream_),
-    list_prefix_interleaved_dev_(0, stream_),
+    list_offsets_dev_(0, stream_),
     list_lengths_dev_(0, stream_),
     centriod_dev_(0, stream_),
-    centriod_norm_dev_(0, stream_),
-    list_index_host_(0),
-    list_prefix_interleaved_host_(0),
-    list_lengths_host_(0),
-    list_data_host_(0)
+    centriod_norm_dev_(0, stream_)
 {
   veclen_ = 16 / sizeof(T);
   while (dim % veclen_ != 0) {
@@ -409,6 +364,46 @@ void cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_managed_ptr,
                   &kmeans_mem_res);
 }
 
+template <typename T>
+__global__ void build_index_kernel(const uint32_t* labels,
+                                   const uint32_t* list_offsets,
+                                   const T* dataset,
+                                   T* list_data,
+                                   uint32_t* list_index,
+                                   uint32_t* list_lengths,
+                                   uint32_t n_rows,
+                                   uint32_t dim,
+                                   uint32_t veclen)
+{
+  const int i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (i >= n_rows) { return; }
+
+  auto list_id     = labels[i];
+  auto inlist_id   = atomicAdd(list_lengths + list_id, 1);
+  auto list_offset = list_offsets[list_id];
+
+  // Record the source vector id in the index
+  list_index[list_offset + inlist_id] = i;
+
+  // The data is written in interleaved groups of `WarpSize` vectors
+  using interleaved_group = Pow2<WarpSize>;
+  auto group_offset       = interleaved_group::roundDown(inlist_id);
+  auto ingroup_id         = interleaved_group::mod(inlist_id) * veclen;
+
+  // Point to the location of the interleaved group of vectors
+  list_data += (list_offset + group_offset) * dim;
+
+  // Point to the source vector
+  dataset += i * dim;
+
+  // Interleave dimensions of the source vector while recording it.
+  for (uint32_t l = 0; l < dim; l += veclen) {
+    for (uint32_t j = 0; j < veclen; j++) {
+      list_data[l * WarpSize + ingroup_id + j] = dataset[l + j];
+    }
+  }
+}
+
 template <typename T>
 void cuivflHandle<T>::cuivflBuildIndex(const T* dataset,
                                        T* trainset,
@@ -440,52 +435,55 @@ void cuivflHandle<T>::cuivflBuildIndex(const T* dataset,
       n_lists_, dim_, centriod_managed_ptr, centriod_norm_dev_.data(), stream_);
     RAFT_LOG_TRACE_VEC(centriod_norm_dev_.data(), 20);
   }
-
-  // Record the number of elements in each clusters
-  handle_.sync_stream(stream_);
-
-  list_prefix_interleaved_host_.resize(n_lists_);
-  list_lengths_host_.assign(n_lists_, 0);
-  for (uint32_t i = 0; i < n_rows_; i++) {
-    uint32_t id_cluster = labels[i];
-    list_lengths_host_[id_cluster] += 1;
-  }
-
-  ninterleave_ = 0;
-  for (uint32_t i = 0; i < n_lists_; i++) {
-    list_prefix_interleaved_host_[i] = ninterleave_;
-    ninterleave_ += Pow2<WarpSize>::roundUp(list_lengths_host_[i]);
-  }
-
-  list_data_host_.assign(ninterleave_ * dim_, 0);
-  list_index_host_.assign(ninterleave_, 0);
-  list_lengths_host_.assign(n_lists_, 0);
-
-  for (size_t i = 0; i < n_rows_; i++) {
-    uint32_t id_cluster     = labels[i];
-    uint32_t current_add    = list_lengths_host_[id_cluster];
-    uint32_t interleave_add = list_prefix_interleaved_host_[id_cluster];
-    _ivfflat_interleaved(
-      list_data_host_.data(), dataset + i * dim_, dim_, current_add, interleave_add, veclen_);
-    list_index_host_[interleave_add + current_add] = i;
-    list_lengths_host_[id_cluster] += 1;
-  }
-
-  // Store index on GPU memory: temp WAR until we've entire index building buffers on device
-  list_data_dev_.resize(ninterleave_ * dim_, stream_);
-  list_index_dev_.resize(ninterleave_, stream_);
-  list_prefix_interleaved_dev_.resize(n_lists_, stream_);
-  list_lengths_dev_.resize(n_lists_, stream_);
   centriod_dev_.resize(n_lists_ * dim_, stream_);
-
-  // Read the list
-  copy(
-    list_prefix_interleaved_dev_.data(), list_prefix_interleaved_host_.data(), n_lists_, stream_);
-  copy(list_lengths_dev_.data(), list_lengths_host_.data(), n_lists_, stream_);
   copy(centriod_dev_.data(), centriod_managed_ptr, n_lists_ * dim_, stream_);
 
-  copy(list_data_dev_.data(), list_data_host_.data(), ninterleave_ * dim_, stream_);
-  copy(list_index_dev_.data(), list_index_host_.data(), ninterleave_, stream_);
+  list_lengths_dev_.resize(n_lists_, stream_);
+  auto list_lengths = list_lengths_dev_.data();
+  stats::histogram(stats::HistType::HistTypeAuto,
+                   reinterpret_cast<int*>(list_lengths),
+                   n_lists_,
+                   labels,
+                   n_rows_,
+                   uint32_t(1),
+                   stream_);
+
+  // NB: stream_ must be equal to handle_.get_stream() to have the thrust functions executed in
+  // order with the rest
+  auto thrust_policy = handle_.get_thrust_policy();
+
+  list_offsets_dev_.resize(n_lists_ + 1, stream_);
+  auto list_offsets = list_offsets_dev_.data();
+
+  thrust::exclusive_scan(
+    thrust_policy,
+    list_lengths,
+    list_lengths + n_lists_ + 1,
+    list_offsets,
+    uint32_t(0),
+    [] __device__(uint32_t s, uint32_t l) { return s + Pow2<WarpSize>::roundUp(l); });
+
+  update_host(&index_size_, list_offsets + n_lists_, 1, stream_);
+  handle_.sync_stream(stream_);
+
+  list_data_dev_.resize(index_size_ * dim_, stream_);
+  list_index_dev_.resize(index_size_, stream_);
+
+  // we'll rebuild the list_lengths in the following kernels, using it as an atomic counter.
+  utils::memset(list_lengths, 0, sizeof(uint32_t) * n_lists_, stream_);
+
+  const dim3 block_dim(256);
+  const dim3 grid_dim(raft::ceildiv<uint32_t>(n_rows_, block_dim.x));
+  build_index_kernel<<<grid_dim, block_dim, 0, stream_>>>(labels,
+                                                          list_offsets,
+                                                          dataset,
+                                                          list_data_dev_.data(),
+                                                          list_index_dev_.data(),
+                                                          list_lengths,
+                                                          n_rows_,
+                                                          dim_,
+                                                          veclen_);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 template <typename T>
@@ -660,24 +658,23 @@ void cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueries, dim]
     indices_dev_ptr   = neighbors;
   }
 
-  ivfflat_interleaved_scan<T, typename utils::config<T>::value_t>(
-    queries,
-    coarse_indices_dev.data(),
-    list_index_dev_.data(),
-    list_data_dev_.data(),
-    list_lengths_dev_.data(),
-    list_prefix_interleaved_dev_.data(),
-    metric_type_,
-    n_probes,
-    k,
-    n_queries,
-    dim_,
-    indices_dev_ptr,
-    distances_dev_ptr,
-    stream_,
-    greater_,
-    veclen_,
-    grid_dim_x_);
+  ivfflat_interleaved_scan<T, typename utils::config<T>::value_t>(queries,
+                                                                  coarse_indices_dev.data(),
+                                                                  list_index_dev_.data(),
+                                                                  list_data_dev_.data(),
+                                                                  list_lengths_dev_.data(),
+                                                                  list_offsets_dev_.data(),
+                                                                  metric_type_,
+                                                                  n_probes,
+                                                                  k,
+                                                                  n_queries,
+                                                                  dim_,
+                                                                  indices_dev_ptr,
+                                                                  distances_dev_ptr,
+                                                                  stream_,
+                                                                  greater_,
+                                                                  veclen_,
+                                                                  grid_dim_x_);
 
   RAFT_LOG_TRACE_VEC(distances_dev_ptr, 2 * k);
   RAFT_LOG_TRACE_VEC(indices_dev_ptr, 2 * k);
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
index 3399e31044..2ce466994c 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
@@ -177,12 +177,8 @@ void approx_knn_build_index(const handle_t& handle,
   if constexpr (std::is_same<T, uint8_t>{} || std::is_same<T, int8_t>{}) {
     if (dynamic_cast<IVFFlatParam*>(params)) {
       IVFFlatParam* IVFFlat_param = dynamic_cast<IVFFlatParam*>(params);
-
-      rmm::mr::managed_memory_resource managed_memory;
-      rmm::device_uvector<T> managed_index_array(n * D, stream, &managed_memory);
-      copy(managed_index_array.data(), index_array, n * D, stream);
       approx_knn_cuivfl_ivfflat_build_index(
-        handle, index, IVFFlat_param, metric, managed_index_array.data(), n, D);
+        handle, index, IVFFlat_param, metric, index_array, n, D);
     } else {
       RAFT_FAIL("IVF Flat algorithm required to fit int8 data");
     }
@@ -198,11 +194,8 @@ void approx_knn_build_index(const handle_t& handle,
           metric == raft::distance::DistanceType::L2Unexpanded ||
           metric == raft::distance::DistanceType::L2Expanded ||
           metric == raft::distance::DistanceType::InnerProduct) {
-        rmm::mr::managed_memory_resource managed_memory;
-        rmm::device_uvector<T> managed_index_array(n * D, stream, &managed_memory);
-        copy(managed_index_array.data(), index_array, n * D, stream);
         approx_knn_cuivfl_ivfflat_build_index(
-          handle, index, IVFFlat_param, metric, managed_index_array.data(), n, D);
+          handle, index, IVFFlat_param, metric, index_array, n, D);
       } else {
         raft::spatial::knn::RmmGpuResources* gpu_res = new raft::spatial::knn::RmmGpuResources();
         gpu_res->noTempMemory();

From 7ace0fb1c6175a1371d119f78260608c974a0105 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 15 Jun 2022 08:43:43 +0200
Subject: [PATCH 062/118] Document the index building kernel

---
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  | 29 ++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index bbb5d8259b..5c027759fd 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -364,6 +364,32 @@ void cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_managed_ptr,
                   &kmeans_mem_res);
 }
 
+/**
+ * @brief Record the dataset into the index, one source row at a time.
+ *
+ * The index concists of the dataset rows, grouped by their labels (into clusters/lists).
+ * Within each cluster (list), the data is grouped into blocks of `WarpSize` interleaved
+ * vectors. Note, the total index length is slightly larger than the dataset length, because
+ * each cluster is padded by `WarpSize` elements
+ *
+ * CUDA launch grid:
+ *   X dimension must cover the dataset (n_rows), YZ are not used;
+ *   there are no dependencies between threads, hence no constraints on the block size.
+ *
+ * @tparam T the element type.
+ *
+ * @param[in] labels device pointer to the cluster ids for each row [n_rows]
+ * @param[in] list_offsets device pointer to the cluster offsets in the output (index) [n_lists]
+ * @param[in] dataset device poitner to the input data [n_rows, dim]
+ * @param[out] list_data device pointer to the output [index_size_, dim]
+ * @param[out] list_index device pointer to the source ids corr. to the output [index_size_]
+ * @param[out] list_lengths device pointer to the cluster sizes [n_lists];
+ *                          it's used as an atomic counter, and must be initialized with zeros.
+ * @param n_rows source length
+ * @param dim the dimensionality of the data
+ * @param veclen size of vectorized loads/stores; must satisfy `dim % veclen == 0`.
+ *
+ */
 template <typename T>
 __global__ void build_index_kernel(const uint32_t* labels,
                                    const uint32_t* list_offsets,
@@ -397,6 +423,7 @@ __global__ void build_index_kernel(const uint32_t* labels,
   dataset += i * dim;
 
   // Interleave dimensions of the source vector while recording it.
+  // NB: such `veclen` is selected, that `dim % veclen == 0`
   for (uint32_t l = 0; l < dim; l += veclen) {
     for (uint32_t j = 0; j < veclen; j++) {
       list_data[l * WarpSize + ingroup_id + j] = dataset[l + j];
@@ -469,7 +496,7 @@ void cuivflHandle<T>::cuivflBuildIndex(const T* dataset,
   list_data_dev_.resize(index_size_ * dim_, stream_);
   list_index_dev_.resize(index_size_, stream_);
 
-  // we'll rebuild the list_lengths in the following kernels, using it as an atomic counter.
+  // we'll rebuild the `list_lengths` in the following kernel, using it as an atomic counter.
   utils::memset(list_lengths, 0, sizeof(uint32_t) * n_lists_, stream_);
 
   const dim3 block_dim(256);

From 35157151db9a73ad221253c5472f535bce2e82d8 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 15 Jun 2022 11:00:28 +0200
Subject: [PATCH 063/118] Added a dims padding todo

---
 cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index 5c027759fd..68998d13f6 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -126,6 +126,8 @@ cuivflHandle<T>::cuivflHandle(const handle_t& handle,
     centriod_dev_(0, stream_),
     centriod_norm_dev_(0, stream_)
 {
+  // TODO: consider padding the dimensions and fixing veclen to its maximum possible value as a
+  // template parameter (https://github.com/rapidsai/raft/issues/711)
   veclen_ = 16 / sizeof(T);
   while (dim % veclen_ != 0) {
     veclen_ = veclen_ >> 1;

From 6bd6560325a521848249e9994b3083b0de61c322 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 15 Jun 2022 17:21:20 +0200
Subject: [PATCH 064/118] Move kmeans-related allocations and routines to
 ann_kmeans_balanced.cuh

---
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  | 303 ++----------------
 .../knn/detail/ann_kmeans_balanced.cuh        | 258 ++++++++++++++-
 .../raft/spatial/knn/detail/ann_quantized.cuh |  26 +-
 3 files changed, 289 insertions(+), 298 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index 68998d13f6..c3229654a5 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -35,7 +35,6 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/managed_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
@@ -50,7 +49,7 @@ class cuivflHandle {
                uint32_t n_lists,
                uint32_t n_iters);
 
-  void cuivflBuildIndex(const T* dataset, T* trainset, uint32_t n_rows, uint32_t nTrainset);
+  void cuivflBuildIndex(const T* dataset, uint32_t n_rows);
 
   void cuivflSetSearchParameters(const uint32_t n_probes,
                                  const uint32_t max_batch,
@@ -86,21 +85,14 @@ class cuivflHandle {
   rmm::device_uvector<uint32_t> list_offsets_dev_;
   // The device memory pointer; the number of each cluster(list); size [n_lists_]
   rmm::device_uvector<uint32_t> list_lengths_dev_;
-  // The device memory pointer; centriod; size [n_lists_, dim_]
-  rmm::device_uvector<float> centriod_dev_;
-  // The device memory pointer; centriod norm ; size [n_lists_, dim_]
-  rmm::device_uvector<float> centriod_norm_dev_;
+  // The device memory pointer; centroid; size [n_lists_, dim_]
+  rmm::device_uvector<float> centers_dev_;
+  // The device memory pointer; centroid norm ; size [n_lists_, dim_]
+  rmm::device_uvector<float> centers_norms_dev_;
   // Memory pool for use during search; after the first search is done the pool is not likely to
   // resize, saving the costs of allocations.
   std::optional<rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>> search_mem_res;
 
-  void cuivflBuildOptimizedKmeans(float* centriod_managed_ptr,
-                                  const T* dataset,
-                                  T* trainset,
-                                  uint32_t* clusterSize,
-                                  uint32_t n_rows,
-                                  uint32_t n_rows_train);
-
   template <typename AccT>
   void cuivflSearchImpl(
     const T* queries, uint32_t n_queries, uint32_t k, size_t* neighbors, AccT* distances);
@@ -123,8 +115,8 @@ cuivflHandle<T>::cuivflHandle(const handle_t& handle,
     list_index_dev_(0, stream_),
     list_offsets_dev_(0, stream_),
     list_lengths_dev_(0, stream_),
-    centriod_dev_(0, stream_),
-    centriod_norm_dev_(0, stream_)
+    centers_dev_(0, stream_),
+    centers_norms_dev_(0, stream_)
 {
   // TODO: consider padding the dimensions and fixing veclen to its maximum possible value as a
   // template parameter (https://github.com/rapidsai/raft/issues/711)
@@ -134,238 +126,6 @@ cuivflHandle<T>::cuivflHandle(const handle_t& handle,
   }
 }
 
-/**
- * NB: `dataset` is accessed only by GPU code, `trainset` accessed by CPU and GPU.
- *
- */
-template <typename T>
-void cuivflHandle<T>::cuivflBuildOptimizedKmeans(float* centriod_managed_ptr,
-                                                 const T* dataset,
-                                                 T* trainset,
-                                                 uint32_t* labels,
-                                                 uint32_t n_rows,
-                                                 uint32_t n_rows_train)
-{
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
-    "cuivflBuildOptimizedKmeans(%u, %u)", n_rows, n_rows_train);
-
-  rmm::device_uvector<uint32_t> trainset_labels(n_rows_train, stream_);
-
-  float* cluster_centers = centriod_managed_ptr;
-
-  uint32_t n_mesoclusters = std::pow<double>(n_lists_, 0.5) + 0.5;
-  RAFT_LOG_DEBUG("(%s) # n_mesoclusters: %u", __func__, n_mesoclusters);
-
-  rmm::mr::managed_memory_resource managed_memory;
-  rmm::device_uvector<float> mesocluster_centers(n_mesoclusters * dim_, stream_, &managed_memory);
-  rmm::device_uvector<uint32_t> mesocluster_labels(n_rows_train, stream_, &managed_memory);
-  rmm::device_uvector<uint32_t> mesocluster_sizes_buf(n_mesoclusters, stream_, &managed_memory);
-  rmm::device_uvector<float> mesocluster_centers_tmp(
-    n_mesoclusters * dim_, stream_, &managed_memory);
-
-  auto mesocluster_sizes = mesocluster_sizes_buf.data();
-
-  rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> kmeans_mem_res(
-    rmm::mr::get_current_device_resource(),
-    // an arbitrary guess on the upper bound of the workspace size
-    Pow2<256>::roundUp(kmeans::calc_minibatch_size(n_mesoclusters, n_rows) * dim_ * 4));
-
-  // Training meso-clusters
-  for (uint32_t iter = 0; iter < 2 * n_iters_; iter += 2) {
-    RAFT_LOG_TRACE("Training kmeans of meso-clusters: %.1f / %u", (float)iter / 2, n_iters_);
-    kmeans::predict(handle_,
-                    mesocluster_centers.data(),
-                    n_mesoclusters,
-                    dim_,
-                    trainset,
-                    n_rows_train,
-                    mesocluster_labels.data(),
-                    metric_type_,
-                    (iter != 0),
-                    mesocluster_centers_tmp.data(),
-                    mesocluster_sizes,
-                    true,
-                    stream_,
-                    &kmeans_mem_res);
-
-    if (iter + 1 < 2 * n_iters_) {
-      if (kmeans::adjust_centers(mesocluster_centers.data(),
-                                 n_mesoclusters,
-                                 dim_,
-                                 trainset,
-                                 n_rows_train,
-                                 mesocluster_labels.data(),
-                                 metric_type_,
-                                 mesocluster_sizes,
-                                 (float)1.0 / 4,
-                                 stream_)) {
-        iter -= 1;
-      }
-    }
-  }
-
-  handle_.sync_stream(stream_);
-
-  std::vector<uint32_t> fine_clusters_nums(n_mesoclusters);
-  std::vector<uint32_t> fine_clusters_csum(n_mesoclusters + 1);
-  fine_clusters_csum[0] = 0;
-
-  uint32_t n_lists_rem            = n_lists_;
-  uint32_t n_rows_train_rem       = n_rows_train;
-  uint32_t mesocluster_size_max   = 0;
-  uint32_t mesocluster_size_sum   = 0;
-  uint32_t fine_clusters_nums_sum = 0;  // checking
-  uint32_t fine_clusters_nums_max = 0;
-  for (uint32_t i = 0; i < n_mesoclusters; i++) {
-    if (i < n_mesoclusters - 1) {
-      fine_clusters_nums[i] = (double)n_lists_rem * mesocluster_sizes[i] / n_rows_train_rem + .5;
-    } else {
-      fine_clusters_nums[i] = n_lists_rem;
-    }
-    n_lists_rem -= fine_clusters_nums[i];
-    n_rows_train_rem -= mesocluster_sizes[i];
-    mesocluster_size_max = max(mesocluster_size_max, mesocluster_sizes[i]);
-    mesocluster_size_sum += mesocluster_sizes[i];
-    fine_clusters_nums_sum += fine_clusters_nums[i];
-    fine_clusters_nums_max    = max(fine_clusters_nums_max, fine_clusters_nums[i]);
-    fine_clusters_csum[i + 1] = fine_clusters_csum[i] + fine_clusters_nums[i];
-  }
-
-  RAFT_LOG_DEBUG("(%s) # mesocluster_size_sum: %u", __func__, mesocluster_size_sum);
-  RAFT_LOG_DEBUG("(%s) # fine_clusters_nums_sum: %u", __func__, fine_clusters_nums_sum);
-  assert(mesocluster_size_sum == n_rows_train);
-  assert(fine_clusters_nums_sum == n_lists_);
-  assert(fine_clusters_csum[n_mesoclusters] == n_lists_);
-
-  rmm::device_uvector<uint32_t> mc_trainset_ids_buf(mesocluster_size_max, stream_, &managed_memory);
-  rmm::device_uvector<float> mc_trainset_buf(mesocluster_size_max * dim_, stream_, &managed_memory);
-  auto mc_trainset_ids = mc_trainset_ids_buf.data();
-  auto mc_trainset     = mc_trainset_buf.data();
-
-  // label (cluster ID) of each vector
-  rmm::device_uvector<uint32_t> mc_trainset_labels(mesocluster_size_max, stream_, &managed_memory);
-
-  rmm::device_uvector<float> mc_trainset_ccenters(
-    fine_clusters_nums_max * dim_, stream_, &managed_memory);
-  rmm::device_uvector<float> mc_trainset_ccenters_tmp(
-    fine_clusters_nums_max * dim_, stream_, &managed_memory);
-  // number of vectors in each cluster
-  rmm::device_uvector<uint32_t> mc_trainset_csizes_tmp(
-    fine_clusters_nums_max, stream_, &managed_memory);
-
-  // Training clusters in each meso-clusters
-  uint32_t n_clusters_done = 0;
-  for (uint32_t i = 0; i < n_mesoclusters; i++) {
-    uint32_t k = 0;
-    for (uint32_t j = 0; j < n_rows_train; j++) {
-      if (mesocluster_labels.data()[j] != i) continue;
-      mc_trainset_ids[k++] = j;
-    }
-    assert(k == mesocluster_sizes[i]);
-
-    utils::copy_selected<T>(
-      mesocluster_sizes[i], dim_, trainset, mc_trainset_ids, dim_, mc_trainset, dim_, stream_);
-
-    for (uint32_t iter = 0; iter < 2 * n_iters_; iter += 2) {
-      RAFT_LOG_TRACE("Training kmeans of clusters in meso-cluster %u (n_lists: %u): %.1f / %u",
-                     i,
-                     fine_clusters_nums[i],
-                     (float)iter / 2,
-                     n_iters_);
-
-      kmeans::predict(handle_,
-                      mc_trainset_ccenters.data(),
-                      fine_clusters_nums[i],
-                      dim_,
-                      mc_trainset,
-                      mesocluster_sizes[i],
-                      mc_trainset_labels.data(),
-                      metric_type_,
-                      (iter != 0),
-                      mc_trainset_ccenters_tmp.data(),
-                      mc_trainset_csizes_tmp.data(),
-                      true,
-                      stream_,
-                      &kmeans_mem_res);
-
-      if (iter + 1 < 2 * n_iters_) {
-        if (kmeans::adjust_centers(mc_trainset_ccenters.data(),
-                                   fine_clusters_nums[i],
-                                   dim_,
-                                   mc_trainset,
-                                   mesocluster_sizes[i],
-                                   mc_trainset_labels.data(),
-                                   metric_type_,
-                                   mc_trainset_csizes_tmp.data(),
-                                   (float)1.0 / 4,
-                                   stream_)) {
-          iter -= 1;
-        }
-      }
-    }
-    copy(cluster_centers + (dim_ * fine_clusters_csum[i]),
-         mc_trainset_ccenters.data(),
-         fine_clusters_nums[i] * dim_,
-         stream_);
-    handle_.sync_stream(stream_);
-    n_clusters_done += fine_clusters_nums[i];
-  }  // end for (uint32_t i = 0; i < n_mesoclusters; i++)
-  assert(n_clusters_done == n_lists_);
-
-  mc_trainset_ccenters_tmp.resize(n_lists_ * dim_, stream_);
-  mc_trainset_csizes_tmp.resize(n_lists_, stream_);
-
-  // Fitting whole clusters using whole trainset.
-  for (int iter = 0; iter < 2; iter++) {
-    kmeans::predict(handle_,
-                    cluster_centers,
-                    n_lists_,
-                    dim_,
-                    trainset,
-                    n_rows_train,
-                    trainset_labels.data(),
-                    metric_type_,
-                    true,
-                    mc_trainset_ccenters_tmp.data(),
-                    mc_trainset_csizes_tmp.data(),
-                    true,
-                    stream_,
-                    &kmeans_mem_res);
-  }  // end for (int iter = 0; iter < 2; iter++)
-
-  RAFT_LOG_DEBUG("(%s) Final fitting.", __func__);
-
-  kmeans::predict(handle_,
-                  cluster_centers,
-                  n_lists_,
-                  dim_,
-                  dataset,
-                  n_rows_,
-                  labels,
-                  metric_type_,
-                  true,
-                  mc_trainset_ccenters_tmp.data(),
-                  mc_trainset_csizes_tmp.data(),
-                  true,
-                  stream_,
-                  &kmeans_mem_res);
-
-  kmeans::predict(handle_,
-                  cluster_centers,
-                  n_lists_,
-                  dim_,
-                  dataset,
-                  n_rows_,
-                  labels,
-                  metric_type_,
-                  true,
-                  mc_trainset_ccenters_tmp.data(),
-                  mc_trainset_csizes_tmp.data(),
-                  false,
-                  stream_,
-                  &kmeans_mem_res);
-}
-
 /**
  * @brief Record the dataset into the index, one source row at a time.
  *
@@ -434,45 +194,44 @@ __global__ void build_index_kernel(const uint32_t* labels,
 }
 
 template <typename T>
-void cuivflHandle<T>::cuivflBuildIndex(const T* dataset,
-                                       T* trainset,
-                                       uint32_t n_rows,
-                                       uint32_t n_rows_train)
+void cuivflHandle<T>::cuivflBuildIndex(const T* dataset, uint32_t n_rows)
 {
   n_rows_ = n_rows;
   RAFT_EXPECTS(n_rows_ > 0, "empty dataset");
 
-  rmm::mr::managed_memory_resource managed_memory;
-  rmm::device_uvector<float> centriod_managed_buf(n_lists_ * dim_, stream_, &managed_memory);
-  auto centriod_managed_ptr = centriod_managed_buf.data();
-
   static_assert(std::is_same_v<T, float> || std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
                 "unsupported data type");
 
-  // Alloc manage memory for centriods, trainset and workspace
-  rmm::device_uvector<uint32_t> labels_buf(n_rows_, stream_, &managed_memory);  // [numDataset]
-  auto labels = labels_buf.data();
+  // kmeans cluster ids for the dataset
+  rmm::device_uvector<uint32_t> labels(n_rows_, stream_);
 
   // Predict labels of the whole dataset
-  cuivflBuildOptimizedKmeans(centriod_managed_ptr, dataset, trainset, labels, n_rows, n_rows_train);
-
-  // Calculate the L2 related result
-  centriod_norm_dev_.resize(n_lists_, stream_);
+  centers_dev_.resize(n_lists_ * dim_, stream_);
+  kmeans::build_optimized_kmeans(handle_,
+                                 n_iters_,
+                                 dim_,
+                                 dataset,
+                                 n_rows,
+                                 labels.data(),
+                                 centers_dev_.data(),
+                                 n_lists_,
+                                 0.5,
+                                 metric_type_,
+                                 stream_);
 
+  // Precompute the centers vector norms for L2Expanded distance
   if (metric_type_ == raft::distance::DistanceType::L2Expanded) {
-    utils::dots_along_rows(
-      n_lists_, dim_, centriod_managed_ptr, centriod_norm_dev_.data(), stream_);
-    RAFT_LOG_TRACE_VEC(centriod_norm_dev_.data(), 20);
+    centers_norms_dev_.resize(n_lists_, stream_);
+    utils::dots_along_rows(n_lists_, dim_, centers_dev_.data(), centers_norms_dev_.data(), stream_);
+    RAFT_LOG_TRACE_VEC(centers_norms_dev_.data(), 20);
   }
-  centriod_dev_.resize(n_lists_ * dim_, stream_);
-  copy(centriod_dev_.data(), centriod_managed_ptr, n_lists_ * dim_, stream_);
 
   list_lengths_dev_.resize(n_lists_, stream_);
   auto list_lengths = list_lengths_dev_.data();
   stats::histogram(stats::HistType::HistTypeAuto,
                    reinterpret_cast<int*>(list_lengths),
                    n_lists_,
-                   labels,
+                   labels.data(),
                    n_rows_,
                    uint32_t(1),
                    stream_);
@@ -503,7 +262,7 @@ void cuivflHandle<T>::cuivflBuildIndex(const T* dataset,
 
   const dim3 block_dim(256);
   const dim3 grid_dim(raft::ceildiv<uint32_t>(n_rows_, block_dim.x));
-  build_index_kernel<<<grid_dim, block_dim, 0, stream_>>>(labels,
+  build_index_kernel<<<grid_dim, block_dim, 0, stream_>>>(labels.data(),
                                                           list_offsets,
                                                           dataset,
                                                           list_data_dev_.data(),
@@ -627,11 +386,11 @@ void cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueries, dim]
     utils::dots_along_rows(n_queries, dim_, converted_queries_ptr, query_norm_dev.data(), stream_);
     utils::outer_add(query_norm_dev.data(),
                      n_queries,
-                     centriod_norm_dev_.data(),
+                     centers_norms_dev_.data(),
                      n_lists_,
                      distance_buffer_dev.data(),
                      stream_);
-    RAFT_LOG_TRACE_VEC(centriod_norm_dev_.data(), 20);
+    RAFT_LOG_TRACE_VEC(centers_norms_dev_.data(), 20);
     RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), 20);
   } else {
     alpha = 1.0f;
@@ -645,7 +404,7 @@ void cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueries, dim]
                n_queries,
                dim_,
                &alpha,
-               centriod_dev_.data(),
+               centers_dev_.data(),
                dim_,
                converted_queries_ptr,
                dim_,
diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index ae3dc98d8c..1bb5bfe851 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -19,6 +19,7 @@
 #include "../ann_common.h"
 #include "ann_utils.cuh"
 
+#include <raft/common/nvtx.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/distance/distance.hpp>
@@ -30,6 +31,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_vector.hpp>
+#include <rmm/mr/device/managed_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
@@ -159,7 +161,7 @@ void update_centers(float* centers,
     utils::accumulate_into_selected<T>(
       n_rows, dim, centers, cluster_sizes, dataset, labels, stream);
   } else {
-    copy(centers, accumulated_centers, n_clusters * dim, stream);
+    raft::copy(centers, accumulated_centers, n_clusters * dim, stream);
   }
 
   if (metric == raft::distance::DistanceType::InnerProduct) {
@@ -209,6 +211,8 @@ void predict(const handle_t& handle,
              rmm::cuda_stream_view stream,
              rmm::mr::device_memory_resource* mr = nullptr)
 {
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "kmeans::predict(%u, %u)", n_rows, n_clusters);
   if (n_rows == 0) {
     RAFT_LOG_WARN(
       "cuann_kmeans_predict: empty dataset (n_rows = %d, n_clusters = %d)", n_rows, n_clusters);
@@ -248,7 +252,7 @@ void predict(const handle_t& handle,
     auto minibatch_size = std::min<uint32_t>(max_minibatch_size, n_rows - offset);
 
     if constexpr (std::is_same_v<T, float>) {
-      copy(cur_dataset.data(), dataset + offset * dim, minibatch_size * dim, stream);
+      raft::copy(cur_dataset.data(), dataset + offset * dim, minibatch_size * dim, stream);
     } else {
       linalg::unaryOp(cur_dataset.data(),
                       dataset + offset * dim,
@@ -333,6 +337,8 @@ auto adjust_centers(float* centers,
                     float threshold,
                     rmm::cuda_stream_view stream) -> bool
 {
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "kmeans::adjust_centers(%u, %u)", n_rows, n_clusters);
   stream.synchronize();
   if (n_clusters == 0) { return false; }
   constexpr static std::array kPrimes{29,   71,   113,  173,  229,  281,  349,  409,  463,  541,
@@ -384,4 +390,252 @@ auto adjust_centers(float* centers,
   return adjusted;
 }
 
+/**
+ * NB: `dataset` is accessed only by GPU code, `trainset` accessed by CPU and GPU.
+ *
+ */
+template <typename T>
+void build_optimized_kmeans(const handle_t& handle,
+                            uint32_t n_iters,
+                            size_t dim,
+                            const T* dataset,  // device
+                            size_t n_rows,
+                            uint32_t* labels,        // device
+                            float* cluster_centers,  // device
+                            size_t n_clusters,
+                            double trainset_fraction,  // 0 < trainset_fraction <= 1
+                            raft::distance::DistanceType metric,
+                            rmm::cuda_stream_view stream)
+{
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "kmeans::build_optimized_kmeans(%u, %u)", n_rows, n_clusters);
+
+  auto trainset_ratio =
+    std::max<size_t>(1, n_rows / std::max<size_t>(trainset_fraction * n_rows, n_clusters));
+  auto n_rows_train = n_rows / trainset_ratio;
+
+  uint32_t n_mesoclusters = std::pow<double>(n_clusters, 0.5) + 0.5;
+  RAFT_LOG_DEBUG("(%s) # n_mesoclusters: %u", __func__, n_mesoclusters);
+
+  rmm::mr::managed_memory_resource managed_memory;
+  rmm::device_uvector<float> mesocluster_centers(n_mesoclusters * dim, stream, &managed_memory);
+  rmm::device_uvector<uint32_t> mesocluster_labels(n_rows_train, stream, &managed_memory);
+  rmm::device_uvector<uint32_t> mesocluster_sizes_buf(n_mesoclusters, stream, &managed_memory);
+  rmm::device_uvector<float> mesocluster_centers_tmp(n_mesoclusters * dim, stream, &managed_memory);
+
+  rmm::device_uvector<T> trainset(n_rows_train * dim, stream, &managed_memory);
+  // TODO: a proper sampling
+  RAFT_CUDA_TRY(cudaMemcpy2DAsync(trainset.data(),
+                                  sizeof(T) * dim,
+                                  dataset,
+                                  sizeof(T) * dim * trainset_ratio,
+                                  sizeof(T) * dim,
+                                  n_rows_train,
+                                  cudaMemcpyDefault,
+                                  stream));
+
+  auto mesocluster_sizes = mesocluster_sizes_buf.data();
+
+  rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> kmeans_mem_res(
+    rmm::mr::get_current_device_resource(),
+    // an arbitrary guess on the upper bound of the workspace size
+    Pow2<256>::roundUp(kmeans::calc_minibatch_size(n_mesoclusters, n_rows) * dim * 4));
+
+  // Training meso-clusters
+  for (uint32_t iter = 0; iter < 2 * n_iters; iter += 2) {
+    RAFT_LOG_TRACE("Training kmeans of meso-clusters: %.1f / %u", (float)iter / 2, n_iters);
+    kmeans::predict(handle,
+                    mesocluster_centers.data(),
+                    n_mesoclusters,
+                    dim,
+                    trainset.data(),
+                    n_rows_train,
+                    mesocluster_labels.data(),
+                    metric,
+                    (iter != 0),
+                    mesocluster_centers_tmp.data(),
+                    mesocluster_sizes,
+                    true,
+                    stream,
+                    &kmeans_mem_res);
+
+    if (iter + 1 < 2 * n_iters) {
+      if (kmeans::adjust_centers(mesocluster_centers.data(),
+                                 n_mesoclusters,
+                                 dim,
+                                 trainset.data(),
+                                 n_rows_train,
+                                 mesocluster_labels.data(),
+                                 metric,
+                                 mesocluster_sizes,
+                                 (float)1.0 / 4,
+                                 stream)) {
+        iter -= 1;
+      }
+    }
+  }
+
+  handle.sync_stream(stream);
+
+  std::vector<uint32_t> fine_clusters_nums(n_mesoclusters);
+  std::vector<uint32_t> fine_clusters_csum(n_mesoclusters + 1);
+  fine_clusters_csum[0] = 0;
+
+  uint32_t n_lists_rem            = n_clusters;
+  uint32_t n_rows_train_rem       = n_rows_train;
+  uint32_t mesocluster_size_max   = 0;
+  uint32_t mesocluster_size_sum   = 0;
+  uint32_t fine_clusters_nums_sum = 0;  // checking
+  uint32_t fine_clusters_nums_max = 0;
+  for (uint32_t i = 0; i < n_mesoclusters; i++) {
+    if (i < n_mesoclusters - 1) {
+      fine_clusters_nums[i] = (double)n_lists_rem * mesocluster_sizes[i] / n_rows_train_rem + .5;
+    } else {
+      fine_clusters_nums[i] = n_lists_rem;
+    }
+    n_lists_rem -= fine_clusters_nums[i];
+    n_rows_train_rem -= mesocluster_sizes[i];
+    mesocluster_size_max = max(mesocluster_size_max, mesocluster_sizes[i]);
+    mesocluster_size_sum += mesocluster_sizes[i];
+    fine_clusters_nums_sum += fine_clusters_nums[i];
+    fine_clusters_nums_max    = max(fine_clusters_nums_max, fine_clusters_nums[i]);
+    fine_clusters_csum[i + 1] = fine_clusters_csum[i] + fine_clusters_nums[i];
+  }
+
+  RAFT_LOG_DEBUG("(%s) # mesocluster_size_sum: %u", __func__, mesocluster_size_sum);
+  RAFT_LOG_DEBUG("(%s) # fine_clusters_nums_sum: %u", __func__, fine_clusters_nums_sum);
+  assert(mesocluster_size_sum == n_rows_train);
+  assert(fine_clusters_nums_sum == n_clusters);
+  assert(fine_clusters_csum[n_mesoclusters] == n_clusters);
+
+  rmm::device_uvector<uint32_t> mc_trainset_ids_buf(mesocluster_size_max, stream, &managed_memory);
+  rmm::device_uvector<float> mc_trainset_buf(mesocluster_size_max * dim, stream, &managed_memory);
+  auto mc_trainset_ids = mc_trainset_ids_buf.data();
+  auto mc_trainset     = mc_trainset_buf.data();
+
+  // label (cluster ID) of each vector
+  rmm::device_uvector<uint32_t> mc_trainset_labels(mesocluster_size_max, stream, &managed_memory);
+
+  rmm::device_uvector<float> mc_trainset_ccenters(
+    fine_clusters_nums_max * dim, stream, &managed_memory);
+  rmm::device_uvector<float> mc_trainset_ccenters_tmp(
+    fine_clusters_nums_max * dim, stream, &managed_memory);
+  // number of vectors in each cluster
+  rmm::device_uvector<uint32_t> mc_trainset_csizes_tmp(
+    fine_clusters_nums_max, stream, &managed_memory);
+
+  // Training clusters in each meso-clusters
+  uint32_t n_clusters_done = 0;
+  for (uint32_t i = 0; i < n_mesoclusters; i++) {
+    uint32_t k = 0;
+    for (uint32_t j = 0; j < n_rows_train; j++) {
+      if (mesocluster_labels.data()[j] != i) continue;
+      mc_trainset_ids[k++] = j;
+    }
+    assert(k == mesocluster_sizes[i]);
+
+    utils::copy_selected<T>(
+      mesocluster_sizes[i], dim, trainset.data(), mc_trainset_ids, dim, mc_trainset, dim, stream);
+
+    for (uint32_t iter = 0; iter < 2 * n_iters; iter += 2) {
+      RAFT_LOG_TRACE("Training kmeans of clusters in meso-cluster %u (n_lists: %u): %.1f / %u",
+                     i,
+                     fine_clusters_nums[i],
+                     (float)iter / 2,
+                     n_iters);
+
+      kmeans::predict(handle,
+                      mc_trainset_ccenters.data(),
+                      fine_clusters_nums[i],
+                      dim,
+                      mc_trainset,
+                      mesocluster_sizes[i],
+                      mc_trainset_labels.data(),
+                      metric,
+                      (iter != 0),
+                      mc_trainset_ccenters_tmp.data(),
+                      mc_trainset_csizes_tmp.data(),
+                      true,
+                      stream,
+                      &kmeans_mem_res);
+
+      if (iter + 1 < 2 * n_iters) {
+        if (kmeans::adjust_centers(mc_trainset_ccenters.data(),
+                                   fine_clusters_nums[i],
+                                   dim,
+                                   mc_trainset,
+                                   mesocluster_sizes[i],
+                                   mc_trainset_labels.data(),
+                                   metric,
+                                   mc_trainset_csizes_tmp.data(),
+                                   (float)1.0 / 4,
+                                   stream)) {
+          iter -= 1;
+        }
+      }
+    }
+    raft::copy(cluster_centers + (dim * fine_clusters_csum[i]),
+               mc_trainset_ccenters.data(),
+               fine_clusters_nums[i] * dim,
+               stream);
+    handle.sync_stream(stream);
+    n_clusters_done += fine_clusters_nums[i];
+  }  // end for (uint32_t i = 0; i < n_mesoclusters; i++)
+  assert(n_clusters_done == n_clusters);
+
+  mc_trainset_ccenters_tmp.resize(n_clusters * dim, stream);
+  mc_trainset_csizes_tmp.resize(n_clusters, stream);
+
+  // Fitting whole clusters using whole trainset.
+  for (int iter = 0; iter < 2; iter++) {
+    // NB: labels.size == n_rows >= n_rows_train; the output is not used.
+    kmeans::predict(handle,
+                    cluster_centers,
+                    n_clusters,
+                    dim,
+                    trainset.data(),
+                    n_rows_train,
+                    labels,
+                    metric,
+                    true,
+                    mc_trainset_ccenters_tmp.data(),
+                    mc_trainset_csizes_tmp.data(),
+                    true,
+                    stream,
+                    &kmeans_mem_res);
+  }  // end for (int iter = 0; iter < 2; iter++)
+
+  RAFT_LOG_DEBUG("(%s) Final fitting.", __func__);
+
+  kmeans::predict(handle,
+                  cluster_centers,
+                  n_clusters,
+                  dim,
+                  dataset,
+                  n_rows,
+                  labels,
+                  metric,
+                  true,
+                  mc_trainset_ccenters_tmp.data(),
+                  mc_trainset_csizes_tmp.data(),
+                  true,
+                  stream,
+                  &kmeans_mem_res);
+
+  kmeans::predict(handle,
+                  cluster_centers,
+                  n_clusters,
+                  dim,
+                  dataset,
+                  n_rows,
+                  labels,
+                  metric,
+                  true,
+                  mc_trainset_ccenters_tmp.data(),
+                  mc_trainset_csizes_tmp.data(),
+                  false,
+                  stream,
+                  &kmeans_mem_res);
+}
+
 }  // namespace raft::spatial::knn::detail::kmeans
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
index 2ce466994c..62ffdef75f 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
@@ -31,8 +31,6 @@
 #include <raft/spatial/knn/faiss_mr.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/managed_memory_resource.hpp>
 
 #include <faiss/gpu/GpuDistance.h>
 #include <faiss/gpu/GpuIndexFlat.h>
@@ -99,30 +97,10 @@ void approx_knn_cuivfl_ivfflat_build_index(const raft::handle_t& handle,
                                            IntType n,
                                            IntType D)
 {
-  auto stream         = handle.get_stream();
-  int ratio           = 2;  // TODO: take these parameters from API
-  int niter           = 20;
-  const int dim       = D;
-  const size_t ntrain = n / ratio;
-  assert(ntrain > 0);
-
-  rmm::mr::managed_memory_resource managed_memory;
-  rmm::device_uvector<T> trainset(ntrain * dim, stream, &managed_memory);
-
-  RAFT_CUDA_TRY(cudaMemcpy2DAsync(trainset.data(),
-                                  sizeof(T) * dim,
-                                  dataset,
-                                  sizeof(T) * dim * ratio,
-                                  sizeof(T) * dim,
-                                  ntrain,
-                                  cudaMemcpyDefault,
-                                  stream));
-
+  int niter = 20;
   index->handle_.get<T>() =
     std::make_unique<detail::cuivflHandle<T>>(handle, metric, D, params->nlist, niter);
-
-  // NB: `trainset` is accessed by both CPU and GPU code here.
-  index->handle_.get<T>()->cuivflBuildIndex(dataset, trainset.data(), n, ntrain);
+  index->handle_.get<T>()->cuivflBuildIndex(dataset, n);
 }
 
 template <typename IntType = int>

From 281181489d919af24187c66f1bb4a007affeeca1 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 15 Jun 2022 17:29:30 +0200
Subject: [PATCH 065/118] Add documentation to the build_optimized_kmeans

---
 .../knn/detail/ann_kmeans_balanced.cuh        | 24 +++++++++++++++----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index 1bb5bfe851..d452371d63 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -391,19 +391,33 @@ auto adjust_centers(float* centers,
 }
 
 /**
- * NB: `dataset` is accessed only by GPU code, `trainset` accessed by CPU and GPU.
+ * kmeans
  *
+ * @tparam T element type
+ *
+ * @param handle
+ * @param n_iters number of training iterations
+ * @param dim number of columns in `centers` and `dataset`
+ * @param[in] dataset a device pointer to the source dataset [n_rows, dim]
+ * @param n_rows number of rows in the input
+ * @param[out] labels a device pointer to the output labels [n_rows]
+ * @param[out] cluster_centers a device pointer to the found cluster centers [n_cluster, dim]
+ * @param n_cluster
+ * @param trainset_fraction a fraction of rows in the `dataset` to sample for kmeans training;
+ *                            0 < trainset_fraction <= 1.
+ * @param metric the distance metric
+ * @param stream
  */
 template <typename T>
 void build_optimized_kmeans(const handle_t& handle,
                             uint32_t n_iters,
                             size_t dim,
-                            const T* dataset,  // device
+                            const T* dataset,
                             size_t n_rows,
-                            uint32_t* labels,        // device
-                            float* cluster_centers,  // device
+                            uint32_t* labels,
+                            float* cluster_centers,
                             size_t n_clusters,
-                            double trainset_fraction,  // 0 < trainset_fraction <= 1
+                            double trainset_fraction,
                             raft::distance::DistanceType metric,
                             rmm::cuda_stream_view stream)
 {

From fc3e46ed203f19dd3df71e3f4b2ce2a51b17ab67 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 16 Jun 2022 15:54:24 +0200
Subject: [PATCH 066/118] Using mdarrays and structured index

---
 cpp/bench/spatial/knn.cu                      |   2 +-
 cpp/include/raft/spatial/knn/ann_common.h     |  25 +-
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  | 235 ++++++++----------
 .../knn/detail/ann_ivf_flat_kernel.cuh        | 185 +++++++++-----
 .../raft/spatial/knn/detail/ann_quantized.cuh |  36 ++-
 cpp/test/spatial/ann_ivf_flat.cu              |   2 +-
 6 files changed, 265 insertions(+), 220 deletions(-)

diff --git a/cpp/bench/spatial/knn.cu b/cpp/bench/spatial/knn.cu
index e8273bd9ed..bce1902ade 100644
--- a/cpp/bench/spatial/knn.cu
+++ b/cpp/bench/spatial/knn.cu
@@ -132,7 +132,7 @@ struct host_uvector {
 template <typename ValT, typename IdxT>
 struct ivf_flat_knn {
   raft::spatial::knn::knnIndex index;
-  raft::spatial::knn::IVFFlatParam ivf_params;
+  raft::spatial::knn::ivf_flat_params ivf_params;
   params ps;
 
   ivf_flat_knn(const raft::handle_t& handle, const params& ps, const ValT* data) : ps(ps)
diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h
index bb857eb64b..7b4aa8c326 100644
--- a/cpp/include/raft/spatial/knn/ann_common.h
+++ b/cpp/include/raft/spatial/knn/ann_common.h
@@ -16,15 +16,16 @@
 
 #pragma once
 
-#include <raft/distance/distance_type.hpp>
-
-#include "detail/ann_ivf_flat.cuh"
 #include <faiss/gpu/GpuIndex.h>
+#include <raft/distance/distance_type.hpp>
 #include <raft/spatial/knn/faiss_mr.hpp>
 
-namespace raft {
-namespace spatial {
-namespace knn {
+namespace raft::spatial::knn {
+
+namespace detail {
+template <typename T>
+class cuivflHandle;
+};
 
 struct cuivfl_handle_t {
   template <typename T>
@@ -119,11 +120,17 @@ struct knnIndexParam {
 };
 
 struct IVFParam : knnIndexParam {
+  /** The number of inverted lists (clusters) */
   int nlist;
+  /** The number of clusters to search. */
   int nprobe;
 };
 
-struct IVFFlatParam : IVFParam {
+struct ivf_flat_params : IVFParam {
+  /** The number of iterations searching for kmeans centers (index building). */
+  uint32_t kmeans_n_iters = 20;
+  /** The fraction of data to use during iterative kmeans building. */
+  double kmeans_trainset_fraction = 0.5;
 };
 
 struct IVFPQParam : IVFParam {
@@ -137,6 +144,4 @@ struct IVFSQParam : IVFParam {
   bool encodeResidual;
 };
 
-};  // namespace knn
-};  // namespace spatial
-};  // namespace raft
+};  // namespace raft::spatial::knn
diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index c3229654a5..90c1cd5d60 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include "../ann_common.h"
 #include "ann_ivf_flat_kernel.cuh"
 #include "ann_kmeans_balanced.cuh"
 #include "ann_utils.cuh"
@@ -45,11 +46,9 @@ class cuivflHandle {
  public:
   cuivflHandle(const handle_t& handle,
                raft::distance::DistanceType metric_type,
-               uint32_t dim,
-               uint32_t n_lists,
-               uint32_t n_iters);
+               const ivf_flat_params& params);
 
-  void cuivflBuildIndex(const T* dataset, uint32_t n_rows);
+  void cuivflBuildIndex(const T* dataset, uint32_t n_rows, uint32_t dim);
 
   void cuivflSetSearchParameters(const uint32_t n_probes,
                                  const uint32_t max_batch,
@@ -59,36 +58,19 @@ class cuivflHandle {
     const T* queries, uint32_t n_queries, uint32_t k, size_t* neighbors, float* distances);
 
   void queryIVFFlatGridSize(const uint32_t n_probes, const uint32_t n_queries, const uint32_t k);
-  uint32_t getDim() { return dim_; }
+  uint32_t getDim() { return index_.has_value() ? index_->dim() : 0; }
 
  private:
   const handle_t& handle_;
   const rmm::cuda_stream_view stream_;
+  ivf_flat_params params_;
 
   raft::distance::DistanceType metric_type_;
   bool greater_;
-  uint32_t n_lists_;     // The number of inverted lists= the number of centriods
-  uint32_t n_iters_;     // The number of uint32_terations for kmeans to build the indexs
-  uint32_t dim_;         // The dimension of vectors for input dataset
-  uint32_t n_probes_;    // The number of clusters for searching
-  uint32_t n_rows_;      // The number of elements for input dataset
-  uint32_t index_size_;  // The number of elements in 32 interleaved group for input dataset
-  uint32_t veclen_;      // The vectorization length of dataset in index.
   uint32_t grid_dim_x_;  // The number of blocks launched across n_probes.
+  // The built index
+  std::optional<const ivf_flat_index<T>> index_ = std::nullopt;
 
-  // device pointer
-  //  The device memory pointer; inverted list for data; size [index_size_, dim_]
-  rmm::device_uvector<T> list_data_dev_;
-  // The device memory pointer; inverted list for index; size [index_size_]
-  rmm::device_uvector<uint32_t> list_index_dev_;
-  // The device memory pointer; Used for list_data_manage_ptr_; size [n_lists_]
-  rmm::device_uvector<uint32_t> list_offsets_dev_;
-  // The device memory pointer; the number of each cluster(list); size [n_lists_]
-  rmm::device_uvector<uint32_t> list_lengths_dev_;
-  // The device memory pointer; centroid; size [n_lists_, dim_]
-  rmm::device_uvector<float> centers_dev_;
-  // The device memory pointer; centroid norm ; size [n_lists_, dim_]
-  rmm::device_uvector<float> centers_norms_dev_;
   // Memory pool for use during search; after the first search is done the pool is not likely to
   // resize, saving the costs of allocations.
   std::optional<rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>> search_mem_res;
@@ -101,35 +83,15 @@ class cuivflHandle {
 template <typename T>
 cuivflHandle<T>::cuivflHandle(const handle_t& handle,
                               raft::distance::DistanceType metric_type,
-                              uint32_t dim,
-                              uint32_t n_lists,
-                              uint32_t n_iters)
-  : handle_(handle),
-    stream_(handle_.get_stream()),
-    dim_(dim),
-    n_lists_(n_lists),
-    n_iters_(n_iters),
-    metric_type_(metric_type),
-    grid_dim_x_(0),
-    list_data_dev_(0, stream_),
-    list_index_dev_(0, stream_),
-    list_offsets_dev_(0, stream_),
-    list_lengths_dev_(0, stream_),
-    centers_dev_(0, stream_),
-    centers_norms_dev_(0, stream_)
+                              const ivf_flat_params& params)
+  : handle_(handle), stream_(handle_.get_stream()), params_(params), grid_dim_x_(0)
 {
-  // TODO: consider padding the dimensions and fixing veclen to its maximum possible value as a
-  // template parameter (https://github.com/rapidsai/raft/issues/711)
-  veclen_ = 16 / sizeof(T);
-  while (dim % veclen_ != 0) {
-    veclen_ = veclen_ >> 1;
-  }
 }
 
 /**
  * @brief Record the dataset into the index, one source row at a time.
  *
- * The index concists of the dataset rows, grouped by their labels (into clusters/lists).
+ * The index consists of the dataset rows, grouped by their labels (into clusters/lists).
  * Within each cluster (list), the data is grouped into blocks of `WarpSize` interleaved
  * vectors. Note, the total index length is slightly larger than the dataset length, because
  * each cluster is padded by `WarpSize` elements
@@ -143,9 +105,9 @@ cuivflHandle<T>::cuivflHandle(const handle_t& handle,
  * @param[in] labels device pointer to the cluster ids for each row [n_rows]
  * @param[in] list_offsets device pointer to the cluster offsets in the output (index) [n_lists]
  * @param[in] dataset device poitner to the input data [n_rows, dim]
- * @param[out] list_data device pointer to the output [index_size_, dim]
- * @param[out] list_index device pointer to the source ids corr. to the output [index_size_]
- * @param[out] list_lengths device pointer to the cluster sizes [n_lists];
+ * @param[out] list_data device pointer to the output [index_size, dim]
+ * @param[out] list_index device pointer to the source ids corr. to the output [index_size]
+ * @param[out] list_sizes_ptr device pointer to the cluster sizes [n_lists];
  *                          it's used as an atomic counter, and must be initialized with zeros.
  * @param n_rows source length
  * @param dim the dimensionality of the data
@@ -158,7 +120,7 @@ __global__ void build_index_kernel(const uint32_t* labels,
                                    const T* dataset,
                                    T* list_data,
                                    uint32_t* list_index,
-                                   uint32_t* list_lengths,
+                                   uint32_t* list_sizes_ptr,
                                    uint32_t n_rows,
                                    uint32_t dim,
                                    uint32_t veclen)
@@ -167,7 +129,7 @@ __global__ void build_index_kernel(const uint32_t* labels,
   if (i >= n_rows) { return; }
 
   auto list_id     = labels[i];
-  auto inlist_id   = atomicAdd(list_lengths + list_id, 1);
+  auto inlist_id   = atomicAdd(list_sizes_ptr + list_id, 1);
   auto list_offset = list_offsets[list_id];
 
   // Record the source vector id in the index
@@ -194,45 +156,44 @@ __global__ void build_index_kernel(const uint32_t* labels,
 }
 
 template <typename T>
-void cuivflHandle<T>::cuivflBuildIndex(const T* dataset, uint32_t n_rows)
+void cuivflHandle<T>::cuivflBuildIndex(const T* dataset, uint32_t n_rows, uint32_t dim)
 {
-  n_rows_ = n_rows;
-  RAFT_EXPECTS(n_rows_ > 0, "empty dataset");
-
   static_assert(std::is_same_v<T, float> || std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
                 "unsupported data type");
+  RAFT_EXPECTS(n_rows > 0 && dim > 0, "empty dataset");
+
+  // TODO: consider padding the dimensions and fixing veclen to its maximum possible value as a
+  // template parameter (https://github.com/rapidsai/raft/issues/711)
+  uint32_t veclen = 16 / sizeof(T);
+  while (dim % veclen != 0) {
+    veclen = veclen >> 1;
+  }
+  auto n_lists = static_cast<uint32_t>(params_.nlist);
 
   // kmeans cluster ids for the dataset
-  rmm::device_uvector<uint32_t> labels(n_rows_, stream_);
+  rmm::device_uvector<uint32_t> labels(n_rows, stream_);
+  auto&& centers = make_array_for_index<float>(stream_, n_lists, dim);
 
   // Predict labels of the whole dataset
-  centers_dev_.resize(n_lists_ * dim_, stream_);
   kmeans::build_optimized_kmeans(handle_,
-                                 n_iters_,
-                                 dim_,
+                                 params_.kmeans_n_iters,
+                                 dim,
                                  dataset,
                                  n_rows,
                                  labels.data(),
-                                 centers_dev_.data(),
-                                 n_lists_,
-                                 0.5,
+                                 centers.data(),
+                                 n_lists,
+                                 params_.kmeans_trainset_fraction,
                                  metric_type_,
                                  stream_);
 
-  // Precompute the centers vector norms for L2Expanded distance
-  if (metric_type_ == raft::distance::DistanceType::L2Expanded) {
-    centers_norms_dev_.resize(n_lists_, stream_);
-    utils::dots_along_rows(n_lists_, dim_, centers_dev_.data(), centers_norms_dev_.data(), stream_);
-    RAFT_LOG_TRACE_VEC(centers_norms_dev_.data(), 20);
-  }
-
-  list_lengths_dev_.resize(n_lists_, stream_);
-  auto list_lengths = list_lengths_dev_.data();
+  auto&& list_sizes   = make_array_for_index<uint32_t>(stream_, n_lists);
+  auto list_sizes_ptr = list_sizes.data();
   stats::histogram(stats::HistType::HistTypeAuto,
-                   reinterpret_cast<int*>(list_lengths),
-                   n_lists_,
+                   reinterpret_cast<int*>(list_sizes_ptr),
+                   n_lists,
                    labels.data(),
-                   n_rows_,
+                   n_rows,
                    uint32_t(1),
                    stream_);
 
@@ -240,38 +201,57 @@ void cuivflHandle<T>::cuivflBuildIndex(const T* dataset, uint32_t n_rows)
   // order with the rest
   auto thrust_policy = handle_.get_thrust_policy();
 
-  list_offsets_dev_.resize(n_lists_ + 1, stream_);
-  auto list_offsets = list_offsets_dev_.data();
+  auto&& list_offsets   = make_array_for_index<uint32_t>(stream_, n_lists + 1);
+  auto list_offsets_ptr = list_offsets.data();
 
   thrust::exclusive_scan(
     thrust_policy,
-    list_lengths,
-    list_lengths + n_lists_ + 1,
-    list_offsets,
+    list_sizes_ptr,
+    list_sizes_ptr + n_lists + 1,
+    list_offsets_ptr,
     uint32_t(0),
     [] __device__(uint32_t s, uint32_t l) { return s + Pow2<WarpSize>::roundUp(l); });
 
-  update_host(&index_size_, list_offsets + n_lists_, 1, stream_);
+  uint32_t index_size;
+  update_host(&index_size, list_offsets_ptr + n_lists, 1, stream_);
   handle_.sync_stream(stream_);
 
-  list_data_dev_.resize(index_size_ * dim_, stream_);
-  list_index_dev_.resize(index_size_, stream_);
+  auto&& data    = make_array_for_index<T>(stream_, index_size, dim);
+  auto&& indices = make_array_for_index<uint32_t>(stream_, index_size);
 
-  // we'll rebuild the `list_lengths` in the following kernel, using it as an atomic counter.
-  utils::memset(list_lengths, 0, sizeof(uint32_t) * n_lists_, stream_);
+  // we'll rebuild the `list_sizes_ptr` in the following kernel, using it as an atomic counter.
+  utils::memset(list_sizes_ptr, 0, sizeof(uint32_t) * n_lists, stream_);
 
   const dim3 block_dim(256);
-  const dim3 grid_dim(raft::ceildiv<uint32_t>(n_rows_, block_dim.x));
+  const dim3 grid_dim(raft::ceildiv<uint32_t>(n_rows, block_dim.x));
   build_index_kernel<<<grid_dim, block_dim, 0, stream_>>>(labels.data(),
-                                                          list_offsets,
+                                                          list_offsets_ptr,
                                                           dataset,
-                                                          list_data_dev_.data(),
-                                                          list_index_dev_.data(),
-                                                          list_lengths,
-                                                          n_rows_,
-                                                          dim_,
-                                                          veclen_);
+                                                          data.data(),
+                                                          indices.data(),
+                                                          list_sizes_ptr,
+                                                          n_rows,
+                                                          dim,
+                                                          veclen);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+  // Precompute the centers vector norms for L2Expanded distance
+  auto compute_norms = [&]() {
+    auto&& r = make_array_for_index<float>(stream_, params_.nlist);
+    utils::dots_along_rows(params_.nlist, dim, centers.data(), r.data(), stream_);
+    RAFT_LOG_TRACE_VEC(center_norms.data(), 20);
+    return r;
+  };
+  auto&& center_norms = metric_type_ == raft::distance::DistanceType::L2Expanded
+                          ? std::optional(compute_norms())
+                          : std::nullopt;
+
+  // assemble the index
+  index_.emplace(
+    ivf_flat_index<T>{veclen, data, indices, list_sizes, list_offsets, centers, center_norms});
+
+  // check index invariants
+  index_->check_consistency();
 }
 
 template <typename T>
@@ -280,23 +260,18 @@ void cuivflHandle<T>::queryIVFFlatGridSize(const uint32_t n_probes,
                                            const uint32_t k)
 {
   // query the gridDimX size to store probes topK output
-  ivfflat_interleaved_scan<T, typename utils::config<T>::value_t>(nullptr,
-                                                                  nullptr,
-                                                                  nullptr,
-                                                                  nullptr,
+  ivfflat_interleaved_scan<T, typename utils::config<T>::value_t>(index_.value(),
                                                                   nullptr,
                                                                   nullptr,
+                                                                  n_queries,
                                                                   metric_type_,
                                                                   n_probes,
                                                                   k,
-                                                                  n_queries,
-                                                                  dim_,
+                                                                  greater_,
                                                                   nullptr,
                                                                   nullptr,
-                                                                  stream_,
-                                                                  greater_,
-                                                                  veclen_,
-                                                                  grid_dim_x_);
+                                                                  grid_dim_x_,
+                                                                  stream_);
 }
 
 template <typename T>
@@ -306,7 +281,7 @@ void cuivflHandle<T>::cuivflSetSearchParameters(const uint32_t n_probes,
 {
   RAFT_EXPECTS(n_probes > 0,
                "n_probes (number of clusters to probe in the search) must be positive.");
-  n_probes_ = n_probes;
+  params_.nprobe = n_probes;
   // Set the greater_
   if (metric_type_ == raft::distance::DistanceType::L2Expanded ||
       metric_type_ == raft::distance::DistanceType::L2Unexpanded) {
@@ -344,14 +319,14 @@ void cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueries, dim]
                                        size_t* neighbors,  // [numQueries, topK]
                                        AccT* distances)
 {
-  uint32_t n_probes = std::min(n_probes_, n_lists_);
+  uint32_t n_probes = std::min(params_.nprobe, params_.nlist);
   grid_dim_x_       = 0;
   queryIVFFlatGridSize(n_probes, n_queries, k);
   auto search_mr = &(search_mem_res.value());
   // The norm of query
   rmm::device_uvector<float> query_norm_dev(n_queries, stream_, search_mr);
   // The distance value of cluster(list) and queries
-  rmm::device_uvector<float> distance_buffer_dev(n_queries * n_lists_, stream_, search_mr);
+  rmm::device_uvector<float> distance_buffer_dev(n_queries * params_.nlist, stream_, search_mr);
   // The topk distance value of cluster(list) and queries
   rmm::device_uvector<float> coarse_distances_dev(n_queries * n_probes, stream_, search_mr);
   // The topk  index of cluster(list) and queries
@@ -363,7 +338,7 @@ void cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueries, dim]
 
   size_t float_query_size;
   if constexpr (std::is_integral_v<T>) {
-    float_query_size = n_queries * dim_;
+    float_query_size = n_queries * index_->dim();
   } else {
     float_query_size = 0;
   }
@@ -374,7 +349,7 @@ void cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueries, dim]
     converted_queries_ptr = const_cast<float*>(queries);
   } else {
     linalg::unaryOp(
-      converted_queries_ptr, queries, n_queries * dim_, utils::mapping<float>{}, stream_);
+      converted_queries_ptr, queries, n_queries * index_->dim(), utils::mapping<float>{}, stream_);
   }
 
   float alpha = 1.0f;
@@ -383,14 +358,15 @@ void cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueries, dim]
   if (metric_type_ == raft::distance::DistanceType::L2Expanded) {
     alpha = -2.0f;
     beta  = 1.0f;
-    utils::dots_along_rows(n_queries, dim_, converted_queries_ptr, query_norm_dev.data(), stream_);
+    utils::dots_along_rows(
+      n_queries, index_->dim(), converted_queries_ptr, query_norm_dev.data(), stream_);
     utils::outer_add(query_norm_dev.data(),
                      n_queries,
-                     centers_norms_dev_.data(),
-                     n_lists_,
+                     index_->center_norms->data(),
+                     params_.nlist,
                      distance_buffer_dev.data(),
                      stream_);
-    RAFT_LOG_TRACE_VEC(centers_norms_dev_.data(), 20);
+    RAFT_LOG_TRACE_VEC(index_->center_norms->data(), 20);
     RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), 20);
   } else {
     alpha = 1.0f;
@@ -400,17 +376,17 @@ void cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueries, dim]
   linalg::gemm(handle_,
                true,
                false,
-               n_lists_,
+               params_.nlist,
                n_queries,
-               dim_,
+               index_->dim(),
                &alpha,
-               centers_dev_.data(),
-               dim_,
+               index_->centers.data(),
+               index_->dim(),
                converted_queries_ptr,
-               dim_,
+               index_->dim(),
                &beta,
                distance_buffer_dev.data(),
-               n_lists_,
+               params_.nlist,
                stream_);
 
   RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), 20);
@@ -418,7 +394,7 @@ void cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueries, dim]
     topk::warp_sort_topk<AccT, uint32_t>(distance_buffer_dev.data(),
                                          nullptr,
                                          n_queries,
-                                         n_lists_,
+                                         params_.nlist,
                                          n_probes,
                                          coarse_distances_dev.data(),
                                          coarse_indices_dev.data(),
@@ -428,7 +404,7 @@ void cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueries, dim]
     topk::radix_topk<AccT, uint32_t, 11, 512>(distance_buffer_dev.data(),
                                               nullptr,
                                               n_queries,
-                                              n_lists_,
+                                              params_.nlist,
                                               n_probes,
                                               coarse_distances_dev.data(),
                                               coarse_indices_dev.data(),
@@ -446,23 +422,18 @@ void cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueries, dim]
     indices_dev_ptr   = neighbors;
   }
 
-  ivfflat_interleaved_scan<T, typename utils::config<T>::value_t>(queries,
+  ivfflat_interleaved_scan<T, typename utils::config<T>::value_t>(index_.value(),
+                                                                  queries,
                                                                   coarse_indices_dev.data(),
-                                                                  list_index_dev_.data(),
-                                                                  list_data_dev_.data(),
-                                                                  list_lengths_dev_.data(),
-                                                                  list_offsets_dev_.data(),
+                                                                  n_queries,
                                                                   metric_type_,
                                                                   n_probes,
                                                                   k,
-                                                                  n_queries,
-                                                                  dim_,
+                                                                  greater_,
                                                                   indices_dev_ptr,
                                                                   distances_dev_ptr,
-                                                                  stream_,
-                                                                  greater_,
-                                                                  veclen_,
-                                                                  grid_dim_x_);
+                                                                  grid_dim_x_,
+                                                                  stream_);
 
   RAFT_LOG_TRACE_VEC(distances_dev_ptr, 2 * k);
   RAFT_LOG_TRACE_VEC(indices_dev_ptr, 2 * k);
diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
index c963f55144..54bbc6de71 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
@@ -36,10 +36,100 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <optional>
+#include <raft/core/mdarray.hpp>
+
 namespace raft::spatial::knn::detail {
 
 constexpr int kThreadsPerBlock = 128;
 
+namespace md = std::experimental;
+
+template <typename T>
+struct ivf_flat_index {
+  using row_major = md::layout_right;
+  using extent_1d = md::extents<dynamic_extent>;
+  using extent_2d = md::extents<dynamic_extent, dynamic_extent>;
+
+  /**
+   * Vectorized load/store size in elements, determines the size of interleaved data chunks.
+   */
+  uint32_t veclen;
+
+  /**
+   * Inverted list data [size, dim].
+   *
+   * The data consists of the dataset rows, grouped by their labels (into clusters/lists).
+   * Within each list (cluster), the data is grouped into blocks of `WarpSize` interleaved
+   * vectors. Note, the total index length is slightly larger than the source dataset length,
+   * because each cluster is padded by `WarpSize` elements.
+   *
+   * Interleaving pattern:
+   * within groups of `WarpSize` rows, the data is interleaved with the block size equal to
+   * `veclen * sizeof(T)`. That is, a chunk of `veclen` consecutive components of one row is
+   * followed by a chunk of the same size of the next row, and so on.
+   *
+   * __Example__: veclen = 2, dim = 6, WarpSize = 32, list_size = 31
+   * `
+   *   x[ 0, 0], x[ 0, 1], x[ 1, 0], x[ 1, 1], ... x[14, 0], x[14, 1], x[15, 0], x[15, 1],
+   *   x[16, 0], x[16, 1], x[17, 0], x[17, 1], ... x[30, 0], x[30, 1],    -    ,    -    ,
+   *   x[ 0, 2], x[ 0, 3], x[ 1, 2], x[ 1, 3], ... x[14, 2], x[14, 3], x[15, 2], x[15, 3],
+   *   x[16, 2], x[16, 3], x[17, 2], x[17, 3], ... x[30, 2], x[30, 3],    -    ,    -    ,
+   *   x[ 0, 4], x[ 0, 5], x[ 1, 4], x[ 1, 5], ... x[14, 4], x[14, 5], x[15, 4], x[15, 5],
+   *   x[16, 4], x[16, 5], x[17, 4], x[17, 5], ... x[30, 4], x[30, 5],    -    ,    -    ,
+   * `
+   */
+  device_mdarray<T, extent_2d, row_major> data;
+  /** Inverted list indices: ids of items in the source data [size] */
+  device_mdarray<uint32_t, extent_1d, row_major> indices;
+
+  /** Sizes of the lists (clusters) [n_lists] */
+  device_mdarray<uint32_t, extent_1d, row_major> list_sizes;
+  /**
+   * Offsets into the lists [n_lists + 1].
+   * The last value contains the total length of the index.
+   */
+  device_mdarray<uint32_t, extent_1d, row_major> list_offsets;
+
+  /** k-means cluster centers corresponding to the lists [n_lists, dim] */
+  device_mdarray<float, extent_2d, row_major> centers;
+  /** (Optional) Precomputed norms of the `centers` w.r.t. the chosen distance metrix [n_lists]  */
+  std::optional<device_mdarray<float, extent_1d, row_major>> center_norms;
+
+  /** Total length of the index. */
+  constexpr inline auto size() const noexcept -> size_t { return data.extent(0); }
+  /** Dimensionality of the data. */
+  constexpr inline auto dim() const noexcept -> size_t { return data.extent(1); }
+  /** Number of clusters/inverted lists. */
+  constexpr inline auto n_lists() const noexcept -> size_t { return centers.extent(0); }
+
+  /** Throw an error if the index content is inconsistent. */
+  inline void check_consistency() const
+  {
+    RAFT_EXPECTS(dim() % veclen == 0, "dimensionality is not a multiple of the veclen");
+    RAFT_EXPECTS(data.extent(0) == indices.extent(0), "inconsistent index size");
+    RAFT_EXPECTS(data.extent(1) == centers.extent(1), "inconsistent data dimensionality");
+    RAFT_EXPECTS(                                             //
+      (centers.extent(0) == list_sizes.extent(0)) &&          //
+        (centers.extent(0) + 1 == list_offsets.extent(0)) &&  //
+        (!center_norms.has_value() || centers.extent(0) == center_norms->extent(0)),
+      "inconsistent number of lists (clusters)");
+  }
+};
+
+template <typename T, typename... Extents>
+static inline auto make_array_for_index(rmm::cuda_stream_view stream, Extents... exts)
+{
+  using extent_t  = md::extents<((void)exts, dynamic_extent)...>;
+  using mdarray_t = device_mdarray<T, extent_t, md::layout_right>;
+
+  typename mdarray_t::extents_type extent{exts...};
+  typename mdarray_t::mapping_type layout{extent};
+  typename mdarray_t::container_policy_type policy{stream};
+
+  return mdarray_t{layout, policy};
+}
+
 /**
  * @brief Copy Veclen elements of type T from `query` to `query_shared` at position `loadDim *
  * Veclen`.
@@ -903,25 +993,21 @@ uint32_t configure_launch_x(uint32_t numQueries, uint32_t nprobe, int32_t sMemSi
 
 template <int Capacity, int Veclen, bool Greater, typename T, typename AccT, typename Lambda>
 void launch_kernel(Lambda lambda,
+                   const ivf_flat_index<T>& index,
                    const T* queries,
                    const uint32_t* coarse_index,
-                   const uint32_t* list_index,
-                   const T* list_data,
-                   const uint32_t* list_lengths,
-                   const uint32_t* list_prefix_interleave,
+                   const uint32_t num_queries,
                    const uint32_t nprobe,
                    const uint32_t k,
-                   const uint32_t dim,
                    size_t* neighbors,
                    float* distances,
-                   const uint32_t batch_size,
                    uint32_t& grid_dim_x,
                    rmm::cuda_stream_view stream)
 {
-  constexpr auto kKernel = interleaved_scan_kernel<Capacity, Veclen, Greater, T, AccT, Lambda>;
-  int max_query_smem     = 16384;
+  constexpr auto kKernel   = interleaved_scan_kernel<Capacity, Veclen, Greater, T, AccT, Lambda>;
+  const int max_query_smem = 16384;
   int query_smem_elems =
-    std::min<int>(max_query_smem / sizeof(T), Pow2<Veclen * WarpSize>::roundUp(dim));
+    std::min<int>(max_query_smem / sizeof(T), Pow2<Veclen * WarpSize>::roundUp(index.dim()));
   int smem_size = query_smem_elems * sizeof(T);
 #ifndef USE_FAISS
   constexpr int kSubwarpSize = std::min<int>(Capacity, WarpSize);
@@ -933,12 +1019,12 @@ void launch_kernel(Lambda lambda,
   constexpr uint32_t kMaxGridY = 32768;
 
   if (grid_dim_x == 0) {
-    grid_dim_x = configure_launch_x(std::min(kMaxGridY, batch_size), nprobe, smem_size, kKernel);
+    grid_dim_x = configure_launch_x(std::min(kMaxGridY, num_queries), nprobe, smem_size, kKernel);
     return;
   }
 
-  for (uint32_t query_offset = 0; query_offset < batch_size; query_offset += kMaxGridY) {
-    uint32_t grid_dim_y = std::min<uint32_t>(kMaxGridY, batch_size - query_offset);
+  for (uint32_t query_offset = 0; query_offset < num_queries; query_offset += kMaxGridY) {
+    uint32_t grid_dim_y = std::min<uint32_t>(kMaxGridY, num_queries - query_offset);
     dim3 grid_dim(grid_dim_x, grid_dim_y, 1);
     dim3 block_dim(kThreadsPerBlock);
     RAFT_LOG_TRACE(
@@ -953,16 +1039,16 @@ void launch_kernel(Lambda lambda,
                                                         query_smem_elems,
                                                         queries,
                                                         coarse_index,
-                                                        list_index,
-                                                        list_data,
-                                                        list_lengths,
-                                                        list_prefix_interleave,
+                                                        index.indices.data(),
+                                                        index.data.data(),
+                                                        index.list_sizes.data(),
+                                                        index.list_offsets.data(),
                                                         nprobe,
                                                         k,
-                                                        dim,
+                                                        index.dim(),
                                                         neighbors,
                                                         distances);
-    queries += grid_dim_y * dim;
+    queries += grid_dim_y * index.dim();
     neighbors += grid_dim_y * grid_dim_x * k;
     distances += grid_dim_y * grid_dim_x * k;
   }
@@ -1081,66 +1167,51 @@ struct select_interleaved_scan_kernel {
  * @tparam T value type
  * @tparam AccT accumulated type
  *
+ * @param index previously built ivf-flat index
  * @param[in] queries device pointer to the query vectors [batch_size, dim]
- * @param[in] coarse_index device pointer to the cluster (list) ids [batch_size, nprobe]
- * @param[in] list_index device pointer to the row ids in each cluster [nrow]
- * @param[in] list_data device pointer to the data in all clusters interleaved [nrow, dim]
- * @param[in] list_lengths device pointer to the numbers of vectors in each cluster [nlist]
- * @param[in] list_prefix_interleave device pointer to the offsets of each cluster in list_index
- * [nlist]
- * @param[in] metric type of the measured distance
- * @param[in] nprobe number of nearest clusters to query
- * @param[in] k number of nearest neighbors.
+ * @param[in] coarse_query_results device pointer to the cluster (list) ids [batch_size, n_probes]
+ * @param n_queries batch size
+ * @param metric type of the measured distance
+ * @param n_probes number of nearest clusters to query
+ * @param k number of nearest neighbors.
  *            NB: the maximum value of `k` is limited statically by `topk::kMaxCapacity`.
- * @param[in] batch_size number of query vectors
- * @param[in] dim dimensionality of search data and query vectors
+ * @param greater whether to select nearest (false) or furthest (true) points w.r.t. the given
+ * metric.
  * @param[out] neighbors device pointer to the result indices for each query and cluster
  * [batch_size, grid_dim_x, k]
  * @param[out] distances device pointer to the result distances for each query and cluster
  * [batch_size, grid_dim_x, k]
- * @param[in] stream
- * @param[in] greater whether to select nearest (false) or furthest (true) points w.r.t. the given
- * metric.
- * @param[in] veclen (optimization parameters) size of the vector for vectorized processing
- * @param[inout] grid_dim_x number of blocks launched across all nprobe clusters;
- *               (one block processes one or more probes, hence: 1 <= grid_dim_x <= nprobe)
+ * @param[inout] grid_dim_x number of blocks launched across all n_probes clusters;
+ *               (one block processes one or more probes, hence: 1 <= grid_dim_x <= n_probes)
+ * @param stream
  */
 template <typename T, typename AccT>
-void ivfflat_interleaved_scan(const T* queries,
-                              const uint32_t* coarse_index,
-                              const uint32_t* list_index,
-                              const T* list_data,
-                              const uint32_t* list_lengths,
-                              const uint32_t* list_prefix_interleave,
+void ivfflat_interleaved_scan(const ivf_flat_index<T>& index,
+                              const T* queries,
+                              const uint32_t* coarse_query_results,
+                              const uint32_t n_queries,
                               const raft::distance::DistanceType metric,
-                              const uint32_t nprobe,
+                              const uint32_t n_probes,
                               const uint32_t k,
-                              const uint32_t batch_size,
-                              const uint32_t dim,
+                              const bool greater,
                               size_t* neighbors,
                               float* distances,
-                              rmm::cuda_stream_view stream,
-                              const bool greater,
-                              const int veclen,
-                              uint32_t& grid_dim_x)
+                              uint32_t& grid_dim_x,
+                              rmm::cuda_stream_view stream)
 {
   const int capacity = raft::spatial::knn::detail::topk::calc_capacity(k);
   select_interleaved_scan_kernel<T, AccT>::run(capacity,
-                                               veclen,
+                                               index.veclen,
                                                greater,
                                                metric,
+                                               index,
                                                queries,
-                                               coarse_index,
-                                               list_index,
-                                               list_data,
-                                               list_lengths,
-                                               list_prefix_interleave,
-                                               nprobe,
+                                               coarse_query_results,
+                                               n_queries,
+                                               n_probes,
                                                k,
-                                               dim,
                                                neighbors,
                                                distances,
-                                               batch_size,
                                                grid_dim_x,
                                                stream);
 }
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
index 62ffdef75f..f6cc56cbb4 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
@@ -91,16 +91,14 @@ void approx_knn_ivfflat_build_index(
 template <typename T = float, typename IntType = int>
 void approx_knn_cuivfl_ivfflat_build_index(const raft::handle_t& handle,
                                            knnIndex* index,
-                                           IVFParam* params,
+                                           ivf_flat_params* params,
                                            raft::distance::DistanceType metric,
                                            T* dataset,
                                            IntType n,
                                            IntType D)
 {
-  int niter = 20;
-  index->handle_.get<T>() =
-    std::make_unique<detail::cuivflHandle<T>>(handle, metric, D, params->nlist, niter);
-  index->handle_.get<T>()->cuivflBuildIndex(dataset, n);
+  index->handle_.get<T>() = std::make_unique<detail::cuivflHandle<T>>(handle, metric, *params);
+  index->handle_.get<T>()->cuivflBuildIndex(dataset, n, D);
 }
 
 template <typename IntType = int>
@@ -153,8 +151,8 @@ void approx_knn_build_index(const handle_t& handle,
   // perform preprocessing
   // k set to 0 (unused during preprocessing / revertion)
   if constexpr (std::is_same<T, uint8_t>{} || std::is_same<T, int8_t>{}) {
-    if (dynamic_cast<IVFFlatParam*>(params)) {
-      IVFFlatParam* IVFFlat_param = dynamic_cast<IVFFlatParam*>(params);
+    if (dynamic_cast<ivf_flat_params*>(params)) {
+      ivf_flat_params* IVFFlat_param = dynamic_cast<ivf_flat_params*>(params);
       approx_knn_cuivfl_ivfflat_build_index(
         handle, index, IVFFlat_param, metric, index_array, n, D);
     } else {
@@ -164,8 +162,8 @@ void approx_knn_build_index(const handle_t& handle,
     std::unique_ptr<MetricProcessor<float>> query_metric_processor =
       create_processor<float>(metric, n, D, 0, false, stream);
 
-    if (dynamic_cast<IVFFlatParam*>(params)) {
-      IVFFlatParam* IVFFlat_param = dynamic_cast<IVFFlatParam*>(params);
+    if (dynamic_cast<ivf_flat_params*>(params)) {
+      ivf_flat_params* IVFFlat_param = dynamic_cast<ivf_flat_params*>(params);
       // cuivfl only supports L2/Inner product for now.
       if (metric == raft::distance::DistanceType::L2SqrtExpanded ||
           metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
@@ -229,11 +227,11 @@ void approx_knn_search(const handle_t& handle,
     index->index->search(n, query_array, k, distances, indices);
 #else
   if constexpr (std::is_same<T, uint8_t>{} || std::is_same<T, int8_t>{}) {
-    if (dynamic_cast<IVFFlatParam*>(params)) {
-      IVFFlatParam* IVFFlat_param = dynamic_cast<IVFFlatParam*>(params);
-      int nprobe                  = IVFFlat_param->nprobe;
-      int max_batch               = n;
-      int max_k                   = k;
+    if (dynamic_cast<ivf_flat_params*>(params)) {
+      ivf_flat_params* IVFFlat_param = dynamic_cast<ivf_flat_params*>(params);
+      int nprobe                     = IVFFlat_param->nprobe;
+      int max_batch                  = n;
+      int max_k                      = k;
 
       index->handle_.get<T>()->cuivflSetSearchParameters(nprobe, max_batch, max_k);
       index->handle_.get<T>()->cuivflSearch(
@@ -244,11 +242,11 @@ void approx_knn_search(const handle_t& handle,
       index->metric, n, index->handle_.get<T>()->getDim(), k, false, handle.get_stream());
     query_metric_processor->preprocess(query_array);
 
-    if (dynamic_cast<IVFFlatParam*>(params)) {
-      IVFFlatParam* IVFFlat_param = dynamic_cast<IVFFlatParam*>(params);
-      int nprobe                  = IVFFlat_param->nprobe;
-      int max_batch               = n;
-      int max_k                   = k;
+    if (dynamic_cast<ivf_flat_params*>(params)) {
+      ivf_flat_params* IVFFlat_param = dynamic_cast<ivf_flat_params*>(params);
+      int nprobe                     = IVFFlat_param->nprobe;
+      int max_batch                  = n;
+      int max_k                      = k;
 
       index->handle_.get<T>()->cuivflSetSearchParameters(nprobe, max_batch, max_k);
       index->handle_.get<T>()->cuivflSearch(
diff --git a/cpp/test/spatial/ann_ivf_flat.cu b/cpp/test/spatial/ann_ivf_flat.cu
index 5c3a01c75b..665532ccaa 100644
--- a/cpp/test/spatial/ann_ivf_flat.cu
+++ b/cpp/test/spatial/ann_ivf_flat.cu
@@ -142,7 +142,7 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
     {
       rmm::device_uvector<T> distances_ivfflat_dev(queries_size, stream_);
       rmm::device_uvector<int64_t> indices_ivfflat_dev(queries_size, stream_);
-      raft::spatial::knn::IVFFlatParam ivfParams;
+      raft::spatial::knn::ivf_flat_params ivfParams;
       ivfParams.nprobe = ps.nprobe;
       ivfParams.nlist  = ps.nlist;
       raft::spatial::knn::knnIndex index;

From fb8c4b16ddc615fee68b9cc183876f2f93bcd86d Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 17 Jun 2022 12:56:52 +0200
Subject: [PATCH 067/118] Fixed a memory leak and introduced a few assertions
 to check pointer alignment

---
 cpp/include/raft/spatial/knn/ann_common.h     | 80 +++++--------------
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  | 10 ++-
 .../knn/detail/ann_ivf_flat_kernel.cuh        | 16 +++-
 .../knn/detail/ann_kmeans_balanced.cuh        |  7 --
 .../raft/spatial/knn/detail/ann_quantized.cuh | 14 ++--
 cpp/test/spatial/ann_ivf_flat.cu              | 43 +++++-----
 6 files changed, 69 insertions(+), 101 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h
index 7b4aa8c326..8d870556b8 100644
--- a/cpp/include/raft/spatial/knn/ann_common.h
+++ b/cpp/include/raft/spatial/knn/ann_common.h
@@ -27,84 +27,44 @@ template <typename T>
 class cuivflHandle;
 };
 
-struct cuivfl_handle_t {
-  template <typename T>
-  auto get() -> std::unique_ptr<detail::cuivflHandle<T>>&;
-
-  cuivfl_handle_t() {}
+struct knnIndex {
+  faiss::gpu::GpuIndex* index;
+  raft::distance::DistanceType metric;
+  float metricArg;
+  std::unique_ptr<detail::cuivflHandle<float>> ivf_flat_float_;
+  std::unique_ptr<detail::cuivflHandle<uint8_t>> ivf_flat_uint8_t_;
+  std::unique_ptr<detail::cuivflHandle<int8_t>> ivf_flat_int8_t_;
 
-  ~cuivfl_handle_t()
+  raft::spatial::knn::RmmGpuResources* gpu_res;
+  int device;
+  ~knnIndex()
   {
-    if (dtype_.has_value()) {
-      switch (*dtype_) {
-        case CUDA_R_32F: impl.float_.~unique_ptr(); break;
-        case CUDA_R_8U: impl.uint8_t_.~unique_ptr(); break;
-        case CUDA_R_8I: impl.int8_t_.~unique_ptr(); break;
-        default: break;
-      }
-    }
+    delete index;
+    delete gpu_res;
   }
 
- private:
-  union handle {
-    void* dummy;
-    std::unique_ptr<detail::cuivflHandle<float>> float_;
-    std::unique_ptr<detail::cuivflHandle<uint8_t>> uint8_t_;
-    std::unique_ptr<detail::cuivflHandle<int8_t>> int8_t_;
-    handle() : dummy(nullptr){};
-    ~handle(){};
-  } impl;
-  std::optional<cudaDataType_t> dtype_;
+  template <typename T>
+  auto ivf_flat() -> std::unique_ptr<detail::cuivflHandle<T>>&;
 };
 
 template <>
-auto cuivfl_handle_t::get<float>() -> std::unique_ptr<detail::cuivflHandle<float>>&
+auto knnIndex::ivf_flat<float>() -> std::unique_ptr<detail::cuivflHandle<float>>&
 {
-  if (dtype_.has_value()) {
-    RAFT_EXPECTS(*dtype_ == CUDA_R_32F, "wrong element type");
-  } else {
-    *dtype_ = CUDA_R_32F;
-  }
-  return impl.float_;
+  return ivf_flat_float_;
 }
 
 template <>
-auto cuivfl_handle_t::get<uint8_t>() -> std::unique_ptr<detail::cuivflHandle<uint8_t>>&
+auto knnIndex::ivf_flat<uint8_t>() -> std::unique_ptr<detail::cuivflHandle<uint8_t>>&
 {
-  if (dtype_.has_value()) {
-    RAFT_EXPECTS(*dtype_ == CUDA_R_8U, "wrong element type");
-  } else {
-    *dtype_ = CUDA_R_8U;
-  }
-  return impl.uint8_t_;
+  return ivf_flat_uint8_t_;
 }
 
 template <>
-auto cuivfl_handle_t::get<int8_t>() -> std::unique_ptr<detail::cuivflHandle<int8_t>>&
+auto knnIndex::ivf_flat<int8_t>() -> std::unique_ptr<detail::cuivflHandle<int8_t>>&
 {
-  if (dtype_.has_value()) {
-    RAFT_EXPECTS(*dtype_ == CUDA_R_8I, "wrong element type");
-  } else {
-    *dtype_ = CUDA_R_8I;
-  }
-  return impl.int8_t_;
+  return ivf_flat_int8_t_;
 }
 
-struct knnIndex {
-  faiss::gpu::GpuIndex* index;
-  raft::distance::DistanceType metric;
-  float metricArg;
-  cuivfl_handle_t handle_;
-
-  raft::spatial::knn::RmmGpuResources* gpu_res;
-  int device;
-  ~knnIndex()
-  {
-    delete index;
-    delete gpu_res;
-  }
-};
-
 enum QuantizerType : unsigned int {
   QT_8bit,
   QT_4bit,
diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index 90c1cd5d60..5079de3fec 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -65,7 +65,7 @@ class cuivflHandle {
   const rmm::cuda_stream_view stream_;
   ivf_flat_params params_;
 
-  raft::distance::DistanceType metric_type_;
+  const raft::distance::DistanceType metric_type_;
   bool greater_;
   uint32_t grid_dim_x_;  // The number of blocks launched across n_probes.
   // The built index
@@ -84,7 +84,11 @@ template <typename T>
 cuivflHandle<T>::cuivflHandle(const handle_t& handle,
                               raft::distance::DistanceType metric_type,
                               const ivf_flat_params& params)
-  : handle_(handle), stream_(handle_.get_stream()), params_(params), grid_dim_x_(0)
+  : handle_(handle),
+    stream_(handle_.get_stream()),
+    params_(params),
+    grid_dim_x_(0),
+    metric_type_(metric_type)
 {
 }
 
@@ -239,7 +243,7 @@ void cuivflHandle<T>::cuivflBuildIndex(const T* dataset, uint32_t n_rows, uint32
   auto compute_norms = [&]() {
     auto&& r = make_array_for_index<float>(stream_, params_.nlist);
     utils::dots_along_rows(params_.nlist, dim, centers.data(), r.data(), stream_);
-    RAFT_LOG_TRACE_VEC(center_norms.data(), 20);
+    RAFT_LOG_TRACE_VEC(r.data(), 20);
     return r;
   };
   auto&& center_norms = metric_type_ == raft::distance::DistanceType::L2Expanded
diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
index 54bbc6de71..c671cd34ba 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
@@ -97,11 +97,14 @@ struct ivf_flat_index {
   std::optional<device_mdarray<float, extent_1d, row_major>> center_norms;
 
   /** Total length of the index. */
-  constexpr inline auto size() const noexcept -> size_t { return data.extent(0); }
+  [[nodiscard]] constexpr inline auto size() const noexcept -> size_t { return data.extent(0); }
   /** Dimensionality of the data. */
-  constexpr inline auto dim() const noexcept -> size_t { return data.extent(1); }
+  [[nodiscard]] constexpr inline auto dim() const noexcept -> size_t { return data.extent(1); }
   /** Number of clusters/inverted lists. */
-  constexpr inline auto n_lists() const noexcept -> size_t { return centers.extent(0); }
+  [[nodiscard]] constexpr inline auto n_lists() const noexcept -> size_t
+  {
+    return centers.extent(0);
+  }
 
   /** Throw an error if the index content is inconsistent. */
   inline void check_consistency() const
@@ -114,6 +117,8 @@ struct ivf_flat_index {
         (centers.extent(0) + 1 == list_offsets.extent(0)) &&  //
         (!center_norms.has_value() || centers.extent(0) == center_norms->extent(0)),
       "inconsistent number of lists (clusters)");
+    RAFT_EXPECTS(reinterpret_cast<size_t>(data.data()) % (veclen * sizeof(T)) == 0,
+                 "The data storage pointer is not aligned to the vector length");
   }
 };
 
@@ -930,7 +935,6 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
 
       if (dim > query_smem_elems) {
         constexpr int kUnroll = WarpSize / Veclen;
-        ;
         loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, decltype(compute_dist), Veclen, T, AccT>
           obj(dist, compute_dist);
         for (int dBase = shLoadDim; dBase < full_warps_along_dim; dBase += WarpSize) {  //
@@ -1004,6 +1008,10 @@ void launch_kernel(Lambda lambda,
                    uint32_t& grid_dim_x,
                    rmm::cuda_stream_view stream)
 {
+  RAFT_EXPECTS(reinterpret_cast<size_t>(queries) % (Veclen * sizeof(T)) == 0,
+               "Queries data is not aligned to the vector load size (Veclen).");
+  RAFT_EXPECTS(Veclen == index.veclen,
+               "Configured Veclen does not match the index interleaving pattern.");
   constexpr auto kKernel   = interleaved_scan_kernel<Capacity, Veclen, Greater, T, AccT, Lambda>;
   const int max_query_smem = 16384;
   int query_smem_elems =
diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index d452371d63..230a6a89ce 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -457,7 +457,6 @@ void build_optimized_kmeans(const handle_t& handle,
 
   // Training meso-clusters
   for (uint32_t iter = 0; iter < 2 * n_iters; iter += 2) {
-    RAFT_LOG_TRACE("Training kmeans of meso-clusters: %.1f / %u", (float)iter / 2, n_iters);
     kmeans::predict(handle,
                     mesocluster_centers.data(),
                     n_mesoclusters,
@@ -552,12 +551,6 @@ void build_optimized_kmeans(const handle_t& handle,
       mesocluster_sizes[i], dim, trainset.data(), mc_trainset_ids, dim, mc_trainset, dim, stream);
 
     for (uint32_t iter = 0; iter < 2 * n_iters; iter += 2) {
-      RAFT_LOG_TRACE("Training kmeans of clusters in meso-cluster %u (n_lists: %u): %.1f / %u",
-                     i,
-                     fine_clusters_nums[i],
-                     (float)iter / 2,
-                     n_iters);
-
       kmeans::predict(handle,
                       mc_trainset_ccenters.data(),
                       fine_clusters_nums[i],
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
index f6cc56cbb4..e8c962fd0c 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
@@ -97,8 +97,8 @@ void approx_knn_cuivfl_ivfflat_build_index(const raft::handle_t& handle,
                                            IntType n,
                                            IntType D)
 {
-  index->handle_.get<T>() = std::make_unique<detail::cuivflHandle<T>>(handle, metric, *params);
-  index->handle_.get<T>()->cuivflBuildIndex(dataset, n, D);
+  index->ivf_flat<T>() = std::make_unique<detail::cuivflHandle<T>>(handle, metric, *params);
+  index->ivf_flat<T>()->cuivflBuildIndex(dataset, n, D);
 }
 
 template <typename IntType = int>
@@ -233,13 +233,13 @@ void approx_knn_search(const handle_t& handle,
       int max_batch                  = n;
       int max_k                      = k;
 
-      index->handle_.get<T>()->cuivflSetSearchParameters(nprobe, max_batch, max_k);
-      index->handle_.get<T>()->cuivflSearch(
+      index->ivf_flat<T>()->cuivflSetSearchParameters(nprobe, max_batch, max_k);
+      index->ivf_flat<T>()->cuivflSearch(
         query_array, max_batch, max_k, (size_t*)indices, distances);
     }
   } else if constexpr (std::is_same<T, float>{}) {
     std::unique_ptr<MetricProcessor<float>> query_metric_processor = create_processor<float>(
-      index->metric, n, index->handle_.get<T>()->getDim(), k, false, handle.get_stream());
+      index->metric, n, index->ivf_flat<T>()->getDim(), k, false, handle.get_stream());
     query_metric_processor->preprocess(query_array);
 
     if (dynamic_cast<ivf_flat_params*>(params)) {
@@ -248,8 +248,8 @@ void approx_knn_search(const handle_t& handle,
       int max_batch                  = n;
       int max_k                      = k;
 
-      index->handle_.get<T>()->cuivflSetSearchParameters(nprobe, max_batch, max_k);
-      index->handle_.get<T>()->cuivflSearch(
+      index->ivf_flat<T>()->cuivflSetSearchParameters(nprobe, max_batch, max_k);
+      index->ivf_flat<T>()->cuivflSearch(
         query_array, max_batch, max_k, (size_t*)indices, distances);
     }
     query_metric_processor->revert(query_array);
diff --git a/cpp/test/spatial/ann_ivf_flat.cu b/cpp/test/spatial/ann_ivf_flat.cu
index 665532ccaa..0ae5aa2a4f 100644
--- a/cpp/test/spatial/ann_ivf_flat.cu
+++ b/cpp/test/spatial/ann_ivf_flat.cu
@@ -92,9 +92,12 @@ auto eval_knn(const std::vector<T>& expected_idx,
   RAFT_LOG_INFO("Recall = %zu/%zu", match_count, total_count);
   double actual_recall = static_cast<double>(match_count) / static_cast<double>(total_count);
   if (actual_recall < min_recall - eps) {
-    return testing::AssertionFailure()
-           << "actual recall (" << actual_recall
-           << ") is smaller than the minimum expected recall (" << min_recall << ").";
+    RAFT_LOG_WARN("Recall is suspiciously too low (%f < %f)", actual_recall, min_recall);
+    if (match_count == 0 || actual_recall < min_recall * min_recall - eps) {
+      return testing::AssertionFailure()
+             << "actual recall (" << actual_recall
+             << ") is much smaller than the minimum expected recall (" << min_recall << ").";
+    }
   }
   return testing::AssertionSuccess();
 }
@@ -111,7 +114,7 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
   }
 
  protected:
-  void testIVFFlat(bool is8bit)
+  void testIVFFlat()
   {
     size_t queries_size = ps.num_queries * ps.k;
     std::vector<int64_t> indices_ivfflat(queries_size);
@@ -142,16 +145,16 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
     {
       rmm::device_uvector<T> distances_ivfflat_dev(queries_size, stream_);
       rmm::device_uvector<int64_t> indices_ivfflat_dev(queries_size, stream_);
-      raft::spatial::knn::ivf_flat_params ivfParams;
-      ivfParams.nprobe = ps.nprobe;
-      ivfParams.nlist  = ps.nlist;
+      raft::spatial::knn::ivf_flat_params params;
+      params.nprobe = ps.nprobe;
+      params.nlist  = ps.nlist;
       raft::spatial::knn::knnIndex index;
       index.index   = nullptr;
       index.gpu_res = nullptr;
 
       approx_knn_build_index(handle_,
                              &index,
-                             dynamic_cast<raft::spatial::knn::knnIndexParam*>(&ivfParams),
+                             dynamic_cast<raft::spatial::knn::knnIndexParam*>(&params),
                              ps.metric,
                              0,
                              database.data(),
@@ -163,7 +166,7 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
                         distances_ivfflat_dev.data(),
                         indices_ivfflat_dev.data(),
                         &index,
-                        dynamic_cast<raft::spatial::knn::knnIndexParam*>(&ivfParams),
+                        dynamic_cast<raft::spatial::knn::knnIndexParam*>(&params),
                         ps.k,
                         search_queries.data(),
                         ps.num_queries);
@@ -175,14 +178,14 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
     // unless something is really wrong with clustering, this could serve as a lower bound on recall
     double min_recall = static_cast<double>(ps.nprobe) / static_cast<double>(ps.nlist);
     // verify.
-    eval_knn(indices_naive,
-             indices_ivfflat,
-             distances_naive,
-             distances_ivfflat,
-             ps.num_queries,
-             ps.k,
-             float(0.001),
-             min_recall);
+    ASSERT_TRUE(eval_knn(indices_naive,
+                         indices_ivfflat,
+                         distances_naive,
+                         distances_ivfflat,
+                         ps.num_queries,
+                         ps.k,
+                         float(0.001),
+                         min_recall));
   }
 
   void SetUp() override
@@ -240,17 +243,17 @@ const std::vector<AnnIvfFlatInputs> inputs = {
   {98306, 1024, 32, 10, 64, 64, raft::distance::DistanceType::InnerProduct}};
 
 typedef AnnIVFFlatTest<float, float> AnnIVFFlatTestF;
-TEST_P(AnnIVFFlatTestF, AnnIVFFlat) { this->testIVFFlat(false); }
+TEST_P(AnnIVFFlatTestF, AnnIVFFlat) { this->testIVFFlat(); }
 
 INSTANTIATE_TEST_CASE_P(AnnIVFFlatTest, AnnIVFFlatTestF, ::testing::ValuesIn(inputs));
 
 typedef AnnIVFFlatTest<float, uint8_t> AnnIVFFlatTestF_uint8;
-TEST_P(AnnIVFFlatTestF_uint8, AnnIVFFlat) { this->testIVFFlat(true); }
+TEST_P(AnnIVFFlatTestF_uint8, AnnIVFFlat) { this->testIVFFlat(); }
 
 INSTANTIATE_TEST_CASE_P(AnnIVFFlatTest, AnnIVFFlatTestF_uint8, ::testing::ValuesIn(inputs));
 
 typedef AnnIVFFlatTest<float, int8_t> AnnIVFFlatTestF_int8;
-TEST_P(AnnIVFFlatTestF_int8, AnnIVFFlat) { this->testIVFFlat(true); }
+TEST_P(AnnIVFFlatTestF_int8, AnnIVFFlat) { this->testIVFFlat(); }
 
 INSTANTIATE_TEST_CASE_P(AnnIVFFlatTest, AnnIVFFlatTestF_int8, ::testing::ValuesIn(inputs));
 

From 092d42816841f85bcbd1012a2af87d0541094a34 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 17 Jun 2022 16:40:19 +0200
Subject: [PATCH 068/118] Refactoring build_optimized_kmeans

---
 .../knn/detail/ann_kmeans_balanced.cuh        | 324 ++++++++++++------
 1 file changed, 211 insertions(+), 113 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index 230a6a89ce..085cfa9f85 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -390,97 +390,50 @@ auto adjust_centers(float* centers,
   return adjusted;
 }
 
-/**
- * kmeans
- *
- * @tparam T element type
- *
- * @param handle
- * @param n_iters number of training iterations
- * @param dim number of columns in `centers` and `dataset`
- * @param[in] dataset a device pointer to the source dataset [n_rows, dim]
- * @param n_rows number of rows in the input
- * @param[out] labels a device pointer to the output labels [n_rows]
- * @param[out] cluster_centers a device pointer to the found cluster centers [n_cluster, dim]
- * @param n_cluster
- * @param trainset_fraction a fraction of rows in the `dataset` to sample for kmeans training;
- *                            0 < trainset_fraction <= 1.
- * @param metric the distance metric
- * @param stream
- */
+/** predict & adjust_centers combined in an iterative process. */
 template <typename T>
-void build_optimized_kmeans(const handle_t& handle,
-                            uint32_t n_iters,
-                            size_t dim,
-                            const T* dataset,
-                            size_t n_rows,
-                            uint32_t* labels,
-                            float* cluster_centers,
-                            size_t n_clusters,
-                            double trainset_fraction,
-                            raft::distance::DistanceType metric,
-                            rmm::cuda_stream_view stream)
+auto build_clusters(const handle_t& handle,
+                    uint32_t n_iters,
+                    size_t dim,
+                    const T* dataset_mptr,
+                    size_t n_rows,
+                    size_t n_clusters,
+                    raft::distance::DistanceType metric,
+                    rmm::mr::managed_memory_resource* managed_memory,
+                    rmm::mr::device_memory_resource* device_memory,
+                    rmm::cuda_stream_view stream)
 {
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
-    "kmeans::build_optimized_kmeans(%u, %u)", n_rows, n_clusters);
-
-  auto trainset_ratio =
-    std::max<size_t>(1, n_rows / std::max<size_t>(trainset_fraction * n_rows, n_clusters));
-  auto n_rows_train = n_rows / trainset_ratio;
-
-  uint32_t n_mesoclusters = std::pow<double>(n_clusters, 0.5) + 0.5;
-  RAFT_LOG_DEBUG("(%s) # n_mesoclusters: %u", __func__, n_mesoclusters);
-
-  rmm::mr::managed_memory_resource managed_memory;
-  rmm::device_uvector<float> mesocluster_centers(n_mesoclusters * dim, stream, &managed_memory);
-  rmm::device_uvector<uint32_t> mesocluster_labels(n_rows_train, stream, &managed_memory);
-  rmm::device_uvector<uint32_t> mesocluster_sizes_buf(n_mesoclusters, stream, &managed_memory);
-  rmm::device_uvector<float> mesocluster_centers_tmp(n_mesoclusters * dim, stream, &managed_memory);
-
-  rmm::device_uvector<T> trainset(n_rows_train * dim, stream, &managed_memory);
-  // TODO: a proper sampling
-  RAFT_CUDA_TRY(cudaMemcpy2DAsync(trainset.data(),
-                                  sizeof(T) * dim,
-                                  dataset,
-                                  sizeof(T) * dim * trainset_ratio,
-                                  sizeof(T) * dim,
-                                  n_rows_train,
-                                  cudaMemcpyDefault,
-                                  stream));
-
-  auto mesocluster_sizes = mesocluster_sizes_buf.data();
-
-  rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> kmeans_mem_res(
-    rmm::mr::get_current_device_resource(),
-    // an arbitrary guess on the upper bound of the workspace size
-    Pow2<256>::roundUp(kmeans::calc_minibatch_size(n_mesoclusters, n_rows) * dim * 4));
+  rmm::device_uvector<float> cluster_centers(n_clusters * dim, stream, managed_memory);
+  rmm::device_uvector<uint32_t> cluster_labels(n_rows, stream, managed_memory);
+  rmm::device_uvector<uint32_t> cluster_sizes(n_clusters, stream, managed_memory);
+  rmm::device_uvector<float> cluster_centers_tmp(n_clusters * dim, stream, managed_memory);
 
   // Training meso-clusters
   for (uint32_t iter = 0; iter < 2 * n_iters; iter += 2) {
     kmeans::predict(handle,
-                    mesocluster_centers.data(),
-                    n_mesoclusters,
+                    cluster_centers.data(),
+                    n_clusters,
                     dim,
-                    trainset.data(),
-                    n_rows_train,
-                    mesocluster_labels.data(),
+                    dataset_mptr,
+                    n_rows,
+                    cluster_labels.data(),
                     metric,
                     (iter != 0),
-                    mesocluster_centers_tmp.data(),
-                    mesocluster_sizes,
+                    cluster_centers_tmp.data(),
+                    cluster_sizes.data(),
                     true,
                     stream,
-                    &kmeans_mem_res);
+                    device_memory);
 
     if (iter + 1 < 2 * n_iters) {
-      if (kmeans::adjust_centers(mesocluster_centers.data(),
-                                 n_mesoclusters,
+      if (kmeans::adjust_centers(cluster_centers.data(),
+                                 n_clusters,
                                  dim,
-                                 trainset.data(),
-                                 n_rows_train,
-                                 mesocluster_labels.data(),
+                                 dataset_mptr,
+                                 n_rows,
+                                 cluster_labels.data(),
                                  metric,
-                                 mesocluster_sizes,
+                                 cluster_sizes.data(),
                                  (float)1.0 / 4,
                                  stream)) {
         iter -= 1;
@@ -488,67 +441,114 @@ void build_optimized_kmeans(const handle_t& handle,
     }
   }
 
-  handle.sync_stream(stream);
+  return std::make_tuple(std::move(cluster_labels), std::move(cluster_sizes));
+}
 
+/** Calculate how many fine clusters should belong to each mesocluster. */
+auto arrange_fine_clusters(size_t n_clusters,
+                           size_t n_mesoclusters,
+                           size_t n_rows,
+                           const uint32_t* mesocluster_sizes)
+{
   std::vector<uint32_t> fine_clusters_nums(n_mesoclusters);
   std::vector<uint32_t> fine_clusters_csum(n_mesoclusters + 1);
   fine_clusters_csum[0] = 0;
 
   uint32_t n_lists_rem            = n_clusters;
-  uint32_t n_rows_train_rem       = n_rows_train;
-  uint32_t mesocluster_size_max   = 0;
+  uint32_t n_rows_rem             = n_rows;
   uint32_t mesocluster_size_sum   = 0;
-  uint32_t fine_clusters_nums_sum = 0;  // checking
+  uint32_t mesocluster_size_max   = 0;
   uint32_t fine_clusters_nums_max = 0;
   for (uint32_t i = 0; i < n_mesoclusters; i++) {
     if (i < n_mesoclusters - 1) {
-      fine_clusters_nums[i] = (double)n_lists_rem * mesocluster_sizes[i] / n_rows_train_rem + .5;
+      fine_clusters_nums[i] = (double)n_lists_rem * mesocluster_sizes[i] / n_rows_rem + .5;
     } else {
       fine_clusters_nums[i] = n_lists_rem;
     }
     n_lists_rem -= fine_clusters_nums[i];
-    n_rows_train_rem -= mesocluster_sizes[i];
+    n_rows_rem -= mesocluster_sizes[i];
     mesocluster_size_max = max(mesocluster_size_max, mesocluster_sizes[i]);
     mesocluster_size_sum += mesocluster_sizes[i];
-    fine_clusters_nums_sum += fine_clusters_nums[i];
     fine_clusters_nums_max    = max(fine_clusters_nums_max, fine_clusters_nums[i]);
     fine_clusters_csum[i + 1] = fine_clusters_csum[i] + fine_clusters_nums[i];
   }
 
-  RAFT_LOG_DEBUG("(%s) # mesocluster_size_sum: %u", __func__, mesocluster_size_sum);
-  RAFT_LOG_DEBUG("(%s) # fine_clusters_nums_sum: %u", __func__, fine_clusters_nums_sum);
-  assert(mesocluster_size_sum == n_rows_train);
-  assert(fine_clusters_nums_sum == n_clusters);
-  assert(fine_clusters_csum[n_mesoclusters] == n_clusters);
+  RAFT_EXPECTS(mesocluster_size_sum == n_rows,
+               "mesocluster sizes do not add up (%u) to the total trainset size (%zu)",
+               mesocluster_size_sum,
+               n_rows);
+  RAFT_EXPECTS(fine_clusters_csum[n_mesoclusters] == n_clusters,
+               "fine cluster numbers do not add up (%u) to the total number of clusters (%zu)",
+               fine_clusters_csum[n_mesoclusters],
+               n_rows
+
+  );
+
+  return std::make_tuple(mesocluster_size_max,
+                         fine_clusters_nums_max,
+                         std::move(fine_clusters_nums),
+                         std::move(fine_clusters_csum));
+}
 
-  rmm::device_uvector<uint32_t> mc_trainset_ids_buf(mesocluster_size_max, stream, &managed_memory);
-  rmm::device_uvector<float> mc_trainset_buf(mesocluster_size_max * dim, stream, &managed_memory);
+/**
+ *  Given the (coarse) mesoclusters and the distribution of fine clusters within them,
+ *  build the fine clusters.
+ *
+ *  Processing one mesocluster at a time:
+ *   1. Copy mesocluster data into a separate buffer
+ *   2. Predict fine cluster
+ *   3. Refince the fine cluster centers
+ *
+ *  As a result, the fine clusters are what is returned by `build_optimized_kmeans`;
+ *  this function returns the total number of fine clusters, which can be checked to be
+ *  the same as the requested number of clusters.
+ */
+template <typename T>
+auto build_fine_clusters(const handle_t& handle,
+                         uint32_t n_iters,
+                         size_t dim,
+                         const T* dataset_mptr,
+                         const uint32_t* labels_mptr,
+                         size_t n_rows,
+                         const uint32_t* fine_clusters_nums,
+                         const uint32_t* fine_clusters_csum,
+                         const uint32_t* mesocluster_sizes,
+                         size_t n_mesoclusters,
+                         size_t mesocluster_size_max,
+                         size_t fine_clusters_nums_max,
+                         float* cluster_centers,
+                         raft::distance::DistanceType metric,
+                         rmm::mr::managed_memory_resource* managed_memory,
+                         rmm::mr::device_memory_resource* device_memory,
+                         rmm::cuda_stream_view stream) -> uint32_t
+{
+  rmm::device_uvector<uint32_t> mc_trainset_ids_buf(mesocluster_size_max, stream, managed_memory);
+  rmm::device_uvector<float> mc_trainset_buf(mesocluster_size_max * dim, stream, managed_memory);
   auto mc_trainset_ids = mc_trainset_ids_buf.data();
   auto mc_trainset     = mc_trainset_buf.data();
 
   // label (cluster ID) of each vector
-  rmm::device_uvector<uint32_t> mc_trainset_labels(mesocluster_size_max, stream, &managed_memory);
+  rmm::device_uvector<uint32_t> mc_trainset_labels(mesocluster_size_max, stream, managed_memory);
 
   rmm::device_uvector<float> mc_trainset_ccenters(
-    fine_clusters_nums_max * dim, stream, &managed_memory);
+    fine_clusters_nums_max * dim, stream, managed_memory);
   rmm::device_uvector<float> mc_trainset_ccenters_tmp(
-    fine_clusters_nums_max * dim, stream, &managed_memory);
+    fine_clusters_nums_max * dim, stream, managed_memory);
   // number of vectors in each cluster
   rmm::device_uvector<uint32_t> mc_trainset_csizes_tmp(
-    fine_clusters_nums_max, stream, &managed_memory);
+    fine_clusters_nums_max, stream, managed_memory);
 
   // Training clusters in each meso-clusters
   uint32_t n_clusters_done = 0;
   for (uint32_t i = 0; i < n_mesoclusters; i++) {
     uint32_t k = 0;
-    for (uint32_t j = 0; j < n_rows_train; j++) {
-      if (mesocluster_labels.data()[j] != i) continue;
-      mc_trainset_ids[k++] = j;
+    for (uint32_t j = 0; j < n_rows; j++) {
+      if (labels_mptr[j] == i) { mc_trainset_ids[k++] = j; }
     }
-    assert(k == mesocluster_sizes[i]);
+    RAFT_EXPECTS(k == mesocluster_sizes[i], "Incorrect mesocluster size at %d.", i);
 
     utils::copy_selected<T>(
-      mesocluster_sizes[i], dim, trainset.data(), mc_trainset_ids, dim, mc_trainset, dim, stream);
+      mesocluster_sizes[i], dim, dataset_mptr, mc_trainset_ids, dim, mc_trainset, dim, stream);
 
     for (uint32_t iter = 0; iter < 2 * n_iters; iter += 2) {
       kmeans::predict(handle,
@@ -564,7 +564,7 @@ void build_optimized_kmeans(const handle_t& handle,
                       mc_trainset_csizes_tmp.data(),
                       true,
                       stream,
-                      &kmeans_mem_res);
+                      device_memory);
 
       if (iter + 1 < 2 * n_iters) {
         if (kmeans::adjust_centers(mc_trainset_ccenters.data(),
@@ -587,11 +587,109 @@ void build_optimized_kmeans(const handle_t& handle,
                stream);
     handle.sync_stream(stream);
     n_clusters_done += fine_clusters_nums[i];
-  }  // end for (uint32_t i = 0; i < n_mesoclusters; i++)
-  assert(n_clusters_done == n_clusters);
+  }
+  return n_clusters_done;
+}
+
+/**
+ * kmeans
+ *
+ * @tparam T element type
+ *
+ * @param handle
+ * @param n_iters number of training iterations
+ * @param dim number of columns in `centers` and `dataset`
+ * @param[in] dataset a device pointer to the source dataset [n_rows, dim]
+ * @param n_rows number of rows in the input
+ * @param[out] labels a device pointer to the output labels [n_rows]
+ * @param[out] cluster_centers a device pointer to the found cluster centers [n_cluster, dim]
+ * @param n_cluster
+ * @param trainset_fraction a fraction of rows in the `dataset` to sample for kmeans training;
+ *                            0 < trainset_fraction <= 1.
+ * @param metric the distance metric
+ * @param stream
+ */
+template <typename T>
+void build_optimized_kmeans(const handle_t& handle,
+                            uint32_t n_iters,
+                            size_t dim,
+                            const T* dataset,
+                            size_t n_rows,
+                            uint32_t* labels,
+                            float* cluster_centers,
+                            size_t n_clusters,
+                            double trainset_fraction,
+                            raft::distance::DistanceType metric,
+                            rmm::cuda_stream_view stream)
+{
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "kmeans::build_optimized_kmeans(%u, %u)", n_rows, n_clusters);
+
+  auto trainset_ratio =
+    std::max<size_t>(1, n_rows / std::max<size_t>(trainset_fraction * n_rows, n_clusters));
+  auto n_rows_train = n_rows / trainset_ratio;
+
+  uint32_t n_mesoclusters = std::pow<double>(n_clusters, 0.5) + 0.5;
+  RAFT_LOG_DEBUG("(%s) # n_mesoclusters: %u", __func__, n_mesoclusters);
+
+  rmm::mr::managed_memory_resource managed_memory;
+  rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> device_memory(
+    rmm::mr::get_current_device_resource(),
+    // an arbitrary guess on the upper bound of the workspace size
+    Pow2<256>::roundUp(kmeans::calc_minibatch_size(n_mesoclusters, n_rows) * dim * 4));
+
+  rmm::device_uvector<T> trainset(n_rows_train * dim, stream, &managed_memory);
+  // TODO: a proper sampling
+  RAFT_CUDA_TRY(cudaMemcpy2DAsync(trainset.data(),
+                                  sizeof(T) * dim,
+                                  dataset,
+                                  sizeof(T) * dim * trainset_ratio,
+                                  sizeof(T) * dim,
+                                  n_rows_train,
+                                  cudaMemcpyDefault,
+                                  stream));
+
+  auto [mesocluster_labels_buf, mesocluster_sizes_buf] = build_clusters(handle,
+                                                                        n_iters,
+                                                                        dim,
+                                                                        trainset.data(),
+                                                                        n_rows_train,
+                                                                        n_mesoclusters,
+                                                                        metric,
+                                                                        &managed_memory,
+                                                                        &device_memory,
+                                                                        stream);
+
+  auto mesocluster_sizes  = mesocluster_sizes_buf.data();
+  auto mesocluster_labels = mesocluster_labels_buf.data();
+
+  handle.sync_stream(stream);
+
+  auto [mesocluster_size_max, fine_clusters_nums_max, fine_clusters_nums, fine_clusters_csum] =
+    arrange_fine_clusters(n_clusters, n_mesoclusters, n_rows_train, mesocluster_sizes);
+
+  auto n_clusters_done = build_fine_clusters(handle,
+                                             n_iters,
+                                             dim,
+                                             trainset.data(),
+                                             mesocluster_labels,
+                                             n_rows_train,
+                                             fine_clusters_nums.data(),
+                                             fine_clusters_csum.data(),
+                                             mesocluster_sizes,
+                                             n_mesoclusters,
+                                             mesocluster_size_max,
+                                             fine_clusters_nums_max,
+                                             cluster_centers,
+                                             metric,
+                                             &managed_memory,
+                                             &device_memory,
+                                             stream);
+  RAFT_EXPECTS(n_clusters_done == n_clusters, "Didn't process all clusters.");
 
-  mc_trainset_ccenters_tmp.resize(n_clusters * dim, stream);
-  mc_trainset_csizes_tmp.resize(n_clusters, stream);
+  rmm::device_uvector<float> centers_temp(n_clusters * dim, stream, &device_memory);
+  // TODO: is this the same as list_sizes comptuer later?..
+  rmm::device_uvector<uint32_t> cluster_sizes(n_clusters, stream, &device_memory);
 
   // Fitting whole clusters using whole trainset.
   for (int iter = 0; iter < 2; iter++) {
@@ -605,11 +703,11 @@ void build_optimized_kmeans(const handle_t& handle,
                     labels,
                     metric,
                     true,
-                    mc_trainset_ccenters_tmp.data(),
-                    mc_trainset_csizes_tmp.data(),
+                    centers_temp.data(),
+                    cluster_sizes.data(),
                     true,
                     stream,
-                    &kmeans_mem_res);
+                    &device_memory);
   }  // end for (int iter = 0; iter < 2; iter++)
 
   RAFT_LOG_DEBUG("(%s) Final fitting.", __func__);
@@ -623,11 +721,11 @@ void build_optimized_kmeans(const handle_t& handle,
                   labels,
                   metric,
                   true,
-                  mc_trainset_ccenters_tmp.data(),
-                  mc_trainset_csizes_tmp.data(),
+                  centers_temp.data(),
+                  cluster_sizes.data(),
                   true,
                   stream,
-                  &kmeans_mem_res);
+                  &device_memory);
 
   kmeans::predict(handle,
                   cluster_centers,
@@ -638,11 +736,11 @@ void build_optimized_kmeans(const handle_t& handle,
                   labels,
                   metric,
                   true,
-                  mc_trainset_ccenters_tmp.data(),
-                  mc_trainset_csizes_tmp.data(),
+                  centers_temp.data(),
+                  cluster_sizes.data(),
                   false,
                   stream,
-                  &kmeans_mem_res);
+                  &device_memory);
 }
 
 }  // namespace raft::spatial::knn::detail::kmeans

From fbcb16b103ff3ca3761496ce9c85a9ba3f867275 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 17 Jun 2022 17:53:58 +0200
Subject: [PATCH 069/118] A few smaller refactorings for kmeans

---
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  |  15 +--
 .../knn/detail/ann_kmeans_balanced.cuh        | 121 ++++++++----------
 2 files changed, 58 insertions(+), 78 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index 5079de3fec..15ed3b52ac 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -176,7 +176,9 @@ void cuivflHandle<T>::cuivflBuildIndex(const T* dataset, uint32_t n_rows, uint32
 
   // kmeans cluster ids for the dataset
   rmm::device_uvector<uint32_t> labels(n_rows, stream_);
-  auto&& centers = make_array_for_index<float>(stream_, n_lists, dim);
+  auto&& centers      = make_array_for_index<float>(stream_, n_lists, dim);
+  auto&& list_sizes   = make_array_for_index<uint32_t>(stream_, n_lists);
+  auto list_sizes_ptr = list_sizes.data();
 
   // Predict labels of the whole dataset
   kmeans::build_optimized_kmeans(handle_,
@@ -185,22 +187,13 @@ void cuivflHandle<T>::cuivflBuildIndex(const T* dataset, uint32_t n_rows, uint32
                                  dataset,
                                  n_rows,
                                  labels.data(),
+                                 list_sizes_ptr,
                                  centers.data(),
                                  n_lists,
                                  params_.kmeans_trainset_fraction,
                                  metric_type_,
                                  stream_);
 
-  auto&& list_sizes   = make_array_for_index<uint32_t>(stream_, n_lists);
-  auto list_sizes_ptr = list_sizes.data();
-  stats::histogram(stats::HistType::HistTypeAuto,
-                   reinterpret_cast<int*>(list_sizes_ptr),
-                   n_lists,
-                   labels.data(),
-                   n_rows,
-                   uint32_t(1),
-                   stream_);
-
   // NB: stream_ must be equal to handle_.get_stream() to have the thrust functions executed in
   // order with the rest
   auto thrust_policy = handle_.get_thrust_policy();
diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index 085cfa9f85..2073b67db6 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -395,53 +395,49 @@ template <typename T>
 auto build_clusters(const handle_t& handle,
                     uint32_t n_iters,
                     size_t dim,
-                    const T* dataset_mptr,
+                    const T* dataset,  // managedl [n_rows, dim]
                     size_t n_rows,
                     size_t n_clusters,
+                    float* cluster_centers,    // managed; [n_clusters, dim]
+                    uint32_t* cluster_labels,  // managed; [n_rows]
+                    uint32_t* cluster_sizes,   // managed; [n_clusters]
                     raft::distance::DistanceType metric,
-                    rmm::mr::managed_memory_resource* managed_memory,
                     rmm::mr::device_memory_resource* device_memory,
                     rmm::cuda_stream_view stream)
 {
-  rmm::device_uvector<float> cluster_centers(n_clusters * dim, stream, managed_memory);
-  rmm::device_uvector<uint32_t> cluster_labels(n_rows, stream, managed_memory);
-  rmm::device_uvector<uint32_t> cluster_sizes(n_clusters, stream, managed_memory);
-  rmm::device_uvector<float> cluster_centers_tmp(n_clusters * dim, stream, managed_memory);
+  rmm::device_uvector<float> cluster_centers_tmp(n_clusters * dim, stream, device_memory);
 
-  // Training meso-clusters
   for (uint32_t iter = 0; iter < 2 * n_iters; iter += 2) {
     kmeans::predict(handle,
-                    cluster_centers.data(),
+                    cluster_centers,
                     n_clusters,
                     dim,
-                    dataset_mptr,
+                    dataset,
                     n_rows,
-                    cluster_labels.data(),
+                    cluster_labels,
                     metric,
                     (iter != 0),
                     cluster_centers_tmp.data(),
-                    cluster_sizes.data(),
+                    cluster_sizes,
                     true,
                     stream,
                     device_memory);
 
     if (iter + 1 < 2 * n_iters) {
-      if (kmeans::adjust_centers(cluster_centers.data(),
+      if (kmeans::adjust_centers(cluster_centers,
                                  n_clusters,
                                  dim,
-                                 dataset_mptr,
+                                 dataset,
                                  n_rows,
-                                 cluster_labels.data(),
+                                 cluster_labels,
                                  metric,
-                                 cluster_sizes.data(),
+                                 cluster_sizes,
                                  (float)1.0 / 4,
                                  stream)) {
         iter -= 1;
       }
     }
   }
-
-  return std::make_tuple(std::move(cluster_labels), std::move(cluster_sizes));
 }
 
 /** Calculate how many fine clusters should belong to each mesocluster. */
@@ -532,8 +528,6 @@ auto build_fine_clusters(const handle_t& handle,
 
   rmm::device_uvector<float> mc_trainset_ccenters(
     fine_clusters_nums_max * dim, stream, managed_memory);
-  rmm::device_uvector<float> mc_trainset_ccenters_tmp(
-    fine_clusters_nums_max * dim, stream, managed_memory);
   // number of vectors in each cluster
   rmm::device_uvector<uint32_t> mc_trainset_csizes_tmp(
     fine_clusters_nums_max, stream, managed_memory);
@@ -550,37 +544,19 @@ auto build_fine_clusters(const handle_t& handle,
     utils::copy_selected<T>(
       mesocluster_sizes[i], dim, dataset_mptr, mc_trainset_ids, dim, mc_trainset, dim, stream);
 
-    for (uint32_t iter = 0; iter < 2 * n_iters; iter += 2) {
-      kmeans::predict(handle,
-                      mc_trainset_ccenters.data(),
-                      fine_clusters_nums[i],
-                      dim,
-                      mc_trainset,
-                      mesocluster_sizes[i],
-                      mc_trainset_labels.data(),
-                      metric,
-                      (iter != 0),
-                      mc_trainset_ccenters_tmp.data(),
-                      mc_trainset_csizes_tmp.data(),
-                      true,
-                      stream,
-                      device_memory);
-
-      if (iter + 1 < 2 * n_iters) {
-        if (kmeans::adjust_centers(mc_trainset_ccenters.data(),
-                                   fine_clusters_nums[i],
-                                   dim,
-                                   mc_trainset,
-                                   mesocluster_sizes[i],
-                                   mc_trainset_labels.data(),
-                                   metric,
-                                   mc_trainset_csizes_tmp.data(),
-                                   (float)1.0 / 4,
-                                   stream)) {
-          iter -= 1;
-        }
-      }
-    }
+    build_clusters(handle,
+                   n_iters,
+                   dim,
+                   mc_trainset,
+                   mesocluster_sizes[i],
+                   fine_clusters_nums[i],
+                   mc_trainset_ccenters.data(),
+                   mc_trainset_labels.data(),
+                   mc_trainset_csizes_tmp.data(),
+                   metric,
+                   device_memory,
+                   stream);
+
     raft::copy(cluster_centers + (dim * fine_clusters_csum[i]),
                mc_trainset_ccenters.data(),
                fine_clusters_nums[i] * dim,
@@ -602,6 +578,7 @@ auto build_fine_clusters(const handle_t& handle,
  * @param[in] dataset a device pointer to the source dataset [n_rows, dim]
  * @param n_rows number of rows in the input
  * @param[out] labels a device pointer to the output labels [n_rows]
+ * @param[out] cluster_sizes a device pointer to the found cluster sizes [n_cluster]
  * @param[out] cluster_centers a device pointer to the found cluster centers [n_cluster, dim]
  * @param n_cluster
  * @param trainset_fraction a fraction of rows in the `dataset` to sample for kmeans training;
@@ -616,6 +593,7 @@ void build_optimized_kmeans(const handle_t& handle,
                             const T* dataset,
                             size_t n_rows,
                             uint32_t* labels,
+                            uint32_t* cluster_sizes,
                             float* cluster_centers,
                             size_t n_clusters,
                             double trainset_fraction,
@@ -649,22 +627,32 @@ void build_optimized_kmeans(const handle_t& handle,
                                   cudaMemcpyDefault,
                                   stream));
 
-  auto [mesocluster_labels_buf, mesocluster_sizes_buf] = build_clusters(handle,
-                                                                        n_iters,
-                                                                        dim,
-                                                                        trainset.data(),
-                                                                        n_rows_train,
-                                                                        n_mesoclusters,
-                                                                        metric,
-                                                                        &managed_memory,
-                                                                        &device_memory,
-                                                                        stream);
+  // build coarse clusters (mesoclusters)
+  rmm::device_uvector<uint32_t> mesocluster_labels_buf(n_rows_train, stream, &managed_memory);
+  rmm::device_uvector<uint32_t> mesocluster_sizes_buf(n_mesoclusters, stream, &managed_memory);
+  {
+    rmm::device_uvector<float> mesocluster_centers_buf(
+      n_mesoclusters * dim, stream, &managed_memory);
+    build_clusters(handle,
+                   n_iters,
+                   dim,
+                   trainset.data(),
+                   n_rows_train,
+                   n_mesoclusters,
+                   mesocluster_centers_buf.data(),
+                   mesocluster_labels_buf.data(),
+                   mesocluster_sizes_buf.data(),
+                   metric,
+                   &device_memory,
+                   stream);
+  }
 
   auto mesocluster_sizes  = mesocluster_sizes_buf.data();
   auto mesocluster_labels = mesocluster_labels_buf.data();
 
   handle.sync_stream(stream);
 
+  // build fine clusters
   auto [mesocluster_size_max, fine_clusters_nums_max, fine_clusters_nums, fine_clusters_csum] =
     arrange_fine_clusters(n_clusters, n_mesoclusters, n_rows_train, mesocluster_sizes);
 
@@ -688,10 +676,8 @@ void build_optimized_kmeans(const handle_t& handle,
   RAFT_EXPECTS(n_clusters_done == n_clusters, "Didn't process all clusters.");
 
   rmm::device_uvector<float> centers_temp(n_clusters * dim, stream, &device_memory);
-  // TODO: is this the same as list_sizes comptuer later?..
-  rmm::device_uvector<uint32_t> cluster_sizes(n_clusters, stream, &device_memory);
 
-  // Fitting whole clusters using whole trainset.
+  // fit clusters using the trainset
   for (int iter = 0; iter < 2; iter++) {
     // NB: labels.size == n_rows >= n_rows_train; the output is not used.
     kmeans::predict(handle,
@@ -704,14 +690,15 @@ void build_optimized_kmeans(const handle_t& handle,
                     metric,
                     true,
                     centers_temp.data(),
-                    cluster_sizes.data(),
+                    cluster_sizes,
                     true,
                     stream,
                     &device_memory);
-  }  // end for (int iter = 0; iter < 2; iter++)
+  }
 
   RAFT_LOG_DEBUG("(%s) Final fitting.", __func__);
 
+  // fit clusters using the whole dataset
   kmeans::predict(handle,
                   cluster_centers,
                   n_clusters,
@@ -722,7 +709,7 @@ void build_optimized_kmeans(const handle_t& handle,
                   metric,
                   true,
                   centers_temp.data(),
-                  cluster_sizes.data(),
+                  cluster_sizes,
                   true,
                   stream,
                   &device_memory);
@@ -737,7 +724,7 @@ void build_optimized_kmeans(const handle_t& handle,
                   metric,
                   true,
                   centers_temp.data(),
-                  cluster_sizes.data(),
+                  cluster_sizes,
                   false,
                   stream,
                   &device_memory);

From 29ca199ceec3f15b4f6bdf52fbb1e5a9e0667dc0 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 20 Jun 2022 14:46:42 +0200
Subject: [PATCH 070/118] Add docs to public methods of the handle

---
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  | 36 +++++++++++++++----
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index 15ed3b52ac..bb7033adff 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -36,6 +36,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
@@ -48,16 +49,40 @@ class cuivflHandle {
                raft::distance::DistanceType metric_type,
                const ivf_flat_params& params);
 
+  /**
+   * @brief Build the index from the dataset for efficient search.
+   *
+   * @param[in] dataset a device pointer to a row-major matrix [n_rows, dim]
+   * @param n_rows number of samples
+   * @param dim the dimensionality of the data
+   */
   void cuivflBuildIndex(const T* dataset, uint32_t n_rows, uint32_t dim);
 
+  /**
+   * @brief Set the search parameters. Must be called before `cuivflSearch`
+   *
+   * @param n_probes number of clusters to look at for each query (affects speed vs recall).
+   * @param max_batch maximum number of queries (affects the required temp memory).
+   * @param max_k maximum number of neighbors to search for.
+   */
   void cuivflSetSearchParameters(const uint32_t n_probes,
                                  const uint32_t max_batch,
                                  const uint32_t max_k);
 
+  /**
+   * @brief Search ANN using the constructed index.
+   *
+   * @param[in] queries a device pointer to a row-major matrix [n_queries, dim]
+   * @param n_queries is the batch size
+   * @param k is the number of neighbors to find for each query.
+   * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
+   * [n_queries, k]
+   * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries,
+   * k]
+   */
   void cuivflSearch(
     const T* queries, uint32_t n_queries, uint32_t k, size_t* neighbors, float* distances);
 
-  void queryIVFFlatGridSize(const uint32_t n_probes, const uint32_t n_queries, const uint32_t k);
   uint32_t getDim() { return index_.has_value() ? index_->dim() : 0; }
 
  private:
@@ -78,6 +103,8 @@ class cuivflHandle {
   template <typename AccT>
   void cuivflSearchImpl(
     const T* queries, uint32_t n_queries, uint32_t k, size_t* neighbors, AccT* distances);
+
+  void queryIVFFlatGridSize(const uint32_t n_probes, const uint32_t n_queries, const uint32_t k);
 };
 
 template <typename T>
@@ -194,15 +221,12 @@ void cuivflHandle<T>::cuivflBuildIndex(const T* dataset, uint32_t n_rows, uint32
                                  metric_type_,
                                  stream_);
 
-  // NB: stream_ must be equal to handle_.get_stream() to have the thrust functions executed in
-  // order with the rest
-  auto thrust_policy = handle_.get_thrust_policy();
-
+  // Calculate offsets into cluster data using exclusive scan
   auto&& list_offsets   = make_array_for_index<uint32_t>(stream_, n_lists + 1);
   auto list_offsets_ptr = list_offsets.data();
 
   thrust::exclusive_scan(
-    thrust_policy,
+    rmm::exec_policy(stream_),
     list_sizes_ptr,
     list_sizes_ptr + n_lists + 1,
     list_offsets_ptr,

From 38b3cec4986ab5fc7fb04347e4a5997a3f3005e3 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 21 Jun 2022 10:13:25 +0200
Subject: [PATCH 071/118] Made the metric be a part of the index struct and set
 the greater_ = true for similarity metrics

---
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  | 63 +++++++++----------
 .../knn/detail/ann_ivf_flat_kernel.cuh        |  5 +-
 .../raft/spatial/knn/detail/ann_quantized.cuh |  4 +-
 cpp/test/spatial/ann_base_kernel.cuh          |  2 +-
 4 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index bb7033adff..77fa066925 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -45,9 +45,10 @@ namespace raft::spatial::knn::detail {
 template <typename T>
 class cuivflHandle {
  public:
-  cuivflHandle(const handle_t& handle,
-               raft::distance::DistanceType metric_type,
-               const ivf_flat_params& params);
+  cuivflHandle(const handle_t& handle, const ivf_flat_params& params)
+    : handle_(handle), stream_(handle_.get_stream()), params_(params), grid_dim_x_(0)
+  {
+  }
 
   /**
    * @brief Build the index from the dataset for efficient search.
@@ -55,8 +56,12 @@ class cuivflHandle {
    * @param[in] dataset a device pointer to a row-major matrix [n_rows, dim]
    * @param n_rows number of samples
    * @param dim the dimensionality of the data
+   * @param metric distance type
    */
-  void cuivflBuildIndex(const T* dataset, uint32_t n_rows, uint32_t dim);
+  void cuivflBuildIndex(const T* dataset,
+                        uint32_t n_rows,
+                        uint32_t dim,
+                        raft::distance::DistanceType metric);
 
   /**
    * @brief Set the search parameters. Must be called before `cuivflSearch`
@@ -90,7 +95,6 @@ class cuivflHandle {
   const rmm::cuda_stream_view stream_;
   ivf_flat_params params_;
 
-  const raft::distance::DistanceType metric_type_;
   bool greater_;
   uint32_t grid_dim_x_;  // The number of blocks launched across n_probes.
   // The built index
@@ -107,18 +111,6 @@ class cuivflHandle {
   void queryIVFFlatGridSize(const uint32_t n_probes, const uint32_t n_queries, const uint32_t k);
 };
 
-template <typename T>
-cuivflHandle<T>::cuivflHandle(const handle_t& handle,
-                              raft::distance::DistanceType metric_type,
-                              const ivf_flat_params& params)
-  : handle_(handle),
-    stream_(handle_.get_stream()),
-    params_(params),
-    grid_dim_x_(0),
-    metric_type_(metric_type)
-{
-}
-
 /**
  * @brief Record the dataset into the index, one source row at a time.
  *
@@ -187,7 +179,10 @@ __global__ void build_index_kernel(const uint32_t* labels,
 }
 
 template <typename T>
-void cuivflHandle<T>::cuivflBuildIndex(const T* dataset, uint32_t n_rows, uint32_t dim)
+void cuivflHandle<T>::cuivflBuildIndex(const T* dataset,
+                                       uint32_t n_rows,
+                                       uint32_t dim,
+                                       raft::distance::DistanceType metric)
 {
   static_assert(std::is_same_v<T, float> || std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
                 "unsupported data type");
@@ -218,7 +213,7 @@ void cuivflHandle<T>::cuivflBuildIndex(const T* dataset, uint32_t n_rows, uint32
                                  centers.data(),
                                  n_lists,
                                  params_.kmeans_trainset_fraction,
-                                 metric_type_,
+                                 metric,
                                  stream_);
 
   // Calculate offsets into cluster data using exclusive scan
@@ -263,13 +258,13 @@ void cuivflHandle<T>::cuivflBuildIndex(const T* dataset, uint32_t n_rows, uint32
     RAFT_LOG_TRACE_VEC(r.data(), 20);
     return r;
   };
-  auto&& center_norms = metric_type_ == raft::distance::DistanceType::L2Expanded
+  auto&& center_norms = metric == raft::distance::DistanceType::L2Expanded
                           ? std::optional(compute_norms())
                           : std::nullopt;
 
   // assemble the index
-  index_.emplace(
-    ivf_flat_index<T>{veclen, data, indices, list_sizes, list_offsets, centers, center_norms});
+  index_.emplace(ivf_flat_index<T>{
+    veclen, metric, data, indices, list_sizes, list_offsets, centers, center_norms});
 
   // check index invariants
   index_->check_consistency();
@@ -285,7 +280,7 @@ void cuivflHandle<T>::queryIVFFlatGridSize(const uint32_t n_probes,
                                                                   nullptr,
                                                                   nullptr,
                                                                   n_queries,
-                                                                  metric_type_,
+                                                                  index_->metric,
                                                                   n_probes,
                                                                   k,
                                                                   greater_,
@@ -303,13 +298,17 @@ void cuivflHandle<T>::cuivflSetSearchParameters(const uint32_t n_probes,
   RAFT_EXPECTS(n_probes > 0,
                "n_probes (number of clusters to probe in the search) must be positive.");
   params_.nprobe = n_probes;
-  // Set the greater_
-  if (metric_type_ == raft::distance::DistanceType::L2Expanded ||
-      metric_type_ == raft::distance::DistanceType::L2Unexpanded) {
-    greater_ = false;
-  } else {
-    // Need to set this to true for inner product if need FAISS like behavior for inner product
-    greater_ = false;
+
+  switch (index_->metric) {
+    case raft::distance::DistanceType::InnerProduct:
+    case raft::distance::DistanceType::CosineExpanded:
+    case raft::distance::DistanceType::CorrelationExpanded:
+      // Similarity metrics have the opposite meaning, i.e. nearest neigbours are those with larger
+      // similarity (See the same logic at cpp/include/raft/sparse/selection/detail/knn.cuh:362
+      // {perform_k_selection})
+      greater_ = true;
+      break;
+    default: greater_ = false;
   }
 
   // Set memory buffer to be reused across searches
@@ -376,7 +375,7 @@ void cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueries, dim]
   float alpha = 1.0f;
   float beta  = 0.0f;
 
-  if (metric_type_ == raft::distance::DistanceType::L2Expanded) {
+  if (index_->metric == raft::distance::DistanceType::L2Expanded) {
     alpha = -2.0f;
     beta  = 1.0f;
     utils::dots_along_rows(
@@ -447,7 +446,7 @@ void cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueries, dim]
                                                                   queries,
                                                                   coarse_indices_dev.data(),
                                                                   n_queries,
-                                                                  metric_type_,
+                                                                  index_->metric,
                                                                   n_probes,
                                                                   k,
                                                                   greater_,
diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
index c671cd34ba..2acae697c2 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
@@ -27,6 +27,7 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/distance/distance.cuh>
+#include <raft/distance/distance_type.hpp>
 #include <raft/pow2_utils.cuh>
 
 #ifdef USE_FAISS
@@ -55,6 +56,8 @@ struct ivf_flat_index {
    * Vectorized load/store size in elements, determines the size of interleaved data chunks.
    */
   uint32_t veclen;
+  /** Distance metric used for clustering. */
+  raft::distance::DistanceType metric;
 
   /**
    * Inverted list data [size, dim].
@@ -82,7 +85,6 @@ struct ivf_flat_index {
   device_mdarray<T, extent_2d, row_major> data;
   /** Inverted list indices: ids of items in the source data [size] */
   device_mdarray<uint32_t, extent_1d, row_major> indices;
-
   /** Sizes of the lists (clusters) [n_lists] */
   device_mdarray<uint32_t, extent_1d, row_major> list_sizes;
   /**
@@ -90,7 +92,6 @@ struct ivf_flat_index {
    * The last value contains the total length of the index.
    */
   device_mdarray<uint32_t, extent_1d, row_major> list_offsets;
-
   /** k-means cluster centers corresponding to the lists [n_lists, dim] */
   device_mdarray<float, extent_2d, row_major> centers;
   /** (Optional) Precomputed norms of the `centers` w.r.t. the chosen distance metrix [n_lists]  */
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
index e8c962fd0c..8787c84fdb 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
@@ -97,8 +97,8 @@ void approx_knn_cuivfl_ivfflat_build_index(const raft::handle_t& handle,
                                            IntType n,
                                            IntType D)
 {
-  index->ivf_flat<T>() = std::make_unique<detail::cuivflHandle<T>>(handle, metric, *params);
-  index->ivf_flat<T>()->cuivflBuildIndex(dataset, n, D);
+  index->ivf_flat<T>() = std::make_unique<detail::cuivflHandle<T>>(handle, *params);
+  index->ivf_flat<T>()->cuivflBuildIndex(dataset, n, D, metric);
 }
 
 template <typename IntType = int>
diff --git a/cpp/test/spatial/ann_base_kernel.cuh b/cpp/test/spatial/ann_base_kernel.cuh
index 2c9698eafd..4462875de2 100644
--- a/cpp/test/spatial/ann_base_kernel.cuh
+++ b/cpp/test/spatial/ann_base_kernel.cuh
@@ -93,7 +93,7 @@ void naiveBfKnn(float* dist_topk,
                              input_len,
                              dist_topk + offset * k,
                              indices_topk + offset * k,
-                             true,
+                             type != raft::distance::DistanceType::InnerProduct,
                              static_cast<int>(k),
                              stream,
                              SelectKAlgo::WARP_SORT);

From d19bb5f0aeba9bfe423f33d33bece8e8ba1911fd Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 21 Jun 2022 10:32:25 +0200
Subject: [PATCH 072/118] Do not persist grid_dim_x between searches

---
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  | 59 +++++++++----------
 1 file changed, 27 insertions(+), 32 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index 77fa066925..103c2188ab 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -46,7 +46,7 @@ template <typename T>
 class cuivflHandle {
  public:
   cuivflHandle(const handle_t& handle, const ivf_flat_params& params)
-    : handle_(handle), stream_(handle_.get_stream()), params_(params), grid_dim_x_(0)
+    : handle_(handle), stream_(handle_.get_stream()), params_(params)
   {
   }
 
@@ -96,7 +96,6 @@ class cuivflHandle {
   ivf_flat_params params_;
 
   bool greater_;
-  uint32_t grid_dim_x_;  // The number of blocks launched across n_probes.
   // The built index
   std::optional<const ivf_flat_index<T>> index_ = std::nullopt;
 
@@ -107,8 +106,6 @@ class cuivflHandle {
   template <typename AccT>
   void cuivflSearchImpl(
     const T* queries, uint32_t n_queries, uint32_t k, size_t* neighbors, AccT* distances);
-
-  void queryIVFFlatGridSize(const uint32_t n_probes, const uint32_t n_queries, const uint32_t k);
 };
 
 /**
@@ -270,26 +267,6 @@ void cuivflHandle<T>::cuivflBuildIndex(const T* dataset,
   index_->check_consistency();
 }
 
-template <typename T>
-void cuivflHandle<T>::queryIVFFlatGridSize(const uint32_t n_probes,
-                                           const uint32_t n_queries,
-                                           const uint32_t k)
-{
-  // query the gridDimX size to store probes topK output
-  ivfflat_interleaved_scan<T, typename utils::config<T>::value_t>(index_.value(),
-                                                                  nullptr,
-                                                                  nullptr,
-                                                                  n_queries,
-                                                                  index_->metric,
-                                                                  n_probes,
-                                                                  k,
-                                                                  greater_,
-                                                                  nullptr,
-                                                                  nullptr,
-                                                                  grid_dim_x_,
-                                                                  stream_);
-}
-
 template <typename T>
 void cuivflHandle<T>::cuivflSetSearchParameters(const uint32_t n_probes,
                                                 const uint32_t max_batch,
@@ -340,9 +317,7 @@ void cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueries, dim]
                                        AccT* distances)
 {
   uint32_t n_probes = std::min(params_.nprobe, params_.nlist);
-  grid_dim_x_       = 0;
-  queryIVFFlatGridSize(n_probes, n_queries, k);
-  auto search_mr = &(search_mem_res.value());
+  auto search_mr    = &(search_mem_res.value());
   // The norm of query
   rmm::device_uvector<float> query_norm_dev(n_queries, stream_, search_mr);
   // The distance value of cluster(list) and queries
@@ -437,7 +412,27 @@ void cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueries, dim]
 
   AccT* distances_dev_ptr = refined_distances_dev.data();
   size_t* indices_dev_ptr = refined_indices_dev.data();
-  if (n_probes == 1 || grid_dim_x_ == 1) {
+
+  uint32_t grid_dim_x = 0;
+  if (n_probes > 1) {
+    // query the gridDimX size to store probes topK output
+    ivfflat_interleaved_scan<T, typename utils::config<T>::value_t>(index_.value(),
+                                                                    nullptr,
+                                                                    nullptr,
+                                                                    n_queries,
+                                                                    index_->metric,
+                                                                    n_probes,
+                                                                    k,
+                                                                    greater_,
+                                                                    nullptr,
+                                                                    nullptr,
+                                                                    grid_dim_x,
+                                                                    stream_);
+  } else {
+    grid_dim_x = 1;
+  }
+
+  if (grid_dim_x == 1) {
     distances_dev_ptr = distances;
     indices_dev_ptr   = neighbors;
   }
@@ -452,19 +447,19 @@ void cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueries, dim]
                                                                   greater_,
                                                                   indices_dev_ptr,
                                                                   distances_dev_ptr,
-                                                                  grid_dim_x_,
+                                                                  grid_dim_x,
                                                                   stream_);
 
   RAFT_LOG_TRACE_VEC(distances_dev_ptr, 2 * k);
   RAFT_LOG_TRACE_VEC(indices_dev_ptr, 2 * k);
 
   // Merge topk values from different blocks
-  if (grid_dim_x_ > 1) {
+  if (grid_dim_x > 1) {
     if (k <= raft::spatial::knn::detail::topk::kMaxCapacity) {
       topk::warp_sort_topk<AccT, size_t>(refined_distances_dev.data(),
                                          refined_indices_dev.data(),
                                          n_queries,
-                                         k * grid_dim_x_,
+                                         k * grid_dim_x,
                                          k,
                                          distances,
                                          neighbors,
@@ -474,7 +469,7 @@ void cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueries, dim]
       topk::radix_topk<AccT, size_t, 11, 512>(refined_distances_dev.data(),
                                               refined_indices_dev.data(),
                                               n_queries,
-                                              k * grid_dim_x_,
+                                              k * grid_dim_x,
                                               k,
                                               distances,
                                               neighbors,

From 9094707ac4a4e9de6ca2a6c00600225966d1c330 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 21 Jun 2022 11:17:52 +0200
Subject: [PATCH 073/118] Refactor names according to clang-tidy

---
 cpp/include/raft/spatial/knn/ann_common.h     |  16 +--
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  | 123 +++++++++---------
 .../knn/detail/ann_ivf_flat_kernel.cuh        |  35 ++---
 .../raft/spatial/knn/detail/ann_quantized.cuh |  24 +---
 4 files changed, 93 insertions(+), 105 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h
index 8d870556b8..6a8c5056ef 100644
--- a/cpp/include/raft/spatial/knn/ann_common.h
+++ b/cpp/include/raft/spatial/knn/ann_common.h
@@ -24,16 +24,16 @@ namespace raft::spatial::knn {
 
 namespace detail {
 template <typename T>
-class cuivflHandle;
+class ivf_flat_handle;
 };
 
 struct knnIndex {
   faiss::gpu::GpuIndex* index;
   raft::distance::DistanceType metric;
   float metricArg;
-  std::unique_ptr<detail::cuivflHandle<float>> ivf_flat_float_;
-  std::unique_ptr<detail::cuivflHandle<uint8_t>> ivf_flat_uint8_t_;
-  std::unique_ptr<detail::cuivflHandle<int8_t>> ivf_flat_int8_t_;
+  std::unique_ptr<detail::ivf_flat_handle<float>> ivf_flat_float_;
+  std::unique_ptr<detail::ivf_flat_handle<uint8_t>> ivf_flat_uint8_t_;
+  std::unique_ptr<detail::ivf_flat_handle<int8_t>> ivf_flat_int8_t_;
 
   raft::spatial::knn::RmmGpuResources* gpu_res;
   int device;
@@ -44,23 +44,23 @@ struct knnIndex {
   }
 
   template <typename T>
-  auto ivf_flat() -> std::unique_ptr<detail::cuivflHandle<T>>&;
+  auto ivf_flat() -> std::unique_ptr<detail::ivf_flat_handle<T>>&;
 };
 
 template <>
-auto knnIndex::ivf_flat<float>() -> std::unique_ptr<detail::cuivflHandle<float>>&
+auto knnIndex::ivf_flat<float>() -> std::unique_ptr<detail::ivf_flat_handle<float>>&
 {
   return ivf_flat_float_;
 }
 
 template <>
-auto knnIndex::ivf_flat<uint8_t>() -> std::unique_ptr<detail::cuivflHandle<uint8_t>>&
+auto knnIndex::ivf_flat<uint8_t>() -> std::unique_ptr<detail::ivf_flat_handle<uint8_t>>&
 {
   return ivf_flat_uint8_t_;
 }
 
 template <>
-auto knnIndex::ivf_flat<int8_t>() -> std::unique_ptr<detail::cuivflHandle<int8_t>>&
+auto knnIndex::ivf_flat<int8_t>() -> std::unique_ptr<detail::ivf_flat_handle<int8_t>>&
 {
   return ivf_flat_int8_t_;
 }
diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index 103c2188ab..6952343c71 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -43,10 +43,10 @@
 namespace raft::spatial::knn::detail {
 
 template <typename T>
-class cuivflHandle {
+class ivf_flat_handle {
  public:
-  cuivflHandle(const handle_t& handle, const ivf_flat_params& params)
-    : handle_(handle), stream_(handle_.get_stream()), params_(params)
+  ivf_flat_handle(const handle_t& handle, ivf_flat_params params)
+    : handle_(handle), stream_(handle_.get_stream()), params_(std::move(params))
   {
   }
 
@@ -58,21 +58,7 @@ class cuivflHandle {
    * @param dim the dimensionality of the data
    * @param metric distance type
    */
-  void cuivflBuildIndex(const T* dataset,
-                        uint32_t n_rows,
-                        uint32_t dim,
-                        raft::distance::DistanceType metric);
-
-  /**
-   * @brief Set the search parameters. Must be called before `cuivflSearch`
-   *
-   * @param n_probes number of clusters to look at for each query (affects speed vs recall).
-   * @param max_batch maximum number of queries (affects the required temp memory).
-   * @param max_k maximum number of neighbors to search for.
-   */
-  void cuivflSetSearchParameters(const uint32_t n_probes,
-                                 const uint32_t max_batch,
-                                 const uint32_t max_k);
+  void build(const T* dataset, uint32_t n_rows, uint32_t dim, raft::distance::DistanceType metric);
 
   /**
    * @brief Search ANN using the constructed index.
@@ -80,32 +66,44 @@ class cuivflHandle {
    * @param[in] queries a device pointer to a row-major matrix [n_queries, dim]
    * @param n_queries is the batch size
    * @param k is the number of neighbors to find for each query.
+   * @param n_probes number of clusters to look at for each query (affects speed vs recall).
    * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
    * [n_queries, k]
    * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries,
    * k]
    */
-  void cuivflSearch(
-    const T* queries, uint32_t n_queries, uint32_t k, size_t* neighbors, float* distances);
+  void search(const T* queries,
+              uint32_t n_queries,
+              uint32_t k,
+              uint32_t n_probes,
+              size_t* neighbors,
+              float* distances);
 
-  uint32_t getDim() { return index_.has_value() ? index_->dim() : 0; }
+  /** Whether `build` method has already been succesfully invoked. */
+  [[nodiscard]] auto is_trained() const -> bool { return index_.has_value(); }
+
+  /** Dimensionality of the data, on which the index has been built. */
+  [[nodiscard]] auto data_dim() const -> uint32_t { return is_trained() ? index_->dim() : 0; }
 
  private:
   const handle_t& handle_;
   const rmm::cuda_stream_view stream_;
   ivf_flat_params params_;
 
-  bool greater_;
   // The built index
   std::optional<const ivf_flat_index<T>> index_ = std::nullopt;
 
   // Memory pool for use during search; after the first search is done the pool is not likely to
   // resize, saving the costs of allocations.
-  std::optional<rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>> search_mem_res;
+  std::optional<rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>> search_mem_res_;
 
   template <typename AccT>
-  void cuivflSearchImpl(
-    const T* queries, uint32_t n_queries, uint32_t k, size_t* neighbors, AccT* distances);
+  void search_impl(const T* queries,
+                   uint32_t n_queries,
+                   uint32_t k,
+                   bool select_min,
+                   size_t* neighbors,
+                   AccT* distances);
 };
 
 /**
@@ -176,11 +174,13 @@ __global__ void build_index_kernel(const uint32_t* labels,
 }
 
 template <typename T>
-void cuivflHandle<T>::cuivflBuildIndex(const T* dataset,
-                                       uint32_t n_rows,
-                                       uint32_t dim,
-                                       raft::distance::DistanceType metric)
+void ivf_flat_handle<T>::build(const T* dataset,
+                               uint32_t n_rows,
+                               uint32_t dim,
+                               raft::distance::DistanceType metric)
 {
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "ivf_flat_handle::build(%u, %u)", n_rows, dim);
   static_assert(std::is_same_v<T, float> || std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
                 "unsupported data type");
   RAFT_EXPECTS(n_rows > 0 && dim > 0, "empty dataset");
@@ -268,14 +268,21 @@ void cuivflHandle<T>::cuivflBuildIndex(const T* dataset,
 }
 
 template <typename T>
-void cuivflHandle<T>::cuivflSetSearchParameters(const uint32_t n_probes,
-                                                const uint32_t max_batch,
-                                                const uint32_t max_k)
+void ivf_flat_handle<T>::search(const T* queries,
+                                uint32_t n_queries,
+                                uint32_t k,
+                                uint32_t n_probes,
+                                size_t* neighbors,
+                                float* distances)
 {
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "ivf_flat_handle::search(%u, %u, %zu)", n_queries, k, neighbors);
+
+  params_.nprobe = n_probes;
   RAFT_EXPECTS(n_probes > 0,
                "n_probes (number of clusters to probe in the search) must be positive.");
-  params_.nprobe = n_probes;
 
+  bool select_min;
   switch (index_->metric) {
     case raft::distance::DistanceType::InnerProduct:
     case raft::distance::DistanceType::CosineExpanded:
@@ -283,41 +290,31 @@ void cuivflHandle<T>::cuivflSetSearchParameters(const uint32_t n_probes,
       // Similarity metrics have the opposite meaning, i.e. nearest neigbours are those with larger
       // similarity (See the same logic at cpp/include/raft/sparse/selection/detail/knn.cuh:362
       // {perform_k_selection})
-      greater_ = true;
+      select_min = false;
       break;
-    default: greater_ = false;
+    default: select_min = true;
   }
 
   // Set memory buffer to be reused across searches
   auto cur_memory_resource = rmm::mr::get_current_device_resource();
-  if (!search_mem_res.has_value() || search_mem_res->get_upstream() != cur_memory_resource) {
-    search_mem_res.emplace(cur_memory_resource,
-                           Pow2<256>::roundUp(max_batch * n_probes * max_k * 16));
+  if (!search_mem_res_.has_value() || search_mem_res_->get_upstream() != cur_memory_resource) {
+    search_mem_res_.emplace(cur_memory_resource, Pow2<256>::roundUp(n_queries * n_probes * k * 16));
   }
-}
 
-template <typename T>
-void cuivflHandle<T>::cuivflSearch(const T* queries,  // [numQueries, dim]
-                                   uint32_t n_queries,
-                                   uint32_t k,
-                                   size_t* neighbors,  // [numQueries, topK]
-                                   float* distances)
-{
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
-    "cuivflSearch(%u, %u, %zu)", n_queries, k, neighbors);
-  cuivflSearchImpl<float>(queries, n_queries, k, neighbors, distances);
+  search_impl<float>(queries, n_queries, k, select_min, neighbors, distances);
 }
 
 template <typename T>
 template <typename AccT>
-void cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueries, dim]
-                                       uint32_t n_queries,
-                                       uint32_t k,
-                                       size_t* neighbors,  // [numQueries, topK]
-                                       AccT* distances)
+void ivf_flat_handle<T>::search_impl(const T* queries,
+                                     uint32_t n_queries,
+                                     uint32_t k,
+                                     bool select_min,
+                                     size_t* neighbors,
+                                     AccT* distances)
 {
   uint32_t n_probes = std::min(params_.nprobe, params_.nlist);
-  auto search_mr    = &(search_mem_res.value());
+  auto search_mr    = &(search_mem_res_.value());
   // The norm of query
   rmm::device_uvector<float> query_norm_dev(n_queries, stream_, search_mr);
   // The distance value of cluster(list) and queries
@@ -393,7 +390,7 @@ void cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueries, dim]
                                          n_probes,
                                          coarse_distances_dev.data(),
                                          coarse_indices_dev.data(),
-                                         !greater_,
+                                         select_min,
                                          stream_);
   } else {
     topk::radix_topk<AccT, uint32_t, 11, 512>(distance_buffer_dev.data(),
@@ -403,9 +400,9 @@ void cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueries, dim]
                                               n_probes,
                                               coarse_distances_dev.data(),
                                               coarse_indices_dev.data(),
-                                              !greater_,
+                                              select_min,
                                               stream_,
-                                              &(search_mem_res.value()));
+                                              &(search_mem_res_.value()));
   }
   RAFT_LOG_TRACE_VEC(coarse_indices_dev.data(), 1 * n_probes);
   RAFT_LOG_TRACE_VEC(coarse_distances_dev.data(), 1 * n_probes);
@@ -423,7 +420,7 @@ void cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueries, dim]
                                                                     index_->metric,
                                                                     n_probes,
                                                                     k,
-                                                                    greater_,
+                                                                    select_min,
                                                                     nullptr,
                                                                     nullptr,
                                                                     grid_dim_x,
@@ -444,7 +441,7 @@ void cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueries, dim]
                                                                   index_->metric,
                                                                   n_probes,
                                                                   k,
-                                                                  greater_,
+                                                                  select_min,
                                                                   indices_dev_ptr,
                                                                   distances_dev_ptr,
                                                                   grid_dim_x,
@@ -463,7 +460,7 @@ void cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueries, dim]
                                          k,
                                          distances,
                                          neighbors,
-                                         !greater_,
+                                         select_min,
                                          stream_);
     } else {
       topk::radix_topk<AccT, size_t, 11, 512>(refined_distances_dev.data(),
@@ -473,9 +470,9 @@ void cuivflHandle<T>::cuivflSearchImpl(const T* queries,  // [numQueries, dim]
                                               k,
                                               distances,
                                               neighbors,
-                                              !greater_,
+                                              select_min,
                                               stream_,
-                                              &(search_mem_res.value()));
+                                              &(search_mem_res_.value()));
     }
   }
 }
diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
index 2acae697c2..ff5b136391 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
@@ -817,7 +817,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 1, int8_t, in
  *
  * query_smem_elems must be multiple of WarpSize * Veclen
  */
-template <int Capacity, int Veclen, bool Greater, typename T, typename AccT, typename Lambda>
+template <int Capacity, int Veclen, bool Ascending, typename T, typename AccT, typename Lambda>
 __global__ void __launch_bounds__(kThreadsPerBlock)
   interleaved_scan_kernel(Lambda compute_dist,
                           const uint32_t query_smem_elems,
@@ -845,7 +845,7 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
   __shared__ float smemK[kThreadsPerBlock];
   __shared__ size_t smemV[kThreadsPerBlock];
 
-  constexpr auto Dir = Greater;
+  constexpr auto Dir = !Ascending;
   constexpr auto identity =
     Dir ? std::numeric_limits<float>::min() : std::numeric_limits<float>::max();
   constexpr auto keyMax =
@@ -856,7 +856,7 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
       queue(identity, keyMax, smemK, smemV, k);
 
 #else
-  topk::block_sort<topk::warp_sort_filtered, Capacity, !Greater, float, size_t> queue(
+  topk::block_sort<topk::warp_sort_filtered, Capacity, Ascending, float, size_t> queue(
     k, interleaved_scan_kernel_smem + query_smem_elems * sizeof(T));
 #endif
 
@@ -957,7 +957,7 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
       }
 
       // Enqueue one element per thread
-      constexpr float kDummy = Greater ? lower_bound<float>() : upper_bound<float>();
+      constexpr float kDummy = Ascending ? upper_bound<float>() : lower_bound<float>();
       float val              = (valid) ? (float)dist : kDummy;
       queue.add(val, idx);
     }  // end for block < numBlocks
@@ -996,7 +996,7 @@ uint32_t configure_launch_x(uint32_t numQueries, uint32_t nprobe, int32_t sMemSi
   return min_grid_x > nprobe ? nprobe : static_cast<uint32_t>(min_grid_x);
 }
 
-template <int Capacity, int Veclen, bool Greater, typename T, typename AccT, typename Lambda>
+template <int Capacity, int Veclen, bool Ascending, typename T, typename AccT, typename Lambda>
 void launch_kernel(Lambda lambda,
                    const ivf_flat_index<T>& index,
                    const T* queries,
@@ -1013,7 +1013,7 @@ void launch_kernel(Lambda lambda,
                "Queries data is not aligned to the vector load size (Veclen).");
   RAFT_EXPECTS(Veclen == index.veclen,
                "Configured Veclen does not match the index interleaving pattern.");
-  constexpr auto kKernel   = interleaved_scan_kernel<Capacity, Veclen, Greater, T, AccT, Lambda>;
+  constexpr auto kKernel   = interleaved_scan_kernel<Capacity, Veclen, Ascending, T, AccT, Lambda>;
   const int max_query_smem = 16384;
   int query_smem_elems =
     std::min<int>(max_query_smem / sizeof(T), Pow2<Veclen * WarpSize>::roundUp(index.dim()));
@@ -1113,15 +1113,16 @@ struct inner_prod_dist {
 };
 
 /** Select the distance computation function and forward the rest of the arguments. */
-template <int Capacity, int Veclen, bool Greater, typename T, typename AccT, typename... Args>
+template <int Capacity, int Veclen, bool Ascending, typename T, typename AccT, typename... Args>
 void launch_with_fixed_consts(raft::distance::DistanceType metric, Args&&... args)
 {
   if (metric == raft::distance::DistanceType::L2Expanded ||
       metric == raft::distance::DistanceType::L2Unexpanded) {
-    launch_kernel<Capacity, Veclen, Greater, T, AccT, euclidean_dist<Veclen, T, AccT>>({}, args...);
+    launch_kernel<Capacity, Veclen, Ascending, T, AccT, euclidean_dist<Veclen, T, AccT>>({},
+                                                                                         args...);
   } else {
-    launch_kernel<Capacity, Veclen, Greater, T, AccT, inner_prod_dist<Veclen, T, AccT>>({},
-                                                                                        args...);
+    launch_kernel<Capacity, Veclen, Ascending, T, AccT, inner_prod_dist<Veclen, T, AccT>>({},
+                                                                                          args...);
   }
 }
 
@@ -1141,18 +1142,18 @@ struct select_interleaved_scan_kernel {
    * two parameters and ends with both values equal to 1.
    */
   template <typename... Args>
-  static inline void run(int capacity, int veclen, bool greater, Args&&... args)
+  static inline void run(int capacity, int veclen, bool select_min, Args&&... args)
   {
     if constexpr (Capacity > 1) {
       if (capacity * 2 <= Capacity) {
         return select_interleaved_scan_kernel<T, AccT, Capacity / 2, Veclen>::run(
-          capacity, veclen, greater, args...);
+          capacity, veclen, select_min, args...);
       }
     }
     if constexpr (Veclen > 1) {
       if (veclen * 2 <= Veclen) {
         return select_interleaved_scan_kernel<T, AccT, Capacity, Veclen / 2>::run(
-          capacity, veclen, greater, args...);
+          capacity, veclen, select_min, args...);
       }
     }
     RAFT_EXPECTS(capacity == Capacity,
@@ -1162,7 +1163,7 @@ struct select_interleaved_scan_kernel {
     RAFT_EXPECTS(
       veclen == Veclen,
       "Veclen must be power-of-two not bigger than the maximum allowed size for this data type.");
-    if (greater) {
+    if (select_min) {
       launch_with_fixed_consts<Capacity, Veclen, true, T, AccT>(args...);
     } else {
       launch_with_fixed_consts<Capacity, Veclen, false, T, AccT>(args...);
@@ -1184,7 +1185,7 @@ struct select_interleaved_scan_kernel {
  * @param n_probes number of nearest clusters to query
  * @param k number of nearest neighbors.
  *            NB: the maximum value of `k` is limited statically by `topk::kMaxCapacity`.
- * @param greater whether to select nearest (false) or furthest (true) points w.r.t. the given
+ * @param select_min whether to select nearest (true) or furthest (false) points w.r.t. the given
  * metric.
  * @param[out] neighbors device pointer to the result indices for each query and cluster
  * [batch_size, grid_dim_x, k]
@@ -1202,7 +1203,7 @@ void ivfflat_interleaved_scan(const ivf_flat_index<T>& index,
                               const raft::distance::DistanceType metric,
                               const uint32_t n_probes,
                               const uint32_t k,
-                              const bool greater,
+                              const bool select_min,
                               size_t* neighbors,
                               float* distances,
                               uint32_t& grid_dim_x,
@@ -1211,7 +1212,7 @@ void ivfflat_interleaved_scan(const ivf_flat_index<T>& index,
   const int capacity = raft::spatial::knn::detail::topk::calc_capacity(k);
   select_interleaved_scan_kernel<T, AccT>::run(capacity,
                                                index.veclen,
-                                               greater,
+                                               select_min,
                                                metric,
                                                index,
                                                queries,
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
index 8787c84fdb..89fea9d6b6 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
@@ -97,8 +97,8 @@ void approx_knn_cuivfl_ivfflat_build_index(const raft::handle_t& handle,
                                            IntType n,
                                            IntType D)
 {
-  index->ivf_flat<T>() = std::make_unique<detail::cuivflHandle<T>>(handle, *params);
-  index->ivf_flat<T>()->cuivflBuildIndex(dataset, n, D, metric);
+  index->ivf_flat<T>() = std::make_unique<detail::ivf_flat_handle<T>>(handle, *params);
+  index->ivf_flat<T>()->build(dataset, n, D, metric);
 }
 
 template <typename IntType = int>
@@ -229,28 +229,18 @@ void approx_knn_search(const handle_t& handle,
   if constexpr (std::is_same<T, uint8_t>{} || std::is_same<T, int8_t>{}) {
     if (dynamic_cast<ivf_flat_params*>(params)) {
       ivf_flat_params* IVFFlat_param = dynamic_cast<ivf_flat_params*>(params);
-      int nprobe                     = IVFFlat_param->nprobe;
-      int max_batch                  = n;
-      int max_k                      = k;
-
-      index->ivf_flat<T>()->cuivflSetSearchParameters(nprobe, max_batch, max_k);
-      index->ivf_flat<T>()->cuivflSearch(
-        query_array, max_batch, max_k, (size_t*)indices, distances);
+      index->ivf_flat<T>()->search(
+        query_array, n, k, IVFFlat_param->nprobe, (size_t*)indices, distances);
     }
   } else if constexpr (std::is_same<T, float>{}) {
     std::unique_ptr<MetricProcessor<float>> query_metric_processor = create_processor<float>(
-      index->metric, n, index->ivf_flat<T>()->getDim(), k, false, handle.get_stream());
+      index->metric, n, index->ivf_flat<T>()->data_dim(), k, false, handle.get_stream());
     query_metric_processor->preprocess(query_array);
 
     if (dynamic_cast<ivf_flat_params*>(params)) {
       ivf_flat_params* IVFFlat_param = dynamic_cast<ivf_flat_params*>(params);
-      int nprobe                     = IVFFlat_param->nprobe;
-      int max_batch                  = n;
-      int max_k                      = k;
-
-      index->ivf_flat<T>()->cuivflSetSearchParameters(nprobe, max_batch, max_k);
-      index->ivf_flat<T>()->cuivflSearch(
-        query_array, max_batch, max_k, (size_t*)indices, distances);
+      index->ivf_flat<T>()->search(
+        query_array, n, k, IVFFlat_param->nprobe, (size_t*)indices, distances);
     }
     query_metric_processor->revert(query_array);
 

From 325e2011342da1cc57ed6c800c9efb4a73e19efd Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 21 Jun 2022 11:44:57 +0200
Subject: [PATCH 074/118] Refactor the usage of stream and params

---
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  | 128 ++++++++++--------
 .../raft/spatial/knn/detail/ann_quantized.cuh |   6 +-
 2 files changed, 74 insertions(+), 60 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index 6952343c71..8f0d682a41 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -46,7 +46,7 @@ template <typename T>
 class ivf_flat_handle {
  public:
   ivf_flat_handle(const handle_t& handle, ivf_flat_params params)
-    : handle_(handle), stream_(handle_.get_stream()), params_(std::move(params))
+    : handle_(handle), params_(std::move(params))
   {
   }
 
@@ -57,8 +57,13 @@ class ivf_flat_handle {
    * @param n_rows number of samples
    * @param dim the dimensionality of the data
    * @param metric distance type
+   * @param stream
    */
-  void build(const T* dataset, uint32_t n_rows, uint32_t dim, raft::distance::DistanceType metric);
+  void build(const T* dataset,
+             uint32_t n_rows,
+             uint32_t dim,
+             raft::distance::DistanceType metric,
+             rmm::cuda_stream_view stream);
 
   /**
    * @brief Search ANN using the constructed index.
@@ -71,13 +76,15 @@ class ivf_flat_handle {
    * [n_queries, k]
    * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries,
    * k]
+   * @param stream
    */
   void search(const T* queries,
               uint32_t n_queries,
               uint32_t k,
               uint32_t n_probes,
               size_t* neighbors,
-              float* distances);
+              float* distances,
+              rmm::cuda_stream_view stream);
 
   /** Whether `build` method has already been succesfully invoked. */
   [[nodiscard]] auto is_trained() const -> bool { return index_.has_value(); }
@@ -87,8 +94,7 @@ class ivf_flat_handle {
 
  private:
   const handle_t& handle_;
-  const rmm::cuda_stream_view stream_;
-  ivf_flat_params params_;
+  const ivf_flat_params params_;
 
   // The built index
   std::optional<const ivf_flat_index<T>> index_ = std::nullopt;
@@ -101,9 +107,11 @@ class ivf_flat_handle {
   void search_impl(const T* queries,
                    uint32_t n_queries,
                    uint32_t k,
+                   uint32_t n_probes,
                    bool select_min,
                    size_t* neighbors,
-                   AccT* distances);
+                   AccT* distances,
+                   rmm::cuda_stream_view stream);
 };
 
 /**
@@ -177,7 +185,8 @@ template <typename T>
 void ivf_flat_handle<T>::build(const T* dataset,
                                uint32_t n_rows,
                                uint32_t dim,
-                               raft::distance::DistanceType metric)
+                               raft::distance::DistanceType metric,
+                               rmm::cuda_stream_view stream)
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
     "ivf_flat_handle::build(%u, %u)", n_rows, dim);
@@ -194,9 +203,9 @@ void ivf_flat_handle<T>::build(const T* dataset,
   auto n_lists = static_cast<uint32_t>(params_.nlist);
 
   // kmeans cluster ids for the dataset
-  rmm::device_uvector<uint32_t> labels(n_rows, stream_);
-  auto&& centers      = make_array_for_index<float>(stream_, n_lists, dim);
-  auto&& list_sizes   = make_array_for_index<uint32_t>(stream_, n_lists);
+  rmm::device_uvector<uint32_t> labels(n_rows, stream);
+  auto&& centers      = make_array_for_index<float>(stream, n_lists, dim);
+  auto&& list_sizes   = make_array_for_index<uint32_t>(stream, n_lists);
   auto list_sizes_ptr = list_sizes.data();
 
   // Predict labels of the whole dataset
@@ -211,14 +220,14 @@ void ivf_flat_handle<T>::build(const T* dataset,
                                  n_lists,
                                  params_.kmeans_trainset_fraction,
                                  metric,
-                                 stream_);
+                                 stream);
 
   // Calculate offsets into cluster data using exclusive scan
-  auto&& list_offsets   = make_array_for_index<uint32_t>(stream_, n_lists + 1);
+  auto&& list_offsets   = make_array_for_index<uint32_t>(stream, n_lists + 1);
   auto list_offsets_ptr = list_offsets.data();
 
   thrust::exclusive_scan(
-    rmm::exec_policy(stream_),
+    rmm::exec_policy(stream),
     list_sizes_ptr,
     list_sizes_ptr + n_lists + 1,
     list_offsets_ptr,
@@ -226,32 +235,32 @@ void ivf_flat_handle<T>::build(const T* dataset,
     [] __device__(uint32_t s, uint32_t l) { return s + Pow2<WarpSize>::roundUp(l); });
 
   uint32_t index_size;
-  update_host(&index_size, list_offsets_ptr + n_lists, 1, stream_);
-  handle_.sync_stream(stream_);
+  update_host(&index_size, list_offsets_ptr + n_lists, 1, stream);
+  handle_.sync_stream(stream);
 
-  auto&& data    = make_array_for_index<T>(stream_, index_size, dim);
-  auto&& indices = make_array_for_index<uint32_t>(stream_, index_size);
+  auto&& data    = make_array_for_index<T>(stream, index_size, dim);
+  auto&& indices = make_array_for_index<uint32_t>(stream, index_size);
 
   // we'll rebuild the `list_sizes_ptr` in the following kernel, using it as an atomic counter.
-  utils::memset(list_sizes_ptr, 0, sizeof(uint32_t) * n_lists, stream_);
+  utils::memset(list_sizes_ptr, 0, sizeof(uint32_t) * n_lists, stream);
 
   const dim3 block_dim(256);
   const dim3 grid_dim(raft::ceildiv<uint32_t>(n_rows, block_dim.x));
-  build_index_kernel<<<grid_dim, block_dim, 0, stream_>>>(labels.data(),
-                                                          list_offsets_ptr,
-                                                          dataset,
-                                                          data.data(),
-                                                          indices.data(),
-                                                          list_sizes_ptr,
-                                                          n_rows,
-                                                          dim,
-                                                          veclen);
+  build_index_kernel<<<grid_dim, block_dim, 0, stream>>>(labels.data(),
+                                                         list_offsets_ptr,
+                                                         dataset,
+                                                         data.data(),
+                                                         indices.data(),
+                                                         list_sizes_ptr,
+                                                         n_rows,
+                                                         dim,
+                                                         veclen);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
   // Precompute the centers vector norms for L2Expanded distance
   auto compute_norms = [&]() {
-    auto&& r = make_array_for_index<float>(stream_, params_.nlist);
-    utils::dots_along_rows(params_.nlist, dim, centers.data(), r.data(), stream_);
+    auto&& r = make_array_for_index<float>(stream, n_lists);
+    utils::dots_along_rows(n_lists, dim, centers.data(), r.data(), stream);
     RAFT_LOG_TRACE_VEC(r.data(), 20);
     return r;
   };
@@ -273,12 +282,14 @@ void ivf_flat_handle<T>::search(const T* queries,
                                 uint32_t k,
                                 uint32_t n_probes,
                                 size_t* neighbors,
-                                float* distances)
+                                float* distances,
+                                rmm::cuda_stream_view stream)
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
     "ivf_flat_handle::search(%u, %u, %zu)", n_queries, k, neighbors);
 
-  params_.nprobe = n_probes;
+  RAFT_EXPECTS(is_trained(),
+               "The index must be trained before the search (ivf_flat_handle::build)");
   RAFT_EXPECTS(n_probes > 0,
                "n_probes (number of clusters to probe in the search) must be positive.");
 
@@ -301,7 +312,7 @@ void ivf_flat_handle<T>::search(const T* queries,
     search_mem_res_.emplace(cur_memory_resource, Pow2<256>::roundUp(n_queries * n_probes * k * 16));
   }
 
-  search_impl<float>(queries, n_queries, k, select_min, neighbors, distances);
+  search_impl<float>(queries, n_queries, k, n_probes, select_min, neighbors, distances, stream);
 }
 
 template <typename T>
@@ -309,24 +320,27 @@ template <typename AccT>
 void ivf_flat_handle<T>::search_impl(const T* queries,
                                      uint32_t n_queries,
                                      uint32_t k,
+                                     uint32_t n_probes,
                                      bool select_min,
                                      size_t* neighbors,
-                                     AccT* distances)
+                                     AccT* distances,
+                                     rmm::cuda_stream_view stream)
 {
-  uint32_t n_probes = std::min(params_.nprobe, params_.nlist);
-  auto search_mr    = &(search_mem_res_.value());
+  auto n_lists   = index_->n_lists();
+  n_probes       = std::min<uint32_t>(n_probes, n_lists);
+  auto search_mr = &(search_mem_res_.value());
   // The norm of query
-  rmm::device_uvector<float> query_norm_dev(n_queries, stream_, search_mr);
+  rmm::device_uvector<float> query_norm_dev(n_queries, stream, search_mr);
   // The distance value of cluster(list) and queries
-  rmm::device_uvector<float> distance_buffer_dev(n_queries * params_.nlist, stream_, search_mr);
+  rmm::device_uvector<float> distance_buffer_dev(n_queries * n_lists, stream, search_mr);
   // The topk distance value of cluster(list) and queries
-  rmm::device_uvector<float> coarse_distances_dev(n_queries * n_probes, stream_, search_mr);
+  rmm::device_uvector<float> coarse_distances_dev(n_queries * n_probes, stream, search_mr);
   // The topk  index of cluster(list) and queries
-  rmm::device_uvector<uint32_t> coarse_indices_dev(n_queries * n_probes, stream_, search_mr);
+  rmm::device_uvector<uint32_t> coarse_indices_dev(n_queries * n_probes, stream, search_mr);
   // The topk distance value of candicate vectors from each cluster(list)
-  rmm::device_uvector<AccT> refined_distances_dev(n_queries * n_probes * k, stream_, search_mr);
+  rmm::device_uvector<AccT> refined_distances_dev(n_queries * n_probes * k, stream, search_mr);
   // The topk index of candicate vectors from each cluster(list)
-  rmm::device_uvector<size_t> refined_indices_dev(n_queries * n_probes * k, stream_, search_mr);
+  rmm::device_uvector<size_t> refined_indices_dev(n_queries * n_probes * k, stream, search_mr);
 
   size_t float_query_size;
   if constexpr (std::is_integral_v<T>) {
@@ -334,14 +348,14 @@ void ivf_flat_handle<T>::search_impl(const T* queries,
   } else {
     float_query_size = 0;
   }
-  rmm::device_uvector<float> converted_queries_dev(float_query_size, stream_, search_mr);
+  rmm::device_uvector<float> converted_queries_dev(float_query_size, stream, search_mr);
   float* converted_queries_ptr = converted_queries_dev.data();
 
   if constexpr (std::is_same_v<T, float>) {
     converted_queries_ptr = const_cast<float*>(queries);
   } else {
     linalg::unaryOp(
-      converted_queries_ptr, queries, n_queries * index_->dim(), utils::mapping<float>{}, stream_);
+      converted_queries_ptr, queries, n_queries * index_->dim(), utils::mapping<float>{}, stream);
   }
 
   float alpha = 1.0f;
@@ -351,13 +365,13 @@ void ivf_flat_handle<T>::search_impl(const T* queries,
     alpha = -2.0f;
     beta  = 1.0f;
     utils::dots_along_rows(
-      n_queries, index_->dim(), converted_queries_ptr, query_norm_dev.data(), stream_);
+      n_queries, index_->dim(), converted_queries_ptr, query_norm_dev.data(), stream);
     utils::outer_add(query_norm_dev.data(),
                      n_queries,
                      index_->center_norms->data(),
-                     params_.nlist,
+                     n_lists,
                      distance_buffer_dev.data(),
-                     stream_);
+                     stream);
     RAFT_LOG_TRACE_VEC(index_->center_norms->data(), 20);
     RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), 20);
   } else {
@@ -368,7 +382,7 @@ void ivf_flat_handle<T>::search_impl(const T* queries,
   linalg::gemm(handle_,
                true,
                false,
-               params_.nlist,
+               n_lists,
                n_queries,
                index_->dim(),
                &alpha,
@@ -378,30 +392,30 @@ void ivf_flat_handle<T>::search_impl(const T* queries,
                index_->dim(),
                &beta,
                distance_buffer_dev.data(),
-               params_.nlist,
-               stream_);
+               n_lists,
+               stream);
 
   RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), 20);
   if (n_probes <= raft::spatial::knn::detail::topk::kMaxCapacity) {
     topk::warp_sort_topk<AccT, uint32_t>(distance_buffer_dev.data(),
                                          nullptr,
                                          n_queries,
-                                         params_.nlist,
+                                         n_lists,
                                          n_probes,
                                          coarse_distances_dev.data(),
                                          coarse_indices_dev.data(),
                                          select_min,
-                                         stream_);
+                                         stream);
   } else {
     topk::radix_topk<AccT, uint32_t, 11, 512>(distance_buffer_dev.data(),
                                               nullptr,
                                               n_queries,
-                                              params_.nlist,
+                                              n_lists,
                                               n_probes,
                                               coarse_distances_dev.data(),
                                               coarse_indices_dev.data(),
                                               select_min,
-                                              stream_,
+                                              stream,
                                               &(search_mem_res_.value()));
   }
   RAFT_LOG_TRACE_VEC(coarse_indices_dev.data(), 1 * n_probes);
@@ -424,7 +438,7 @@ void ivf_flat_handle<T>::search_impl(const T* queries,
                                                                     nullptr,
                                                                     nullptr,
                                                                     grid_dim_x,
-                                                                    stream_);
+                                                                    stream);
   } else {
     grid_dim_x = 1;
   }
@@ -445,7 +459,7 @@ void ivf_flat_handle<T>::search_impl(const T* queries,
                                                                   indices_dev_ptr,
                                                                   distances_dev_ptr,
                                                                   grid_dim_x,
-                                                                  stream_);
+                                                                  stream);
 
   RAFT_LOG_TRACE_VEC(distances_dev_ptr, 2 * k);
   RAFT_LOG_TRACE_VEC(indices_dev_ptr, 2 * k);
@@ -461,7 +475,7 @@ void ivf_flat_handle<T>::search_impl(const T* queries,
                                          distances,
                                          neighbors,
                                          select_min,
-                                         stream_);
+                                         stream);
     } else {
       topk::radix_topk<AccT, size_t, 11, 512>(refined_distances_dev.data(),
                                               refined_indices_dev.data(),
@@ -471,7 +485,7 @@ void ivf_flat_handle<T>::search_impl(const T* queries,
                                               distances,
                                               neighbors,
                                               select_min,
-                                              stream_,
+                                              stream,
                                               &(search_mem_res_.value()));
     }
   }
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
index 89fea9d6b6..3ec8d0bc51 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
@@ -98,7 +98,7 @@ void approx_knn_cuivfl_ivfflat_build_index(const raft::handle_t& handle,
                                            IntType D)
 {
   index->ivf_flat<T>() = std::make_unique<detail::ivf_flat_handle<T>>(handle, *params);
-  index->ivf_flat<T>()->build(dataset, n, D, metric);
+  index->ivf_flat<T>()->build(dataset, n, D, metric, handle.get_stream());
 }
 
 template <typename IntType = int>
@@ -230,7 +230,7 @@ void approx_knn_search(const handle_t& handle,
     if (dynamic_cast<ivf_flat_params*>(params)) {
       ivf_flat_params* IVFFlat_param = dynamic_cast<ivf_flat_params*>(params);
       index->ivf_flat<T>()->search(
-        query_array, n, k, IVFFlat_param->nprobe, (size_t*)indices, distances);
+        query_array, n, k, IVFFlat_param->nprobe, (size_t*)indices, distances, handle.get_stream());
     }
   } else if constexpr (std::is_same<T, float>{}) {
     std::unique_ptr<MetricProcessor<float>> query_metric_processor = create_processor<float>(
@@ -240,7 +240,7 @@ void approx_knn_search(const handle_t& handle,
     if (dynamic_cast<ivf_flat_params*>(params)) {
       ivf_flat_params* IVFFlat_param = dynamic_cast<ivf_flat_params*>(params);
       index->ivf_flat<T>()->search(
-        query_array, n, k, IVFFlat_param->nprobe, (size_t*)indices, distances);
+        query_array, n, k, IVFFlat_param->nprobe, (size_t*)indices, distances, handle.get_stream());
     }
     query_metric_processor->revert(query_array);
 

From 2a3eb3385091a815be5701953fb5bf99409453bc Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 21 Jun 2022 16:02:18 +0200
Subject: [PATCH 075/118] Refactor api to have symmetric index/search params

---
 cpp/bench/spatial/knn.cu                      |  40 ++++---
 cpp/include/raft/spatial/knn/ann.cuh          |  24 ++--
 .../knn/{ann_common.h => ann_common.hpp}      |  39 ++++---
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  |  53 +++++----
 .../knn/detail/ann_ivf_flat_kernel.cuh        |  86 +-------------
 .../knn/detail/ann_kmeans_balanced.cuh        |   2 +-
 .../raft/spatial/knn/detail/ann_quantized.cuh | 105 +++++++++---------
 .../raft/spatial/knn/detail/ann_utils.cuh     |   2 +-
 cpp/test/spatial/ann_ivf_flat.cu              |  20 ++--
 9 files changed, 144 insertions(+), 227 deletions(-)
 rename cpp/include/raft/spatial/knn/{ann_common.h => ann_common.hpp} (75%)

diff --git a/cpp/bench/spatial/knn.cu b/cpp/bench/spatial/knn.cu
index bce1902ade..5f97e3ae57 100644
--- a/cpp/bench/spatial/knn.cu
+++ b/cpp/bench/spatial/knn.cu
@@ -132,22 +132,20 @@ struct host_uvector {
 template <typename ValT, typename IdxT>
 struct ivf_flat_knn {
   raft::spatial::knn::knnIndex index;
-  raft::spatial::knn::ivf_flat_params ivf_params;
+  raft::spatial::knn::ivf_flat_index_params index_params;
+  raft::spatial::knn::ivf_flat_search_params search_params;
   params ps;
 
   ivf_flat_knn(const raft::handle_t& handle, const params& ps, const ValT* data) : ps(ps)
   {
-    ivf_params.nlist  = 4096;
-    ivf_params.nprobe = 20;
-    raft::spatial::knn::approx_knn_build_index<ValT, IdxT>(
-      const_cast<raft::handle_t&>(handle),
-      &(index),
-      dynamic_cast<raft::spatial::knn::knnIndexParam*>(&ivf_params),
-      raft::distance::DistanceType::L2Unexpanded,
-      2.0f,
-      const_cast<ValT*>(data),
-      (IdxT)ps.n_samples,
-      (IdxT)ps.n_dims);
+    index_params.n_lists = 4096;
+    index_params.metric  = raft::distance::DistanceType::L2Expanded;
+    raft::spatial::knn::approx_knn_build_index<ValT, IdxT>(const_cast<raft::handle_t&>(handle),
+                                                           &index,
+                                                           index_params,
+                                                           const_cast<ValT*>(data),
+                                                           (IdxT)ps.n_samples,
+                                                           (IdxT)ps.n_dims);
   }
 
   void search(const raft::handle_t& handle,
@@ -155,15 +153,15 @@ struct ivf_flat_knn {
               ValT* out_dists,
               IdxT* out_idxs)
   {
-    raft::spatial::knn::approx_knn_search<ValT, IdxT>(
-      const_cast<raft::handle_t&>(handle),
-      out_dists,
-      out_idxs,
-      &(index),
-      dynamic_cast<raft::spatial::knn::knnIndexParam*>(&ivf_params),
-      (IdxT)ps.k,
-      const_cast<ValT*>(search_items),
-      (IdxT)ps.n_probes);
+    search_params.n_probes = 20;
+    raft::spatial::knn::approx_knn_search<ValT, IdxT>(const_cast<raft::handle_t&>(handle),
+                                                      out_dists,
+                                                      out_idxs,
+                                                      &index,
+                                                      search_params,
+                                                      (IdxT)ps.k,
+                                                      const_cast<ValT*>(search_items),
+                                                      (IdxT)ps.n_probes);
   }
 };
 
diff --git a/cpp/include/raft/spatial/knn/ann.cuh b/cpp/include/raft/spatial/knn/ann.cuh
index a5e05bc82b..51e76d44f2 100644
--- a/cpp/include/raft/spatial/knn/ann.cuh
+++ b/cpp/include/raft/spatial/knn/ann.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "ann_common.h"
+#include "ann_common.hpp"
 #include "detail/ann_quantized.cuh"
 
 #include <raft/spatial/knn/faiss_mr.hpp>
@@ -30,23 +30,19 @@ namespace raft::spatial::knn {
  * @param[in] handle RAFT handle
  * @param[out] index index to be built
  * @param[in] params parametrization of the index to be built
- * @param[in] metric distance metric to use. Euclidean (L2) is used by default
- * @param[in] metricArg metric argument
  * @param[in] index_array the index array to build the index with
  * @param[in] n number of rows in the index array
  * @param[in] D the dimensionality of the index array
  */
 template <typename T = float, typename value_idx = int>
 inline void approx_knn_build_index(const raft::handle_t& handle,
-                                   raft::spatial::knn::knnIndex* index,
-                                   knnIndexParam* params,
-                                   raft::distance::DistanceType metric,
-                                   float metricArg,
+                                   knnIndex* index,
+                                   const knn_index_params& params,
                                    T* index_array,
                                    value_idx n,
                                    value_idx D)
 {
-  detail::approx_knn_build_index(handle, index, params, metric, metricArg, index_array, n, D);
+  detail::approx_knn_build_index(handle, index, params, index_array, n, D);
 }
 
 /**
@@ -58,22 +54,22 @@ inline void approx_knn_build_index(const raft::handle_t& handle,
  *                       their query point
  * @param[out] indices indices of the nearest neighbors
  * @param[in] index index to perform a search with
- * @param[in] params parameters used to build the index
+ * @param[in] params configure search
  * @param[in] k the number of nearest neighbors to search for
  * @param[in] query_array the query to perform a search with
- * @param[in] n number of rows in the query array
+ * @param[in] n_queries number of rows in the query array
  */
 template <typename T = float, typename value_idx = int>
 inline void approx_knn_search(const raft::handle_t& handle,
                               float* distances,
                               int64_t* indices,
-                              raft::spatial::knn::knnIndex* index,
-                              knnIndexParam* params,
+                              knnIndex* index,
+                              const knn_search_params& params,
                               value_idx k,
                               T* query_array,
-                              value_idx n)
+                              value_idx n_queries)
 {
-  detail::approx_knn_search(handle, distances, indices, index, params, k, query_array, n);
+  detail::approx_knn_search(handle, distances, indices, index, params, k, query_array, n_queries);
 }
 
 }  // namespace raft::spatial::knn
diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.hpp
similarity index 75%
rename from cpp/include/raft/spatial/knn/ann_common.h
rename to cpp/include/raft/spatial/knn/ann_common.hpp
index 6a8c5056ef..e916747bbb 100644
--- a/cpp/include/raft/spatial/knn/ann_common.h
+++ b/cpp/include/raft/spatial/knn/ann_common.hpp
@@ -75,33 +75,40 @@ enum QuantizerType : unsigned int {
   QT_6bit
 };
 
-struct knnIndexParam {
-  virtual ~knnIndexParam() {}
+struct knn_index_params {
+  /** Distance type. */
+  raft::distance::DistanceType metric = distance::DistanceType::L2Expanded;
+  /** The argument used by some distance metrics. */
+  float metric_arg = 2.0f;
+
+  virtual ~knn_index_params() = default;
+};
+
+struct knn_search_params {
+  virtual ~knn_search_params() = default;
 };
 
-struct IVFParam : knnIndexParam {
+struct ivf_index_params : knn_index_params {
   /** The number of inverted lists (clusters) */
-  int nlist;
-  /** The number of clusters to search. */
-  int nprobe;
+  uint32_t n_lists = 1024;
 };
 
-struct ivf_flat_params : IVFParam {
-  /** The number of iterations searching for kmeans centers (index building). */
-  uint32_t kmeans_n_iters = 20;
-  /** The fraction of data to use during iterative kmeans building. */
-  double kmeans_trainset_fraction = 0.5;
+struct ivf_search_params : knn_search_params {
+  /** The number of clusters to search. */
+  uint32_t n_probes = 20;
 };
 
-struct IVFPQParam : IVFParam {
-  int M;
+// TODO: move to ivf_pq
+struct ivf_pq_index_params : ivf_index_params {
+  int n_subquantizers;
   int n_bits;
-  bool usePrecomputedTables;
+  bool use_precomputed_tables;
 };
 
-struct IVFSQParam : IVFParam {
+// TODO: move to ivf_sq
+struct ivf_sq_index_params : ivf_index_params {
   QuantizerType qtype;
-  bool encodeResidual;
+  bool encode_residual;
 };
 
 };  // namespace raft::spatial::knn
diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
index 8f0d682a41..9387ae7978 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
@@ -16,7 +16,9 @@
 
 #pragma once
 
-#include "../ann_common.h"
+#include "../ann_common.hpp"
+#include "../ivf_flat.hpp"
+
 #include "ann_ivf_flat_kernel.cuh"
 #include "ann_kmeans_balanced.cuh"
 #include "ann_utils.cuh"
@@ -31,7 +33,7 @@
 #include <raft/distance/distance_type.hpp>
 #include <raft/linalg/gemm.cuh>
 #include <raft/linalg/unary_op.cuh>
-#include <raft/spatial/knn/ann_common.h>
+#include <raft/spatial/knn/ann_common.hpp>
 #include <raft/stats/histogram.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -45,10 +47,7 @@ namespace raft::spatial::knn::detail {
 template <typename T>
 class ivf_flat_handle {
  public:
-  ivf_flat_handle(const handle_t& handle, ivf_flat_params params)
-    : handle_(handle), params_(std::move(params))
-  {
-  }
+  ivf_flat_handle(const handle_t& handle) : handle_(handle) {}
 
   /**
    * @brief Build the index from the dataset for efficient search.
@@ -56,13 +55,13 @@ class ivf_flat_handle {
    * @param[in] dataset a device pointer to a row-major matrix [n_rows, dim]
    * @param n_rows number of samples
    * @param dim the dimensionality of the data
-   * @param metric distance type
+   * @param params configure the index building
    * @param stream
    */
   void build(const T* dataset,
              uint32_t n_rows,
              uint32_t dim,
-             raft::distance::DistanceType metric,
+             const ivf_flat_index_params& params,
              rmm::cuda_stream_view stream);
 
   /**
@@ -71,7 +70,7 @@ class ivf_flat_handle {
    * @param[in] queries a device pointer to a row-major matrix [n_queries, dim]
    * @param n_queries is the batch size
    * @param k is the number of neighbors to find for each query.
-   * @param n_probes number of clusters to look at for each query (affects speed vs recall).
+   * @param params configure the search
    * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
    * [n_queries, k]
    * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries,
@@ -81,7 +80,7 @@ class ivf_flat_handle {
   void search(const T* queries,
               uint32_t n_queries,
               uint32_t k,
-              uint32_t n_probes,
+              const ivf_flat_search_params& params,
               size_t* neighbors,
               float* distances,
               rmm::cuda_stream_view stream);
@@ -94,7 +93,6 @@ class ivf_flat_handle {
 
  private:
   const handle_t& handle_;
-  const ivf_flat_params params_;
 
   // The built index
   std::optional<const ivf_flat_index<T>> index_ = std::nullopt;
@@ -185,7 +183,7 @@ template <typename T>
 void ivf_flat_handle<T>::build(const T* dataset,
                                uint32_t n_rows,
                                uint32_t dim,
-                               raft::distance::DistanceType metric,
+                               const ivf_flat_index_params& params,
                                rmm::cuda_stream_view stream)
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
@@ -200,7 +198,7 @@ void ivf_flat_handle<T>::build(const T* dataset,
   while (dim % veclen != 0) {
     veclen = veclen >> 1;
   }
-  auto n_lists = static_cast<uint32_t>(params_.nlist);
+  auto n_lists = static_cast<uint32_t>(params.n_lists);
 
   // kmeans cluster ids for the dataset
   rmm::device_uvector<uint32_t> labels(n_rows, stream);
@@ -210,7 +208,7 @@ void ivf_flat_handle<T>::build(const T* dataset,
 
   // Predict labels of the whole dataset
   kmeans::build_optimized_kmeans(handle_,
-                                 params_.kmeans_n_iters,
+                                 params.kmeans_n_iters,
                                  dim,
                                  dataset,
                                  n_rows,
@@ -218,8 +216,8 @@ void ivf_flat_handle<T>::build(const T* dataset,
                                  list_sizes_ptr,
                                  centers.data(),
                                  n_lists,
-                                 params_.kmeans_trainset_fraction,
-                                 metric,
+                                 params.kmeans_trainset_fraction,
+                                 params.metric,
                                  stream);
 
   // Calculate offsets into cluster data using exclusive scan
@@ -264,13 +262,13 @@ void ivf_flat_handle<T>::build(const T* dataset,
     RAFT_LOG_TRACE_VEC(r.data(), 20);
     return r;
   };
-  auto&& center_norms = metric == raft::distance::DistanceType::L2Expanded
+  auto&& center_norms = params.metric == raft::distance::DistanceType::L2Expanded
                           ? std::optional(compute_norms())
                           : std::nullopt;
 
   // assemble the index
   index_.emplace(ivf_flat_index<T>{
-    veclen, metric, data, indices, list_sizes, list_offsets, centers, center_norms});
+    veclen, params.metric, data, indices, list_sizes, list_offsets, centers, center_norms});
 
   // check index invariants
   index_->check_consistency();
@@ -280,7 +278,7 @@ template <typename T>
 void ivf_flat_handle<T>::search(const T* queries,
                                 uint32_t n_queries,
                                 uint32_t k,
-                                uint32_t n_probes,
+                                const ivf_flat_search_params& params,
                                 size_t* neighbors,
                                 float* distances,
                                 rmm::cuda_stream_view stream)
@@ -290,8 +288,9 @@ void ivf_flat_handle<T>::search(const T* queries,
 
   RAFT_EXPECTS(is_trained(),
                "The index must be trained before the search (ivf_flat_handle::build)");
-  RAFT_EXPECTS(n_probes > 0,
+  RAFT_EXPECTS(params.n_probes > 0,
                "n_probes (number of clusters to probe in the search) must be positive.");
+  auto n_probes = std::min<uint32_t>(params.n_probes, index_->n_lists());
 
   bool select_min;
   switch (index_->metric) {
@@ -326,13 +325,11 @@ void ivf_flat_handle<T>::search_impl(const T* queries,
                                      AccT* distances,
                                      rmm::cuda_stream_view stream)
 {
-  auto n_lists   = index_->n_lists();
-  n_probes       = std::min<uint32_t>(n_probes, n_lists);
   auto search_mr = &(search_mem_res_.value());
   // The norm of query
   rmm::device_uvector<float> query_norm_dev(n_queries, stream, search_mr);
   // The distance value of cluster(list) and queries
-  rmm::device_uvector<float> distance_buffer_dev(n_queries * n_lists, stream, search_mr);
+  rmm::device_uvector<float> distance_buffer_dev(n_queries * index_->n_lists(), stream, search_mr);
   // The topk distance value of cluster(list) and queries
   rmm::device_uvector<float> coarse_distances_dev(n_queries * n_probes, stream, search_mr);
   // The topk  index of cluster(list) and queries
@@ -369,7 +366,7 @@ void ivf_flat_handle<T>::search_impl(const T* queries,
     utils::outer_add(query_norm_dev.data(),
                      n_queries,
                      index_->center_norms->data(),
-                     n_lists,
+                     index_->n_lists(),
                      distance_buffer_dev.data(),
                      stream);
     RAFT_LOG_TRACE_VEC(index_->center_norms->data(), 20);
@@ -382,7 +379,7 @@ void ivf_flat_handle<T>::search_impl(const T* queries,
   linalg::gemm(handle_,
                true,
                false,
-               n_lists,
+               index_->n_lists(),
                n_queries,
                index_->dim(),
                &alpha,
@@ -392,7 +389,7 @@ void ivf_flat_handle<T>::search_impl(const T* queries,
                index_->dim(),
                &beta,
                distance_buffer_dev.data(),
-               n_lists,
+               index_->n_lists(),
                stream);
 
   RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), 20);
@@ -400,7 +397,7 @@ void ivf_flat_handle<T>::search_impl(const T* queries,
     topk::warp_sort_topk<AccT, uint32_t>(distance_buffer_dev.data(),
                                          nullptr,
                                          n_queries,
-                                         n_lists,
+                                         index_->n_lists(),
                                          n_probes,
                                          coarse_distances_dev.data(),
                                          coarse_indices_dev.data(),
@@ -410,7 +407,7 @@ void ivf_flat_handle<T>::search_impl(const T* queries,
     topk::radix_topk<AccT, uint32_t, 11, 512>(distance_buffer_dev.data(),
                                               nullptr,
                                               n_queries,
-                                              n_lists,
+                                              index_->n_lists(),
                                               n_probes,
                                               coarse_distances_dev.data(),
                                               coarse_indices_dev.data(),
diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
index ff5b136391..d99a7024c1 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
@@ -18,7 +18,8 @@
 
 // #define USE_FAISS
 
-#include "../ann_common.h"
+#include "../ann_common.hpp"
+#include "../ivf_flat.hpp"
 #include "ann_utils.cuh"
 #include "topk/warpsort_topk.cuh"
 
@@ -44,90 +45,11 @@ namespace raft::spatial::knn::detail {
 
 constexpr int kThreadsPerBlock = 128;
 
-namespace md = std::experimental;
-
-template <typename T>
-struct ivf_flat_index {
-  using row_major = md::layout_right;
-  using extent_1d = md::extents<dynamic_extent>;
-  using extent_2d = md::extents<dynamic_extent, dynamic_extent>;
-
-  /**
-   * Vectorized load/store size in elements, determines the size of interleaved data chunks.
-   */
-  uint32_t veclen;
-  /** Distance metric used for clustering. */
-  raft::distance::DistanceType metric;
-
-  /**
-   * Inverted list data [size, dim].
-   *
-   * The data consists of the dataset rows, grouped by their labels (into clusters/lists).
-   * Within each list (cluster), the data is grouped into blocks of `WarpSize` interleaved
-   * vectors. Note, the total index length is slightly larger than the source dataset length,
-   * because each cluster is padded by `WarpSize` elements.
-   *
-   * Interleaving pattern:
-   * within groups of `WarpSize` rows, the data is interleaved with the block size equal to
-   * `veclen * sizeof(T)`. That is, a chunk of `veclen` consecutive components of one row is
-   * followed by a chunk of the same size of the next row, and so on.
-   *
-   * __Example__: veclen = 2, dim = 6, WarpSize = 32, list_size = 31
-   * `
-   *   x[ 0, 0], x[ 0, 1], x[ 1, 0], x[ 1, 1], ... x[14, 0], x[14, 1], x[15, 0], x[15, 1],
-   *   x[16, 0], x[16, 1], x[17, 0], x[17, 1], ... x[30, 0], x[30, 1],    -    ,    -    ,
-   *   x[ 0, 2], x[ 0, 3], x[ 1, 2], x[ 1, 3], ... x[14, 2], x[14, 3], x[15, 2], x[15, 3],
-   *   x[16, 2], x[16, 3], x[17, 2], x[17, 3], ... x[30, 2], x[30, 3],    -    ,    -    ,
-   *   x[ 0, 4], x[ 0, 5], x[ 1, 4], x[ 1, 5], ... x[14, 4], x[14, 5], x[15, 4], x[15, 5],
-   *   x[16, 4], x[16, 5], x[17, 4], x[17, 5], ... x[30, 4], x[30, 5],    -    ,    -    ,
-   * `
-   */
-  device_mdarray<T, extent_2d, row_major> data;
-  /** Inverted list indices: ids of items in the source data [size] */
-  device_mdarray<uint32_t, extent_1d, row_major> indices;
-  /** Sizes of the lists (clusters) [n_lists] */
-  device_mdarray<uint32_t, extent_1d, row_major> list_sizes;
-  /**
-   * Offsets into the lists [n_lists + 1].
-   * The last value contains the total length of the index.
-   */
-  device_mdarray<uint32_t, extent_1d, row_major> list_offsets;
-  /** k-means cluster centers corresponding to the lists [n_lists, dim] */
-  device_mdarray<float, extent_2d, row_major> centers;
-  /** (Optional) Precomputed norms of the `centers` w.r.t. the chosen distance metrix [n_lists]  */
-  std::optional<device_mdarray<float, extent_1d, row_major>> center_norms;
-
-  /** Total length of the index. */
-  [[nodiscard]] constexpr inline auto size() const noexcept -> size_t { return data.extent(0); }
-  /** Dimensionality of the data. */
-  [[nodiscard]] constexpr inline auto dim() const noexcept -> size_t { return data.extent(1); }
-  /** Number of clusters/inverted lists. */
-  [[nodiscard]] constexpr inline auto n_lists() const noexcept -> size_t
-  {
-    return centers.extent(0);
-  }
-
-  /** Throw an error if the index content is inconsistent. */
-  inline void check_consistency() const
-  {
-    RAFT_EXPECTS(dim() % veclen == 0, "dimensionality is not a multiple of the veclen");
-    RAFT_EXPECTS(data.extent(0) == indices.extent(0), "inconsistent index size");
-    RAFT_EXPECTS(data.extent(1) == centers.extent(1), "inconsistent data dimensionality");
-    RAFT_EXPECTS(                                             //
-      (centers.extent(0) == list_sizes.extent(0)) &&          //
-        (centers.extent(0) + 1 == list_offsets.extent(0)) &&  //
-        (!center_norms.has_value() || centers.extent(0) == center_norms->extent(0)),
-      "inconsistent number of lists (clusters)");
-    RAFT_EXPECTS(reinterpret_cast<size_t>(data.data()) % (veclen * sizeof(T)) == 0,
-                 "The data storage pointer is not aligned to the vector length");
-  }
-};
-
 template <typename T, typename... Extents>
 static inline auto make_array_for_index(rmm::cuda_stream_view stream, Extents... exts)
 {
-  using extent_t  = md::extents<((void)exts, dynamic_extent)...>;
-  using mdarray_t = device_mdarray<T, extent_t, md::layout_right>;
+  using extent_t  = extents<((void)exts, dynamic_extent)...>;
+  using mdarray_t = device_mdarray<T, extent_t, layout_c_contiguous>;
 
   typename mdarray_t::extents_type extent{exts...};
   typename mdarray_t::mapping_type layout{extent};
diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index 2073b67db6..bc3273b9ba 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "../ann_common.h"
+#include "../ann_common.hpp"
 #include "ann_utils.cuh"
 
 #include <raft/common/nvtx.hpp>
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
index 3ec8d0bc51..981f058433 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "../ann_common.h"
+#include "../ann_common.hpp"
 #include "knn_brute_force_faiss.cuh"
 
 #include "common_faiss.h"
@@ -76,85 +76,87 @@ inline faiss::ScalarQuantizer::QuantizerType build_faiss_qtype(QuantizerType qty
 }
 
 template <typename IntType = int>
-void approx_knn_ivfflat_build_index(
-  knnIndex* index, IVFParam* params, raft::distance::DistanceType metric, IntType n, IntType D)
+void approx_knn_ivfflat_build_index(knnIndex* index,
+                                    const ivf_index_params& params,
+                                    IntType n,
+                                    IntType D)
 {
   faiss::gpu::GpuIndexIVFFlatConfig config;
   config.device                  = index->device;
-  faiss::MetricType faiss_metric = build_faiss_metric(metric);
+  faiss::MetricType faiss_metric = build_faiss_metric(params.metric);
   faiss::gpu::GpuIndexIVFFlat* faiss_index =
-    new faiss::gpu::GpuIndexIVFFlat(index->gpu_res, D, params->nlist, faiss_metric, config);
-  faiss_index->setNumProbes(params->nprobe);
+    new faiss::gpu::GpuIndexIVFFlat(index->gpu_res, D, params.n_lists, faiss_metric, config);
   index->index = faiss_index;
 }
 
 template <typename T = float, typename IntType = int>
 void approx_knn_cuivfl_ivfflat_build_index(const raft::handle_t& handle,
                                            knnIndex* index,
-                                           ivf_flat_params* params,
-                                           raft::distance::DistanceType metric,
+                                           const ivf_flat_index_params& params,
                                            T* dataset,
                                            IntType n,
                                            IntType D)
 {
-  index->ivf_flat<T>() = std::make_unique<detail::ivf_flat_handle<T>>(handle, *params);
-  index->ivf_flat<T>()->build(dataset, n, D, metric, handle.get_stream());
+  index->ivf_flat<T>() = std::make_unique<detail::ivf_flat_handle<T>>(handle);
+  index->ivf_flat<T>()->build(dataset, n, D, params, handle.get_stream());
 }
 
 template <typename IntType = int>
-void approx_knn_ivfpq_build_index(
-  knnIndex* index, IVFPQParam* params, raft::distance::DistanceType metric, IntType n, IntType D)
+void approx_knn_ivfpq_build_index(knnIndex* index,
+                                  const ivf_pq_index_params& params,
+                                  IntType n,
+                                  IntType D)
 {
   faiss::gpu::GpuIndexIVFPQConfig config;
   config.device                          = index->device;
-  config.usePrecomputedTables            = params->usePrecomputedTables;
-  config.interleavedLayout               = params->n_bits != 8;
-  faiss::MetricType faiss_metric         = build_faiss_metric(metric);
+  config.usePrecomputedTables            = params.use_precomputed_tables;
+  config.interleavedLayout               = params.n_bits != 8;
+  faiss::MetricType faiss_metric         = build_faiss_metric(params.metric);
   faiss::gpu::GpuIndexIVFPQ* faiss_index = new faiss::gpu::GpuIndexIVFPQ(
-    index->gpu_res, D, params->nlist, params->M, params->n_bits, faiss_metric, config);
-  faiss_index->setNumProbes(params->nprobe);
+    index->gpu_res, D, params.n_lists, params.n_subquantizers, params.n_bits, faiss_metric, config);
   index->index = faiss_index;
 }
 
 template <typename IntType = int>
-void approx_knn_ivfsq_build_index(
-  knnIndex* index, IVFSQParam* params, raft::distance::DistanceType metric, IntType n, IntType D)
+void approx_knn_ivfsq_build_index(knnIndex* index,
+                                  const ivf_sq_index_params& params,
+                                  IntType n,
+                                  IntType D)
 {
   faiss::gpu::GpuIndexIVFScalarQuantizerConfig config;
   config.device                                       = index->device;
-  faiss::MetricType faiss_metric                      = build_faiss_metric(metric);
-  faiss::ScalarQuantizer::QuantizerType faiss_qtype   = build_faiss_qtype(params->qtype);
+  faiss::MetricType faiss_metric                      = build_faiss_metric(params.metric);
+  faiss::ScalarQuantizer::QuantizerType faiss_qtype   = build_faiss_qtype(params.qtype);
   faiss::gpu::GpuIndexIVFScalarQuantizer* faiss_index = new faiss::gpu::GpuIndexIVFScalarQuantizer(
-    index->gpu_res, D, params->nlist, faiss_qtype, faiss_metric, params->encodeResidual);
-  faiss_index->setNumProbes(params->nprobe);
+    index->gpu_res, D, params.n_lists, faiss_qtype, faiss_metric, params.encode_residual);
   index->index = faiss_index;
 }
 
 template <typename T = float, typename IntType = int>
 void approx_knn_build_index(const handle_t& handle,
-                            raft::spatial::knn::knnIndex* index,
-                            raft::spatial::knn::knnIndexParam* params,
-                            raft::distance::DistanceType metric,
-                            float metricArg,
+                            knnIndex* index,
+                            const knn_index_params& params,
                             T* index_array,
                             IntType n,
                             IntType D)
 {
   auto stream      = handle.get_stream();
+  auto metric      = params.metric;
   index->index     = nullptr;
   index->metric    = metric;
-  index->metricArg = metricArg;
+  index->metricArg = params.metric_arg;
   int device;
   RAFT_CUDA_TRY(cudaGetDevice(&device));
-  index->device = device;
+  index->device    = device;
+  auto ivf_ft_pams = dynamic_cast<const ivf_flat_index_params*>(&params);
+  auto ivf_pq_pams = dynamic_cast<const ivf_pq_index_params*>(&params);
+  auto ivf_sq_pams = dynamic_cast<const ivf_sq_index_params*>(&params);
 
   // perform preprocessing
   // k set to 0 (unused during preprocessing / revertion)
   if constexpr (std::is_same<T, uint8_t>{} || std::is_same<T, int8_t>{}) {
-    if (dynamic_cast<ivf_flat_params*>(params)) {
-      ivf_flat_params* IVFFlat_param = dynamic_cast<ivf_flat_params*>(params);
-      approx_knn_cuivfl_ivfflat_build_index(
-        handle, index, IVFFlat_param, metric, index_array, n, D);
+    if (ivf_ft_pams) {
+      approx_knn_cuivfl_ivfflat_build_index(handle, index, *ivf_ft_pams, index_array, n, D);
     } else {
       RAFT_FAIL("IVF Flat algorithm required to fit int8 data");
     }
@@ -162,22 +164,20 @@ void approx_knn_build_index(const handle_t& handle,
     std::unique_ptr<MetricProcessor<float>> query_metric_processor =
       create_processor<float>(metric, n, D, 0, false, stream);
 
-    if (dynamic_cast<ivf_flat_params*>(params)) {
-      ivf_flat_params* IVFFlat_param = dynamic_cast<ivf_flat_params*>(params);
+    if (ivf_ft_pams) {
       // cuivfl only supports L2/Inner product for now.
       if (metric == raft::distance::DistanceType::L2SqrtExpanded ||
           metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
           metric == raft::distance::DistanceType::L2Unexpanded ||
           metric == raft::distance::DistanceType::L2Expanded ||
           metric == raft::distance::DistanceType::InnerProduct) {
-        approx_knn_cuivfl_ivfflat_build_index(
-          handle, index, IVFFlat_param, metric, index_array, n, D);
+        approx_knn_cuivfl_ivfflat_build_index(handle, index, *ivf_ft_pams, index_array, n, D);
       } else {
         raft::spatial::knn::RmmGpuResources* gpu_res = new raft::spatial::knn::RmmGpuResources();
         gpu_res->noTempMemory();
         gpu_res->setDefaultStream(device, stream);
         index->gpu_res = gpu_res;
-        approx_knn_ivfflat_build_index(index, IVFFlat_param, metric, n, D);
+        approx_knn_ivfflat_build_index(index, *ivf_ft_pams, n, D);
         std::vector<float> h_index_array(n * D);
         raft::update_host(h_index_array.data(), index_array, h_index_array.size(), stream);
         query_metric_processor->revert(index_array);
@@ -192,12 +192,10 @@ void approx_knn_build_index(const handle_t& handle,
       gpu_res->setDefaultStream(device, stream);
       index->gpu_res = gpu_res;
       query_metric_processor->preprocess(index_array);
-      if (dynamic_cast<IVFPQParam*>(params)) {
-        IVFPQParam* IVFPQ_param = dynamic_cast<IVFPQParam*>(params);
-        approx_knn_ivfpq_build_index(index, IVFPQ_param, metric, n, D);
-      } else if (dynamic_cast<IVFSQParam*>(params)) {
-        IVFSQParam* IVFSQ_param = dynamic_cast<IVFSQParam*>(params);
-        approx_knn_ivfsq_build_index(index, IVFSQ_param, metric, n, D);
+      if (ivf_pq_pams) {
+        approx_knn_ivfpq_build_index(index, *ivf_pq_pams, n, D);
+      } else if (ivf_sq_pams) {
+        approx_knn_ivfsq_build_index(index, *ivf_sq_pams, n, D);
       } else {
         ASSERT(index->index, "KNN index could not be initialized");
       }
@@ -213,12 +211,16 @@ template <typename T = float, typename IntType = int>
 void approx_knn_search(const handle_t& handle,
                        float* distances,
                        int64_t* indices,
-                       raft::spatial::knn::knnIndex* index,
-                       raft::spatial::knn::knnIndexParam* params,
+                       knnIndex* index,
+                       const knn_search_params& params,
                        IntType k,
                        T* query_array,
                        IntType n)
 {
+  if (dynamic_cast<GpuIndexIVF*>(index->index) && dynamic_cast<const ivf_search_params*>(&params)) {
+    dynamic_cast<GpuIndexIVF*>(index->index)
+      ->setNumProbes(dynamic_cast<const ivf_search_params&>(params).n_probes);
+  }
   // perform preprocessing
 #if 0
   std::unique_ptr<MetricProcessor<float>> query_metric_processor =
@@ -226,21 +228,20 @@ void approx_knn_search(const handle_t& handle,
   query_metric_processor->preprocess(query_array);
     index->index->search(n, query_array, k, distances, indices);
 #else
+  auto ivf_ft_pams = dynamic_cast<const ivf_flat_search_params*>(&params);
   if constexpr (std::is_same<T, uint8_t>{} || std::is_same<T, int8_t>{}) {
-    if (dynamic_cast<ivf_flat_params*>(params)) {
-      ivf_flat_params* IVFFlat_param = dynamic_cast<ivf_flat_params*>(params);
+    if (ivf_ft_pams) {
       index->ivf_flat<T>()->search(
-        query_array, n, k, IVFFlat_param->nprobe, (size_t*)indices, distances, handle.get_stream());
+        query_array, n, k, *ivf_ft_pams, (size_t*)indices, distances, handle.get_stream());
     }
   } else if constexpr (std::is_same<T, float>{}) {
     std::unique_ptr<MetricProcessor<float>> query_metric_processor = create_processor<float>(
       index->metric, n, index->ivf_flat<T>()->data_dim(), k, false, handle.get_stream());
     query_metric_processor->preprocess(query_array);
 
-    if (dynamic_cast<ivf_flat_params*>(params)) {
-      ivf_flat_params* IVFFlat_param = dynamic_cast<ivf_flat_params*>(params);
+    if (ivf_ft_pams) {
       index->ivf_flat<T>()->search(
-        query_array, n, k, IVFFlat_param->nprobe, (size_t*)indices, distances, handle.get_stream());
+        query_array, n, k, *ivf_ft_pams, (size_t*)indices, distances, handle.get_stream());
     }
     query_metric_processor->revert(query_array);
 
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index cae531a312..6e8d36820b 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "../ann_common.h"
+#include "../ann_common.hpp"
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
diff --git a/cpp/test/spatial/ann_ivf_flat.cu b/cpp/test/spatial/ann_ivf_flat.cu
index 0ae5aa2a4f..0b8185ffb2 100644
--- a/cpp/test/spatial/ann_ivf_flat.cu
+++ b/cpp/test/spatial/ann_ivf_flat.cu
@@ -145,28 +145,24 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
     {
       rmm::device_uvector<T> distances_ivfflat_dev(queries_size, stream_);
       rmm::device_uvector<int64_t> indices_ivfflat_dev(queries_size, stream_);
-      raft::spatial::knn::ivf_flat_params params;
-      params.nprobe = ps.nprobe;
-      params.nlist  = ps.nlist;
+      raft::spatial::knn::ivf_flat_index_params index_params;
+      raft::spatial::knn::ivf_flat_search_params search_params;
+      index_params.n_lists   = ps.nlist;
+      index_params.metric    = ps.metric;
+      search_params.n_probes = ps.nprobe;
       raft::spatial::knn::knnIndex index;
       index.index   = nullptr;
       index.gpu_res = nullptr;
 
-      approx_knn_build_index(handle_,
-                             &index,
-                             dynamic_cast<raft::spatial::knn::knnIndexParam*>(&params),
-                             ps.metric,
-                             0,
-                             database.data(),
-                             ps.num_db_vecs,
-                             ps.dim);
+      approx_knn_build_index(
+        handle_, &index, index_params, database.data(), ps.num_db_vecs, ps.dim);
       handle_.sync_stream(stream_);
 
       approx_knn_search(handle_,
                         distances_ivfflat_dev.data(),
                         indices_ivfflat_dev.data(),
                         &index,
-                        dynamic_cast<raft::spatial::knn::knnIndexParam*>(&params),
+                        search_params,
                         ps.k,
                         search_queries.data(),
                         ps.num_queries);

From 867beca8e83f72e215980bbfce1f66e6643b6ed4 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 22 Jun 2022 11:26:37 +0200
Subject: [PATCH 076/118] refactor away ivf_flat_index

---
 cpp/bench/spatial/knn.cu                      |   4 +-
 cpp/include/raft/spatial/knn/ann_common.hpp   |  18 +-
 .../raft/spatial/knn/detail/ann_ivf_flat.cuh  | 491 ------------------
 .../knn/detail/ann_kmeans_balanced.cuh        |   1 +
 .../raft/spatial/knn/detail/ann_quantized.cuh |  41 +-
 .../spatial/knn/detail/ivf_flat_build.cuh     | 213 ++++++++
 ...vf_flat_kernel.cuh => ivf_flat_search.cuh} | 256 ++++++++-
 cpp/include/raft/spatial/knn/ivf_flat.cuh     |  83 +++
 .../raft/spatial/knn/ivf_flat_types.hpp       | 118 +++++
 cpp/test/spatial/ann_ivf_flat.cu              |   4 +-
 10 files changed, 691 insertions(+), 538 deletions(-)
 delete mode 100644 cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
 create mode 100644 cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
 rename cpp/include/raft/spatial/knn/detail/{ann_ivf_flat_kernel.cuh => ivf_flat_search.cuh} (81%)
 create mode 100644 cpp/include/raft/spatial/knn/ivf_flat.cuh
 create mode 100644 cpp/include/raft/spatial/knn/ivf_flat_types.hpp

diff --git a/cpp/bench/spatial/knn.cu b/cpp/bench/spatial/knn.cu
index 5f97e3ae57..49d1b00dfb 100644
--- a/cpp/bench/spatial/knn.cu
+++ b/cpp/bench/spatial/knn.cu
@@ -132,8 +132,8 @@ struct host_uvector {
 template <typename ValT, typename IdxT>
 struct ivf_flat_knn {
   raft::spatial::knn::knnIndex index;
-  raft::spatial::knn::ivf_flat_index_params index_params;
-  raft::spatial::knn::ivf_flat_search_params search_params;
+  raft::spatial::knn::ivf_flat::index_params index_params;
+  raft::spatial::knn::ivf_flat::search_params search_params;
   params ps;
 
   ivf_flat_knn(const raft::handle_t& handle, const params& ps, const ValT* data) : ps(ps)
diff --git a/cpp/include/raft/spatial/knn/ann_common.hpp b/cpp/include/raft/spatial/knn/ann_common.hpp
index e916747bbb..72d5f365b6 100644
--- a/cpp/include/raft/spatial/knn/ann_common.hpp
+++ b/cpp/include/raft/spatial/knn/ann_common.hpp
@@ -22,18 +22,18 @@
 
 namespace raft::spatial::knn {
 
-namespace detail {
+namespace ivf_flat {
 template <typename T>
-class ivf_flat_handle;
+class index;
 };
 
 struct knnIndex {
   faiss::gpu::GpuIndex* index;
   raft::distance::DistanceType metric;
   float metricArg;
-  std::unique_ptr<detail::ivf_flat_handle<float>> ivf_flat_float_;
-  std::unique_ptr<detail::ivf_flat_handle<uint8_t>> ivf_flat_uint8_t_;
-  std::unique_ptr<detail::ivf_flat_handle<int8_t>> ivf_flat_int8_t_;
+  std::unique_ptr<ivf_flat::index<float>> ivf_flat_float_;
+  std::unique_ptr<ivf_flat::index<uint8_t>> ivf_flat_uint8_t_;
+  std::unique_ptr<ivf_flat::index<int8_t>> ivf_flat_int8_t_;
 
   raft::spatial::knn::RmmGpuResources* gpu_res;
   int device;
@@ -44,23 +44,23 @@ struct knnIndex {
   }
 
   template <typename T>
-  auto ivf_flat() -> std::unique_ptr<detail::ivf_flat_handle<T>>&;
+  auto ivf_flat() -> std::unique_ptr<ivf_flat::index<T>>&;
 };
 
 template <>
-auto knnIndex::ivf_flat<float>() -> std::unique_ptr<detail::ivf_flat_handle<float>>&
+auto knnIndex::ivf_flat<float>() -> std::unique_ptr<ivf_flat::index<float>>&
 {
   return ivf_flat_float_;
 }
 
 template <>
-auto knnIndex::ivf_flat<uint8_t>() -> std::unique_ptr<detail::ivf_flat_handle<uint8_t>>&
+auto knnIndex::ivf_flat<uint8_t>() -> std::unique_ptr<ivf_flat::index<uint8_t>>&
 {
   return ivf_flat_uint8_t_;
 }
 
 template <>
-auto knnIndex::ivf_flat<int8_t>() -> std::unique_ptr<detail::ivf_flat_handle<int8_t>>&
+auto knnIndex::ivf_flat<int8_t>() -> std::unique_ptr<ivf_flat::index<int8_t>>&
 {
   return ivf_flat_int8_t_;
 }
diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh b/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
deleted file mode 100644
index 9387ae7978..0000000000
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat.cuh
+++ /dev/null
@@ -1,491 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "../ann_common.hpp"
-#include "../ivf_flat.hpp"
-
-#include "ann_ivf_flat_kernel.cuh"
-#include "ann_kmeans_balanced.cuh"
-#include "ann_utils.cuh"
-#include "topk/radix_topk.cuh"
-#include "topk/warpsort_topk.cuh"
-
-#include <raft/common/nvtx.hpp>
-#include <raft/core/logger.hpp>
-#include <raft/cuda_utils.cuh>
-#include <raft/cudart_utils.h>
-#include <raft/distance/distance.cuh>
-#include <raft/distance/distance_type.hpp>
-#include <raft/linalg/gemm.cuh>
-#include <raft/linalg/unary_op.cuh>
-#include <raft/spatial/knn/ann_common.hpp>
-#include <raft/stats/histogram.cuh>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/mr/device/pool_memory_resource.hpp>
-
-namespace raft::spatial::knn::detail {
-
-template <typename T>
-class ivf_flat_handle {
- public:
-  ivf_flat_handle(const handle_t& handle) : handle_(handle) {}
-
-  /**
-   * @brief Build the index from the dataset for efficient search.
-   *
-   * @param[in] dataset a device pointer to a row-major matrix [n_rows, dim]
-   * @param n_rows number of samples
-   * @param dim the dimensionality of the data
-   * @param params configure the index building
-   * @param stream
-   */
-  void build(const T* dataset,
-             uint32_t n_rows,
-             uint32_t dim,
-             const ivf_flat_index_params& params,
-             rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Search ANN using the constructed index.
-   *
-   * @param[in] queries a device pointer to a row-major matrix [n_queries, dim]
-   * @param n_queries is the batch size
-   * @param k is the number of neighbors to find for each query.
-   * @param params configure the search
-   * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
-   * [n_queries, k]
-   * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries,
-   * k]
-   * @param stream
-   */
-  void search(const T* queries,
-              uint32_t n_queries,
-              uint32_t k,
-              const ivf_flat_search_params& params,
-              size_t* neighbors,
-              float* distances,
-              rmm::cuda_stream_view stream);
-
-  /** Whether `build` method has already been succesfully invoked. */
-  [[nodiscard]] auto is_trained() const -> bool { return index_.has_value(); }
-
-  /** Dimensionality of the data, on which the index has been built. */
-  [[nodiscard]] auto data_dim() const -> uint32_t { return is_trained() ? index_->dim() : 0; }
-
- private:
-  const handle_t& handle_;
-
-  // The built index
-  std::optional<const ivf_flat_index<T>> index_ = std::nullopt;
-
-  // Memory pool for use during search; after the first search is done the pool is not likely to
-  // resize, saving the costs of allocations.
-  std::optional<rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>> search_mem_res_;
-
-  template <typename AccT>
-  void search_impl(const T* queries,
-                   uint32_t n_queries,
-                   uint32_t k,
-                   uint32_t n_probes,
-                   bool select_min,
-                   size_t* neighbors,
-                   AccT* distances,
-                   rmm::cuda_stream_view stream);
-};
-
-/**
- * @brief Record the dataset into the index, one source row at a time.
- *
- * The index consists of the dataset rows, grouped by their labels (into clusters/lists).
- * Within each cluster (list), the data is grouped into blocks of `WarpSize` interleaved
- * vectors. Note, the total index length is slightly larger than the dataset length, because
- * each cluster is padded by `WarpSize` elements
- *
- * CUDA launch grid:
- *   X dimension must cover the dataset (n_rows), YZ are not used;
- *   there are no dependencies between threads, hence no constraints on the block size.
- *
- * @tparam T the element type.
- *
- * @param[in] labels device pointer to the cluster ids for each row [n_rows]
- * @param[in] list_offsets device pointer to the cluster offsets in the output (index) [n_lists]
- * @param[in] dataset device poitner to the input data [n_rows, dim]
- * @param[out] list_data device pointer to the output [index_size, dim]
- * @param[out] list_index device pointer to the source ids corr. to the output [index_size]
- * @param[out] list_sizes_ptr device pointer to the cluster sizes [n_lists];
- *                          it's used as an atomic counter, and must be initialized with zeros.
- * @param n_rows source length
- * @param dim the dimensionality of the data
- * @param veclen size of vectorized loads/stores; must satisfy `dim % veclen == 0`.
- *
- */
-template <typename T>
-__global__ void build_index_kernel(const uint32_t* labels,
-                                   const uint32_t* list_offsets,
-                                   const T* dataset,
-                                   T* list_data,
-                                   uint32_t* list_index,
-                                   uint32_t* list_sizes_ptr,
-                                   uint32_t n_rows,
-                                   uint32_t dim,
-                                   uint32_t veclen)
-{
-  const int i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i >= n_rows) { return; }
-
-  auto list_id     = labels[i];
-  auto inlist_id   = atomicAdd(list_sizes_ptr + list_id, 1);
-  auto list_offset = list_offsets[list_id];
-
-  // Record the source vector id in the index
-  list_index[list_offset + inlist_id] = i;
-
-  // The data is written in interleaved groups of `WarpSize` vectors
-  using interleaved_group = Pow2<WarpSize>;
-  auto group_offset       = interleaved_group::roundDown(inlist_id);
-  auto ingroup_id         = interleaved_group::mod(inlist_id) * veclen;
-
-  // Point to the location of the interleaved group of vectors
-  list_data += (list_offset + group_offset) * dim;
-
-  // Point to the source vector
-  dataset += i * dim;
-
-  // Interleave dimensions of the source vector while recording it.
-  // NB: such `veclen` is selected, that `dim % veclen == 0`
-  for (uint32_t l = 0; l < dim; l += veclen) {
-    for (uint32_t j = 0; j < veclen; j++) {
-      list_data[l * WarpSize + ingroup_id + j] = dataset[l + j];
-    }
-  }
-}
-
-template <typename T>
-void ivf_flat_handle<T>::build(const T* dataset,
-                               uint32_t n_rows,
-                               uint32_t dim,
-                               const ivf_flat_index_params& params,
-                               rmm::cuda_stream_view stream)
-{
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
-    "ivf_flat_handle::build(%u, %u)", n_rows, dim);
-  static_assert(std::is_same_v<T, float> || std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
-                "unsupported data type");
-  RAFT_EXPECTS(n_rows > 0 && dim > 0, "empty dataset");
-
-  // TODO: consider padding the dimensions and fixing veclen to its maximum possible value as a
-  // template parameter (https://github.com/rapidsai/raft/issues/711)
-  uint32_t veclen = 16 / sizeof(T);
-  while (dim % veclen != 0) {
-    veclen = veclen >> 1;
-  }
-  auto n_lists = static_cast<uint32_t>(params.n_lists);
-
-  // kmeans cluster ids for the dataset
-  rmm::device_uvector<uint32_t> labels(n_rows, stream);
-  auto&& centers      = make_array_for_index<float>(stream, n_lists, dim);
-  auto&& list_sizes   = make_array_for_index<uint32_t>(stream, n_lists);
-  auto list_sizes_ptr = list_sizes.data();
-
-  // Predict labels of the whole dataset
-  kmeans::build_optimized_kmeans(handle_,
-                                 params.kmeans_n_iters,
-                                 dim,
-                                 dataset,
-                                 n_rows,
-                                 labels.data(),
-                                 list_sizes_ptr,
-                                 centers.data(),
-                                 n_lists,
-                                 params.kmeans_trainset_fraction,
-                                 params.metric,
-                                 stream);
-
-  // Calculate offsets into cluster data using exclusive scan
-  auto&& list_offsets   = make_array_for_index<uint32_t>(stream, n_lists + 1);
-  auto list_offsets_ptr = list_offsets.data();
-
-  thrust::exclusive_scan(
-    rmm::exec_policy(stream),
-    list_sizes_ptr,
-    list_sizes_ptr + n_lists + 1,
-    list_offsets_ptr,
-    uint32_t(0),
-    [] __device__(uint32_t s, uint32_t l) { return s + Pow2<WarpSize>::roundUp(l); });
-
-  uint32_t index_size;
-  update_host(&index_size, list_offsets_ptr + n_lists, 1, stream);
-  handle_.sync_stream(stream);
-
-  auto&& data    = make_array_for_index<T>(stream, index_size, dim);
-  auto&& indices = make_array_for_index<uint32_t>(stream, index_size);
-
-  // we'll rebuild the `list_sizes_ptr` in the following kernel, using it as an atomic counter.
-  utils::memset(list_sizes_ptr, 0, sizeof(uint32_t) * n_lists, stream);
-
-  const dim3 block_dim(256);
-  const dim3 grid_dim(raft::ceildiv<uint32_t>(n_rows, block_dim.x));
-  build_index_kernel<<<grid_dim, block_dim, 0, stream>>>(labels.data(),
-                                                         list_offsets_ptr,
-                                                         dataset,
-                                                         data.data(),
-                                                         indices.data(),
-                                                         list_sizes_ptr,
-                                                         n_rows,
-                                                         dim,
-                                                         veclen);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
-
-  // Precompute the centers vector norms for L2Expanded distance
-  auto compute_norms = [&]() {
-    auto&& r = make_array_for_index<float>(stream, n_lists);
-    utils::dots_along_rows(n_lists, dim, centers.data(), r.data(), stream);
-    RAFT_LOG_TRACE_VEC(r.data(), 20);
-    return r;
-  };
-  auto&& center_norms = params.metric == raft::distance::DistanceType::L2Expanded
-                          ? std::optional(compute_norms())
-                          : std::nullopt;
-
-  // assemble the index
-  index_.emplace(ivf_flat_index<T>{
-    veclen, params.metric, data, indices, list_sizes, list_offsets, centers, center_norms});
-
-  // check index invariants
-  index_->check_consistency();
-}
-
-template <typename T>
-void ivf_flat_handle<T>::search(const T* queries,
-                                uint32_t n_queries,
-                                uint32_t k,
-                                const ivf_flat_search_params& params,
-                                size_t* neighbors,
-                                float* distances,
-                                rmm::cuda_stream_view stream)
-{
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
-    "ivf_flat_handle::search(%u, %u, %zu)", n_queries, k, neighbors);
-
-  RAFT_EXPECTS(is_trained(),
-               "The index must be trained before the search (ivf_flat_handle::build)");
-  RAFT_EXPECTS(params.n_probes > 0,
-               "n_probes (number of clusters to probe in the search) must be positive.");
-  auto n_probes = std::min<uint32_t>(params.n_probes, index_->n_lists());
-
-  bool select_min;
-  switch (index_->metric) {
-    case raft::distance::DistanceType::InnerProduct:
-    case raft::distance::DistanceType::CosineExpanded:
-    case raft::distance::DistanceType::CorrelationExpanded:
-      // Similarity metrics have the opposite meaning, i.e. nearest neigbours are those with larger
-      // similarity (See the same logic at cpp/include/raft/sparse/selection/detail/knn.cuh:362
-      // {perform_k_selection})
-      select_min = false;
-      break;
-    default: select_min = true;
-  }
-
-  // Set memory buffer to be reused across searches
-  auto cur_memory_resource = rmm::mr::get_current_device_resource();
-  if (!search_mem_res_.has_value() || search_mem_res_->get_upstream() != cur_memory_resource) {
-    search_mem_res_.emplace(cur_memory_resource, Pow2<256>::roundUp(n_queries * n_probes * k * 16));
-  }
-
-  search_impl<float>(queries, n_queries, k, n_probes, select_min, neighbors, distances, stream);
-}
-
-template <typename T>
-template <typename AccT>
-void ivf_flat_handle<T>::search_impl(const T* queries,
-                                     uint32_t n_queries,
-                                     uint32_t k,
-                                     uint32_t n_probes,
-                                     bool select_min,
-                                     size_t* neighbors,
-                                     AccT* distances,
-                                     rmm::cuda_stream_view stream)
-{
-  auto search_mr = &(search_mem_res_.value());
-  // The norm of query
-  rmm::device_uvector<float> query_norm_dev(n_queries, stream, search_mr);
-  // The distance value of cluster(list) and queries
-  rmm::device_uvector<float> distance_buffer_dev(n_queries * index_->n_lists(), stream, search_mr);
-  // The topk distance value of cluster(list) and queries
-  rmm::device_uvector<float> coarse_distances_dev(n_queries * n_probes, stream, search_mr);
-  // The topk  index of cluster(list) and queries
-  rmm::device_uvector<uint32_t> coarse_indices_dev(n_queries * n_probes, stream, search_mr);
-  // The topk distance value of candicate vectors from each cluster(list)
-  rmm::device_uvector<AccT> refined_distances_dev(n_queries * n_probes * k, stream, search_mr);
-  // The topk index of candicate vectors from each cluster(list)
-  rmm::device_uvector<size_t> refined_indices_dev(n_queries * n_probes * k, stream, search_mr);
-
-  size_t float_query_size;
-  if constexpr (std::is_integral_v<T>) {
-    float_query_size = n_queries * index_->dim();
-  } else {
-    float_query_size = 0;
-  }
-  rmm::device_uvector<float> converted_queries_dev(float_query_size, stream, search_mr);
-  float* converted_queries_ptr = converted_queries_dev.data();
-
-  if constexpr (std::is_same_v<T, float>) {
-    converted_queries_ptr = const_cast<float*>(queries);
-  } else {
-    linalg::unaryOp(
-      converted_queries_ptr, queries, n_queries * index_->dim(), utils::mapping<float>{}, stream);
-  }
-
-  float alpha = 1.0f;
-  float beta  = 0.0f;
-
-  if (index_->metric == raft::distance::DistanceType::L2Expanded) {
-    alpha = -2.0f;
-    beta  = 1.0f;
-    utils::dots_along_rows(
-      n_queries, index_->dim(), converted_queries_ptr, query_norm_dev.data(), stream);
-    utils::outer_add(query_norm_dev.data(),
-                     n_queries,
-                     index_->center_norms->data(),
-                     index_->n_lists(),
-                     distance_buffer_dev.data(),
-                     stream);
-    RAFT_LOG_TRACE_VEC(index_->center_norms->data(), 20);
-    RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), 20);
-  } else {
-    alpha = 1.0f;
-    beta  = 0.0f;
-  }
-
-  linalg::gemm(handle_,
-               true,
-               false,
-               index_->n_lists(),
-               n_queries,
-               index_->dim(),
-               &alpha,
-               index_->centers.data(),
-               index_->dim(),
-               converted_queries_ptr,
-               index_->dim(),
-               &beta,
-               distance_buffer_dev.data(),
-               index_->n_lists(),
-               stream);
-
-  RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), 20);
-  if (n_probes <= raft::spatial::knn::detail::topk::kMaxCapacity) {
-    topk::warp_sort_topk<AccT, uint32_t>(distance_buffer_dev.data(),
-                                         nullptr,
-                                         n_queries,
-                                         index_->n_lists(),
-                                         n_probes,
-                                         coarse_distances_dev.data(),
-                                         coarse_indices_dev.data(),
-                                         select_min,
-                                         stream);
-  } else {
-    topk::radix_topk<AccT, uint32_t, 11, 512>(distance_buffer_dev.data(),
-                                              nullptr,
-                                              n_queries,
-                                              index_->n_lists(),
-                                              n_probes,
-                                              coarse_distances_dev.data(),
-                                              coarse_indices_dev.data(),
-                                              select_min,
-                                              stream,
-                                              &(search_mem_res_.value()));
-  }
-  RAFT_LOG_TRACE_VEC(coarse_indices_dev.data(), 1 * n_probes);
-  RAFT_LOG_TRACE_VEC(coarse_distances_dev.data(), 1 * n_probes);
-
-  AccT* distances_dev_ptr = refined_distances_dev.data();
-  size_t* indices_dev_ptr = refined_indices_dev.data();
-
-  uint32_t grid_dim_x = 0;
-  if (n_probes > 1) {
-    // query the gridDimX size to store probes topK output
-    ivfflat_interleaved_scan<T, typename utils::config<T>::value_t>(index_.value(),
-                                                                    nullptr,
-                                                                    nullptr,
-                                                                    n_queries,
-                                                                    index_->metric,
-                                                                    n_probes,
-                                                                    k,
-                                                                    select_min,
-                                                                    nullptr,
-                                                                    nullptr,
-                                                                    grid_dim_x,
-                                                                    stream);
-  } else {
-    grid_dim_x = 1;
-  }
-
-  if (grid_dim_x == 1) {
-    distances_dev_ptr = distances;
-    indices_dev_ptr   = neighbors;
-  }
-
-  ivfflat_interleaved_scan<T, typename utils::config<T>::value_t>(index_.value(),
-                                                                  queries,
-                                                                  coarse_indices_dev.data(),
-                                                                  n_queries,
-                                                                  index_->metric,
-                                                                  n_probes,
-                                                                  k,
-                                                                  select_min,
-                                                                  indices_dev_ptr,
-                                                                  distances_dev_ptr,
-                                                                  grid_dim_x,
-                                                                  stream);
-
-  RAFT_LOG_TRACE_VEC(distances_dev_ptr, 2 * k);
-  RAFT_LOG_TRACE_VEC(indices_dev_ptr, 2 * k);
-
-  // Merge topk values from different blocks
-  if (grid_dim_x > 1) {
-    if (k <= raft::spatial::knn::detail::topk::kMaxCapacity) {
-      topk::warp_sort_topk<AccT, size_t>(refined_distances_dev.data(),
-                                         refined_indices_dev.data(),
-                                         n_queries,
-                                         k * grid_dim_x,
-                                         k,
-                                         distances,
-                                         neighbors,
-                                         select_min,
-                                         stream);
-    } else {
-      topk::radix_topk<AccT, size_t, 11, 512>(refined_distances_dev.data(),
-                                              refined_indices_dev.data(),
-                                              n_queries,
-                                              k * grid_dim_x,
-                                              k,
-                                              distances,
-                                              neighbors,
-                                              select_min,
-                                              stream,
-                                              &(search_mem_res_.value()));
-    }
-  }
-}
-
-}  // namespace raft::spatial::knn::detail
diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index bc3273b9ba..e039ac9f32 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -20,6 +20,7 @@
 #include "ann_utils.cuh"
 
 #include <raft/common/nvtx.hpp>
+#include <raft/core/logger.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/distance/distance.hpp>
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
index 981f058433..39363ae53d 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include "../ann_common.hpp"
+#include "../ivf_flat.cuh"
 #include "knn_brute_force_faiss.cuh"
 
 #include "common_faiss.h"
@@ -47,8 +48,6 @@
 
 #include <raft/distance/distance_type.hpp>
 
-#include "ann_ivf_flat.cuh"
-
 #include <iostream>
 #include <set>
 
@@ -92,13 +91,13 @@ void approx_knn_ivfflat_build_index(knnIndex* index,
 template <typename T = float, typename IntType = int>
 void approx_knn_cuivfl_ivfflat_build_index(const raft::handle_t& handle,
                                            knnIndex* index,
-                                           const ivf_flat_index_params& params,
+                                           const ivf_flat::index_params& params,
                                            T* dataset,
-                                           IntType n,
-                                           IntType D)
+                                           IntType n_rows,
+                                           IntType dim)
 {
-  index->ivf_flat<T>() = std::make_unique<detail::ivf_flat_handle<T>>(handle);
-  index->ivf_flat<T>()->build(dataset, n, D, params, handle.get_stream());
+  index->ivf_flat<T>() = std::make_unique<ivf_flat::index<T>>(
+    ivf_flat::build(handle, params, dataset, n_rows, dim, handle.get_stream()));
 }
 
 template <typename IntType = int>
@@ -148,7 +147,7 @@ void approx_knn_build_index(const handle_t& handle,
   int device;
   RAFT_CUDA_TRY(cudaGetDevice(&device));
   index->device    = device;
-  auto ivf_ft_pams = dynamic_cast<const ivf_flat_index_params*>(&params);
+  auto ivf_ft_pams = dynamic_cast<const ivf_flat::index_params*>(&params);
   auto ivf_pq_pams = dynamic_cast<const ivf_pq_index_params*>(&params);
   auto ivf_sq_pams = dynamic_cast<const ivf_sq_index_params*>(&params);
 
@@ -228,20 +227,34 @@ void approx_knn_search(const handle_t& handle,
   query_metric_processor->preprocess(query_array);
     index->index->search(n, query_array, k, distances, indices);
 #else
-  auto ivf_ft_pams = dynamic_cast<const ivf_flat_search_params*>(&params);
+  auto ivf_ft_pams = dynamic_cast<const ivf_flat::search_params*>(&params);
   if constexpr (std::is_same<T, uint8_t>{} || std::is_same<T, int8_t>{}) {
     if (ivf_ft_pams) {
-      index->ivf_flat<T>()->search(
-        query_array, n, k, *ivf_ft_pams, (size_t*)indices, distances, handle.get_stream());
+      ivf_flat::search(handle,
+                       *ivf_ft_pams,
+                       *(index->ivf_flat<T>()),
+                       query_array,
+                       n,
+                       k,
+                       (size_t*)indices,
+                       distances,
+                       handle.get_stream());
     }
   } else if constexpr (std::is_same<T, float>{}) {
     std::unique_ptr<MetricProcessor<float>> query_metric_processor = create_processor<float>(
-      index->metric, n, index->ivf_flat<T>()->data_dim(), k, false, handle.get_stream());
+      index->metric, n, index->ivf_flat<T>()->dim(), k, false, handle.get_stream());
     query_metric_processor->preprocess(query_array);
 
     if (ivf_ft_pams) {
-      index->ivf_flat<T>()->search(
-        query_array, n, k, *ivf_ft_pams, (size_t*)indices, distances, handle.get_stream());
+      ivf_flat::search(handle,
+                       *ivf_ft_pams,
+                       *(index->ivf_flat<T>()),
+                       query_array,
+                       n,
+                       k,
+                       (size_t*)indices,
+                       distances,
+                       handle.get_stream());
     }
     query_metric_processor->revert(query_array);
 
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
new file mode 100644
index 0000000000..0d5456dbf7
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "../ivf_flat_types.hpp"
+#include "ann_kmeans_balanced.cuh"
+#include "ann_utils.cuh"
+
+#include <raft/core/handle.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/core/mdarray.hpp>
+#include <raft/core/nvtx.hpp>
+#include <raft/pow2_utils.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace raft::spatial::knn::detail::ivf_flat {
+
+using raft::spatial::knn::ivf_flat::index;
+using raft::spatial::knn::ivf_flat::index_params;
+
+template <typename T, typename... Extents>
+static inline auto make_array_for_index(rmm::cuda_stream_view stream, Extents... exts)
+{
+  using extent_t  = extents<((void)exts, dynamic_extent)...>;
+  using mdarray_t = device_mdarray<T, extent_t, layout_c_contiguous>;
+
+  typename mdarray_t::extents_type extent{exts...};
+  typename mdarray_t::mapping_type layout{extent};
+  typename mdarray_t::container_policy_type policy{stream};
+
+  return mdarray_t{layout, policy};
+}
+
+/**
+ * @brief Record the dataset into the index, one source row at a time.
+ *
+ * The index consists of the dataset rows, grouped by their labels (into clusters/lists).
+ * Within each cluster (list), the data is grouped into blocks of `WarpSize` interleaved
+ * vectors. Note, the total index length is slightly larger than the dataset length, because
+ * each cluster is padded by `WarpSize` elements
+ *
+ * CUDA launch grid:
+ *   X dimension must cover the dataset (n_rows), YZ are not used;
+ *   there are no dependencies between threads, hence no constraints on the block size.
+ *
+ * @tparam T the element type.
+ *
+ * @param[in] labels device pointer to the cluster ids for each row [n_rows]
+ * @param[in] list_offsets device pointer to the cluster offsets in the output (index) [n_lists]
+ * @param[in] dataset device poitner to the input data [n_rows, dim]
+ * @param[out] list_data device pointer to the output [index_size, dim]
+ * @param[out] list_index device pointer to the source ids corr. to the output [index_size]
+ * @param[out] list_sizes_ptr device pointer to the cluster sizes [n_lists];
+ *                          it's used as an atomic counter, and must be initialized with zeros.
+ * @param n_rows source length
+ * @param dim the dimensionality of the data
+ * @param veclen size of vectorized loads/stores; must satisfy `dim % veclen == 0`.
+ *
+ */
+template <typename T>
+__global__ void build_index_kernel(const uint32_t* labels,
+                                   const uint32_t* list_offsets,
+                                   const T* dataset,
+                                   T* list_data,
+                                   uint32_t* list_index,
+                                   uint32_t* list_sizes_ptr,
+                                   uint32_t n_rows,
+                                   uint32_t dim,
+                                   uint32_t veclen)
+{
+  const int i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (i >= n_rows) { return; }
+
+  auto list_id     = labels[i];
+  auto inlist_id   = atomicAdd(list_sizes_ptr + list_id, 1);
+  auto list_offset = list_offsets[list_id];
+
+  // Record the source vector id in the index
+  list_index[list_offset + inlist_id] = i;
+
+  // The data is written in interleaved groups of `WarpSize` vectors
+  using interleaved_group = Pow2<WarpSize>;
+  auto group_offset       = interleaved_group::roundDown(inlist_id);
+  auto ingroup_id         = interleaved_group::mod(inlist_id) * veclen;
+
+  // Point to the location of the interleaved group of vectors
+  list_data += (list_offset + group_offset) * dim;
+
+  // Point to the source vector
+  dataset += i * dim;
+
+  // Interleave dimensions of the source vector while recording it.
+  // NB: such `veclen` is selected, that `dim % veclen == 0`
+  for (uint32_t l = 0; l < dim; l += veclen) {
+    for (uint32_t j = 0; j < veclen; j++) {
+      list_data[l * WarpSize + ingroup_id + j] = dataset[l + j];
+    }
+  }
+}
+
+template <typename T>
+inline auto build(const handle_t& handle,
+                  const index_params& params,
+                  const T* dataset,
+                  uint32_t n_rows,
+                  uint32_t dim,
+                  rmm::cuda_stream_view stream) -> index<T>
+{
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope("ivf_flat::build(%u, %u)", n_rows, dim);
+  static_assert(std::is_same_v<T, float> || std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
+                "unsupported data type");
+  RAFT_EXPECTS(n_rows > 0 && dim > 0, "empty dataset");
+
+  // TODO: consider padding the dimensions and fixing veclen to its maximum possible value as a
+  // template parameter (https://github.com/rapidsai/raft/issues/711)
+  uint32_t veclen = 16 / sizeof(T);
+  while (dim % veclen != 0) {
+    veclen = veclen >> 1;
+  }
+  auto n_lists = static_cast<uint32_t>(params.n_lists);
+
+  // kmeans cluster ids for the dataset
+  rmm::device_uvector<uint32_t> labels(n_rows, stream);
+  auto&& centers      = make_array_for_index<float>(stream, n_lists, dim);
+  auto&& list_sizes   = make_array_for_index<uint32_t>(stream, n_lists);
+  auto list_sizes_ptr = list_sizes.data();
+
+  // Predict labels of the whole dataset
+  kmeans::build_optimized_kmeans(handle,
+                                 params.kmeans_n_iters,
+                                 dim,
+                                 dataset,
+                                 n_rows,
+                                 labels.data(),
+                                 list_sizes_ptr,
+                                 centers.data(),
+                                 n_lists,
+                                 params.kmeans_trainset_fraction,
+                                 params.metric,
+                                 stream);
+
+  // Calculate offsets into cluster data using exclusive scan
+  auto&& list_offsets   = make_array_for_index<uint32_t>(stream, n_lists + 1);
+  auto list_offsets_ptr = list_offsets.data();
+
+  thrust::exclusive_scan(
+    rmm::exec_policy(stream),
+    list_sizes_ptr,
+    list_sizes_ptr + n_lists + 1,
+    list_offsets_ptr,
+    uint32_t(0),
+    [] __device__(uint32_t s, uint32_t l) { return s + Pow2<WarpSize>::roundUp(l); });
+
+  uint32_t index_size;
+  update_host(&index_size, list_offsets_ptr + n_lists, 1, stream);
+  handle.sync_stream(stream);
+
+  auto&& data    = make_array_for_index<T>(stream, index_size, dim);
+  auto&& indices = make_array_for_index<uint32_t>(stream, index_size);
+
+  // we'll rebuild the `list_sizes_ptr` in the following kernel, using it as an atomic counter.
+  utils::memset(list_sizes_ptr, 0, sizeof(uint32_t) * n_lists, stream);
+
+  const dim3 block_dim(256);
+  const dim3 grid_dim(raft::ceildiv<uint32_t>(n_rows, block_dim.x));
+  build_index_kernel<<<grid_dim, block_dim, 0, stream>>>(labels.data(),
+                                                         list_offsets_ptr,
+                                                         dataset,
+                                                         data.data(),
+                                                         indices.data(),
+                                                         list_sizes_ptr,
+                                                         n_rows,
+                                                         dim,
+                                                         veclen);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+  // Precompute the centers vector norms for L2Expanded distance
+  auto compute_norms = [&]() {
+    auto&& r = make_array_for_index<float>(stream, n_lists);
+    utils::dots_along_rows(n_lists, dim, centers.data(), r.data(), stream);
+    RAFT_LOG_TRACE_VEC(r.data(), 20);
+    return r;
+  };
+  auto&& center_norms = params.metric == raft::distance::DistanceType::L2Expanded
+                          ? std::optional(compute_norms())
+                          : std::nullopt;
+
+  // assemble the index
+  index<T> index{
+    veclen, params.metric, data, indices, list_sizes, list_offsets, centers, center_norms};
+
+  // check index invariants
+  index.check_consistency();
+
+  return index;
+}
+
+}  // namespace raft::spatial::knn::detail::ivf_flat
diff --git a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
similarity index 81%
rename from cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
rename to cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
index d99a7024c1..778a2f8a27 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_ivf_flat_kernel.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
@@ -18,13 +18,15 @@
 
 // #define USE_FAISS
 
-#include "../ann_common.hpp"
-#include "../ivf_flat.hpp"
+#include "../ivf_flat_types.hpp"
 #include "ann_utils.cuh"
+#include "topk/radix_topk.cuh"
 #include "topk/warpsort_topk.cuh"
 
 #include <raft/common/device_loads_stores.cuh>
+#include <raft/core/handle.hpp>
 #include <raft/core/logger.hpp>
+#include <raft/core/mdarray.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/distance/distance.cuh>
@@ -37,26 +39,16 @@
 #endif
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 
 #include <optional>
-#include <raft/core/mdarray.hpp>
-
-namespace raft::spatial::knn::detail {
 
-constexpr int kThreadsPerBlock = 128;
-
-template <typename T, typename... Extents>
-static inline auto make_array_for_index(rmm::cuda_stream_view stream, Extents... exts)
-{
-  using extent_t  = extents<((void)exts, dynamic_extent)...>;
-  using mdarray_t = device_mdarray<T, extent_t, layout_c_contiguous>;
+namespace raft::spatial::knn::detail::ivf_flat {
 
-  typename mdarray_t::extents_type extent{exts...};
-  typename mdarray_t::mapping_type layout{extent};
-  typename mdarray_t::container_policy_type policy{stream};
+using raft::spatial::knn::ivf_flat::index;
+using raft::spatial::knn::ivf_flat::search_params;
 
-  return mdarray_t{layout, policy};
-}
+constexpr int kThreadsPerBlock = 128;
 
 /**
  * @brief Copy Veclen elements of type T from `query` to `query_shared` at position `loadDim *
@@ -920,7 +912,7 @@ uint32_t configure_launch_x(uint32_t numQueries, uint32_t nprobe, int32_t sMemSi
 
 template <int Capacity, int Veclen, bool Ascending, typename T, typename AccT, typename Lambda>
 void launch_kernel(Lambda lambda,
-                   const ivf_flat_index<T>& index,
+                   const ivf_flat::index<T>& index,
                    const T* queries,
                    const uint32_t* coarse_index,
                    const uint32_t num_queries,
@@ -1118,7 +1110,7 @@ struct select_interleaved_scan_kernel {
  * @param stream
  */
 template <typename T, typename AccT>
-void ivfflat_interleaved_scan(const ivf_flat_index<T>& index,
+void ivfflat_interleaved_scan(const ivf_flat::index<T>& index,
                               const T* queries,
                               const uint32_t* coarse_query_results,
                               const uint32_t n_queries,
@@ -1148,4 +1140,228 @@ void ivfflat_interleaved_scan(const ivf_flat_index<T>& index,
                                                stream);
 }
 
-}  // namespace raft::spatial::knn::detail
+template <typename T, typename AccT>
+void search_impl(const handle_t& handle,
+                 const index<T>& index,
+                 const T* queries,
+                 uint32_t n_queries,
+                 uint32_t k,
+                 uint32_t n_probes,
+                 bool select_min,
+                 size_t* neighbors,
+                 AccT* distances,
+                 rmm::cuda_stream_view stream,
+                 rmm::mr::device_memory_resource* search_mr)
+{
+  // The norm of query
+  rmm::device_uvector<float> query_norm_dev(n_queries, stream, search_mr);
+  // The distance value of cluster(list) and queries
+  rmm::device_uvector<float> distance_buffer_dev(n_queries * index.n_lists(), stream, search_mr);
+  // The topk distance value of cluster(list) and queries
+  rmm::device_uvector<float> coarse_distances_dev(n_queries * n_probes, stream, search_mr);
+  // The topk  index of cluster(list) and queries
+  rmm::device_uvector<uint32_t> coarse_indices_dev(n_queries * n_probes, stream, search_mr);
+  // The topk distance value of candicate vectors from each cluster(list)
+  rmm::device_uvector<AccT> refined_distances_dev(n_queries * n_probes * k, stream, search_mr);
+  // The topk index of candicate vectors from each cluster(list)
+  rmm::device_uvector<size_t> refined_indices_dev(n_queries * n_probes * k, stream, search_mr);
+
+  size_t float_query_size;
+  if constexpr (std::is_integral_v<T>) {
+    float_query_size = n_queries * index.dim();
+  } else {
+    float_query_size = 0;
+  }
+  rmm::device_uvector<float> converted_queries_dev(float_query_size, stream, search_mr);
+  float* converted_queries_ptr = converted_queries_dev.data();
+
+  if constexpr (std::is_same_v<T, float>) {
+    converted_queries_ptr = const_cast<float*>(queries);
+  } else {
+    linalg::unaryOp(
+      converted_queries_ptr, queries, n_queries * index.dim(), utils::mapping<float>{}, stream);
+  }
+
+  float alpha = 1.0f;
+  float beta  = 0.0f;
+
+  if (index.metric == raft::distance::DistanceType::L2Expanded) {
+    alpha = -2.0f;
+    beta  = 1.0f;
+    utils::dots_along_rows(
+      n_queries, index.dim(), converted_queries_ptr, query_norm_dev.data(), stream);
+    utils::outer_add(query_norm_dev.data(),
+                     n_queries,
+                     index.center_norms->data(),
+                     index.n_lists(),
+                     distance_buffer_dev.data(),
+                     stream);
+    RAFT_LOG_TRACE_VEC(index.center_norms->data(), 20);
+    RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), 20);
+  } else {
+    alpha = 1.0f;
+    beta  = 0.0f;
+  }
+
+  linalg::gemm(handle,
+               true,
+               false,
+               index.n_lists(),
+               n_queries,
+               index.dim(),
+               &alpha,
+               index.centers.data(),
+               index.dim(),
+               converted_queries_ptr,
+               index.dim(),
+               &beta,
+               distance_buffer_dev.data(),
+               index.n_lists(),
+               stream);
+
+  RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), 20);
+  if (n_probes <= raft::spatial::knn::detail::topk::kMaxCapacity) {
+    topk::warp_sort_topk<AccT, uint32_t>(distance_buffer_dev.data(),
+                                         nullptr,
+                                         n_queries,
+                                         index.n_lists(),
+                                         n_probes,
+                                         coarse_distances_dev.data(),
+                                         coarse_indices_dev.data(),
+                                         select_min,
+                                         stream);
+  } else {
+    topk::radix_topk<AccT, uint32_t, 11, 512>(distance_buffer_dev.data(),
+                                              nullptr,
+                                              n_queries,
+                                              index.n_lists(),
+                                              n_probes,
+                                              coarse_distances_dev.data(),
+                                              coarse_indices_dev.data(),
+                                              select_min,
+                                              stream,
+                                              search_mr);
+  }
+  RAFT_LOG_TRACE_VEC(coarse_indices_dev.data(), 1 * n_probes);
+  RAFT_LOG_TRACE_VEC(coarse_distances_dev.data(), 1 * n_probes);
+
+  AccT* distances_dev_ptr = refined_distances_dev.data();
+  size_t* indices_dev_ptr = refined_indices_dev.data();
+
+  uint32_t grid_dim_x = 0;
+  if (n_probes > 1) {
+    // query the gridDimX size to store probes topK output
+    ivfflat_interleaved_scan<T, typename utils::config<T>::value_t>(index,
+                                                                    nullptr,
+                                                                    nullptr,
+                                                                    n_queries,
+                                                                    index.metric,
+                                                                    n_probes,
+                                                                    k,
+                                                                    select_min,
+                                                                    nullptr,
+                                                                    nullptr,
+                                                                    grid_dim_x,
+                                                                    stream);
+  } else {
+    grid_dim_x = 1;
+  }
+
+  if (grid_dim_x == 1) {
+    distances_dev_ptr = distances;
+    indices_dev_ptr   = neighbors;
+  }
+
+  ivfflat_interleaved_scan<T, typename utils::config<T>::value_t>(index,
+                                                                  queries,
+                                                                  coarse_indices_dev.data(),
+                                                                  n_queries,
+                                                                  index.metric,
+                                                                  n_probes,
+                                                                  k,
+                                                                  select_min,
+                                                                  indices_dev_ptr,
+                                                                  distances_dev_ptr,
+                                                                  grid_dim_x,
+                                                                  stream);
+
+  RAFT_LOG_TRACE_VEC(distances_dev_ptr, 2 * k);
+  RAFT_LOG_TRACE_VEC(indices_dev_ptr, 2 * k);
+
+  // Merge topk values from different blocks
+  if (grid_dim_x > 1) {
+    if (k <= raft::spatial::knn::detail::topk::kMaxCapacity) {
+      topk::warp_sort_topk<AccT, size_t>(refined_distances_dev.data(),
+                                         refined_indices_dev.data(),
+                                         n_queries,
+                                         k * grid_dim_x,
+                                         k,
+                                         distances,
+                                         neighbors,
+                                         select_min,
+                                         stream);
+    } else {
+      topk::radix_topk<AccT, size_t, 11, 512>(refined_distances_dev.data(),
+                                              refined_indices_dev.data(),
+                                              n_queries,
+                                              k * grid_dim_x,
+                                              k,
+                                              distances,
+                                              neighbors,
+                                              select_min,
+                                              stream,
+                                              search_mr);
+    }
+  }
+}
+
+template <typename T>
+inline void search(const handle_t& handle,
+                   const search_params& params,
+                   const index<T>& index,
+                   const T* queries,
+                   uint32_t n_queries,
+                   uint32_t k,
+                   size_t* neighbors,
+                   float* distances,
+                   rmm::cuda_stream_view stream,
+                   rmm::mr::device_memory_resource* mr = nullptr)
+{
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "ivf_flat_handle::search(%u, %u, %zu)", n_queries, k, neighbors);
+
+  RAFT_EXPECTS(params.n_probes > 0,
+               "n_probes (number of clusters to probe in the search) must be positive.");
+  auto n_probes = std::min<uint32_t>(params.n_probes, index.n_lists());
+
+  bool select_min;
+  switch (index.metric) {
+    case raft::distance::DistanceType::InnerProduct:
+    case raft::distance::DistanceType::CosineExpanded:
+    case raft::distance::DistanceType::CorrelationExpanded:
+      // Similarity metrics have the opposite meaning, i.e. nearest neigbours are those with larger
+      // similarity (See the same logic at cpp/include/raft/sparse/selection/detail/knn.cuh:362
+      // {perform_k_selection})
+      select_min = false;
+      break;
+    default: select_min = true;
+  }
+
+  //   // Set memory buffer to be reused across searches
+  //   auto cur_memory_resource = rmm::mr::get_current_device_resource();
+  //   if (!search_mem_res_.has_value() || search_mem_res_->get_upstream() != cur_memory_resource) {
+  //     search_mem_res_.emplace(cur_memory_resource, Pow2<256>::roundUp(n_queries * n_probes * k *
+  //     16));
+  //   }
+  std::optional<rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>> pool_res;
+  if (mr == nullptr) {
+    pool_res.emplace(rmm::mr::get_current_device_resource(),
+                     Pow2<256>::roundUp(n_queries * n_probes * k * 16));
+    mr = &(pool_res.value());
+  }
+
+  return search_impl<T, float>(
+    handle, index, queries, n_queries, k, n_probes, select_min, neighbors, distances, stream, mr);
+}
+
+}  // namespace raft::spatial::knn::detail::ivf_flat
diff --git a/cpp/include/raft/spatial/knn/ivf_flat.cuh b/cpp/include/raft/spatial/knn/ivf_flat.cuh
new file mode 100644
index 0000000000..7ad17038ea
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/ivf_flat.cuh
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/ivf_flat_build.cuh"
+#include "detail/ivf_flat_search.cuh"
+#include "ivf_flat_types.hpp"
+
+#include <raft/core/handle.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+namespace raft::spatial::knn::ivf_flat {
+
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * @param handle
+ * @param params configure the index building
+ * @param[in] dataset a device pointer to a row-major matrix [n_rows, dim]
+ * @param n_rows the number of samples
+ * @param dim the dimensionality of the data
+ * @param stream
+ */
+template <typename T>
+inline auto build(const handle_t& handle,
+                  const index_params& params,
+                  const T* dataset,
+                  uint32_t n_rows,
+                  uint32_t dim,
+                  rmm::cuda_stream_view stream) -> index<T>
+{
+  return raft::spatial::knn::detail::ivf_flat::build(handle, params, dataset, n_rows, dim, stream);
+}
+
+/**
+ * @brief Search ANN using the constructed index.
+ *
+ * @param handle
+ * @param params configure the search
+ * @param index ivf-flat constructed index
+ * @param[in] queries a device pointer to a row-major matrix [n_queries, index->dim()]
+ * @param n_queries the batch size
+ * @param k the number of neighbors to find for each query.
+ * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
+ * [n_queries, k]
+ * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
+ * @param stream
+ * @param mr an optional memory resource to use across the searches (you can provide a large enough
+ *           memory pool here to avoid memory allocations within search).
+ */
+template <typename T>
+inline void search(const handle_t& handle,
+                   const search_params& params,
+                   const index<T>& index,
+                   const T* queries,
+                   uint32_t n_queries,
+                   uint32_t k,
+                   size_t* neighbors,
+                   float* distances,
+                   rmm::cuda_stream_view stream,
+                   rmm::mr::device_memory_resource* mr = nullptr)
+{
+  return raft::spatial::knn::detail::ivf_flat::search(
+    handle, params, index, queries, n_queries, k, neighbors, distances, stream, mr);
+}
+
+}  // namespace raft::spatial::knn::ivf_flat
diff --git a/cpp/include/raft/spatial/knn/ivf_flat_types.hpp b/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
new file mode 100644
index 0000000000..9e06607526
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "ann_common.hpp"
+
+#include <raft/core/mdarray.hpp>
+#include <raft/distance/distance_type.hpp>
+
+#include <optional>
+
+namespace raft::spatial::knn::ivf_flat {
+
+template <typename T>
+struct index {
+  using row_major = layout_c_contiguous;
+  using extent_1d = extents<dynamic_extent>;
+  using extent_2d = extents<dynamic_extent, dynamic_extent>;
+
+  /**
+   * Vectorized load/store size in elements, determines the size of interleaved data chunks.
+   *
+   * TODO: in theory, we can lift this to the template parameter and keep it at hardware maximum
+   * possible value by padding the `dim` of the data https://github.com/rapidsai/raft/issues/711
+   */
+  uint32_t veclen;
+  /** Distance metric used for clustering. */
+  raft::distance::DistanceType metric;
+
+  /**
+   * Inverted list data [size, dim].
+   *
+   * The data consists of the dataset rows, grouped by their labels (into clusters/lists).
+   * Within each list (cluster), the data is grouped into blocks of `WarpSize` interleaved
+   * vectors. Note, the total index length is slightly larger than the source dataset length,
+   * because each cluster is padded by `WarpSize` elements.
+   *
+   * Interleaving pattern:
+   * within groups of `WarpSize` rows, the data is interleaved with the block size equal to
+   * `veclen * sizeof(T)`. That is, a chunk of `veclen` consecutive components of one row is
+   * followed by a chunk of the same size of the next row, and so on.
+   *
+   * __Example__: veclen = 2, dim = 6, WarpSize = 32, list_size = 31
+   * `
+   *   x[ 0, 0], x[ 0, 1], x[ 1, 0], x[ 1, 1], ... x[14, 0], x[14, 1], x[15, 0], x[15, 1],
+   *   x[16, 0], x[16, 1], x[17, 0], x[17, 1], ... x[30, 0], x[30, 1],    -    ,    -    ,
+   *   x[ 0, 2], x[ 0, 3], x[ 1, 2], x[ 1, 3], ... x[14, 2], x[14, 3], x[15, 2], x[15, 3],
+   *   x[16, 2], x[16, 3], x[17, 2], x[17, 3], ... x[30, 2], x[30, 3],    -    ,    -    ,
+   *   x[ 0, 4], x[ 0, 5], x[ 1, 4], x[ 1, 5], ... x[14, 4], x[14, 5], x[15, 4], x[15, 5],
+   *   x[16, 4], x[16, 5], x[17, 4], x[17, 5], ... x[30, 4], x[30, 5],    -    ,    -    ,
+   * `
+   */
+  device_mdarray<T, extent_2d, row_major> data;
+  /** Inverted list indices: ids of items in the source data [size] */
+  device_mdarray<uint32_t, extent_1d, row_major> indices;
+  /** Sizes of the lists (clusters) [n_lists] */
+  device_mdarray<uint32_t, extent_1d, row_major> list_sizes;
+  /**
+   * Offsets into the lists [n_lists + 1].
+   * The last value contains the total length of the index.
+   */
+  device_mdarray<uint32_t, extent_1d, row_major> list_offsets;
+  /** k-means cluster centers corresponding to the lists [n_lists, dim] */
+  device_mdarray<float, extent_2d, row_major> centers;
+  /** (Optional) Precomputed norms of the `centers` w.r.t. the chosen distance metric [n_lists]  */
+  std::optional<device_mdarray<float, extent_1d, row_major>> center_norms;
+
+  /** Total length of the index. */
+  [[nodiscard]] constexpr inline auto size() const noexcept -> size_t { return data.extent(0); }
+  /** Dimensionality of the data. */
+  [[nodiscard]] constexpr inline auto dim() const noexcept -> size_t { return data.extent(1); }
+  /** Number of clusters/inverted lists. */
+  [[nodiscard]] constexpr inline auto n_lists() const noexcept -> size_t
+  {
+    return centers.extent(0);
+  }
+
+  /** Throw an error if the index content is inconsistent. */
+  inline void check_consistency() const
+  {
+    RAFT_EXPECTS(dim() % veclen == 0, "dimensionality is not a multiple of the veclen");
+    RAFT_EXPECTS(data.extent(0) == indices.extent(0), "inconsistent index size");
+    RAFT_EXPECTS(data.extent(1) == centers.extent(1), "inconsistent data dimensionality");
+    RAFT_EXPECTS(                                             //
+      (centers.extent(0) == list_sizes.extent(0)) &&          //
+        (centers.extent(0) + 1 == list_offsets.extent(0)) &&  //
+        (!center_norms.has_value() || centers.extent(0) == center_norms->extent(0)),
+      "inconsistent number of lists (clusters)");
+    RAFT_EXPECTS(reinterpret_cast<size_t>(data.data()) % (veclen * sizeof(T)) == 0,
+                 "The data storage pointer is not aligned to the vector length");
+  }
+};
+
+struct index_params : ivf_index_params {
+  /** The number of iterations searching for kmeans centers (index building). */
+  uint32_t kmeans_n_iters = 20;
+  /** The fraction of data to use during iterative kmeans building. */
+  double kmeans_trainset_fraction = 0.5;
+};
+
+struct search_params : ivf_search_params {
+};
+
+}  // namespace raft::spatial::knn::ivf_flat
diff --git a/cpp/test/spatial/ann_ivf_flat.cu b/cpp/test/spatial/ann_ivf_flat.cu
index 0b8185ffb2..a86848fdf9 100644
--- a/cpp/test/spatial/ann_ivf_flat.cu
+++ b/cpp/test/spatial/ann_ivf_flat.cu
@@ -145,8 +145,8 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
     {
       rmm::device_uvector<T> distances_ivfflat_dev(queries_size, stream_);
       rmm::device_uvector<int64_t> indices_ivfflat_dev(queries_size, stream_);
-      raft::spatial::knn::ivf_flat_index_params index_params;
-      raft::spatial::knn::ivf_flat_search_params search_params;
+      raft::spatial::knn::ivf_flat::index_params index_params;
+      raft::spatial::knn::ivf_flat::search_params search_params;
       index_params.n_lists   = ps.nlist;
       index_params.metric    = ps.metric;
       search_params.n_probes = ps.nprobe;

From 059a6c097a2df0e13d27e2cea8f5e67c44e0a8a7 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 22 Jun 2022 11:42:13 +0200
Subject: [PATCH 077/118] Add the memory resource argument to warp_sort_topk

---
 .../spatial/knn/detail/ivf_flat_search.cuh    |  6 +-
 .../spatial/knn/detail/topk/radix_topk.cuh    | 12 ++--
 .../spatial/knn/detail/topk/warpsort_topk.cuh | 64 +++++++++++++++----
 3 files changed, 61 insertions(+), 21 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
index 778a2f8a27..1f121a4a6c 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
@@ -1229,7 +1229,8 @@ void search_impl(const handle_t& handle,
                                          coarse_distances_dev.data(),
                                          coarse_indices_dev.data(),
                                          select_min,
-                                         stream);
+                                         stream,
+                                         search_mr);
   } else {
     topk::radix_topk<AccT, uint32_t, 11, 512>(distance_buffer_dev.data(),
                                               nullptr,
@@ -1299,7 +1300,8 @@ void search_impl(const handle_t& handle,
                                          distances,
                                          neighbors,
                                          select_min,
-                                         stream);
+                                         stream,
+                                         search_mr);
     } else {
       topk::radix_topk<AccT, size_t, 11, 512>(refined_distances_dev.data(),
                                               refined_indices_dev.data(),
diff --git a/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh b/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
index c0b86c9970..a012bd7f7d 100644
--- a/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
@@ -528,12 +528,12 @@ inline dim3 get_optimal_grid_size(size_t req_batch_size, size_t len)
  * @param[in] in_idx
  *   contiguous device array of inputs of size (len * batch_size);
  *   typically, these are indices of the corresponding in_keys.
- * @param[in] batch_size
+ * @param batch_size
  *   number of input rows, i.e. the batch size.
- * @param[in] len
+ * @param len
  *   length of a single input array (row); also sometimes referred as n_cols.
  *   Invariant: len >= k.
- * @param[in] k
+ * @param k
  *   the number of outputs to select in each input row.
  * @param[out] out
  *   contiguous device array of outputs of size (k * batch_size);
@@ -541,9 +541,11 @@ inline dim3 get_optimal_grid_size(size_t req_batch_size, size_t len)
  * @param[out] out_idx
  *   contiguous device array of outputs of size (k * batch_size);
  *   the payload selected together with `out`.
- * @param[in] select_min
+ * @param select_min
  *   whether to select k smallest (true) or largest (false) keys.
- * @param[in] stream
+ * @param stream
+ * @param mr an optional memory resource to use across the calls (you can provide a large enough
+ *           memory pool here to avoid memory allocations within the call).
  */
 template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
 void radix_topk(const T* in,
diff --git a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
index a599e8367e..cd5d2fc728 100644
--- a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
@@ -25,6 +25,10 @@
 #include <functional>
 #include <type_traits>
 
+#include <rmm/device_vector.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
 /*
   Three APIs of different scopes are provided:
     1. host function: warp_sort_topk()
@@ -571,7 +575,7 @@ struct launch_setup {
                      const IdxT* in_idx,
                      T* out_key,
                      IdxT* out_idx,
-                     cudaStream_t stream)
+                     rmm::cuda_stream_view stream)
   {
     const int capacity = calc_capacity(k);
     if constexpr (Capacity > 1) {
@@ -719,10 +723,19 @@ void warp_sort_topk_(int num_of_block,
                      T* out,
                      IdxT* out_idx,
                      bool select_min,
-                     cudaStream_t stream = 0)
+                     rmm::cuda_stream_view stream,
+                     rmm::mr::device_memory_resource* mr = nullptr)
 {
-  rmm::device_uvector<T> tmp_val(num_of_block * k * batch_size, stream);
-  rmm::device_uvector<IdxT> tmp_idx(num_of_block * k * batch_size, stream);
+  std::optional<rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>> pool_res;
+  if (mr == nullptr) {
+    pool_res.emplace(
+      rmm::mr::get_current_device_resource(),
+      Pow2<256>::roundUp(num_of_block * k * batch_size * 2 * std::max(sizeof(T), sizeof(IdxT))));
+    mr = &(pool_res.value());
+  }
+
+  rmm::device_uvector<T> tmp_val(num_of_block * k * batch_size, stream, mr);
+  rmm::device_uvector<IdxT> tmp_idx(num_of_block * k * batch_size, stream, mr);
 
   int capacity   = calc_capacity(k);
   int warp_width = std::min(capacity, WarpSize);
@@ -780,12 +793,12 @@ void warp_sort_topk_(int num_of_block,
  * @param[in] in_idx
  *   contiguous device array of inputs of size (len * batch_size);
  *   typically, these are indices of the corresponding in_keys.
- * @param[in] batch_size
+ * @param batch_size
  *   number of input rows, i.e. the batch size.
- * @param[in] len
+ * @param len
  *   length of a single input array (row); also sometimes referred as n_cols.
  *   Invariant: len >= k.
- * @param[in] k
+ * @param k
  *   the number of outputs to select in each input row.
  * @param[out] out
  *   contiguous device array of outputs of size (k * batch_size);
@@ -793,9 +806,11 @@ void warp_sort_topk_(int num_of_block,
  * @param[out] out_idx
  *   contiguous device array of outputs of size (k * batch_size);
  *   the payload selected together with `out`.
- * @param[in] select_min
+ * @param select_min
  *   whether to select k smallest (true) or largest (false) keys.
- * @param[in] stream
+ * @param stream
+ * @param mr an optional memory resource to use across the calls (you can provide a large enough
+ *           memory pool here to avoid memory allocations within the call).
  */
 template <typename T, typename IdxT>
 void warp_sort_topk(const T* in,
@@ -806,7 +821,8 @@ void warp_sort_topk(const T* in,
                     T* out,
                     IdxT* out_idx,
                     bool select_min,
-                    rmm::cuda_stream_view stream = 0)
+                    rmm::cuda_stream_view stream,
+                    rmm::mr::device_memory_resource* mr = nullptr)
 {
   ASSERT(k <= kMaxCapacity, "Current max k is %d (requested %d)", kMaxCapacity, k);
   ASSERT(len <= size_t(std::numeric_limits<IdxT>::max()),
@@ -821,13 +837,33 @@ void warp_sort_topk(const T* in,
   int len_per_thread = len / (num_of_block * num_of_warp * std::min(capacity, WarpSize));
 
   if (len_per_thread <= LaunchThreshold<warp_sort_immediate>::len_factor_for_choosing) {
-    warp_sort_topk_<warp_sort_immediate, T, IdxT>(
-      num_of_block, num_of_warp, in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+    warp_sort_topk_<warp_sort_immediate, T, IdxT>(num_of_block,
+                                                  num_of_warp,
+                                                  in,
+                                                  in_idx,
+                                                  batch_size,
+                                                  len,
+                                                  k,
+                                                  out,
+                                                  out_idx,
+                                                  select_min,
+                                                  stream,
+                                                  mr);
   } else {
     calc_launch_parameter<warp_sort_filtered, T, IdxT>(
       batch_size, len, k, &num_of_block, &num_of_warp);
-    warp_sort_topk_<warp_sort_filtered, T, IdxT>(
-      num_of_block, num_of_warp, in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+    warp_sort_topk_<warp_sort_filtered, T, IdxT>(num_of_block,
+                                                 num_of_warp,
+                                                 in,
+                                                 in_idx,
+                                                 batch_size,
+                                                 len,
+                                                 k,
+                                                 out,
+                                                 out_idx,
+                                                 select_min,
+                                                 stream,
+                                                 mr);
   }
 }
 

From df17b5b631ddec4e277c2af4386b9aa9f073bdd8 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 22 Jun 2022 11:45:00 +0200
Subject: [PATCH 078/118] update docs

---
 cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh  | 1 +
 cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh | 1 +
 cpp/include/raft/spatial/knn/ivf_flat.cuh               | 6 ++++++
 3 files changed, 8 insertions(+)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
index 0d5456dbf7..49391221be 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
@@ -113,6 +113,7 @@ __global__ void build_index_kernel(const uint32_t* labels,
   }
 }
 
+/** See raft::spatial::knn::ivf_flat::build docs */
 template <typename T>
 inline auto build(const handle_t& handle,
                   const index_params& params,
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
index 1f121a4a6c..8ee05a23d3 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
@@ -1317,6 +1317,7 @@ void search_impl(const handle_t& handle,
   }
 }
 
+/** See raft::spatial::knn::ivf_flat::search docs */
 template <typename T>
 inline void search(const handle_t& handle,
                    const search_params& params,
diff --git a/cpp/include/raft/spatial/knn/ivf_flat.cuh b/cpp/include/raft/spatial/knn/ivf_flat.cuh
index 7ad17038ea..18085666f0 100644
--- a/cpp/include/raft/spatial/knn/ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/ivf_flat.cuh
@@ -30,12 +30,16 @@ namespace raft::spatial::knn::ivf_flat {
 /**
  * @brief Build the index from the dataset for efficient search.
  *
+ * @tparam T data element type
+ *
  * @param handle
  * @param params configure the index building
  * @param[in] dataset a device pointer to a row-major matrix [n_rows, dim]
  * @param n_rows the number of samples
  * @param dim the dimensionality of the data
  * @param stream
+ *
+ * @return the constructed ivf-flat index
  */
 template <typename T>
 inline auto build(const handle_t& handle,
@@ -51,6 +55,8 @@ inline auto build(const handle_t& handle,
 /**
  * @brief Search ANN using the constructed index.
  *
+ * @tparam T data element type
+ *
  * @param handle
  * @param params configure the search
  * @param index ivf-flat constructed index

From fe9ced11560392c9ede579bd0c83dd1437f35965 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 23 Jun 2022 14:48:51 +0200
Subject: [PATCH 079/118] Allow empty mesoclusters

---
 .../raft/spatial/knn/detail/ann_kmeans_balanced.cuh      | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index e039ac9f32..9280faf0f2 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -533,7 +533,7 @@ auto build_fine_clusters(const handle_t& handle,
   rmm::device_uvector<uint32_t> mc_trainset_csizes_tmp(
     fine_clusters_nums_max, stream, managed_memory);
 
-  // Training clusters in each meso-clusters
+  // Training clusters in each meso-cluster
   uint32_t n_clusters_done = 0;
   for (uint32_t i = 0; i < n_mesoclusters; i++) {
     uint32_t k = 0;
@@ -541,6 +541,13 @@ auto build_fine_clusters(const handle_t& handle,
       if (labels_mptr[j] == i) { mc_trainset_ids[k++] = j; }
     }
     RAFT_EXPECTS(k == mesocluster_sizes[i], "Incorrect mesocluster size at %d.", i);
+    if (k == 0) {
+      RAFT_LOG_DEBUG("Empty cluster %d", i);
+      RAFT_EXPECTS(fine_clusters_nums[i] == 0,
+                   "Number of fine clusters must be zero for the empty mesocluster (got %d)",
+                   fine_clusters_nums[i]);
+      continue;
+    }
 
     utils::copy_selected<T>(
       mesocluster_sizes[i], dim, dataset_mptr, mc_trainset_ids, dim, mc_trainset, dim, stream);

From 91fdcbb3dc83260640bd4317150fd18c55b21684 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 23 Jun 2022 14:53:26 +0200
Subject: [PATCH 080/118] Add low-dimensional and
 non-veclen-aligned-dimensional test cases

---
 cpp/test/spatial/ann_ivf_flat.cu | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/cpp/test/spatial/ann_ivf_flat.cu b/cpp/test/spatial/ann_ivf_flat.cu
index a86848fdf9..401907af8a 100644
--- a/cpp/test/spatial/ann_ivf_flat.cu
+++ b/cpp/test/spatial/ann_ivf_flat.cu
@@ -216,6 +216,21 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
 };
 
 const std::vector<AnnIvfFlatInputs> inputs = {
+
+  {1000, 10000, 1, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
+  {1000, 10000, 2, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
+  {1000, 10000, 3, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
+  {1000, 10000, 4, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
+  {1000, 10000, 5, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
+  {1000, 10000, 8, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
+  {1000, 10000, 2048, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
+  {1000, 10000, 2049, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
+  {1000, 10000, 2050, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
+  {1000, 10000, 2051, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
+  {1000, 10000, 2052, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
+  {1000, 10000, 2053, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
+  {1000, 10000, 2056, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
+
   {1000, 10000, 16, 10, 40, 1024, raft::distance::DistanceType::L2Expanded},
   {1000, 10000, 16, 10, 50, 1024, raft::distance::DistanceType::L2Expanded},
   {1000, 10000, 16, 10, 70, 1024, raft::distance::DistanceType::L2Expanded},

From be14c63398eaa4ae3917b1ee67b6fd8075722ec2 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 23 Jun 2022 14:54:17 +0200
Subject: [PATCH 081/118] Refactor and document loadAndComputeDist

---
 .../spatial/knn/detail/ivf_flat_search.cuh    | 397 +++++++-----------
 1 file changed, 158 insertions(+), 239 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
index 8ee05a23d3..9a270a968a 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
@@ -117,12 +117,21 @@ __device__ __forceinline__ void queryLoadToShmem<int8_t, 16>(const int8_t* const
   sts(reinterpret_cast<int32_t*>(query_shared) + loadIndex, queryReg);
 }
 
-template <int kUnroll,
-          int wordsPerVectorBlockDim,
-          typename Lambda,
-          int Veclen,
-          typename T,
-          typename AccT>
+/**
+ * @brief Load a part of a vector from the index and from query, compute the (part of the) distance
+ * between them, and aggregate it using the provided Lambda; one structure per thread, per query,
+ * and per index item.
+ *
+ * @tparam kUnroll elements per loop (normally, kUnroll = WarpSize / Veclen)
+ * @tparam GroupSize number of vectors in the interleaved groups in the index.
+ * @tparam Lambda computing the part of the distance for one dimension and aggregating it:
+ *                void (AccT& acc, AccT x, AccT y)
+ * @tparam Veclen size of the vectorized load
+ * @tparam T type of the data in the query and the index
+ * @tparam AccT type of the accumulated value (an optimization for 8bit values to be loaded as 32bit
+ * values)
+ */
+template <int kUnroll, int GroupSize, typename Lambda, int Veclen, typename T, typename AccT>
 struct loadAndComputeDist {
   Lambda compute_dist;
   AccT& dist;
@@ -132,83 +141,85 @@ struct loadAndComputeDist {
   {
   }
 
+  /**
+   * Load parts of vectors from the index and query and accumulates the partial distance.
+   * This version assumes the query is stored in shared memory.
+   * Every thread here processes exactly kUnroll * Veclen elements independently of others.
+   */
   template <typename IdxT>
   __device__ __forceinline__ void runLoadShmemCompute(const T* const& data,
                                                       const T* query_shared,
                                                       IdxT loadIndex,
-                                                      IdxT baseShmemIndex,
-                                                      IdxT iShmemIndex)
+                                                      IdxT shmemIndex)
   {
-    T encV[kUnroll][Veclen];
-    T queryRegs[kUnroll][Veclen];
-    constexpr int stride  = kUnroll * Veclen;
-    const int shmemStride = baseShmemIndex + iShmemIndex * stride;
 #pragma unroll
     for (int j = 0; j < kUnroll; ++j) {
-      ldg(encV[j], data + (loadIndex + j * wordsPerVectorBlockDim) * Veclen);
-      const int d = shmemStride + j * Veclen;
-      lds(queryRegs[j], &query_shared[d]);
+      T encV[Veclen];
+      ldg(encV, data + (loadIndex + j * GroupSize) * Veclen);
+      T queryRegs[Veclen];
+      lds(queryRegs, &query_shared[shmemIndex + j * Veclen]);
 #pragma unroll
       for (int k = 0; k < Veclen; ++k) {
-        compute_dist(dist, queryRegs[j][k], encV[j][k]);
+        compute_dist(dist, queryRegs[k], encV[k]);
       }
     }
   }
 
+  /**
+   * Load parts of vectors from the index and query and accumulates the partial distance.
+   * This version assumes the query is stored in the global memory and is different for every
+   * thread. One warp loads exactly WarpSize query elements at once and then reshuffles them into
+   * corresponding threads (`WarpSize / (kUnroll * Veclen)` elements per thread at once).
+   */
   template <typename IdxT>
   __device__ __forceinline__ void runLoadShflAndCompute(const T*& data,
                                                         const T* query,
                                                         IdxT baseLoadIndex,
                                                         const int laneId)
   {
-    T encV[kUnroll][Veclen];
     T queryReg               = query[baseLoadIndex + laneId];
     constexpr int stride     = kUnroll * Veclen;
     constexpr int totalIter  = WarpSize / stride;
-    constexpr int gmemStride = stride * wordsPerVectorBlockDim;
+    constexpr int gmemStride = stride * GroupSize;
 #pragma unroll
     for (int i = 0; i < totalIter; ++i, data += gmemStride) {
 #pragma unroll
       for (int j = 0; j < kUnroll; ++j) {
-        ldg(encV[j], (data + (laneId + j * wordsPerVectorBlockDim) * Veclen));
-        T q[Veclen];
+        T encV[Veclen];
+        ldg(encV, data + (laneId + j * GroupSize) * Veclen);
         const int d = (i * kUnroll + j) * Veclen;
 #pragma unroll
         for (int k = 0; k < Veclen; ++k) {
-          q[k] = shfl(queryReg, d + k, WarpSize);
-          compute_dist(dist, q[k], encV[j][k]);  //@TODO add other metrics
+          compute_dist(dist, shfl(queryReg, d + k, WarpSize), encV[k]);
         }
       }
     }
   }
 
+  /**
+   * Load parts of vectors from the index and query and accumulates the partial distance.
+   * This version augments `runLoadShflAndCompute` when `dim` is not a multiple of `WarpSize`.
+   */
   __device__ __forceinline__ void runLoadShflAndComputeRemainder(
     const T*& data, const T* query, const int laneId, const int dim, const int dimBlocks)
   {
     const int loadDim     = dimBlocks + laneId;
     T queryReg            = loadDim < dim ? query[loadDim] : 0;
     const int loadDataIdx = laneId * Veclen;
-    for (int d = 0; d < dim - dimBlocks; d += Veclen, data += wordsPerVectorBlockDim * Veclen) {
+    for (int d = 0; d < dim - dimBlocks; d += Veclen, data += GroupSize * Veclen) {
       T enc[Veclen];
-      T q[Veclen];
       ldg(enc, data + loadDataIdx);
 #pragma unroll
       for (int k = 0; k < Veclen; k++) {
-        q[k] = shfl(queryReg, d + k, WarpSize);
-        compute_dist(dist, q[k], enc[k]);
+        compute_dist(dist, shfl(queryReg, d + k, WarpSize), enc[k]);
       }
     }  // end for d < dim - dimBlocks
   }
 };
 
 // This handles uint8_t 8, 16 Veclens
-template <int kUnroll, int wordsPerVectorBlockDim, typename Lambda, int uint8_veclen>
-struct loadAndComputeDist<kUnroll,
-                          wordsPerVectorBlockDim,
-                          Lambda,
-                          uint8_veclen,
-                          uint8_t,
-                          uint32_t> {
+template <int kUnroll, int GroupSize, typename Lambda, int uint8_veclen>
+struct loadAndComputeDist<kUnroll, GroupSize, Lambda, uint8_veclen, uint8_t, uint32_t> {
   Lambda compute_dist;
   uint32_t& dist;
 
@@ -220,24 +231,19 @@ struct loadAndComputeDist<kUnroll,
   __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
                                                       const uint8_t* query_shared,
                                                       int loadIndex,
-                                                      int baseShmemIndex,
-                                                      int iShmemIndex)
+                                                      int shmemIndex)
   {
     constexpr int veclen_int = uint8_veclen / 4;  // converting uint8_t veclens to int
-    uint32_t encV[kUnroll][veclen_int];
-    uint32_t queryRegs[kUnroll][veclen_int];
-
-    loadIndex = loadIndex * veclen_int;
+    loadIndex                = loadIndex * veclen_int;
 #pragma unroll
     for (int j = 0; j < kUnroll; ++j) {
-      ldg(encV[j],
-          reinterpret_cast<unsigned const*>(data) + loadIndex +
-            j * wordsPerVectorBlockDim * veclen_int);
-      const int d = iShmemIndex * kUnroll + j * veclen_int;
-      lds(queryRegs[j], reinterpret_cast<unsigned const*>(query_shared + baseShmemIndex) + d);
+      uint32_t encV[veclen_int];
+      ldg(encV, reinterpret_cast<unsigned const*>(data) + loadIndex + j * GroupSize * veclen_int);
+      uint32_t queryRegs[veclen_int];
+      lds(queryRegs, reinterpret_cast<unsigned const*>(query_shared + shmemIndex) + j * veclen_int);
 #pragma unroll
       for (int k = 0; k < veclen_int; k++) {
-        compute_dist(dist, queryRegs[j][k], encV[j][k]);
+        compute_dist(dist, queryRegs[k], encV[k]);
       }
     }
   }
@@ -247,24 +253,20 @@ struct loadAndComputeDist<kUnroll,
                                                         const int laneId)
   {
     constexpr int veclen_int = uint8_veclen / 4;  // converting uint8_t veclens to int
-    uint32_t encV[kUnroll][veclen_int];
     uint32_t queryReg =
       (laneId < 8) ? reinterpret_cast<unsigned const*>(query + baseLoadIndex)[laneId] : 0;
-    uint32_t q[kUnroll][veclen_int];
     constexpr int stride = kUnroll * uint8_veclen;
 
 #pragma unroll
-    for (int i = 0; i < WarpSize / stride; ++i, data += stride * wordsPerVectorBlockDim) {
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * GroupSize) {
 #pragma unroll
       for (int j = 0; j < kUnroll; ++j) {
-        ldg(encV[j],
-            reinterpret_cast<unsigned const*>(data) +
-              (laneId + j * wordsPerVectorBlockDim) * veclen_int);
+        uint32_t encV[veclen_int];
+        ldg(encV, reinterpret_cast<unsigned const*>(data) + (laneId + j * GroupSize) * veclen_int);
         const int d = (i * kUnroll + j) * veclen_int;
 #pragma unroll
         for (int k = 0; k < veclen_int; ++k) {
-          q[j][k] = shfl(queryReg, d + k, WarpSize);
-          compute_dist(dist, q[j][k], encV[j][k]);
+          compute_dist(dist, shfl(queryReg, d + k, WarpSize), encV[k]);
         }
       }
     }
@@ -279,15 +281,13 @@ struct loadAndComputeDist<kUnroll,
     constexpr int veclen_int = uint8_veclen / 4;
     const int loadDim        = dimBlocks + laneId * 4;  // Here 4 is for 1 - int
     uint32_t queryReg = loadDim < dim ? reinterpret_cast<uint32_t const*>(query + loadDim)[0] : 0;
-    for (int d = 0; d < dim - dimBlocks;
-         d += uint8_veclen, data += wordsPerVectorBlockDim * uint8_veclen) {
+    for (int d = 0; d < dim - dimBlocks; d += uint8_veclen, data += GroupSize * uint8_veclen) {
       uint32_t enc[veclen_int];
-      uint32_t q[veclen_int];
       ldg(enc, reinterpret_cast<uint32_t const*>(data) + laneId * veclen_int);
 #pragma unroll
       for (int k = 0; k < veclen_int; k++) {
-        q[k] = shfl(queryReg, (d / 4) + k, WarpSize);
-        compute_dist(dist, q[k], enc[k]);
+        uint32_t q = shfl(queryReg, (d / 4) + k, WarpSize);
+        compute_dist(dist, q, enc[k]);
       }
     }  // end for d < dim - dimBlocks
   }
@@ -295,8 +295,8 @@ struct loadAndComputeDist<kUnroll,
 
 // Keep this specialized uint8 Veclen = 4, because compiler is generating suboptimal code while
 // using above common template of int2/int4
-template <int kUnroll, int wordsPerVectorBlockDim, typename Lambda>
-struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 4, uint8_t, uint32_t> {
+template <int kUnroll, int GroupSize, typename Lambda>
+struct loadAndComputeDist<kUnroll, GroupSize, Lambda, 4, uint8_t, uint32_t> {
   Lambda compute_dist;
   uint32_t& dist;
 
@@ -308,18 +308,13 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 4, uint8_t, u
   __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
                                                       const uint8_t* query_shared,
                                                       int loadIndex,
-                                                      int baseShmemIndex,
-                                                      int iShmemIndex)
+                                                      int shmemIndex)
   {
-    uint32_t encV[kUnroll];
-    uint32_t queryRegs[kUnroll];
-
 #pragma unroll
     for (int j = 0; j < kUnroll; ++j) {
-      encV[j]     = reinterpret_cast<unsigned const*>(data)[loadIndex + j * wordsPerVectorBlockDim];
-      const int d = (iShmemIndex * kUnroll + j);
-      queryRegs[j] = reinterpret_cast<unsigned const*>(query_shared + baseShmemIndex)[d];
-      compute_dist(dist, queryRegs[j], encV[j]);
+      uint32_t encV      = reinterpret_cast<unsigned const*>(data)[loadIndex + j * GroupSize];
+      uint32_t queryRegs = reinterpret_cast<unsigned const*>(query_shared + shmemIndex)[j];
+      compute_dist(dist, queryRegs, encV);
     }
   }
   __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
@@ -327,21 +322,18 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 4, uint8_t, u
                                                         int baseLoadIndex,
                                                         const int laneId)
   {
-    uint32_t encV[kUnroll];
     uint32_t queryReg =
       (laneId < 8) ? reinterpret_cast<unsigned const*>(query + baseLoadIndex)[laneId] : 0;
-    uint32_t q[kUnroll];
     constexpr int veclen = 4;
     constexpr int stride = kUnroll * veclen;
 
 #pragma unroll
-    for (int i = 0; i < WarpSize / stride; ++i, data += stride * wordsPerVectorBlockDim) {
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * GroupSize) {
 #pragma unroll
       for (int j = 0; j < kUnroll; ++j) {
-        encV[j]     = reinterpret_cast<unsigned const*>(data)[laneId + j * wordsPerVectorBlockDim];
-        const int d = (i * kUnroll + j);
-        q[j]        = shfl(queryReg, d, WarpSize);
-        compute_dist(dist, q[j], encV[j]);
+        uint32_t encV = reinterpret_cast<unsigned const*>(data)[laneId + j * GroupSize];
+        uint32_t q    = shfl(queryReg, i * kUnroll + j, WarpSize);
+        compute_dist(dist, q, encV);
       }
     }
   }
@@ -355,7 +347,7 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 4, uint8_t, u
     constexpr int veclen = 4;
     const int loadDim    = dimBlocks + laneId;
     uint32_t queryReg    = loadDim < dim ? reinterpret_cast<unsigned const*>(query)[loadDim] : 0;
-    for (int d = 0; d < dim - dimBlocks; d += veclen, data += wordsPerVectorBlockDim * veclen) {
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += GroupSize * veclen) {
       uint32_t enc = reinterpret_cast<unsigned const*>(data)[laneId];
       uint32_t q   = shfl(queryReg, d / veclen, WarpSize);
       compute_dist(dist, q, enc);
@@ -363,8 +355,8 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 4, uint8_t, u
   }
 };
 
-template <int kUnroll, int wordsPerVectorBlockDim, typename Lambda>
-struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 2, uint8_t, uint32_t> {
+template <int kUnroll, int GroupSize, typename Lambda>
+struct loadAndComputeDist<kUnroll, GroupSize, Lambda, 2, uint8_t, uint32_t> {
   Lambda compute_dist;
   uint32_t& dist;
 
@@ -376,19 +368,13 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 2, uint8_t, u
   __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
                                                       const uint8_t* query_shared,
                                                       int loadIndex,
-                                                      int baseShmemIndex,
-                                                      int iShmemIndex)
+                                                      int shmemIndex)
   {
-    uint32_t encV[kUnroll];
-    uint32_t queryRegs[kUnroll];
 #pragma unroll
     for (int j = 0; j < kUnroll; ++j) {
-      encV[j]     = 0;
-      encV[j]     = reinterpret_cast<uint16_t const*>(data)[loadIndex + j * wordsPerVectorBlockDim];
-      const int d = (iShmemIndex * kUnroll + j);
-      queryRegs[j] = 0;
-      queryRegs[j] = reinterpret_cast<uint16_t const*>(query_shared + baseShmemIndex)[d];
-      compute_dist(dist, queryRegs[j], encV[j]);
+      uint32_t encV      = reinterpret_cast<uint16_t const*>(data)[loadIndex + j * GroupSize];
+      uint32_t queryRegs = reinterpret_cast<uint16_t const*>(query_shared + shmemIndex)[j];
+      compute_dist(dist, queryRegs, encV);
     }
   }
 
@@ -397,22 +383,18 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 2, uint8_t, u
                                                         int baseLoadIndex,
                                                         const int laneId)
   {
-    uint32_t encV[kUnroll];
-    uint32_t queryReg = 0;
-    queryReg = (laneId < 16) ? reinterpret_cast<uint16_t const*>(query + baseLoadIndex)[laneId] : 0;
-    uint32_t q[kUnroll];
+    uint32_t queryReg =
+      (laneId < 16) ? reinterpret_cast<uint16_t const*>(query + baseLoadIndex)[laneId] : 0;
     constexpr int veclen = 2;
     constexpr int stride = kUnroll * veclen;
 
 #pragma unroll
-    for (int i = 0; i < WarpSize / stride; ++i, data += stride * wordsPerVectorBlockDim) {
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * GroupSize) {
 #pragma unroll
       for (int j = 0; j < kUnroll; ++j) {
-        encV[j]     = 0;
-        encV[j]     = reinterpret_cast<uint16_t const*>(data)[laneId + j * wordsPerVectorBlockDim];
-        const int d = (i * kUnroll + j);
-        q[j]        = shfl(queryReg, d, WarpSize);
-        compute_dist(dist, q[j], encV[j]);
+        uint32_t encV = reinterpret_cast<uint16_t const*>(data)[laneId + j * GroupSize];
+        uint32_t q    = shfl(queryReg, i * kUnroll + j, WarpSize);
+        compute_dist(dist, q, encV);
       }
     }
   }
@@ -425,18 +407,17 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 2, uint8_t, u
   {
     constexpr int veclen = 2;
     int loadDim          = dimBlocks + laneId * veclen;
-    uint32_t queryReg    = 0;
-    queryReg = loadDim < dim ? reinterpret_cast<uint16_t const*>(query + loadDim)[0] : 0;
-    for (int d = 0; d < dim - dimBlocks; d += veclen, data += wordsPerVectorBlockDim * veclen) {
+    uint32_t queryReg = loadDim < dim ? reinterpret_cast<uint16_t const*>(query + loadDim)[0] : 0;
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += GroupSize * veclen) {
       uint32_t enc = reinterpret_cast<uint16_t const*>(data)[laneId];
       uint32_t q   = shfl(queryReg, d / veclen, WarpSize);
       compute_dist(dist, q, enc);
-    }  // end for d < dim - dimBlocks
+    }
   }
 };
 
-template <int kUnroll, int wordsPerVectorBlockDim, typename Lambda>
-struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 1, uint8_t, uint32_t> {
+template <int kUnroll, int GroupSize, typename Lambda>
+struct loadAndComputeDist<kUnroll, GroupSize, Lambda, 1, uint8_t, uint32_t> {
   Lambda compute_dist;
   uint32_t& dist;
 
@@ -448,17 +429,13 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 1, uint8_t, u
   __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
                                                       const uint8_t* query_shared,
                                                       int loadIndex,
-                                                      int baseShmemIndex,
-                                                      int iShmemIndex)
+                                                      int shmemIndex)
   {
-    uint32_t encV[kUnroll];
-    uint32_t queryRegs[kUnroll];
 #pragma unroll
     for (int j = 0; j < kUnroll; ++j) {
-      encV[j]      = data[loadIndex + j * wordsPerVectorBlockDim];
-      const int d  = (iShmemIndex * kUnroll + j);
-      queryRegs[j] = query_shared[baseShmemIndex + d];
-      compute_dist(dist, queryRegs[j], encV[j]);
+      uint32_t encV      = data[loadIndex + j * GroupSize];
+      uint32_t queryRegs = query_shared[shmemIndex + j];
+      compute_dist(dist, queryRegs, encV);
     }
   }
 
@@ -467,22 +444,17 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 1, uint8_t, u
                                                         int baseLoadIndex,
                                                         const int laneId)
   {
-    uint32_t encV[kUnroll];
-    uint32_t queryReg = 0;
-    queryReg          = query[baseLoadIndex + laneId];
-    uint32_t q[kUnroll];
+    uint32_t queryReg    = query[baseLoadIndex + laneId];
     constexpr int veclen = 1;
     constexpr int stride = kUnroll * veclen;
 
 #pragma unroll
-    for (int i = 0; i < WarpSize / stride; ++i, data += stride * wordsPerVectorBlockDim) {
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * GroupSize) {
 #pragma unroll
       for (int j = 0; j < kUnroll; ++j) {
-        encV[j]     = 0;
-        encV[j]     = data[laneId + j * wordsPerVectorBlockDim];
-        const int d = (i * kUnroll + j);
-        q[j]        = shfl(queryReg, d, WarpSize);
-        compute_dist(dist, q[j], encV[j]);
+        uint32_t encV = data[laneId + j * GroupSize];
+        uint32_t q    = shfl(queryReg, i * kUnroll + j, WarpSize);
+        compute_dist(dist, q, encV);
       }
     }
   }
@@ -495,20 +467,18 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 1, uint8_t, u
   {
     constexpr int veclen = 1;
     int loadDim          = dimBlocks + laneId;
-    uint32_t queryReg    = 0;
-    queryReg             = loadDim < dim ? query[loadDim] : 0;
-    for (int d = 0; d < dim - dimBlocks; d += veclen, data += wordsPerVectorBlockDim * veclen) {
-      uint32_t enc = 0;
-      enc          = data[laneId];
+    uint32_t queryReg    = loadDim < dim ? query[loadDim] : 0;
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += GroupSize * veclen) {
+      uint32_t enc = data[laneId];
       uint32_t q   = shfl(queryReg, d, WarpSize);
       compute_dist(dist, q, enc);
-    }  // end for d < dim - dimBlocks
+    }
   }
 };
 
 // This device function is for int8 veclens 4, 8 and 16
-template <int kUnroll, int wordsPerVectorBlockDim, typename Lambda, int int8_veclen>
-struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, int8_veclen, int8_t, int32_t> {
+template <int kUnroll, int GroupSize, typename Lambda, int int8_veclen>
+struct loadAndComputeDist<kUnroll, GroupSize, Lambda, int8_veclen, int8_t, int32_t> {
   Lambda compute_dist;
   int32_t& dist;
 
@@ -520,23 +490,19 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, int8_veclen,
   __device__ __forceinline__ void runLoadShmemCompute(const int8_t* const& data,
                                                       const int8_t* query_shared,
                                                       int loadIndex,
-                                                      int baseShmemIndex,
-                                                      int iShmemIndex)
+                                                      int shmemIndex)
   {
     constexpr int veclen_int = int8_veclen / 4;  // converting int8_t veclens to int
-    int32_t encV[kUnroll][veclen_int];
-    int32_t queryRegs[kUnroll][veclen_int];
 
 #pragma unroll
     for (int j = 0; j < kUnroll; ++j) {
-      ldg(encV[j],
-          reinterpret_cast<int32_t const*>(data) +
-            (loadIndex + j * wordsPerVectorBlockDim) * veclen_int);
-      const int d = iShmemIndex * kUnroll + j * veclen_int;
-      lds(queryRegs[j], reinterpret_cast<int32_t const*>(query_shared + baseShmemIndex) + d);
+      int32_t encV[veclen_int];
+      ldg(encV, reinterpret_cast<int32_t const*>(data) + (loadIndex + j * GroupSize) * veclen_int);
+      int32_t queryRegs[veclen_int];
+      lds(queryRegs, reinterpret_cast<int32_t const*>(query_shared + shmemIndex) + j * veclen_int);
 #pragma unroll
       for (int k = 0; k < veclen_int; k++) {
-        compute_dist(dist, queryRegs[j][k], encV[j][k]);
+        compute_dist(dist, queryRegs[k], encV[k]);
       }
     }
   }
@@ -547,24 +513,22 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, int8_veclen,
                                                         const int laneId)
   {
     constexpr int veclen_int = int8_veclen / 4;  // converting int8_t veclens to int
-    int32_t encV[kUnroll][veclen_int];
+
     int32_t queryReg =
       (laneId < 8) ? reinterpret_cast<int32_t const*>(query + baseLoadIndex)[laneId] : 0;
-    int32_t q[kUnroll][veclen_int];
     constexpr int stride = kUnroll * int8_veclen;
 
 #pragma unroll
-    for (int i = 0; i < WarpSize / stride; ++i, data += stride * wordsPerVectorBlockDim) {
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * GroupSize) {
 #pragma unroll
       for (int j = 0; j < kUnroll; ++j) {
-        ldg(encV[j],
-            reinterpret_cast<int32_t const*>(data) +
-              (laneId + j * wordsPerVectorBlockDim) * veclen_int);
+        int32_t encV[veclen_int];
+        ldg(encV, reinterpret_cast<int32_t const*>(data) + (laneId + j * GroupSize) * veclen_int);
         const int d = (i * kUnroll + j) * veclen_int;
 #pragma unroll
         for (int k = 0; k < veclen_int; ++k) {
-          q[j][k] = shfl(queryReg, d + k, WarpSize);
-          compute_dist(dist, q[j][k], encV[j][k]);
+          int32_t q = shfl(queryReg, d + k, WarpSize);
+          compute_dist(dist, q, encV[k]);
         }
       }
     }
@@ -576,22 +540,20 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, int8_veclen,
     constexpr int veclen_int = int8_veclen / 4;
     const int loadDim        = dimBlocks + laneId * 4;  // Here 4 is for 1 - int;
     int32_t queryReg = loadDim < dim ? reinterpret_cast<int32_t const*>(query + loadDim)[0] : 0;
-    for (int d = 0; d < dim - dimBlocks;
-         d += int8_veclen, data += wordsPerVectorBlockDim * int8_veclen) {
+    for (int d = 0; d < dim - dimBlocks; d += int8_veclen, data += GroupSize * int8_veclen) {
       int32_t enc[veclen_int];
-      int32_t q[veclen_int];
       ldg(enc, reinterpret_cast<int32_t const*>(data) + laneId * veclen_int);
 #pragma unroll
       for (int k = 0; k < veclen_int; k++) {
-        q[k] = shfl(queryReg, (d / 4) + k, WarpSize);  // Here 4 is for 1 - int;
-        compute_dist(dist, q[k], enc[k]);
+        int32_t q = shfl(queryReg, (d / 4) + k, WarpSize);  // Here 4 is for 1 - int;
+        compute_dist(dist, q, enc[k]);
       }
     }  // end for d < dim - dimBlocks
   }
 };
 
-template <int kUnroll, int wordsPerVectorBlockDim, typename Lambda>
-struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 2, int8_t, int32_t> {
+template <int kUnroll, int GroupSize, typename Lambda>
+struct loadAndComputeDist<kUnroll, GroupSize, Lambda, 2, int8_t, int32_t> {
   Lambda compute_dist;
   int32_t& dist;
   __device__ __forceinline__ loadAndComputeDist(int32_t& dist, Lambda op)
@@ -601,19 +563,13 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 2, int8_t, in
   __device__ __forceinline__ void runLoadShmemCompute(const int8_t* const& data,
                                                       const int8_t* query_shared,
                                                       int loadIndex,
-                                                      int baseShmemIndex,
-                                                      int iShmemIndex)
+                                                      int shmemIndex)
   {
-    int32_t encV[kUnroll];
-    int32_t queryRegs[kUnroll];
 #pragma unroll
     for (int j = 0; j < kUnroll; ++j) {
-      encV[j]     = 0;
-      encV[j]     = reinterpret_cast<uint16_t const*>(data)[loadIndex + j * wordsPerVectorBlockDim];
-      const int d = (iShmemIndex * kUnroll + j);
-      queryRegs[j] = 0;
-      queryRegs[j] = reinterpret_cast<uint16_t const*>(query_shared + baseShmemIndex)[d];
-      compute_dist(dist, queryRegs[j], encV[j]);
+      int32_t encV      = reinterpret_cast<uint16_t const*>(data)[loadIndex + j * GroupSize];
+      int32_t queryRegs = reinterpret_cast<uint16_t const*>(query_shared + shmemIndex)[j];
+      compute_dist(dist, queryRegs, encV);
     }
   }
 
@@ -622,22 +578,18 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 2, int8_t, in
                                                         int baseLoadIndex,
                                                         const int laneId)
   {
-    int32_t encV[kUnroll];
-    int32_t queryReg = 0;
-    queryReg = (laneId < 16) ? reinterpret_cast<uint16_t const*>(query + baseLoadIndex)[laneId] : 0;
-    int32_t q[kUnroll];
+    int32_t queryReg =
+      (laneId < 16) ? reinterpret_cast<uint16_t const*>(query + baseLoadIndex)[laneId] : 0;
     constexpr int veclen = 2;
     constexpr int stride = kUnroll * veclen;
 
 #pragma unroll
-    for (int i = 0; i < WarpSize / stride; ++i, data += stride * wordsPerVectorBlockDim) {
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * GroupSize) {
 #pragma unroll
       for (int j = 0; j < kUnroll; ++j) {
-        encV[j]     = 0;
-        encV[j]     = reinterpret_cast<uint16_t const*>(data)[laneId + j * wordsPerVectorBlockDim];
-        const int d = (i * kUnroll + j);
-        q[j]        = shfl(queryReg, d, WarpSize);
-        compute_dist(dist, q[j], encV[j]);
+        int32_t encV = reinterpret_cast<uint16_t const*>(data)[laneId + j * GroupSize];
+        int32_t q    = shfl(queryReg, i * kUnroll + j, WarpSize);
+        compute_dist(dist, q, encV);
       }
     }
   }
@@ -647,18 +599,17 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 2, int8_t, in
   {
     constexpr int veclen = 2;
     int loadDim          = dimBlocks + laneId * veclen;
-    int32_t queryReg     = 0;
-    queryReg = loadDim < dim ? reinterpret_cast<uint16_t const*>(query + loadDim)[0] : 0;
-    for (int d = 0; d < dim - dimBlocks; d += veclen, data += wordsPerVectorBlockDim * veclen) {
+    int32_t queryReg = loadDim < dim ? reinterpret_cast<uint16_t const*>(query + loadDim)[0] : 0;
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += GroupSize * veclen) {
       int32_t enc = reinterpret_cast<uint16_t const*>(data + laneId * veclen)[0];
       int32_t q   = shfl(queryReg, d / veclen, WarpSize);
       compute_dist(dist, q, enc);
-    }  // end for d < dim - dimBlocks
+    }
   }
 };
 
-template <int kUnroll, int wordsPerVectorBlockDim, typename Lambda>
-struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 1, int8_t, int32_t> {
+template <int kUnroll, int GroupSize, typename Lambda>
+struct loadAndComputeDist<kUnroll, GroupSize, Lambda, 1, int8_t, int32_t> {
   Lambda compute_dist;
   int32_t& dist;
   __device__ __forceinline__ loadAndComputeDist(int32_t& dist, Lambda op)
@@ -669,20 +620,11 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 1, int8_t, in
   __device__ __forceinline__ void runLoadShmemCompute(const int8_t* const& data,
                                                       const int8_t* query_shared,
                                                       int loadIndex,
-                                                      int baseShmemIndex,
-                                                      int iShmemIndex)
+                                                      int shmemIndex)
   {
-    int32_t encV[kUnroll];
-    int32_t queryRegs[kUnroll];
-
 #pragma unroll
     for (int j = 0; j < kUnroll; ++j) {
-      encV[j]      = 0;
-      encV[j]      = data[loadIndex + j * wordsPerVectorBlockDim];
-      const int d  = (iShmemIndex * kUnroll + j);
-      queryRegs[j] = 0;
-      queryRegs[j] = query_shared[baseShmemIndex + d];
-      compute_dist(dist, queryRegs[j], encV[j]);
+      compute_dist(dist, query_shared[shmemIndex + j], data[loadIndex + j * GroupSize]);
     }
   }
 
@@ -693,20 +635,13 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 1, int8_t, in
   {
     constexpr int veclen = 1;
     constexpr int stride = kUnroll * veclen;
-    int32_t encV[kUnroll];
-    int32_t queryReg = 0;
-    queryReg         = query[baseLoadIndex + laneId];
-    int32_t q[kUnroll];
+    int32_t queryReg     = query[baseLoadIndex + laneId];
 
 #pragma unroll
-    for (int i = 0; i < WarpSize / stride; ++i, data += stride * wordsPerVectorBlockDim) {
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * GroupSize) {
 #pragma unroll
       for (int j = 0; j < kUnroll; ++j) {
-        encV[j]     = 0;
-        encV[j]     = data[laneId + j * wordsPerVectorBlockDim];
-        const int d = (i * kUnroll + j);
-        q[j]        = shfl(queryReg, d, WarpSize);
-        compute_dist(dist, q[j], encV[j]);
+        compute_dist(dist, shfl(queryReg, i * kUnroll + j, WarpSize), data[laneId + j * GroupSize]);
       }
     }
   }
@@ -715,14 +650,10 @@ struct loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, Lambda, 1, int8_t, in
   {
     constexpr int veclen = 1;
     const int loadDim    = dimBlocks + laneId;
-    int32_t queryReg     = 0;
-    queryReg             = loadDim < dim ? query[loadDim] : 0;
-    for (int d = 0; d < dim - dimBlocks; d += veclen, data += wordsPerVectorBlockDim * veclen) {
-      int32_t enc = 0;
-      enc         = data[laneId];
-      int32_t q   = shfl(queryReg, d, WarpSize);
-      compute_dist(dist, q, enc);
-    }  // end for d < dim - dimBlocks
+    int32_t queryReg     = loadDim < dim ? query[loadDim] : 0;
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += GroupSize * veclen) {
+      compute_dist(dist, shfl(queryReg, d, WarpSize), data[laneId]);
+    }
   }
 };
 
@@ -780,8 +711,8 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
   int queryId      = blockIdx.y;
 
   /// Set the address
-  auto query                           = queries + queryId * dim;
-  constexpr int wordsPerVectorBlockDim = WarpSize;
+  auto query               = queries + queryId * dim;
+  constexpr int kGroupSize = WarpSize;
 
   // How many full warps needed to compute the distance (without remainder)
   const int full_warps_along_dim = align_warp::roundDown(dim);
@@ -813,6 +744,7 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
     // The number of interleaved group to be processed
     const uint32_t numBlocks = ceildiv<uint32_t>(numVecs, WarpSize);
 
+    constexpr int kUnroll        = WarpSize / Veclen;
     constexpr uint32_t kNumWarps = kThreadsPerBlock / WarpSize;
     // Every warp reads WarpSize vectors and computes the distances to them.
     // Then, the distances and corresponding ids are distributed among the threads,
@@ -824,34 +756,21 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
       bool valid         = vec < numVecs;
       size_t idx         = (valid) ? (size_t)indexBase[vec] : (size_t)laneId;
       // This is where this warp begins reading data
-      const T* data =
-        vecsBase + size_t(block) * wordsPerVectorBlockDim * dim;  // Start position of this block
+      const T* data = vecsBase + size_t(block) * kGroupSize * dim;  // Start position of this block
 
       if (valid) {
         /// load query from shared mem
-        for (int dBase = 0; dBase < shLoadDim; dBase += WarpSize) {  //
-          constexpr int kUnroll   = WarpSize / Veclen;
-          constexpr int stride    = kUnroll * Veclen;
-          constexpr int totalIter = WarpSize / stride;
-
-          loadAndComputeDist<kUnroll,
-                             wordsPerVectorBlockDim,
-                             decltype(compute_dist),
-                             Veclen,
-                             T,
-                             AccT>
-            obj(dist, compute_dist);
-#pragma unroll
-          for (int i = 0; i < totalIter; ++i, data += stride * wordsPerVectorBlockDim) {
-            obj.runLoadShmemCompute(data, query_shared, laneId, dBase, i);
-          }  // end for i < WarpSize / kUnroll
-        }    // end for dBase < full_warps_along_dim
+        for (int dBase = 0; dBase < shLoadDim; dBase += WarpSize) {
+          loadAndComputeDist<kUnroll, kGroupSize, decltype(compute_dist), Veclen, T, AccT> obj(
+            dist, compute_dist);
+          obj.runLoadShmemCompute(data, query_shared, laneId, dBase);
+          data += WarpSize * kGroupSize;
+        }
       }
 
       if (dim > query_smem_elems) {
-        constexpr int kUnroll = WarpSize / Veclen;
-        loadAndComputeDist<kUnroll, wordsPerVectorBlockDim, decltype(compute_dist), Veclen, T, AccT>
-          obj(dist, compute_dist);
+        loadAndComputeDist<kUnroll, kGroupSize, decltype(compute_dist), Veclen, T, AccT> obj(
+          dist, compute_dist);
         for (int dBase = shLoadDim; dBase < full_warps_along_dim; dBase += WarpSize) {  //
           obj.runLoadShflAndCompute(data, query, dBase, laneId);
         }
@@ -862,10 +781,10 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
         if (valid) {
           /// Remainder chunk = dim - full_warps_along_dim
           for (int d = 0; d < dim - full_warps_along_dim;
-               d += Veclen, data += wordsPerVectorBlockDim * Veclen) {
-            loadAndComputeDist<1, wordsPerVectorBlockDim, decltype(compute_dist), Veclen, T, AccT>
-              obj(dist, compute_dist);
-            obj.runLoadShmemCompute(data, query_shared, laneId, full_warps_along_dim + d, 0);
+               d += Veclen, data += kGroupSize * Veclen) {
+            loadAndComputeDist<1, kGroupSize, decltype(compute_dist), Veclen, T, AccT> obj(
+              dist, compute_dist);
+            obj.runLoadShmemCompute(data, query_shared, laneId, full_warps_along_dim + d);
           }  // end for d < dim - full_warps_along_dim
         }
       }

From eeb4601537284f515db14f7e400d3a2ab31bdd47 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 23 Jun 2022 14:59:10 +0200
Subject: [PATCH 082/118] Minor renamings

---
 .../raft/spatial/knn/detail/ivf_flat_search.cuh      | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
index 9a270a968a..5c195c53ca 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
@@ -739,21 +739,21 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
     // The start address of index of vector for each cluster(list) interleaved
     auto indexBase = list_index + list_prefix_interleave[listId];
     // The number of vectors in each cluster(list); [nlist]
-    const uint32_t numVecs = list_lengths[listId];
+    const uint32_t list_length = list_lengths[listId];
 
-    // The number of interleaved group to be processed
-    const uint32_t numBlocks = ceildiv<uint32_t>(numVecs, WarpSize);
+    // The number of interleaved groups to be processed
+    const uint32_t num_groups = ceildiv<uint32_t>(list_length, WarpSize);
 
     constexpr int kUnroll        = WarpSize / Veclen;
     constexpr uint32_t kNumWarps = kThreadsPerBlock / WarpSize;
     // Every warp reads WarpSize vectors and computes the distances to them.
     // Then, the distances and corresponding ids are distributed among the threads,
     // and each thread adds one (id, dist) pair to the filtering queue.
-    for (uint32_t block = warpId; block < numBlocks; block += kNumWarps) {
+    for (uint32_t block = warpId; block < num_groups; block += kNumWarps) {
       AccT dist = 0;
       // This is the vector a given lane/thread handles
       const uint32_t vec = block * WarpSize + laneId;
-      bool valid         = vec < numVecs;
+      bool valid         = vec < list_length;
       size_t idx         = (valid) ? (size_t)indexBase[vec] : (size_t)laneId;
       // This is where this warp begins reading data
       const T* data = vecsBase + size_t(block) * kGroupSize * dim;  // Start position of this block
@@ -793,7 +793,7 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
       constexpr float kDummy = Ascending ? upper_bound<float>() : lower_bound<float>();
       float val              = (valid) ? (float)dist : kDummy;
       queue.add(val, idx);
-    }  // end for block < numBlocks
+    }
   }
 
   /// Warp_wise topk

From 025e5a571590b587c645216447c9012762c4d51b Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 23 Jun 2022 16:04:54 +0200
Subject: [PATCH 083/118] Add 8bit int types to knn benchmarks

---
 cpp/bench/spatial/knn.cu | 33 +++++++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/cpp/bench/spatial/knn.cu b/cpp/bench/spatial/knn.cu
index 49d1b00dfb..a83f914f2f 100644
--- a/cpp/bench/spatial/knn.cu
+++ b/cpp/bench/spatial/knn.cu
@@ -131,6 +131,8 @@ struct host_uvector {
 
 template <typename ValT, typename IdxT>
 struct ivf_flat_knn {
+  using dist_t = float;
+
   raft::spatial::knn::knnIndex index;
   raft::spatial::knn::ivf_flat::index_params index_params;
   raft::spatial::knn::ivf_flat::search_params search_params;
@@ -150,7 +152,7 @@ struct ivf_flat_knn {
 
   void search(const raft::handle_t& handle,
               const ValT* search_items,
-              ValT* out_dists,
+              dist_t* out_dists,
               IdxT* out_idxs)
   {
     search_params.n_probes = 20;
@@ -167,6 +169,8 @@ struct ivf_flat_knn {
 
 template <typename ValT, typename IdxT>
 struct brute_force_knn {
+  using dist_t = ValT;
+
   ValT* index;
   params ps;
 
@@ -177,7 +181,7 @@ struct brute_force_knn {
 
   void search(const raft::handle_t& handle,
               const ValT* search_items,
-              ValT* out_dists,
+              dist_t* out_dists,
               IdxT* out_idxs)
   {
     std::vector<ValT*> input{index};
@@ -207,8 +211,7 @@ struct knn : public fixture {
       out_idxs_(p.n_probes * p.k, stream)
   {
     raft::random::RngState state{42};
-    raft::random::uniform(
-      state, search_items_.data(), search_items_.size(), ValT(-1.0), ValT(1.0), stream);
+    gen_data(state, search_items_, search_items_.size(), stream);
     try {
       size_t total_size = p.n_samples * p.n_dims;
       data_host_.resize(total_size);
@@ -216,7 +219,7 @@ struct knn : public fixture {
       rmm::device_uvector<ValT> d(std::min(kGenMinibatchSize, total_size), stream);
       for (size_t offset = 0; offset < total_size; offset += kGenMinibatchSize) {
         size_t actual_size = std::min(total_size - offset, kGenMinibatchSize);
-        raft::random::uniform(state, d.data(), actual_size, ValT(-1.0), ValT(1.0), stream);
+        gen_data(state, d, actual_size, stream);
         copy(data_host_.data() + offset, d.data(), actual_size, stream);
       }
     } catch (std::bad_alloc& e) {
@@ -224,6 +227,22 @@ struct knn : public fixture {
     }
   }
 
+  template <typename T>
+  void gen_data(raft::random::RngState& state,
+                rmm::device_uvector<T>& vec,
+                size_t n,
+                rmm::cuda_stream_view stream)
+  {
+    constexpr T kRangeMax = T(std::min<double>(
+      raft::spatial::knn::detail::utils::config<T>::kDivisor, std::numeric_limits<T>::max()));
+    constexpr T kRangeMin = std::is_signed_v<T> ? -kRangeMax : T(0);
+    if constexpr (std::is_integral_v<T>) {
+      raft::random::uniformInt(state, vec.data(), n, kRangeMin, kRangeMax, stream);
+    } else {
+      raft::random::uniform(state, vec.data(), n, kRangeMin, kRangeMax, stream);
+    }
+  }
+
   void run_benchmark(::benchmark::State& state) override
   {
     if (data_does_not_fit_) {
@@ -326,7 +345,7 @@ struct knn : public fixture {
 
   std::vector<ValT> data_host_;
   rmm::device_uvector<ValT> search_items_;
-  rmm::device_uvector<ValT> out_dists_;
+  rmm::device_uvector<typename ImplT::dist_t> out_dists_;
   rmm::device_uvector<IdxT> out_idxs_;
 };
 
@@ -352,5 +371,7 @@ const std::vector<Scope> kAllScopes{Scope::BUILD, Scope::SEARCH, Scope::BUILD_SE
 
 KNN_REGISTER(float, int64_t, brute_force_knn, kInputs, kAllStrategies, kScopeFull);
 KNN_REGISTER(float, int64_t, ivf_flat_knn, kInputs, kNoCopyOnly, kAllScopes);
+KNN_REGISTER(int8_t, int64_t, ivf_flat_knn, kInputs, kNoCopyOnly, kAllScopes);
+KNN_REGISTER(uint8_t, int64_t, ivf_flat_knn, kInputs, kNoCopyOnly, kAllScopes);
 
 }  // namespace raft::bench::spatial

From 38213668ccb505333df253e751c0d27cf8111c27 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 24 Jun 2022 11:36:42 +0200
Subject: [PATCH 084/118] Fix incorrect data mapping for int8 types

---
 .../knn/detail/ann_kmeans_balanced.cuh        |  2 +-
 .../raft/spatial/knn/detail/ann_utils.cuh     | 24 +++++++------------
 2 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index 9280faf0f2..066382db5a 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -549,7 +549,7 @@ auto build_fine_clusters(const handle_t& handle,
       continue;
     }
 
-    utils::copy_selected<T>(
+    utils::copy_selected(
       mesocluster_sizes[i], dim, dataset_mptr, mc_trainset_ids, dim, mc_trainset, dim, stream);
 
     build_clusters(handle,
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index 6e8d36820b..93c3c2f616 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -122,22 +122,14 @@ struct mapping {
   template <typename S>
   HDI auto operator()(const S& x) -> std::enable_if_t<!std::is_same_v<S, T>, T>
   {
-    constexpr double kMult = config<S>::kDivisor / config<T>::kDivisor;
-    return static_cast<T>(static_cast<double>(x) * kMult);
+    constexpr double kMult = config<T>::kDivisor / config<S>::kDivisor;
+    if constexpr (std::is_floating_point_v<S>) { return static_cast<T>(x * static_cast<S>(kMult)); }
+    if constexpr (std::is_floating_point_v<T>) { return static_cast<T>(x) * static_cast<T>(kMult); }
+    return static_cast<T>(static_cast<float>(x) * static_cast<float>(kMult));
   };
   /** @} */
 };
 
-template <>
-struct mapping<float> {
-  template <typename S>
-  HDI auto operator()(const S& x) -> float
-  {
-    constexpr float kMult = static_cast<float>(config<float>::kDivisor / config<S>::kDivisor);
-    return static_cast<float>(x) * kMult;
-  };
-};
-
 /**
  * @brief Sets the first num bytes of the block of memory pointed by ptr to the specified value.
  *
@@ -437,10 +429,10 @@ void outer_add(
 template <typename T, typename S>
 __global__ void copy_selected_kernel(uint32_t n_rows,
                                      uint32_t n_cols,
-                                     const T* src,
+                                     const S* src,
                                      const uint32_t* row_ids,
                                      uint32_t ld_src,
-                                     S* dst,
+                                     T* dst,
                                      uint32_t ld_dst)
 {
   uint64_t gid   = threadIdx.x + blockDim.x * blockIdx.x;
@@ -470,10 +462,10 @@ __global__ void copy_selected_kernel(uint32_t n_rows,
 template <typename T, typename S>
 void copy_selected(uint32_t n_rows,
                    uint32_t n_cols,
-                   const T* src,
+                   const S* src,
                    const uint32_t* row_ids,
                    uint32_t ld_src,
-                   S* dst,
+                   T* dst,
                    uint32_t ld_dst,
                    rmm::cuda_stream_view stream)
 {

From a29baa77c88bdb0a59250437d5e8d049bc76d6b2 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 27 Jun 2022 08:35:45 +0200
Subject: [PATCH 085/118] Introduce kIndexGroupSize constant

---
 .../spatial/knn/detail/ivf_flat_build.cuh     |   7 +-
 .../spatial/knn/detail/ivf_flat_search.cuh    | 113 +++++++++---------
 .../raft/spatial/knn/ivf_flat_types.hpp       |  11 +-
 3 files changed, 70 insertions(+), 61 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
index 49391221be..0f2f70a315 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
@@ -32,6 +32,7 @@ namespace raft::spatial::knn::detail::ivf_flat {
 
 using raft::spatial::knn::ivf_flat::index;
 using raft::spatial::knn::ivf_flat::index_params;
+using raft::spatial::knn::ivf_flat::kIndexGroupSize;
 
 template <typename T, typename... Extents>
 static inline auto make_array_for_index(rmm::cuda_stream_view stream, Extents... exts)
@@ -93,8 +94,8 @@ __global__ void build_index_kernel(const uint32_t* labels,
   // Record the source vector id in the index
   list_index[list_offset + inlist_id] = i;
 
-  // The data is written in interleaved groups of `WarpSize` vectors
-  using interleaved_group = Pow2<WarpSize>;
+  // The data is written in interleaved groups of `index::kGroupSize` vectors
+  using interleaved_group = Pow2<kIndexGroupSize>;
   auto group_offset       = interleaved_group::roundDown(inlist_id);
   auto ingroup_id         = interleaved_group::mod(inlist_id) * veclen;
 
@@ -108,7 +109,7 @@ __global__ void build_index_kernel(const uint32_t* labels,
   // NB: such `veclen` is selected, that `dim % veclen == 0`
   for (uint32_t l = 0; l < dim; l += veclen) {
     for (uint32_t j = 0; j < veclen; j++) {
-      list_data[l * WarpSize + ingroup_id + j] = dataset[l + j];
+      list_data[l * kIndexGroupSize + ingroup_id + j] = dataset[l + j];
     }
   }
 }
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
index 5c195c53ca..70783bde3e 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
@@ -46,6 +46,7 @@
 namespace raft::spatial::knn::detail::ivf_flat {
 
 using raft::spatial::knn::ivf_flat::index;
+using raft::spatial::knn::ivf_flat::kIndexGroupSize;
 using raft::spatial::knn::ivf_flat::search_params;
 
 constexpr int kThreadsPerBlock = 128;
@@ -123,7 +124,6 @@ __device__ __forceinline__ void queryLoadToShmem<int8_t, 16>(const int8_t* const
  * and per index item.
  *
  * @tparam kUnroll elements per loop (normally, kUnroll = WarpSize / Veclen)
- * @tparam GroupSize number of vectors in the interleaved groups in the index.
  * @tparam Lambda computing the part of the distance for one dimension and aggregating it:
  *                void (AccT& acc, AccT x, AccT y)
  * @tparam Veclen size of the vectorized load
@@ -131,7 +131,7 @@ __device__ __forceinline__ void queryLoadToShmem<int8_t, 16>(const int8_t* const
  * @tparam AccT type of the accumulated value (an optimization for 8bit values to be loaded as 32bit
  * values)
  */
-template <int kUnroll, int GroupSize, typename Lambda, int Veclen, typename T, typename AccT>
+template <int kUnroll, typename Lambda, int Veclen, typename T, typename AccT>
 struct loadAndComputeDist {
   Lambda compute_dist;
   AccT& dist;
@@ -155,7 +155,7 @@ struct loadAndComputeDist {
 #pragma unroll
     for (int j = 0; j < kUnroll; ++j) {
       T encV[Veclen];
-      ldg(encV, data + (loadIndex + j * GroupSize) * Veclen);
+      ldg(encV, data + (loadIndex + j * kIndexGroupSize) * Veclen);
       T queryRegs[Veclen];
       lds(queryRegs, &query_shared[shmemIndex + j * Veclen]);
 #pragma unroll
@@ -180,13 +180,13 @@ struct loadAndComputeDist {
     T queryReg               = query[baseLoadIndex + laneId];
     constexpr int stride     = kUnroll * Veclen;
     constexpr int totalIter  = WarpSize / stride;
-    constexpr int gmemStride = stride * GroupSize;
+    constexpr int gmemStride = stride * kIndexGroupSize;
 #pragma unroll
     for (int i = 0; i < totalIter; ++i, data += gmemStride) {
 #pragma unroll
       for (int j = 0; j < kUnroll; ++j) {
         T encV[Veclen];
-        ldg(encV, data + (laneId + j * GroupSize) * Veclen);
+        ldg(encV, data + (laneId + j * kIndexGroupSize) * Veclen);
         const int d = (i * kUnroll + j) * Veclen;
 #pragma unroll
         for (int k = 0; k < Veclen; ++k) {
@@ -206,7 +206,7 @@ struct loadAndComputeDist {
     const int loadDim     = dimBlocks + laneId;
     T queryReg            = loadDim < dim ? query[loadDim] : 0;
     const int loadDataIdx = laneId * Veclen;
-    for (int d = 0; d < dim - dimBlocks; d += Veclen, data += GroupSize * Veclen) {
+    for (int d = 0; d < dim - dimBlocks; d += Veclen, data += kIndexGroupSize * Veclen) {
       T enc[Veclen];
       ldg(enc, data + loadDataIdx);
 #pragma unroll
@@ -218,8 +218,8 @@ struct loadAndComputeDist {
 };
 
 // This handles uint8_t 8, 16 Veclens
-template <int kUnroll, int GroupSize, typename Lambda, int uint8_veclen>
-struct loadAndComputeDist<kUnroll, GroupSize, Lambda, uint8_veclen, uint8_t, uint32_t> {
+template <int kUnroll, typename Lambda, int uint8_veclen>
+struct loadAndComputeDist<kUnroll, Lambda, uint8_veclen, uint8_t, uint32_t> {
   Lambda compute_dist;
   uint32_t& dist;
 
@@ -238,7 +238,8 @@ struct loadAndComputeDist<kUnroll, GroupSize, Lambda, uint8_veclen, uint8_t, uin
 #pragma unroll
     for (int j = 0; j < kUnroll; ++j) {
       uint32_t encV[veclen_int];
-      ldg(encV, reinterpret_cast<unsigned const*>(data) + loadIndex + j * GroupSize * veclen_int);
+      ldg(encV,
+          reinterpret_cast<unsigned const*>(data) + loadIndex + j * kIndexGroupSize * veclen_int);
       uint32_t queryRegs[veclen_int];
       lds(queryRegs, reinterpret_cast<unsigned const*>(query_shared + shmemIndex) + j * veclen_int);
 #pragma unroll
@@ -258,11 +259,12 @@ struct loadAndComputeDist<kUnroll, GroupSize, Lambda, uint8_veclen, uint8_t, uin
     constexpr int stride = kUnroll * uint8_veclen;
 
 #pragma unroll
-    for (int i = 0; i < WarpSize / stride; ++i, data += stride * GroupSize) {
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
 #pragma unroll
       for (int j = 0; j < kUnroll; ++j) {
         uint32_t encV[veclen_int];
-        ldg(encV, reinterpret_cast<unsigned const*>(data) + (laneId + j * GroupSize) * veclen_int);
+        ldg(encV,
+            reinterpret_cast<unsigned const*>(data) + (laneId + j * kIndexGroupSize) * veclen_int);
         const int d = (i * kUnroll + j) * veclen_int;
 #pragma unroll
         for (int k = 0; k < veclen_int; ++k) {
@@ -281,7 +283,8 @@ struct loadAndComputeDist<kUnroll, GroupSize, Lambda, uint8_veclen, uint8_t, uin
     constexpr int veclen_int = uint8_veclen / 4;
     const int loadDim        = dimBlocks + laneId * 4;  // Here 4 is for 1 - int
     uint32_t queryReg = loadDim < dim ? reinterpret_cast<uint32_t const*>(query + loadDim)[0] : 0;
-    for (int d = 0; d < dim - dimBlocks; d += uint8_veclen, data += GroupSize * uint8_veclen) {
+    for (int d = 0; d < dim - dimBlocks;
+         d += uint8_veclen, data += kIndexGroupSize * uint8_veclen) {
       uint32_t enc[veclen_int];
       ldg(enc, reinterpret_cast<uint32_t const*>(data) + laneId * veclen_int);
 #pragma unroll
@@ -295,8 +298,8 @@ struct loadAndComputeDist<kUnroll, GroupSize, Lambda, uint8_veclen, uint8_t, uin
 
 // Keep this specialized uint8 Veclen = 4, because compiler is generating suboptimal code while
 // using above common template of int2/int4
-template <int kUnroll, int GroupSize, typename Lambda>
-struct loadAndComputeDist<kUnroll, GroupSize, Lambda, 4, uint8_t, uint32_t> {
+template <int kUnroll, typename Lambda>
+struct loadAndComputeDist<kUnroll, Lambda, 4, uint8_t, uint32_t> {
   Lambda compute_dist;
   uint32_t& dist;
 
@@ -312,7 +315,7 @@ struct loadAndComputeDist<kUnroll, GroupSize, Lambda, 4, uint8_t, uint32_t> {
   {
 #pragma unroll
     for (int j = 0; j < kUnroll; ++j) {
-      uint32_t encV      = reinterpret_cast<unsigned const*>(data)[loadIndex + j * GroupSize];
+      uint32_t encV      = reinterpret_cast<unsigned const*>(data)[loadIndex + j * kIndexGroupSize];
       uint32_t queryRegs = reinterpret_cast<unsigned const*>(query_shared + shmemIndex)[j];
       compute_dist(dist, queryRegs, encV);
     }
@@ -328,10 +331,10 @@ struct loadAndComputeDist<kUnroll, GroupSize, Lambda, 4, uint8_t, uint32_t> {
     constexpr int stride = kUnroll * veclen;
 
 #pragma unroll
-    for (int i = 0; i < WarpSize / stride; ++i, data += stride * GroupSize) {
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
 #pragma unroll
       for (int j = 0; j < kUnroll; ++j) {
-        uint32_t encV = reinterpret_cast<unsigned const*>(data)[laneId + j * GroupSize];
+        uint32_t encV = reinterpret_cast<unsigned const*>(data)[laneId + j * kIndexGroupSize];
         uint32_t q    = shfl(queryReg, i * kUnroll + j, WarpSize);
         compute_dist(dist, q, encV);
       }
@@ -347,7 +350,7 @@ struct loadAndComputeDist<kUnroll, GroupSize, Lambda, 4, uint8_t, uint32_t> {
     constexpr int veclen = 4;
     const int loadDim    = dimBlocks + laneId;
     uint32_t queryReg    = loadDim < dim ? reinterpret_cast<unsigned const*>(query)[loadDim] : 0;
-    for (int d = 0; d < dim - dimBlocks; d += veclen, data += GroupSize * veclen) {
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
       uint32_t enc = reinterpret_cast<unsigned const*>(data)[laneId];
       uint32_t q   = shfl(queryReg, d / veclen, WarpSize);
       compute_dist(dist, q, enc);
@@ -355,8 +358,8 @@ struct loadAndComputeDist<kUnroll, GroupSize, Lambda, 4, uint8_t, uint32_t> {
   }
 };
 
-template <int kUnroll, int GroupSize, typename Lambda>
-struct loadAndComputeDist<kUnroll, GroupSize, Lambda, 2, uint8_t, uint32_t> {
+template <int kUnroll, typename Lambda>
+struct loadAndComputeDist<kUnroll, Lambda, 2, uint8_t, uint32_t> {
   Lambda compute_dist;
   uint32_t& dist;
 
@@ -372,7 +375,7 @@ struct loadAndComputeDist<kUnroll, GroupSize, Lambda, 2, uint8_t, uint32_t> {
   {
 #pragma unroll
     for (int j = 0; j < kUnroll; ++j) {
-      uint32_t encV      = reinterpret_cast<uint16_t const*>(data)[loadIndex + j * GroupSize];
+      uint32_t encV      = reinterpret_cast<uint16_t const*>(data)[loadIndex + j * kIndexGroupSize];
       uint32_t queryRegs = reinterpret_cast<uint16_t const*>(query_shared + shmemIndex)[j];
       compute_dist(dist, queryRegs, encV);
     }
@@ -389,10 +392,10 @@ struct loadAndComputeDist<kUnroll, GroupSize, Lambda, 2, uint8_t, uint32_t> {
     constexpr int stride = kUnroll * veclen;
 
 #pragma unroll
-    for (int i = 0; i < WarpSize / stride; ++i, data += stride * GroupSize) {
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
 #pragma unroll
       for (int j = 0; j < kUnroll; ++j) {
-        uint32_t encV = reinterpret_cast<uint16_t const*>(data)[laneId + j * GroupSize];
+        uint32_t encV = reinterpret_cast<uint16_t const*>(data)[laneId + j * kIndexGroupSize];
         uint32_t q    = shfl(queryReg, i * kUnroll + j, WarpSize);
         compute_dist(dist, q, encV);
       }
@@ -408,7 +411,7 @@ struct loadAndComputeDist<kUnroll, GroupSize, Lambda, 2, uint8_t, uint32_t> {
     constexpr int veclen = 2;
     int loadDim          = dimBlocks + laneId * veclen;
     uint32_t queryReg = loadDim < dim ? reinterpret_cast<uint16_t const*>(query + loadDim)[0] : 0;
-    for (int d = 0; d < dim - dimBlocks; d += veclen, data += GroupSize * veclen) {
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
       uint32_t enc = reinterpret_cast<uint16_t const*>(data)[laneId];
       uint32_t q   = shfl(queryReg, d / veclen, WarpSize);
       compute_dist(dist, q, enc);
@@ -416,8 +419,8 @@ struct loadAndComputeDist<kUnroll, GroupSize, Lambda, 2, uint8_t, uint32_t> {
   }
 };
 
-template <int kUnroll, int GroupSize, typename Lambda>
-struct loadAndComputeDist<kUnroll, GroupSize, Lambda, 1, uint8_t, uint32_t> {
+template <int kUnroll, typename Lambda>
+struct loadAndComputeDist<kUnroll, Lambda, 1, uint8_t, uint32_t> {
   Lambda compute_dist;
   uint32_t& dist;
 
@@ -433,7 +436,7 @@ struct loadAndComputeDist<kUnroll, GroupSize, Lambda, 1, uint8_t, uint32_t> {
   {
 #pragma unroll
     for (int j = 0; j < kUnroll; ++j) {
-      uint32_t encV      = data[loadIndex + j * GroupSize];
+      uint32_t encV      = data[loadIndex + j * kIndexGroupSize];
       uint32_t queryRegs = query_shared[shmemIndex + j];
       compute_dist(dist, queryRegs, encV);
     }
@@ -449,10 +452,10 @@ struct loadAndComputeDist<kUnroll, GroupSize, Lambda, 1, uint8_t, uint32_t> {
     constexpr int stride = kUnroll * veclen;
 
 #pragma unroll
-    for (int i = 0; i < WarpSize / stride; ++i, data += stride * GroupSize) {
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
 #pragma unroll
       for (int j = 0; j < kUnroll; ++j) {
-        uint32_t encV = data[laneId + j * GroupSize];
+        uint32_t encV = data[laneId + j * kIndexGroupSize];
         uint32_t q    = shfl(queryReg, i * kUnroll + j, WarpSize);
         compute_dist(dist, q, encV);
       }
@@ -468,7 +471,7 @@ struct loadAndComputeDist<kUnroll, GroupSize, Lambda, 1, uint8_t, uint32_t> {
     constexpr int veclen = 1;
     int loadDim          = dimBlocks + laneId;
     uint32_t queryReg    = loadDim < dim ? query[loadDim] : 0;
-    for (int d = 0; d < dim - dimBlocks; d += veclen, data += GroupSize * veclen) {
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
       uint32_t enc = data[laneId];
       uint32_t q   = shfl(queryReg, d, WarpSize);
       compute_dist(dist, q, enc);
@@ -477,8 +480,8 @@ struct loadAndComputeDist<kUnroll, GroupSize, Lambda, 1, uint8_t, uint32_t> {
 };
 
 // This device function is for int8 veclens 4, 8 and 16
-template <int kUnroll, int GroupSize, typename Lambda, int int8_veclen>
-struct loadAndComputeDist<kUnroll, GroupSize, Lambda, int8_veclen, int8_t, int32_t> {
+template <int kUnroll, typename Lambda, int int8_veclen>
+struct loadAndComputeDist<kUnroll, Lambda, int8_veclen, int8_t, int32_t> {
   Lambda compute_dist;
   int32_t& dist;
 
@@ -497,7 +500,8 @@ struct loadAndComputeDist<kUnroll, GroupSize, Lambda, int8_veclen, int8_t, int32
 #pragma unroll
     for (int j = 0; j < kUnroll; ++j) {
       int32_t encV[veclen_int];
-      ldg(encV, reinterpret_cast<int32_t const*>(data) + (loadIndex + j * GroupSize) * veclen_int);
+      ldg(encV,
+          reinterpret_cast<int32_t const*>(data) + (loadIndex + j * kIndexGroupSize) * veclen_int);
       int32_t queryRegs[veclen_int];
       lds(queryRegs, reinterpret_cast<int32_t const*>(query_shared + shmemIndex) + j * veclen_int);
 #pragma unroll
@@ -519,11 +523,12 @@ struct loadAndComputeDist<kUnroll, GroupSize, Lambda, int8_veclen, int8_t, int32
     constexpr int stride = kUnroll * int8_veclen;
 
 #pragma unroll
-    for (int i = 0; i < WarpSize / stride; ++i, data += stride * GroupSize) {
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
 #pragma unroll
       for (int j = 0; j < kUnroll; ++j) {
         int32_t encV[veclen_int];
-        ldg(encV, reinterpret_cast<int32_t const*>(data) + (laneId + j * GroupSize) * veclen_int);
+        ldg(encV,
+            reinterpret_cast<int32_t const*>(data) + (laneId + j * kIndexGroupSize) * veclen_int);
         const int d = (i * kUnroll + j) * veclen_int;
 #pragma unroll
         for (int k = 0; k < veclen_int; ++k) {
@@ -540,7 +545,7 @@ struct loadAndComputeDist<kUnroll, GroupSize, Lambda, int8_veclen, int8_t, int32
     constexpr int veclen_int = int8_veclen / 4;
     const int loadDim        = dimBlocks + laneId * 4;  // Here 4 is for 1 - int;
     int32_t queryReg = loadDim < dim ? reinterpret_cast<int32_t const*>(query + loadDim)[0] : 0;
-    for (int d = 0; d < dim - dimBlocks; d += int8_veclen, data += GroupSize * int8_veclen) {
+    for (int d = 0; d < dim - dimBlocks; d += int8_veclen, data += kIndexGroupSize * int8_veclen) {
       int32_t enc[veclen_int];
       ldg(enc, reinterpret_cast<int32_t const*>(data) + laneId * veclen_int);
 #pragma unroll
@@ -552,8 +557,8 @@ struct loadAndComputeDist<kUnroll, GroupSize, Lambda, int8_veclen, int8_t, int32
   }
 };
 
-template <int kUnroll, int GroupSize, typename Lambda>
-struct loadAndComputeDist<kUnroll, GroupSize, Lambda, 2, int8_t, int32_t> {
+template <int kUnroll, typename Lambda>
+struct loadAndComputeDist<kUnroll, Lambda, 2, int8_t, int32_t> {
   Lambda compute_dist;
   int32_t& dist;
   __device__ __forceinline__ loadAndComputeDist(int32_t& dist, Lambda op)
@@ -567,7 +572,7 @@ struct loadAndComputeDist<kUnroll, GroupSize, Lambda, 2, int8_t, int32_t> {
   {
 #pragma unroll
     for (int j = 0; j < kUnroll; ++j) {
-      int32_t encV      = reinterpret_cast<uint16_t const*>(data)[loadIndex + j * GroupSize];
+      int32_t encV      = reinterpret_cast<uint16_t const*>(data)[loadIndex + j * kIndexGroupSize];
       int32_t queryRegs = reinterpret_cast<uint16_t const*>(query_shared + shmemIndex)[j];
       compute_dist(dist, queryRegs, encV);
     }
@@ -584,10 +589,10 @@ struct loadAndComputeDist<kUnroll, GroupSize, Lambda, 2, int8_t, int32_t> {
     constexpr int stride = kUnroll * veclen;
 
 #pragma unroll
-    for (int i = 0; i < WarpSize / stride; ++i, data += stride * GroupSize) {
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
 #pragma unroll
       for (int j = 0; j < kUnroll; ++j) {
-        int32_t encV = reinterpret_cast<uint16_t const*>(data)[laneId + j * GroupSize];
+        int32_t encV = reinterpret_cast<uint16_t const*>(data)[laneId + j * kIndexGroupSize];
         int32_t q    = shfl(queryReg, i * kUnroll + j, WarpSize);
         compute_dist(dist, q, encV);
       }
@@ -600,7 +605,7 @@ struct loadAndComputeDist<kUnroll, GroupSize, Lambda, 2, int8_t, int32_t> {
     constexpr int veclen = 2;
     int loadDim          = dimBlocks + laneId * veclen;
     int32_t queryReg = loadDim < dim ? reinterpret_cast<uint16_t const*>(query + loadDim)[0] : 0;
-    for (int d = 0; d < dim - dimBlocks; d += veclen, data += GroupSize * veclen) {
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
       int32_t enc = reinterpret_cast<uint16_t const*>(data + laneId * veclen)[0];
       int32_t q   = shfl(queryReg, d / veclen, WarpSize);
       compute_dist(dist, q, enc);
@@ -608,8 +613,8 @@ struct loadAndComputeDist<kUnroll, GroupSize, Lambda, 2, int8_t, int32_t> {
   }
 };
 
-template <int kUnroll, int GroupSize, typename Lambda>
-struct loadAndComputeDist<kUnroll, GroupSize, Lambda, 1, int8_t, int32_t> {
+template <int kUnroll, typename Lambda>
+struct loadAndComputeDist<kUnroll, Lambda, 1, int8_t, int32_t> {
   Lambda compute_dist;
   int32_t& dist;
   __device__ __forceinline__ loadAndComputeDist(int32_t& dist, Lambda op)
@@ -624,7 +629,7 @@ struct loadAndComputeDist<kUnroll, GroupSize, Lambda, 1, int8_t, int32_t> {
   {
 #pragma unroll
     for (int j = 0; j < kUnroll; ++j) {
-      compute_dist(dist, query_shared[shmemIndex + j], data[loadIndex + j * GroupSize]);
+      compute_dist(dist, query_shared[shmemIndex + j], data[loadIndex + j * kIndexGroupSize]);
     }
   }
 
@@ -638,10 +643,11 @@ struct loadAndComputeDist<kUnroll, GroupSize, Lambda, 1, int8_t, int32_t> {
     int32_t queryReg     = query[baseLoadIndex + laneId];
 
 #pragma unroll
-    for (int i = 0; i < WarpSize / stride; ++i, data += stride * GroupSize) {
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
 #pragma unroll
       for (int j = 0; j < kUnroll; ++j) {
-        compute_dist(dist, shfl(queryReg, i * kUnroll + j, WarpSize), data[laneId + j * GroupSize]);
+        compute_dist(
+          dist, shfl(queryReg, i * kUnroll + j, WarpSize), data[laneId + j * kIndexGroupSize]);
       }
     }
   }
@@ -651,7 +657,7 @@ struct loadAndComputeDist<kUnroll, GroupSize, Lambda, 1, int8_t, int32_t> {
     constexpr int veclen = 1;
     const int loadDim    = dimBlocks + laneId;
     int32_t queryReg     = loadDim < dim ? query[loadDim] : 0;
-    for (int d = 0; d < dim - dimBlocks; d += veclen, data += GroupSize * veclen) {
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
       compute_dist(dist, shfl(queryReg, d, WarpSize), data[laneId]);
     }
   }
@@ -761,16 +767,16 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
       if (valid) {
         /// load query from shared mem
         for (int dBase = 0; dBase < shLoadDim; dBase += WarpSize) {
-          loadAndComputeDist<kUnroll, kGroupSize, decltype(compute_dist), Veclen, T, AccT> obj(
-            dist, compute_dist);
+          loadAndComputeDist<kUnroll, decltype(compute_dist), Veclen, T, AccT> obj(dist,
+                                                                                   compute_dist);
           obj.runLoadShmemCompute(data, query_shared, laneId, dBase);
           data += WarpSize * kGroupSize;
         }
       }
 
       if (dim > query_smem_elems) {
-        loadAndComputeDist<kUnroll, kGroupSize, decltype(compute_dist), Veclen, T, AccT> obj(
-          dist, compute_dist);
+        loadAndComputeDist<kUnroll, decltype(compute_dist), Veclen, T, AccT> obj(dist,
+                                                                                 compute_dist);
         for (int dBase = shLoadDim; dBase < full_warps_along_dim; dBase += WarpSize) {  //
           obj.runLoadShflAndCompute(data, query, dBase, laneId);
         }
@@ -782,8 +788,7 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
           /// Remainder chunk = dim - full_warps_along_dim
           for (int d = 0; d < dim - full_warps_along_dim;
                d += Veclen, data += kGroupSize * Veclen) {
-            loadAndComputeDist<1, kGroupSize, decltype(compute_dist), Veclen, T, AccT> obj(
-              dist, compute_dist);
+            loadAndComputeDist<1, decltype(compute_dist), Veclen, T, AccT> obj(dist, compute_dist);
             obj.runLoadShmemCompute(data, query_shared, laneId, full_warps_along_dim + d);
           }  // end for d < dim - full_warps_along_dim
         }
diff --git a/cpp/include/raft/spatial/knn/ivf_flat_types.hpp b/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
index 9e06607526..828e465316 100644
--- a/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
+++ b/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
@@ -25,6 +25,9 @@
 
 namespace raft::spatial::knn::ivf_flat {
 
+/** Size of the interleaved group (see `index::data` description). */
+constexpr static uint32_t kIndexGroupSize = 32;
+
 template <typename T>
 struct index {
   using row_major = layout_c_contiguous;
@@ -45,16 +48,16 @@ struct index {
    * Inverted list data [size, dim].
    *
    * The data consists of the dataset rows, grouped by their labels (into clusters/lists).
-   * Within each list (cluster), the data is grouped into blocks of `WarpSize` interleaved
+   * Within each list (cluster), the data is grouped into blocks of `kGroupSize` interleaved
    * vectors. Note, the total index length is slightly larger than the source dataset length,
-   * because each cluster is padded by `WarpSize` elements.
+   * because each cluster is padded by `kGroupSize` elements.
    *
    * Interleaving pattern:
-   * within groups of `WarpSize` rows, the data is interleaved with the block size equal to
+   * within groups of `kGroupSize` rows, the data is interleaved with the block size equal to
    * `veclen * sizeof(T)`. That is, a chunk of `veclen` consecutive components of one row is
    * followed by a chunk of the same size of the next row, and so on.
    *
-   * __Example__: veclen = 2, dim = 6, WarpSize = 32, list_size = 31
+   * __Example__: veclen = 2, dim = 6, kGroupSize = 32, list_size = 31
    * `
    *   x[ 0, 0], x[ 0, 1], x[ 1, 0], x[ 1, 1], ... x[14, 0], x[14, 1], x[15, 0], x[15, 1],
    *   x[16, 0], x[16, 1], x[17, 0], x[17, 1], ... x[30, 0], x[30, 1],    -    ,    -    ,

From 546bef80a199985e55f678bfdd1d6cbf3b1b0e47 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 27 Jun 2022 14:10:44 +0200
Subject: [PATCH 086/118] Cleanup ann_quantized

---
 cpp/include/raft/spatial/knn/ann_common.hpp   |  11 +-
 .../raft/spatial/knn/detail/ann_quantized.cuh | 242 +++++++-----------
 2 files changed, 96 insertions(+), 157 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/ann_common.hpp b/cpp/include/raft/spatial/knn/ann_common.hpp
index 72d5f365b6..7008bddaa1 100644
--- a/cpp/include/raft/spatial/knn/ann_common.hpp
+++ b/cpp/include/raft/spatial/knn/ann_common.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include "detail/processing.hpp"
 #include <faiss/gpu/GpuIndex.h>
 #include <raft/distance/distance_type.hpp>
 #include <raft/spatial/knn/faiss_mr.hpp>
@@ -28,20 +29,16 @@ class index;
 };
 
 struct knnIndex {
-  faiss::gpu::GpuIndex* index;
   raft::distance::DistanceType metric;
   float metricArg;
+  std::unique_ptr<faiss::gpu::GpuIndex> index;
+  std::unique_ptr<MetricProcessor<float>> metric_processor;
   std::unique_ptr<ivf_flat::index<float>> ivf_flat_float_;
   std::unique_ptr<ivf_flat::index<uint8_t>> ivf_flat_uint8_t_;
   std::unique_ptr<ivf_flat::index<int8_t>> ivf_flat_int8_t_;
 
-  raft::spatial::knn::RmmGpuResources* gpu_res;
+  std::unique_ptr<raft::spatial::knn::RmmGpuResources> gpu_res;
   int device;
-  ~knnIndex()
-  {
-    delete index;
-    delete gpu_res;
-  }
 
   template <typename T>
   auto ivf_flat() -> std::unique_ptr<ivf_flat::index<T>>&;
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
index 39363ae53d..420299c8d1 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
@@ -21,13 +21,12 @@
 #include "knn_brute_force_faiss.cuh"
 
 #include "common_faiss.h"
-#include "processing.hpp"
-
 #include "processing.hpp"
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 
 #include <raft/distance/distance.cuh>
+#include <raft/distance/distance_type.hpp>
 #include <raft/label/classlabels.cuh>
 #include <raft/spatial/knn/faiss_mr.hpp>
 
@@ -46,17 +45,7 @@
 
 #include <thrust/iterator/transform_iterator.h>
 
-#include <raft/distance/distance_type.hpp>
-
-#include <iostream>
-#include <set>
-
-#define IVF_FAISS 0
-
-namespace raft {
-namespace spatial {
-namespace knn {
-namespace detail {
+namespace raft ::spatial ::knn::detail {
 
 inline faiss::ScalarQuantizer::QuantizerType build_faiss_qtype(QuantizerType qtype)
 {
@@ -83,21 +72,8 @@ void approx_knn_ivfflat_build_index(knnIndex* index,
   faiss::gpu::GpuIndexIVFFlatConfig config;
   config.device                  = index->device;
   faiss::MetricType faiss_metric = build_faiss_metric(params.metric);
-  faiss::gpu::GpuIndexIVFFlat* faiss_index =
-    new faiss::gpu::GpuIndexIVFFlat(index->gpu_res, D, params.n_lists, faiss_metric, config);
-  index->index = faiss_index;
-}
-
-template <typename T = float, typename IntType = int>
-void approx_knn_cuivfl_ivfflat_build_index(const raft::handle_t& handle,
-                                           knnIndex* index,
-                                           const ivf_flat::index_params& params,
-                                           T* dataset,
-                                           IntType n_rows,
-                                           IntType dim)
-{
-  index->ivf_flat<T>() = std::make_unique<ivf_flat::index<T>>(
-    ivf_flat::build(handle, params, dataset, n_rows, dim, handle.get_stream()));
+  index->index.reset(
+    new faiss::gpu::GpuIndexIVFFlat(index->gpu_res.get(), D, params.n_lists, faiss_metric, config));
 }
 
 template <typename IntType = int>
@@ -107,13 +83,17 @@ void approx_knn_ivfpq_build_index(knnIndex* index,
                                   IntType D)
 {
   faiss::gpu::GpuIndexIVFPQConfig config;
-  config.device                          = index->device;
-  config.usePrecomputedTables            = params.use_precomputed_tables;
-  config.interleavedLayout               = params.n_bits != 8;
-  faiss::MetricType faiss_metric         = build_faiss_metric(params.metric);
-  faiss::gpu::GpuIndexIVFPQ* faiss_index = new faiss::gpu::GpuIndexIVFPQ(
-    index->gpu_res, D, params.n_lists, params.n_subquantizers, params.n_bits, faiss_metric, config);
-  index->index = faiss_index;
+  config.device                  = index->device;
+  config.usePrecomputedTables    = params.use_precomputed_tables;
+  config.interleavedLayout       = params.n_bits != 8;
+  faiss::MetricType faiss_metric = build_faiss_metric(params.metric);
+  index->index.reset(new faiss::gpu::GpuIndexIVFPQ(index->gpu_res.get(),
+                                                   D,
+                                                   params.n_lists,
+                                                   params.n_subquantizers,
+                                                   params.n_bits,
+                                                   faiss_metric,
+                                                   config));
 }
 
 template <typename IntType = int>
@@ -123,12 +103,11 @@ void approx_knn_ivfsq_build_index(knnIndex* index,
                                   IntType D)
 {
   faiss::gpu::GpuIndexIVFScalarQuantizerConfig config;
-  config.device                                       = index->device;
-  faiss::MetricType faiss_metric                      = build_faiss_metric(params.metric);
-  faiss::ScalarQuantizer::QuantizerType faiss_qtype   = build_faiss_qtype(params.qtype);
-  faiss::gpu::GpuIndexIVFScalarQuantizer* faiss_index = new faiss::gpu::GpuIndexIVFScalarQuantizer(
-    index->gpu_res, D, params.n_lists, faiss_qtype, faiss_metric, params.encode_residual);
-  index->index = faiss_index;
+  config.device                                     = index->device;
+  faiss::MetricType faiss_metric                    = build_faiss_metric(params.metric);
+  faiss::ScalarQuantizer::QuantizerType faiss_qtype = build_faiss_qtype(params.qtype);
+  index->index.reset(new faiss::gpu::GpuIndexIVFScalarQuantizer(
+    index->gpu_res.get(), D, params.n_lists, faiss_qtype, faiss_metric, params.encode_residual));
 }
 
 template <typename T = float, typename IntType = int>
@@ -144,66 +123,45 @@ void approx_knn_build_index(const handle_t& handle,
   index->index     = nullptr;
   index->metric    = metric;
   index->metricArg = params.metric_arg;
-  int device;
-  RAFT_CUDA_TRY(cudaGetDevice(&device));
-  index->device    = device;
   auto ivf_ft_pams = dynamic_cast<const ivf_flat::index_params*>(&params);
   auto ivf_pq_pams = dynamic_cast<const ivf_pq_index_params*>(&params);
   auto ivf_sq_pams = dynamic_cast<const ivf_sq_index_params*>(&params);
 
-  // perform preprocessing
-  // k set to 0 (unused during preprocessing / revertion)
-  if constexpr (std::is_same<T, uint8_t>{} || std::is_same<T, int8_t>{}) {
+  if constexpr (std::is_same<T, float>{}) {
+    index->metric_processor = create_processor<float>(metric, n, D, 0, false, stream);
+  }
+  if constexpr (std::is_same<T, float>{}) { index->metric_processor->preprocess(index_array); }
+
+  if (ivf_ft_pams && (metric == raft::distance::DistanceType::L2SqrtExpanded ||
+                      metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
+                      metric == raft::distance::DistanceType::L2Unexpanded ||
+                      metric == raft::distance::DistanceType::L2Expanded ||
+                      metric == raft::distance::DistanceType::InnerProduct)) {
+    index->ivf_flat<T>() = std::make_unique<ivf_flat::index<T>>(
+      ivf_flat::build(handle, *ivf_ft_pams, index_array, n, D, stream));
+  } else {
+    RAFT_CUDA_TRY(cudaGetDevice(&(index->device)));
+    index->gpu_res.reset(new raft::spatial::knn::RmmGpuResources());
+    index->gpu_res->noTempMemory();
+    index->gpu_res->setDefaultStream(index->device, stream);
     if (ivf_ft_pams) {
-      approx_knn_cuivfl_ivfflat_build_index(handle, index, *ivf_ft_pams, index_array, n, D);
+      approx_knn_ivfflat_build_index(index, *ivf_ft_pams, n, D);
+    } else if (ivf_pq_pams) {
+      approx_knn_ivfpq_build_index(index, *ivf_pq_pams, n, D);
+    } else if (ivf_sq_pams) {
+      approx_knn_ivfsq_build_index(index, *ivf_sq_pams, n, D);
     } else {
-      RAFT_FAIL("IVF Flat algorithm required to fit int8 data");
+      RAFT_FAIL("Unrecognized index type.");
     }
-  } else if constexpr (std::is_same<T, float>{}) {
-    std::unique_ptr<MetricProcessor<float>> query_metric_processor =
-      create_processor<float>(metric, n, D, 0, false, stream);
-
-    if (ivf_ft_pams) {
-      // cuivfl only supports L2/Inner product for now.
-      if (metric == raft::distance::DistanceType::L2SqrtExpanded ||
-          metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
-          metric == raft::distance::DistanceType::L2Unexpanded ||
-          metric == raft::distance::DistanceType::L2Expanded ||
-          metric == raft::distance::DistanceType::InnerProduct) {
-        approx_knn_cuivfl_ivfflat_build_index(handle, index, *ivf_ft_pams, index_array, n, D);
-      } else {
-        raft::spatial::knn::RmmGpuResources* gpu_res = new raft::spatial::knn::RmmGpuResources();
-        gpu_res->noTempMemory();
-        gpu_res->setDefaultStream(device, stream);
-        index->gpu_res = gpu_res;
-        approx_knn_ivfflat_build_index(index, *ivf_ft_pams, n, D);
-        std::vector<float> h_index_array(n * D);
-        raft::update_host(h_index_array.data(), index_array, h_index_array.size(), stream);
-        query_metric_processor->revert(index_array);
-        index->index->train(n, h_index_array.data());
-        index->index->add(n, h_index_array.data());
-      }
-    } else {
-      int device;
-      RAFT_CUDA_TRY(cudaGetDevice(&device));
-      raft::spatial::knn::RmmGpuResources* gpu_res = new raft::spatial::knn::RmmGpuResources();
-      gpu_res->noTempMemory();
-      gpu_res->setDefaultStream(device, stream);
-      index->gpu_res = gpu_res;
-      query_metric_processor->preprocess(index_array);
-      if (ivf_pq_pams) {
-        approx_knn_ivfpq_build_index(index, *ivf_pq_pams, n, D);
-      } else if (ivf_sq_pams) {
-        approx_knn_ivfsq_build_index(index, *ivf_sq_pams, n, D);
-      } else {
-        ASSERT(index->index, "KNN index could not be initialized");
-      }
-
+    if constexpr (std::is_same<T, float>{}) {
       index->index->train(n, index_array);
       index->index->add(n, index_array);
-      query_metric_processor->revert(index_array);
+    } else {
+      RAFT_FAIL("FAISS-based index supports only float data.");
     }
   }
+
+  if constexpr (std::is_same<T, float>{}) { index->metric_processor->revert(index_array); }
 }
 
 template <typename T = float, typename IntType = int>
@@ -216,70 +174,54 @@ void approx_knn_search(const handle_t& handle,
                        T* query_array,
                        IntType n)
 {
-  if (dynamic_cast<GpuIndexIVF*>(index->index) && dynamic_cast<const ivf_search_params*>(&params)) {
-    dynamic_cast<GpuIndexIVF*>(index->index)
-      ->setNumProbes(dynamic_cast<const ivf_search_params&>(params).n_probes);
-  }
-  // perform preprocessing
-#if 0
-  std::unique_ptr<MetricProcessor<float>> query_metric_processor =
-  create_processor<float>(index->metric, n, index->index->d, k, false, handle.get_stream());
-  query_metric_processor->preprocess(query_array);
-    index->index->search(n, query_array, k, distances, indices);
-#else
   auto ivf_ft_pams = dynamic_cast<const ivf_flat::search_params*>(&params);
-  if constexpr (std::is_same<T, uint8_t>{} || std::is_same<T, int8_t>{}) {
-    if (ivf_ft_pams) {
-      ivf_flat::search(handle,
-                       *ivf_ft_pams,
-                       *(index->ivf_flat<T>()),
-                       query_array,
-                       n,
-                       k,
-                       (size_t*)indices,
-                       distances,
-                       handle.get_stream());
-    }
-  } else if constexpr (std::is_same<T, float>{}) {
-    std::unique_ptr<MetricProcessor<float>> query_metric_processor = create_processor<float>(
-      index->metric, n, index->ivf_flat<T>()->dim(), k, false, handle.get_stream());
-    query_metric_processor->preprocess(query_array);
+  auto ivf_pams    = dynamic_cast<const ivf_search_params*>(&params);
+  auto faiss_ivf   = dynamic_cast<GpuIndexIVF*>(index->index.get());
+  if (ivf_pams && faiss_ivf) { faiss_ivf->setNumProbes(ivf_pams->n_probes); }
 
-    if (ivf_ft_pams) {
-      ivf_flat::search(handle,
-                       *ivf_ft_pams,
-                       *(index->ivf_flat<T>()),
-                       query_array,
-                       n,
-                       k,
-                       (size_t*)indices,
-                       distances,
-                       handle.get_stream());
-    }
-    query_metric_processor->revert(query_array);
+  if constexpr (std::is_same<T, float>{}) { index->metric_processor->preprocess(query_array); }
 
-    // Perform necessary post-processing
-    if (index->metric == raft::distance::DistanceType::L2SqrtExpanded ||
-        index->metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
-        index->metric == raft::distance::DistanceType::LpUnexpanded) {
-      /**
-       * post-processing
-       */
-      float p = 0.5;  // standard l2
-      if (index->metric == raft::distance::DistanceType::LpUnexpanded) p = 1.0 / index->metricArg;
-      raft::linalg::unaryOp<float>(
-        distances,
-        distances,
-        n * k,
-        [p] __device__(float input) { return powf(input, p); },
-        handle.get_stream());
+  // search
+  if (faiss_ivf) {
+    if constexpr (std::is_same<T, float>{}) {
+      faiss_ivf->search(n, query_array, k, distances, indices);
+    } else {
+      RAFT_FAIL("FAISS-based index supports only float data.");
     }
-    query_metric_processor->postprocess(distances);
+  } else if (ivf_ft_pams) {
+    ivf_flat::search(handle,
+                     *ivf_ft_pams,
+                     *(index->ivf_flat<T>()),
+                     query_array,
+                     n,
+                     k,
+                     (size_t*)indices,
+                     distances,
+                     handle.get_stream());
+  } else {
+    RAFT_FAIL("The model is not trained");
+  }
+
+  // revert changes to the query
+  if constexpr (std::is_same<T, float>{}) { index->metric_processor->revert(query_array); }
+
+  // perform post-processing to show the real distances
+  if (index->metric == raft::distance::DistanceType::L2SqrtExpanded ||
+      index->metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
+      index->metric == raft::distance::DistanceType::LpUnexpanded) {
+    /**
+     * post-processing
+     */
+    float p = 0.5;  // standard l2
+    if (index->metric == raft::distance::DistanceType::LpUnexpanded) p = 1.0 / index->metricArg;
+    raft::linalg::unaryOp<float>(
+      distances,
+      distances,
+      n * k,
+      [p] __device__(float input) { return powf(input, p); },
+      handle.get_stream());
   }
-#endif
+  if constexpr (std::is_same<T, float>{}) { index->metric_processor->postprocess(distances); }
 }
 
-}  // namespace detail
-}  // namespace knn
-}  // namespace spatial
-}  // namespace raft
+}  // namespace raft::spatial::knn::detail

From 32d0d2e04d743401cb7b110951e892f45d6d9120 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 27 Jun 2022 17:22:46 +0200
Subject: [PATCH 087/118] Add several type aliases and helpers for creating
 mdarrays

---
 cpp/include/raft/core/mdarray.hpp   | 216 ++++++++++++++++++++--------
 cpp/include/raft/detail/mdarray.hpp |  17 ++-
 2 files changed, 169 insertions(+), 64 deletions(-)

diff --git a/cpp/include/raft/core/mdarray.hpp b/cpp/include/raft/core/mdarray.hpp
index 0ab882e7a0..3f257b69f6 100644
--- a/cpp/include/raft/core/mdarray.hpp
+++ b/cpp/include/raft/core/mdarray.hpp
@@ -27,7 +27,9 @@
 #include <raft/core/handle.hpp>
 #include <raft/core/mdspan.hpp>
 #include <raft/detail/mdarray.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
 
 namespace raft {
 /**
@@ -37,14 +39,38 @@ template <size_t... ExtentsPack>
 using extents = std::experimental::extents<ExtentsPack...>;
 
 /**
- * @\brief C-Contiguous layout for mdarray and mdspan. Implies row-major and contiguous memory.
+ * @defgroup C-Contiguous layout for mdarray and mdspan. Implies row-major and contiguous memory.
+ * @{
  */
-using layout_c_contiguous = detail::stdex::layout_right;
+using detail::stdex::layout_right;
+using layout_c_contiguous = layout_right;
+using row_major           = layout_right;
+/** @} */
 
 /**
- * @\brief F-Contiguous layout for mdarray and mdspan. Implies column-major and contiguous memory.
+ * @defgroup F-Contiguous layout for mdarray and mdspan. Implies column-major and contiguous memory.
  */
-using layout_f_contiguous = detail::stdex::layout_left;
+using detail::stdex::layout_left;
+using layout_f_contiguous = layout_left;
+using col_major           = layout_left;
+/** @} */
+
+/**
+ * @defgroup Common mdarray/mdspan extent types. The rank is known at compile time, each dimension
+ * is known at run time (dynamic_extent in each dimension).
+ */
+using detail::matrix_extent;
+using detail::scalar_extent;
+using detail::vector_extent;
+
+using extent_1d = vector_extent;
+using extent_2d = matrix_extent;
+using extent_3d = detail::stdex::extents<dynamic_extent, dynamic_extent, dynamic_extent>;
+using extent_4d =
+  detail::stdex::extents<dynamic_extent, dynamic_extent, dynamic_extent, dynamic_extent>;
+using extent_5d = detail::stdex::
+  extents<dynamic_extent, dynamic_extent, dynamic_extent, dynamic_extent, dynamic_extent>;
+/** @} */
 
 template <typename ElementType,
           typename Extents,
@@ -511,28 +537,28 @@ using device_mdarray =
  * @tparam ElementType the data type of the scalar element
  */
 template <typename ElementType>
-using host_scalar = host_mdarray<ElementType, detail::scalar_extent>;
+using host_scalar = host_mdarray<ElementType, scalar_extent>;
 
 /**
  * @brief Shorthand for 0-dim host mdarray (scalar).
  * @tparam ElementType the data type of the scalar element
  */
 template <typename ElementType>
-using device_scalar = device_mdarray<ElementType, detail::scalar_extent>;
+using device_scalar = device_mdarray<ElementType, scalar_extent>;
 
 /**
  * @brief Shorthand for 1-dim host mdarray.
  * @tparam ElementType the data type of the vector elements
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-using host_vector = host_mdarray<ElementType, detail::vector_extent, LayoutPolicy>;
+using host_vector = host_mdarray<ElementType, vector_extent, LayoutPolicy>;
 
 /**
  * @brief Shorthand for 1-dim device mdarray.
  * @tparam ElementType the data type of the vector elements
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-using device_vector = device_mdarray<ElementType, detail::vector_extent, LayoutPolicy>;
+using device_vector = device_mdarray<ElementType, vector_extent, LayoutPolicy>;
 
 /**
  * @brief Shorthand for c-contiguous host matrix.
@@ -540,7 +566,7 @@ using device_vector = device_mdarray<ElementType, detail::vector_extent, LayoutP
  * @tparam LayoutPolicy policy for strides and layout ordering
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-using host_matrix = host_mdarray<ElementType, detail::matrix_extent, LayoutPolicy>;
+using host_matrix = host_mdarray<ElementType, matrix_extent, LayoutPolicy>;
 
 /**
  * @brief Shorthand for c-contiguous device matrix.
@@ -548,35 +574,35 @@ using host_matrix = host_mdarray<ElementType, detail::matrix_extent, LayoutPolic
  * @tparam LayoutPolicy policy for strides and layout ordering
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-using device_matrix = device_mdarray<ElementType, detail::matrix_extent, LayoutPolicy>;
+using device_matrix = device_mdarray<ElementType, matrix_extent, LayoutPolicy>;
 
 /**
  * @brief Shorthand for 0-dim host mdspan (scalar).
  * @tparam ElementType the data type of the scalar element
  */
 template <typename ElementType>
-using host_scalar_view = host_mdspan<ElementType, detail::scalar_extent>;
+using host_scalar_view = host_mdspan<ElementType, scalar_extent>;
 
 /**
  * @brief Shorthand for 0-dim host mdspan (scalar).
  * @tparam ElementType the data type of the scalar element
  */
 template <typename ElementType>
-using device_scalar_view = device_mdspan<ElementType, detail::scalar_extent>;
+using device_scalar_view = device_mdspan<ElementType, scalar_extent>;
 
 /**
  * @brief Shorthand for 1-dim host mdspan.
  * @tparam ElementType the data type of the vector elements
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-using host_vector_view = host_mdspan<ElementType, detail::vector_extent, LayoutPolicy>;
+using host_vector_view = host_mdspan<ElementType, vector_extent, LayoutPolicy>;
 
 /**
  * @brief Shorthand for 1-dim device mdspan.
  * @tparam ElementType the data type of the vector elements
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-using device_vector_view = device_mdspan<ElementType, detail::vector_extent, LayoutPolicy>;
+using device_vector_view = device_mdspan<ElementType, vector_extent, LayoutPolicy>;
 
 /**
  * @brief Shorthand for c-contiguous host matrix view.
@@ -585,7 +611,7 @@ using device_vector_view = device_mdspan<ElementType, detail::vector_extent, Lay
  *
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-using host_matrix_view = host_mdspan<ElementType, detail::matrix_extent, LayoutPolicy>;
+using host_matrix_view = host_mdspan<ElementType, matrix_extent, LayoutPolicy>;
 
 /**
  * @brief Shorthand for c-contiguous device matrix view.
@@ -594,7 +620,7 @@ using host_matrix_view = host_mdspan<ElementType, detail::matrix_extent, LayoutP
  *
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-using device_matrix_view = device_mdspan<ElementType, detail::matrix_extent, LayoutPolicy>;
+using device_matrix_view = device_mdspan<ElementType, matrix_extent, LayoutPolicy>;
 
 /**
  * @brief Create a 0-dim (scalar) mdspan instance for host value.
@@ -603,9 +629,9 @@ using device_matrix_view = device_mdspan<ElementType, detail::matrix_extent, Lay
  * @param[in] ptr on device to wrap
  */
 template <typename ElementType>
-auto make_host_scalar_view(ElementType* ptr)
+inline auto make_host_scalar_view(ElementType* ptr)
 {
-  detail::scalar_extent extents;
+  scalar_extent extents;
   return host_scalar_view<ElementType>{ptr, extents};
 }
 
@@ -616,9 +642,9 @@ auto make_host_scalar_view(ElementType* ptr)
  * @param[in] ptr on device to wrap
  */
 template <typename ElementType>
-auto make_device_scalar_view(ElementType* ptr)
+inline auto make_device_scalar_view(ElementType* ptr)
 {
-  detail::scalar_extent extents;
+  scalar_extent extents;
   return device_scalar_view<ElementType>{ptr, extents};
 }
 
@@ -633,9 +659,9 @@ auto make_device_scalar_view(ElementType* ptr)
  * @param[in] n_cols number of columns in pointer
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-auto make_host_matrix_view(ElementType* ptr, size_t n_rows, size_t n_cols)
+inline auto make_host_matrix_view(ElementType* ptr, size_t n_rows, size_t n_cols)
 {
-  detail::matrix_extent extents{n_rows, n_cols};
+  matrix_extent extents{n_rows, n_cols};
   return host_matrix_view<ElementType, LayoutPolicy>{ptr, extents};
 }
 /**
@@ -649,9 +675,9 @@ auto make_host_matrix_view(ElementType* ptr, size_t n_rows, size_t n_cols)
  * @param[in] n_cols number of columns in pointer
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-auto make_device_matrix_view(ElementType* ptr, size_t n_rows, size_t n_cols)
+inline auto make_device_matrix_view(ElementType* ptr, size_t n_rows, size_t n_cols)
 {
-  detail::matrix_extent extents{n_rows, n_cols};
+  matrix_extent extents{n_rows, n_cols};
   return device_matrix_view<ElementType, LayoutPolicy>{ptr, extents};
 }
 
@@ -663,9 +689,9 @@ auto make_device_matrix_view(ElementType* ptr, size_t n_rows, size_t n_cols)
  * @return raft::host_vector_view
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-auto make_host_vector_view(ElementType* ptr, size_t n)
+inline auto make_host_vector_view(ElementType* ptr, size_t n)
 {
-  detail::vector_extent extents{n};
+  vector_extent extents{n};
   return host_vector_view<ElementType, LayoutPolicy>{ptr, extents};
 }
 
@@ -677,12 +703,90 @@ auto make_host_vector_view(ElementType* ptr, size_t n)
  * @return raft::device_vector_view
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-auto make_device_vector_view(ElementType* ptr, size_t n)
+inline auto make_device_vector_view(ElementType* ptr, size_t n)
 {
-  detail::vector_extent extents{n};
+  vector_extent extents{n};
   return device_vector_view<ElementType, LayoutPolicy>{ptr, extents};
 }
 
+template <typename... Extents>
+using ensure_integral_extents =
+  std::enable_if_t<(true && ... && std::is_integral_v<Extents>), void>;
+
+/**
+ * @brief Create a host mdarray.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @param exts dimensionality of the array (series of integers)
+ * @return raft::host_mdarray
+ */
+template <typename ElementType,
+          typename LayoutPolicy = layout_c_contiguous,
+          typename... Extents,
+          typename = ensure_integral_extents<Extents...>>
+inline auto make_host_mdarray(Extents... exts)
+{
+  using extent_t  = extents<((void)exts, dynamic_extent)...>;
+  using mdarray_t = host_mdarray<ElementType, extent_t, LayoutPolicy>;
+
+  typename mdarray_t::extents_type extent{exts...};
+  typename mdarray_t::mapping_type layout{extent};
+  typename mdarray_t::container_policy_type policy;
+
+  return mdarray_t{layout, policy};
+}
+
+/**
+ * @brief Create a device mdarray.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @param stream cuda stream for ordering events
+ * @param exts dimensionality of the array (series of integers)
+ * @return raft::device_mdarray
+ */
+template <typename ElementType,
+          typename LayoutPolicy = layout_c_contiguous,
+          typename... Extents,
+          typename = ensure_integral_extents<Extents...>>
+inline auto make_device_mdarray(rmm::cuda_stream_view stream, Extents... exts)
+{
+  using extent_t  = extents<((void)exts, dynamic_extent)...>;
+  using mdarray_t = device_mdarray<ElementType, extent_t, LayoutPolicy>;
+
+  typename mdarray_t::extents_type extent{exts...};
+  typename mdarray_t::mapping_type layout{extent};
+  typename mdarray_t::container_policy_type policy{stream};
+
+  return mdarray_t{layout, policy};
+}
+
+/**
+ * @brief Create a device mdarray.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @param stream cuda stream for ordering events
+ * @param mr rmm memory resource used for allocating the memory for the array
+ * @param exts dimensionality of the array (series of integers)
+ * @return raft::device_mdarray
+ */
+template <typename ElementType,
+          typename LayoutPolicy = layout_c_contiguous,
+          typename... Extents,
+          typename = ensure_integral_extents<Extents...>>
+inline auto make_device_mdarray(rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr,
+                                Extents... exts)
+{
+  using extent_t  = extents<((void)exts, dynamic_extent)...>;
+  using mdarray_t = device_mdarray<ElementType, extent_t, LayoutPolicy>;
+
+  typename mdarray_t::extents_type extent{exts...};
+  typename mdarray_t::mapping_type layout{extent};
+  typename mdarray_t::container_policy_type policy{stream, mr};
+
+  return mdarray_t{layout, policy};
+}
+
 /**
  * @brief Create a 2-dim c-contiguous host mdarray.
  * @tparam ElementType the data type of the matrix elements
@@ -692,12 +796,9 @@ auto make_device_vector_view(ElementType* ptr, size_t n)
  * @return raft::host_matrix
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-auto make_host_matrix(size_t n_rows, size_t n_cols)
+inline auto make_host_matrix(size_t n_rows, size_t n_cols)
 {
-  detail::matrix_extent extents{n_rows, n_cols};
-  using policy_t = typename host_matrix<ElementType>::container_policy_type;
-  policy_t policy;
-  return host_matrix<ElementType, LayoutPolicy>{extents, policy};
+  return make_host_mdarray<ElementType, LayoutPolicy>(n_rows, n_cols);
 }
 
 /**
@@ -710,12 +811,9 @@ auto make_host_matrix(size_t n_rows, size_t n_cols)
  * @return raft::device_matrix
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-auto make_device_matrix(size_t n_rows, size_t n_cols, rmm::cuda_stream_view stream)
+inline auto make_device_matrix(size_t n_rows, size_t n_cols, rmm::cuda_stream_view stream)
 {
-  detail::matrix_extent extents{n_rows, n_cols};
-  using policy_t = typename device_matrix<ElementType>::container_policy_type;
-  policy_t policy{stream};
-  return device_matrix<ElementType, LayoutPolicy>{extents, policy};
+  return make_device_mdarray<ElementType, LayoutPolicy>(stream, n_rows, n_cols);
 }
 
 /**
@@ -729,7 +827,7 @@ auto make_device_matrix(size_t n_rows, size_t n_cols, rmm::cuda_stream_view stre
  * @return raft::device_matrix
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-auto make_device_matrix(raft::handle_t const& handle, size_t n_rows, size_t n_cols)
+inline auto make_device_matrix(raft::handle_t const& handle, size_t n_rows, size_t n_cols)
 {
   return make_device_matrix<ElementType, LayoutPolicy>(n_rows, n_cols, handle.get_stream());
 }
@@ -742,12 +840,12 @@ auto make_device_matrix(raft::handle_t const& handle, size_t n_rows, size_t n_co
  * @return raft::host_scalar
  */
 template <typename ElementType>
-auto make_host_scalar(ElementType const& v)
+inline auto make_host_scalar(ElementType const& v)
 {
   // FIXME(jiamingy): We can optimize this by using std::array as container policy, which
   // requires some more compile time dispatching. This is enabled in the ref impl but
   // hasn't been ported here yet.
-  detail::scalar_extent extents;
+  scalar_extent extents;
   using policy_t = typename host_scalar<ElementType>::container_policy_type;
   policy_t policy;
   auto scalar = host_scalar<ElementType>{extents, policy};
@@ -764,9 +862,9 @@ auto make_host_scalar(ElementType const& v)
  * @return raft::device_scalar
  */
 template <typename ElementType>
-auto make_device_scalar(ElementType const& v, rmm::cuda_stream_view stream)
+inline auto make_device_scalar(ElementType const& v, rmm::cuda_stream_view stream)
 {
-  detail::scalar_extent extents;
+  scalar_extent extents;
   using policy_t = typename device_scalar<ElementType>::container_policy_type;
   policy_t policy{stream};
   auto scalar = device_scalar<ElementType>{extents, policy};
@@ -783,7 +881,7 @@ auto make_device_scalar(ElementType const& v, rmm::cuda_stream_view stream)
  * @return raft::device_scalar
  */
 template <typename ElementType>
-auto make_device_scalar(raft::handle_t const& handle, ElementType const& v)
+inline auto make_device_scalar(raft::handle_t const& handle, ElementType const& v)
 {
   return make_device_scalar<ElementType>(v, handle.get_stream());
 }
@@ -795,12 +893,9 @@ auto make_device_scalar(raft::handle_t const& handle, ElementType const& v)
  * @return raft::host_vector
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-auto make_host_vector(size_t n)
+inline auto make_host_vector(size_t n)
 {
-  detail::vector_extent extents{n};
-  using policy_t = typename host_vector<ElementType, LayoutPolicy>::container_policy_type;
-  policy_t policy;
-  return host_vector<ElementType, LayoutPolicy>{extents, policy};
+  return make_host_mdarray<ElementType, LayoutPolicy>(n);
 }
 
 /**
@@ -811,12 +906,9 @@ auto make_host_vector(size_t n)
  * @return raft::device_vector
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-auto make_device_vector(size_t n, rmm::cuda_stream_view stream)
+inline auto make_device_vector(size_t n, rmm::cuda_stream_view stream)
 {
-  detail::vector_extent extents{n};
-  using policy_t = typename device_vector<ElementType, LayoutPolicy>::container_policy_type;
-  policy_t policy{stream};
-  return device_vector<ElementType, LayoutPolicy>{extents, policy};
+  return make_device_mdarray<ElementType, LayoutPolicy>(stream, n);
 }
 
 /**
@@ -827,7 +919,7 @@ auto make_device_vector(size_t n, rmm::cuda_stream_view stream)
  * @return raft::device_vector
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-auto make_device_vector(raft::handle_t const& handle, size_t n)
+inline auto make_device_vector(raft::handle_t const& handle, size_t n)
 {
   return make_device_vector<ElementType, LayoutPolicy>(n, handle.get_stream());
 }
@@ -841,14 +933,14 @@ auto make_device_vector(raft::handle_t const& handle, size_t n)
  *         depending on AccessoryPolicy
  */
 template <typename mdspan_type, std::enable_if_t<is_mdspan_v<mdspan_type>>* = nullptr>
-auto flatten(mdspan_type mds)
+inline auto flatten(mdspan_type mds)
 {
   RAFT_EXPECTS(mds.is_contiguous(), "Input must be contiguous.");
 
-  detail::vector_extent ext{mds.size()};
+  vector_extent ext{mds.size()};
 
   return detail::stdex::mdspan<typename mdspan_type::element_type,
-                               detail::vector_extent,
+                               vector_extent,
                                typename mdspan_type::layout_type,
                                typename mdspan_type::accessor_type>(mds.data(), ext);
 }
@@ -863,7 +955,7 @@ auto flatten(mdspan_type mds)
  */
 template <typename array_interface_type,
           std::enable_if_t<is_array_interface_v<array_interface_type>>* = nullptr>
-auto flatten(const array_interface_type& mda)
+inline auto flatten(const array_interface_type& mda)
 {
   return flatten(mda.view());
 }
@@ -880,7 +972,7 @@ auto flatten(const array_interface_type& mda)
 template <typename mdspan_type,
           size_t... Extents,
           std::enable_if_t<is_mdspan_v<mdspan_type>>* = nullptr>
-auto reshape(mdspan_type mds, extents<Extents...> new_shape)
+inline auto reshape(mdspan_type mds, extents<Extents...> new_shape)
 {
   RAFT_EXPECTS(mds.is_contiguous(), "Input must be contiguous.");
 
@@ -909,9 +1001,9 @@ auto reshape(mdspan_type mds, extents<Extents...> new_shape)
 template <typename array_interface_type,
           size_t... Extents,
           std::enable_if_t<is_array_interface_v<array_interface_type>>* = nullptr>
-auto reshape(const array_interface_type& mda, extents<Extents...> new_shape)
+inline auto reshape(const array_interface_type& mda, extents<Extents...> new_shape)
 {
   return reshape(mda.view(), new_shape);
 }
 
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/include/raft/detail/mdarray.hpp b/cpp/include/raft/detail/mdarray.hpp
index cb6f8a0920..96069ec5dd 100644
--- a/cpp/include/raft/detail/mdarray.hpp
+++ b/cpp/include/raft/detail/mdarray.hpp
@@ -24,8 +24,11 @@
 #include <raft/core/mdspan.hpp>
 #include <raft/cudart_utils.h>
 #include <raft/detail/span.hpp>  // dynamic_extent
+
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
 #include <thrust/device_ptr.h>
 
 namespace raft::detail {
@@ -138,6 +141,7 @@ class device_uvector {
 template <typename ElementType>
 class device_uvector_policy {
   rmm::cuda_stream_view stream_;
+  rmm::mr::device_memory_resource* mr_;
 
  public:
   using element_type   = ElementType;
@@ -152,12 +156,21 @@ class device_uvector_policy {
   using const_accessor_policy = std::experimental::default_accessor<element_type const>;
 
  public:
-  auto create(size_t n) -> container_type { return container_type(n, stream_); }
+  auto create(size_t n) -> container_type
+  {
+    return mr_ ? container_type(n, stream_, mr_) : container_type(n, stream_);
+  }
 
   device_uvector_policy() = delete;
   explicit device_uvector_policy(rmm::cuda_stream_view stream) noexcept(
     std::is_nothrow_copy_constructible_v<rmm::cuda_stream_view>)
-    : stream_{stream}
+    : stream_{stream}, mr_(nullptr)
+  {
+  }
+
+  device_uvector_policy(rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) noexcept(
+    std::is_nothrow_copy_constructible_v<rmm::cuda_stream_view>)
+    : stream_{stream}, mr_(mr)
   {
   }
 

From 5f427c029bddbe5aa4c47d943e0e07c34a344878 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 28 Jun 2022 08:05:36 +0200
Subject: [PATCH 088/118] Remove unnecessary inlines and fix docs

---
 cpp/include/raft/core/mdarray.hpp | 64 ++++++++++++++++++-------------
 1 file changed, 37 insertions(+), 27 deletions(-)

diff --git a/cpp/include/raft/core/mdarray.hpp b/cpp/include/raft/core/mdarray.hpp
index 3f257b69f6..b5bc86322c 100644
--- a/cpp/include/raft/core/mdarray.hpp
+++ b/cpp/include/raft/core/mdarray.hpp
@@ -49,6 +49,7 @@ using row_major           = layout_right;
 
 /**
  * @defgroup F-Contiguous layout for mdarray and mdspan. Implies column-major and contiguous memory.
+ * @{
  */
 using detail::stdex::layout_left;
 using layout_f_contiguous = layout_left;
@@ -58,6 +59,7 @@ using col_major           = layout_left;
 /**
  * @defgroup Common mdarray/mdspan extent types. The rank is known at compile time, each dimension
  * is known at run time (dynamic_extent in each dimension).
+ * @{
  */
 using detail::matrix_extent;
 using detail::scalar_extent;
@@ -629,7 +631,7 @@ using device_matrix_view = device_mdspan<ElementType, matrix_extent, LayoutPolic
  * @param[in] ptr on device to wrap
  */
 template <typename ElementType>
-inline auto make_host_scalar_view(ElementType* ptr)
+auto make_host_scalar_view(ElementType* ptr)
 {
   scalar_extent extents;
   return host_scalar_view<ElementType>{ptr, extents};
@@ -642,7 +644,7 @@ inline auto make_host_scalar_view(ElementType* ptr)
  * @param[in] ptr on device to wrap
  */
 template <typename ElementType>
-inline auto make_device_scalar_view(ElementType* ptr)
+auto make_device_scalar_view(ElementType* ptr)
 {
   scalar_extent extents;
   return device_scalar_view<ElementType>{ptr, extents};
@@ -659,7 +661,7 @@ inline auto make_device_scalar_view(ElementType* ptr)
  * @param[in] n_cols number of columns in pointer
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-inline auto make_host_matrix_view(ElementType* ptr, size_t n_rows, size_t n_cols)
+auto make_host_matrix_view(ElementType* ptr, size_t n_rows, size_t n_cols)
 {
   matrix_extent extents{n_rows, n_cols};
   return host_matrix_view<ElementType, LayoutPolicy>{ptr, extents};
@@ -675,7 +677,7 @@ inline auto make_host_matrix_view(ElementType* ptr, size_t n_rows, size_t n_cols
  * @param[in] n_cols number of columns in pointer
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-inline auto make_device_matrix_view(ElementType* ptr, size_t n_rows, size_t n_cols)
+auto make_device_matrix_view(ElementType* ptr, size_t n_rows, size_t n_cols)
 {
   matrix_extent extents{n_rows, n_cols};
   return device_matrix_view<ElementType, LayoutPolicy>{ptr, extents};
@@ -689,7 +691,7 @@ inline auto make_device_matrix_view(ElementType* ptr, size_t n_rows, size_t n_co
  * @return raft::host_vector_view
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-inline auto make_host_vector_view(ElementType* ptr, size_t n)
+auto make_host_vector_view(ElementType* ptr, size_t n)
 {
   vector_extent extents{n};
   return host_vector_view<ElementType, LayoutPolicy>{ptr, extents};
@@ -703,15 +705,23 @@ inline auto make_host_vector_view(ElementType* ptr, size_t n)
  * @return raft::device_vector_view
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-inline auto make_device_vector_view(ElementType* ptr, size_t n)
+auto make_device_vector_view(ElementType* ptr, size_t n)
 {
   vector_extent extents{n};
   return device_vector_view<ElementType, LayoutPolicy>{ptr, extents};
 }
 
+namespace detail {
+/**
+ * Ensure all types listed in the parameter pack `Extents` are integral types.
+ * Usage:
+ *   put it as the last nameless template parameter of a function:
+ *     `typename = ensure_integral_extents<Extents...>`
+ */
 template <typename... Extents>
 using ensure_integral_extents =
   std::enable_if_t<(true && ... && std::is_integral_v<Extents>), void>;
+}  // namespace detail
 
 /**
  * @brief Create a host mdarray.
@@ -723,8 +733,8 @@ using ensure_integral_extents =
 template <typename ElementType,
           typename LayoutPolicy = layout_c_contiguous,
           typename... Extents,
-          typename = ensure_integral_extents<Extents...>>
-inline auto make_host_mdarray(Extents... exts)
+          typename = detail::ensure_integral_extents<Extents...>>
+auto make_host_mdarray(Extents... exts)
 {
   using extent_t  = extents<((void)exts, dynamic_extent)...>;
   using mdarray_t = host_mdarray<ElementType, extent_t, LayoutPolicy>;
@@ -747,8 +757,8 @@ inline auto make_host_mdarray(Extents... exts)
 template <typename ElementType,
           typename LayoutPolicy = layout_c_contiguous,
           typename... Extents,
-          typename = ensure_integral_extents<Extents...>>
-inline auto make_device_mdarray(rmm::cuda_stream_view stream, Extents... exts)
+          typename = detail::ensure_integral_extents<Extents...>>
+auto make_device_mdarray(rmm::cuda_stream_view stream, Extents... exts)
 {
   using extent_t  = extents<((void)exts, dynamic_extent)...>;
   using mdarray_t = device_mdarray<ElementType, extent_t, LayoutPolicy>;
@@ -772,10 +782,10 @@ inline auto make_device_mdarray(rmm::cuda_stream_view stream, Extents... exts)
 template <typename ElementType,
           typename LayoutPolicy = layout_c_contiguous,
           typename... Extents,
-          typename = ensure_integral_extents<Extents...>>
-inline auto make_device_mdarray(rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr,
-                                Extents... exts)
+          typename = detail::ensure_integral_extents<Extents...>>
+auto make_device_mdarray(rmm::cuda_stream_view stream,
+                         rmm::mr::device_memory_resource* mr,
+                         Extents... exts)
 {
   using extent_t  = extents<((void)exts, dynamic_extent)...>;
   using mdarray_t = device_mdarray<ElementType, extent_t, LayoutPolicy>;
@@ -796,7 +806,7 @@ inline auto make_device_mdarray(rmm::cuda_stream_view stream,
  * @return raft::host_matrix
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-inline auto make_host_matrix(size_t n_rows, size_t n_cols)
+auto make_host_matrix(size_t n_rows, size_t n_cols)
 {
   return make_host_mdarray<ElementType, LayoutPolicy>(n_rows, n_cols);
 }
@@ -811,7 +821,7 @@ inline auto make_host_matrix(size_t n_rows, size_t n_cols)
  * @return raft::device_matrix
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-inline auto make_device_matrix(size_t n_rows, size_t n_cols, rmm::cuda_stream_view stream)
+auto make_device_matrix(size_t n_rows, size_t n_cols, rmm::cuda_stream_view stream)
 {
   return make_device_mdarray<ElementType, LayoutPolicy>(stream, n_rows, n_cols);
 }
@@ -827,7 +837,7 @@ inline auto make_device_matrix(size_t n_rows, size_t n_cols, rmm::cuda_stream_vi
  * @return raft::device_matrix
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-inline auto make_device_matrix(raft::handle_t const& handle, size_t n_rows, size_t n_cols)
+auto make_device_matrix(raft::handle_t const& handle, size_t n_rows, size_t n_cols)
 {
   return make_device_matrix<ElementType, LayoutPolicy>(n_rows, n_cols, handle.get_stream());
 }
@@ -840,7 +850,7 @@ inline auto make_device_matrix(raft::handle_t const& handle, size_t n_rows, size
  * @return raft::host_scalar
  */
 template <typename ElementType>
-inline auto make_host_scalar(ElementType const& v)
+auto make_host_scalar(ElementType const& v)
 {
   // FIXME(jiamingy): We can optimize this by using std::array as container policy, which
   // requires some more compile time dispatching. This is enabled in the ref impl but
@@ -862,7 +872,7 @@ inline auto make_host_scalar(ElementType const& v)
  * @return raft::device_scalar
  */
 template <typename ElementType>
-inline auto make_device_scalar(ElementType const& v, rmm::cuda_stream_view stream)
+auto make_device_scalar(ElementType const& v, rmm::cuda_stream_view stream)
 {
   scalar_extent extents;
   using policy_t = typename device_scalar<ElementType>::container_policy_type;
@@ -881,7 +891,7 @@ inline auto make_device_scalar(ElementType const& v, rmm::cuda_stream_view strea
  * @return raft::device_scalar
  */
 template <typename ElementType>
-inline auto make_device_scalar(raft::handle_t const& handle, ElementType const& v)
+auto make_device_scalar(raft::handle_t const& handle, ElementType const& v)
 {
   return make_device_scalar<ElementType>(v, handle.get_stream());
 }
@@ -893,7 +903,7 @@ inline auto make_device_scalar(raft::handle_t const& handle, ElementType const&
  * @return raft::host_vector
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-inline auto make_host_vector(size_t n)
+auto make_host_vector(size_t n)
 {
   return make_host_mdarray<ElementType, LayoutPolicy>(n);
 }
@@ -906,7 +916,7 @@ inline auto make_host_vector(size_t n)
  * @return raft::device_vector
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-inline auto make_device_vector(size_t n, rmm::cuda_stream_view stream)
+auto make_device_vector(size_t n, rmm::cuda_stream_view stream)
 {
   return make_device_mdarray<ElementType, LayoutPolicy>(stream, n);
 }
@@ -919,7 +929,7 @@ inline auto make_device_vector(size_t n, rmm::cuda_stream_view stream)
  * @return raft::device_vector
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
-inline auto make_device_vector(raft::handle_t const& handle, size_t n)
+auto make_device_vector(raft::handle_t const& handle, size_t n)
 {
   return make_device_vector<ElementType, LayoutPolicy>(n, handle.get_stream());
 }
@@ -933,7 +943,7 @@ inline auto make_device_vector(raft::handle_t const& handle, size_t n)
  *         depending on AccessoryPolicy
  */
 template <typename mdspan_type, std::enable_if_t<is_mdspan_v<mdspan_type>>* = nullptr>
-inline auto flatten(mdspan_type mds)
+auto flatten(mdspan_type mds)
 {
   RAFT_EXPECTS(mds.is_contiguous(), "Input must be contiguous.");
 
@@ -955,7 +965,7 @@ inline auto flatten(mdspan_type mds)
  */
 template <typename array_interface_type,
           std::enable_if_t<is_array_interface_v<array_interface_type>>* = nullptr>
-inline auto flatten(const array_interface_type& mda)
+auto flatten(const array_interface_type& mda)
 {
   return flatten(mda.view());
 }
@@ -972,7 +982,7 @@ inline auto flatten(const array_interface_type& mda)
 template <typename mdspan_type,
           size_t... Extents,
           std::enable_if_t<is_mdspan_v<mdspan_type>>* = nullptr>
-inline auto reshape(mdspan_type mds, extents<Extents...> new_shape)
+auto reshape(mdspan_type mds, extents<Extents...> new_shape)
 {
   RAFT_EXPECTS(mds.is_contiguous(), "Input must be contiguous.");
 
@@ -1001,7 +1011,7 @@ inline auto reshape(mdspan_type mds, extents<Extents...> new_shape)
 template <typename array_interface_type,
           size_t... Extents,
           std::enable_if_t<is_array_interface_v<array_interface_type>>* = nullptr>
-inline auto reshape(const array_interface_type& mda, extents<Extents...> new_shape)
+auto reshape(const array_interface_type& mda, extents<Extents...> new_shape)
 {
   return reshape(mda.view(), new_shape);
 }

From c581fe231deb5f935e458d7dae7c157aef8c02dc Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 28 Jun 2022 18:26:11 +0200
Subject: [PATCH 089/118] More refactoring and a few forceinlines

---
 .../spatial/knn/detail/ivf_flat_search.cuh    | 317 +++++++++---------
 1 file changed, 160 insertions(+), 157 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
index 70783bde3e..c3cbd6f3ea 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
@@ -175,9 +175,9 @@ struct loadAndComputeDist {
   __device__ __forceinline__ void runLoadShflAndCompute(const T*& data,
                                                         const T* query,
                                                         IdxT baseLoadIndex,
-                                                        const int laneId)
+                                                        const int lane_id)
   {
-    T queryReg               = query[baseLoadIndex + laneId];
+    T queryReg               = query[baseLoadIndex + lane_id];
     constexpr int stride     = kUnroll * Veclen;
     constexpr int totalIter  = WarpSize / stride;
     constexpr int gmemStride = stride * kIndexGroupSize;
@@ -186,7 +186,7 @@ struct loadAndComputeDist {
 #pragma unroll
       for (int j = 0; j < kUnroll; ++j) {
         T encV[Veclen];
-        ldg(encV, data + (laneId + j * kIndexGroupSize) * Veclen);
+        ldg(encV, data + (lane_id + j * kIndexGroupSize) * Veclen);
         const int d = (i * kUnroll + j) * Veclen;
 #pragma unroll
         for (int k = 0; k < Veclen; ++k) {
@@ -201,11 +201,11 @@ struct loadAndComputeDist {
    * This version augments `runLoadShflAndCompute` when `dim` is not a multiple of `WarpSize`.
    */
   __device__ __forceinline__ void runLoadShflAndComputeRemainder(
-    const T*& data, const T* query, const int laneId, const int dim, const int dimBlocks)
+    const T*& data, const T* query, const int lane_id, const int dim, const int dimBlocks)
   {
-    const int loadDim     = dimBlocks + laneId;
+    const int loadDim     = dimBlocks + lane_id;
     T queryReg            = loadDim < dim ? query[loadDim] : 0;
-    const int loadDataIdx = laneId * Veclen;
+    const int loadDataIdx = lane_id * Veclen;
     for (int d = 0; d < dim - dimBlocks; d += Veclen, data += kIndexGroupSize * Veclen) {
       T enc[Veclen];
       ldg(enc, data + loadDataIdx);
@@ -251,11 +251,11 @@ struct loadAndComputeDist<kUnroll, Lambda, uint8_veclen, uint8_t, uint32_t> {
   __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
                                                         const uint8_t* query,
                                                         int baseLoadIndex,
-                                                        const int laneId)
+                                                        const int lane_id)
   {
     constexpr int veclen_int = uint8_veclen / 4;  // converting uint8_t veclens to int
     uint32_t queryReg =
-      (laneId < 8) ? reinterpret_cast<unsigned const*>(query + baseLoadIndex)[laneId] : 0;
+      (lane_id < 8) ? reinterpret_cast<unsigned const*>(query + baseLoadIndex)[lane_id] : 0;
     constexpr int stride = kUnroll * uint8_veclen;
 
 #pragma unroll
@@ -264,7 +264,7 @@ struct loadAndComputeDist<kUnroll, Lambda, uint8_veclen, uint8_t, uint32_t> {
       for (int j = 0; j < kUnroll; ++j) {
         uint32_t encV[veclen_int];
         ldg(encV,
-            reinterpret_cast<unsigned const*>(data) + (laneId + j * kIndexGroupSize) * veclen_int);
+            reinterpret_cast<unsigned const*>(data) + (lane_id + j * kIndexGroupSize) * veclen_int);
         const int d = (i * kUnroll + j) * veclen_int;
 #pragma unroll
         for (int k = 0; k < veclen_int; ++k) {
@@ -276,17 +276,17 @@ struct loadAndComputeDist<kUnroll, Lambda, uint8_veclen, uint8_t, uint32_t> {
 
   __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
                                                                  const uint8_t* query,
-                                                                 const int laneId,
+                                                                 const int lane_id,
                                                                  const int dim,
                                                                  const int dimBlocks)
   {
     constexpr int veclen_int = uint8_veclen / 4;
-    const int loadDim        = dimBlocks + laneId * 4;  // Here 4 is for 1 - int
+    const int loadDim        = dimBlocks + lane_id * 4;  // Here 4 is for 1 - int
     uint32_t queryReg = loadDim < dim ? reinterpret_cast<uint32_t const*>(query + loadDim)[0] : 0;
     for (int d = 0; d < dim - dimBlocks;
          d += uint8_veclen, data += kIndexGroupSize * uint8_veclen) {
       uint32_t enc[veclen_int];
-      ldg(enc, reinterpret_cast<uint32_t const*>(data) + laneId * veclen_int);
+      ldg(enc, reinterpret_cast<uint32_t const*>(data) + lane_id * veclen_int);
 #pragma unroll
       for (int k = 0; k < veclen_int; k++) {
         uint32_t q = shfl(queryReg, (d / 4) + k, WarpSize);
@@ -323,10 +323,10 @@ struct loadAndComputeDist<kUnroll, Lambda, 4, uint8_t, uint32_t> {
   __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
                                                         const uint8_t* query,
                                                         int baseLoadIndex,
-                                                        const int laneId)
+                                                        const int lane_id)
   {
     uint32_t queryReg =
-      (laneId < 8) ? reinterpret_cast<unsigned const*>(query + baseLoadIndex)[laneId] : 0;
+      (lane_id < 8) ? reinterpret_cast<unsigned const*>(query + baseLoadIndex)[lane_id] : 0;
     constexpr int veclen = 4;
     constexpr int stride = kUnroll * veclen;
 
@@ -334,7 +334,7 @@ struct loadAndComputeDist<kUnroll, Lambda, 4, uint8_t, uint32_t> {
     for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
 #pragma unroll
       for (int j = 0; j < kUnroll; ++j) {
-        uint32_t encV = reinterpret_cast<unsigned const*>(data)[laneId + j * kIndexGroupSize];
+        uint32_t encV = reinterpret_cast<unsigned const*>(data)[lane_id + j * kIndexGroupSize];
         uint32_t q    = shfl(queryReg, i * kUnroll + j, WarpSize);
         compute_dist(dist, q, encV);
       }
@@ -343,15 +343,15 @@ struct loadAndComputeDist<kUnroll, Lambda, 4, uint8_t, uint32_t> {
 
   __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
                                                                  const uint8_t* query,
-                                                                 const int laneId,
+                                                                 const int lane_id,
                                                                  const int dim,
                                                                  const int dimBlocks)
   {
     constexpr int veclen = 4;
-    const int loadDim    = dimBlocks + laneId;
+    const int loadDim    = dimBlocks + lane_id;
     uint32_t queryReg    = loadDim < dim ? reinterpret_cast<unsigned const*>(query)[loadDim] : 0;
     for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
-      uint32_t enc = reinterpret_cast<unsigned const*>(data)[laneId];
+      uint32_t enc = reinterpret_cast<unsigned const*>(data)[lane_id];
       uint32_t q   = shfl(queryReg, d / veclen, WarpSize);
       compute_dist(dist, q, enc);
     }  // end for d < dim - dimBlocks
@@ -384,10 +384,10 @@ struct loadAndComputeDist<kUnroll, Lambda, 2, uint8_t, uint32_t> {
   __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
                                                         const uint8_t* query,
                                                         int baseLoadIndex,
-                                                        const int laneId)
+                                                        const int lane_id)
   {
     uint32_t queryReg =
-      (laneId < 16) ? reinterpret_cast<uint16_t const*>(query + baseLoadIndex)[laneId] : 0;
+      (lane_id < 16) ? reinterpret_cast<uint16_t const*>(query + baseLoadIndex)[lane_id] : 0;
     constexpr int veclen = 2;
     constexpr int stride = kUnroll * veclen;
 
@@ -395,7 +395,7 @@ struct loadAndComputeDist<kUnroll, Lambda, 2, uint8_t, uint32_t> {
     for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
 #pragma unroll
       for (int j = 0; j < kUnroll; ++j) {
-        uint32_t encV = reinterpret_cast<uint16_t const*>(data)[laneId + j * kIndexGroupSize];
+        uint32_t encV = reinterpret_cast<uint16_t const*>(data)[lane_id + j * kIndexGroupSize];
         uint32_t q    = shfl(queryReg, i * kUnroll + j, WarpSize);
         compute_dist(dist, q, encV);
       }
@@ -404,15 +404,15 @@ struct loadAndComputeDist<kUnroll, Lambda, 2, uint8_t, uint32_t> {
 
   __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
                                                                  const uint8_t* query,
-                                                                 const int laneId,
+                                                                 const int lane_id,
                                                                  const int dim,
                                                                  const int dimBlocks)
   {
     constexpr int veclen = 2;
-    int loadDim          = dimBlocks + laneId * veclen;
+    int loadDim          = dimBlocks + lane_id * veclen;
     uint32_t queryReg = loadDim < dim ? reinterpret_cast<uint16_t const*>(query + loadDim)[0] : 0;
     for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
-      uint32_t enc = reinterpret_cast<uint16_t const*>(data)[laneId];
+      uint32_t enc = reinterpret_cast<uint16_t const*>(data)[lane_id];
       uint32_t q   = shfl(queryReg, d / veclen, WarpSize);
       compute_dist(dist, q, enc);
     }
@@ -445,9 +445,9 @@ struct loadAndComputeDist<kUnroll, Lambda, 1, uint8_t, uint32_t> {
   __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
                                                         const uint8_t* query,
                                                         int baseLoadIndex,
-                                                        const int laneId)
+                                                        const int lane_id)
   {
-    uint32_t queryReg    = query[baseLoadIndex + laneId];
+    uint32_t queryReg    = query[baseLoadIndex + lane_id];
     constexpr int veclen = 1;
     constexpr int stride = kUnroll * veclen;
 
@@ -455,7 +455,7 @@ struct loadAndComputeDist<kUnroll, Lambda, 1, uint8_t, uint32_t> {
     for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
 #pragma unroll
       for (int j = 0; j < kUnroll; ++j) {
-        uint32_t encV = data[laneId + j * kIndexGroupSize];
+        uint32_t encV = data[lane_id + j * kIndexGroupSize];
         uint32_t q    = shfl(queryReg, i * kUnroll + j, WarpSize);
         compute_dist(dist, q, encV);
       }
@@ -464,15 +464,15 @@ struct loadAndComputeDist<kUnroll, Lambda, 1, uint8_t, uint32_t> {
 
   __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
                                                                  const uint8_t* query,
-                                                                 const int laneId,
+                                                                 const int lane_id,
                                                                  const int dim,
                                                                  const int dimBlocks)
   {
     constexpr int veclen = 1;
-    int loadDim          = dimBlocks + laneId;
+    int loadDim          = dimBlocks + lane_id;
     uint32_t queryReg    = loadDim < dim ? query[loadDim] : 0;
     for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
-      uint32_t enc = data[laneId];
+      uint32_t enc = data[lane_id];
       uint32_t q   = shfl(queryReg, d, WarpSize);
       compute_dist(dist, q, enc);
     }
@@ -514,12 +514,12 @@ struct loadAndComputeDist<kUnroll, Lambda, int8_veclen, int8_t, int32_t> {
   __device__ __forceinline__ void runLoadShflAndCompute(const int8_t*& data,
                                                         const int8_t* query,
                                                         int baseLoadIndex,
-                                                        const int laneId)
+                                                        const int lane_id)
   {
     constexpr int veclen_int = int8_veclen / 4;  // converting int8_t veclens to int
 
     int32_t queryReg =
-      (laneId < 8) ? reinterpret_cast<int32_t const*>(query + baseLoadIndex)[laneId] : 0;
+      (lane_id < 8) ? reinterpret_cast<int32_t const*>(query + baseLoadIndex)[lane_id] : 0;
     constexpr int stride = kUnroll * int8_veclen;
 
 #pragma unroll
@@ -528,7 +528,7 @@ struct loadAndComputeDist<kUnroll, Lambda, int8_veclen, int8_t, int32_t> {
       for (int j = 0; j < kUnroll; ++j) {
         int32_t encV[veclen_int];
         ldg(encV,
-            reinterpret_cast<int32_t const*>(data) + (laneId + j * kIndexGroupSize) * veclen_int);
+            reinterpret_cast<int32_t const*>(data) + (lane_id + j * kIndexGroupSize) * veclen_int);
         const int d = (i * kUnroll + j) * veclen_int;
 #pragma unroll
         for (int k = 0; k < veclen_int; ++k) {
@@ -540,14 +540,14 @@ struct loadAndComputeDist<kUnroll, Lambda, int8_veclen, int8_t, int32_t> {
   }
 
   __device__ __forceinline__ void runLoadShflAndComputeRemainder(
-    const int8_t*& data, const int8_t* query, const int laneId, const int dim, const int dimBlocks)
+    const int8_t*& data, const int8_t* query, const int lane_id, const int dim, const int dimBlocks)
   {
     constexpr int veclen_int = int8_veclen / 4;
-    const int loadDim        = dimBlocks + laneId * 4;  // Here 4 is for 1 - int;
+    const int loadDim        = dimBlocks + lane_id * 4;  // Here 4 is for 1 - int;
     int32_t queryReg = loadDim < dim ? reinterpret_cast<int32_t const*>(query + loadDim)[0] : 0;
     for (int d = 0; d < dim - dimBlocks; d += int8_veclen, data += kIndexGroupSize * int8_veclen) {
       int32_t enc[veclen_int];
-      ldg(enc, reinterpret_cast<int32_t const*>(data) + laneId * veclen_int);
+      ldg(enc, reinterpret_cast<int32_t const*>(data) + lane_id * veclen_int);
 #pragma unroll
       for (int k = 0; k < veclen_int; k++) {
         int32_t q = shfl(queryReg, (d / 4) + k, WarpSize);  // Here 4 is for 1 - int;
@@ -581,10 +581,10 @@ struct loadAndComputeDist<kUnroll, Lambda, 2, int8_t, int32_t> {
   __device__ __forceinline__ void runLoadShflAndCompute(const int8_t*& data,
                                                         const int8_t* query,
                                                         int baseLoadIndex,
-                                                        const int laneId)
+                                                        const int lane_id)
   {
     int32_t queryReg =
-      (laneId < 16) ? reinterpret_cast<uint16_t const*>(query + baseLoadIndex)[laneId] : 0;
+      (lane_id < 16) ? reinterpret_cast<uint16_t const*>(query + baseLoadIndex)[lane_id] : 0;
     constexpr int veclen = 2;
     constexpr int stride = kUnroll * veclen;
 
@@ -592,7 +592,7 @@ struct loadAndComputeDist<kUnroll, Lambda, 2, int8_t, int32_t> {
     for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
 #pragma unroll
       for (int j = 0; j < kUnroll; ++j) {
-        int32_t encV = reinterpret_cast<uint16_t const*>(data)[laneId + j * kIndexGroupSize];
+        int32_t encV = reinterpret_cast<uint16_t const*>(data)[lane_id + j * kIndexGroupSize];
         int32_t q    = shfl(queryReg, i * kUnroll + j, WarpSize);
         compute_dist(dist, q, encV);
       }
@@ -600,13 +600,13 @@ struct loadAndComputeDist<kUnroll, Lambda, 2, int8_t, int32_t> {
   }
 
   __device__ __forceinline__ void runLoadShflAndComputeRemainder(
-    const int8_t*& data, const int8_t* query, const int laneId, const int dim, const int dimBlocks)
+    const int8_t*& data, const int8_t* query, const int lane_id, const int dim, const int dimBlocks)
   {
     constexpr int veclen = 2;
-    int loadDim          = dimBlocks + laneId * veclen;
+    int loadDim          = dimBlocks + lane_id * veclen;
     int32_t queryReg = loadDim < dim ? reinterpret_cast<uint16_t const*>(query + loadDim)[0] : 0;
     for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
-      int32_t enc = reinterpret_cast<uint16_t const*>(data + laneId * veclen)[0];
+      int32_t enc = reinterpret_cast<uint16_t const*>(data + lane_id * veclen)[0];
       int32_t q   = shfl(queryReg, d / veclen, WarpSize);
       compute_dist(dist, q, enc);
     }
@@ -636,29 +636,29 @@ struct loadAndComputeDist<kUnroll, Lambda, 1, int8_t, int32_t> {
   __device__ __forceinline__ void runLoadShflAndCompute(const int8_t*& data,
                                                         const int8_t* query,
                                                         int baseLoadIndex,
-                                                        const int laneId)
+                                                        const int lane_id)
   {
     constexpr int veclen = 1;
     constexpr int stride = kUnroll * veclen;
-    int32_t queryReg     = query[baseLoadIndex + laneId];
+    int32_t queryReg     = query[baseLoadIndex + lane_id];
 
 #pragma unroll
     for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
 #pragma unroll
       for (int j = 0; j < kUnroll; ++j) {
         compute_dist(
-          dist, shfl(queryReg, i * kUnroll + j, WarpSize), data[laneId + j * kIndexGroupSize]);
+          dist, shfl(queryReg, i * kUnroll + j, WarpSize), data[lane_id + j * kIndexGroupSize]);
       }
     }
   }
   __device__ __forceinline__ void runLoadShflAndComputeRemainder(
-    const int8_t*& data, const int8_t* query, const int laneId, const int dim, const int dimBlocks)
+    const int8_t*& data, const int8_t* query, const int lane_id, const int dim, const int dimBlocks)
   {
     constexpr int veclen = 1;
-    const int loadDim    = dimBlocks + laneId;
+    const int loadDim    = dimBlocks + lane_id;
     int32_t queryReg     = loadDim < dim ? query[loadDim] : 0;
     for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
-      compute_dist(dist, shfl(queryReg, d, WarpSize), data[laneId]);
+      compute_dist(dist, shfl(queryReg, d, WarpSize), data[lane_id]);
     }
   }
 };
@@ -678,18 +678,13 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
                           const T* list_data,
                           const uint32_t* list_lengths,
                           const uint32_t* list_prefix_interleave,
-                          const uint32_t nprobe,
+                          const uint32_t n_probes,
                           const uint32_t k,
                           const uint32_t dim,
                           size_t* neighbors,
                           float* distances)
 {
   extern __shared__ __align__(256) uint8_t interleaved_scan_kernel_smem[];
-  // Using shared memory for the (part of the) query;
-  // This allows to save on global memory bandwidth when reading index and query
-  // data at the same time.
-  // Its size is `query_smem_elems`.
-  T* query_shared = reinterpret_cast<T*>(interleaved_scan_kernel_smem);
 #ifdef USE_FAISS
   // temporary use of FAISS blockSelect for development purpose of k <= 32
   // for comparison purpose
@@ -707,97 +702,105 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
       queue(identity, keyMax, smemK, smemV, k);
 
 #else
-  topk::block_sort<topk::warp_sort_filtered, Capacity, Ascending, float, size_t> queue(
+  topk::block_sort<topk::warp_sort_immediate, Capacity, Ascending, float, size_t> queue(
     k, interleaved_scan_kernel_smem + query_smem_elems * sizeof(T));
 #endif
 
-  using align_warp = Pow2<WarpSize>;
-  const int laneId = align_warp::mod(threadIdx.x);
-  const int warpId = align_warp::div(threadIdx.x);
-  int queryId      = blockIdx.y;
+  const int query_id = blockIdx.y;
+  {
+    // Using shared memory for the (part of the) query;
+    // This allows to save on global memory bandwidth when reading index and query
+    // data at the same time.
+    // Its size is `query_smem_elems`.
+    T* query_shared = reinterpret_cast<T*>(interleaved_scan_kernel_smem);
 
-  /// Set the address
-  auto query               = queries + queryId * dim;
-  constexpr int kGroupSize = WarpSize;
+    using align_warp  = Pow2<WarpSize>;
+    const int lane_id = align_warp::mod(threadIdx.x);
+    const int warp_id = align_warp::div(threadIdx.x);
 
-  // How many full warps needed to compute the distance (without remainder)
-  const int full_warps_along_dim = align_warp::roundDown(dim);
+    /// Set the address
+    auto query               = queries + query_id * dim;
+    constexpr int kGroupSize = WarpSize;
 
-  int shLoadDim = (dim < query_smem_elems) ? dim : query_smem_elems;
+    // How many full warps needed to compute the distance (without remainder)
+    const int full_warps_along_dim = align_warp::roundDown(dim);
 
-  // load the query data from global to shared memory
-  for (int loadDim = threadIdx.x; loadDim * Veclen < shLoadDim; loadDim += blockDim.x) {
-    queryLoadToShmem<T, Veclen>(query, query_shared, loadDim);
-  }
-  __syncthreads();
-  shLoadDim = (dim > query_smem_elems) ? shLoadDim : full_warps_along_dim;
-
-  // Every CUDA block scans one cluster at a time.
-  for (int probeId = blockIdx.x; probeId < nprobe; probeId += gridDim.x) {
-    uint32_t listId = coarse_index[queryId * nprobe + probeId];  // The id of cluster(list)
-
-    /**
-     * Uses shared memory
-     */
-    //@TODO The result with dimension
-    // The start address of the full value of vector for each cluster(list) interleaved
-    auto vecsBase = list_data + size_t(list_prefix_interleave[listId]) * dim;
-    // The start address of index of vector for each cluster(list) interleaved
-    auto indexBase = list_index + list_prefix_interleave[listId];
-    // The number of vectors in each cluster(list); [nlist]
-    const uint32_t list_length = list_lengths[listId];
-
-    // The number of interleaved groups to be processed
-    const uint32_t num_groups = ceildiv<uint32_t>(list_length, WarpSize);
-
-    constexpr int kUnroll        = WarpSize / Veclen;
-    constexpr uint32_t kNumWarps = kThreadsPerBlock / WarpSize;
-    // Every warp reads WarpSize vectors and computes the distances to them.
-    // Then, the distances and corresponding ids are distributed among the threads,
-    // and each thread adds one (id, dist) pair to the filtering queue.
-    for (uint32_t block = warpId; block < num_groups; block += kNumWarps) {
-      AccT dist = 0;
-      // This is the vector a given lane/thread handles
-      const uint32_t vec = block * WarpSize + laneId;
-      bool valid         = vec < list_length;
-      size_t idx         = (valid) ? (size_t)indexBase[vec] : (size_t)laneId;
-      // This is where this warp begins reading data
-      const T* data = vecsBase + size_t(block) * kGroupSize * dim;  // Start position of this block
-
-      if (valid) {
-        /// load query from shared mem
-        for (int dBase = 0; dBase < shLoadDim; dBase += WarpSize) {
-          loadAndComputeDist<kUnroll, decltype(compute_dist), Veclen, T, AccT> obj(dist,
-                                                                                   compute_dist);
-          obj.runLoadShmemCompute(data, query_shared, laneId, dBase);
-          data += WarpSize * kGroupSize;
-        }
-      }
+    int shm_assisted_dim = (dim < query_smem_elems) ? dim : query_smem_elems;
 
-      if (dim > query_smem_elems) {
-        loadAndComputeDist<kUnroll, decltype(compute_dist), Veclen, T, AccT> obj(dist,
-                                                                                 compute_dist);
-        for (int dBase = shLoadDim; dBase < full_warps_along_dim; dBase += WarpSize) {  //
-          obj.runLoadShflAndCompute(data, query, dBase, laneId);
-        }
-        // Remainder chunk = dim - full_warps_along_dim
-        obj.runLoadShflAndComputeRemainder(data, query, laneId, dim, full_warps_along_dim);
-        // end for d < dim - full_warps_along_dim
-      } else {
+    // load the query data from global to shared memory
+    for (int i = threadIdx.x; i * Veclen < shm_assisted_dim; i += blockDim.x) {
+      queryLoadToShmem<T, Veclen>(query, query_shared, i);
+    }
+    __syncthreads();
+    shm_assisted_dim = (dim > query_smem_elems) ? query_smem_elems : full_warps_along_dim;
+
+    // Every CUDA block scans one cluster at a time.
+    for (int probe_id = blockIdx.x; probe_id < n_probes; probe_id += gridDim.x) {
+      const uint32_t list_id =
+        coarse_index[query_id * n_probes + probe_id];  // The id of cluster(list)
+
+      /**
+       * Uses shared memory
+       */
+      // The start address of the full value of vector for each cluster(list) interleaved
+      auto vecsBase = list_data + size_t(list_prefix_interleave[list_id]) * dim;
+      // The start address of index of vector for each cluster(list) interleaved
+      auto indexBase = list_index + list_prefix_interleave[list_id];
+      // The number of vectors in each cluster(list); [nlist]
+      const uint32_t list_length = list_lengths[list_id];
+
+      // The number of interleaved groups to be processed
+      const uint32_t num_groups = ceildiv<uint32_t>(list_length, WarpSize);
+
+      constexpr int kUnroll        = WarpSize / Veclen;
+      constexpr uint32_t kNumWarps = kThreadsPerBlock / WarpSize;
+      // Every warp reads WarpSize vectors and computes the distances to them.
+      // Then, the distances and corresponding ids are distributed among the threads,
+      // and each thread adds one (id, dist) pair to the filtering queue.
+      for (uint32_t block = warp_id; block < num_groups; block += kNumWarps) {
+        AccT dist = 0;
+        // This is the vector a given lane/thread handles
+        const uint32_t vec = block * WarpSize + lane_id;
+        bool valid         = vec < list_length;
+        size_t idx         = (valid) ? (size_t)indexBase[vec] : (size_t)lane_id;
+        // This is where this warp begins reading data
+        const T* data =
+          vecsBase + size_t(block) * kGroupSize * dim;  // Start position of this block
+
+        // Process first shm_assisted_dim dimensions (always using shared memory)
         if (valid) {
-          /// Remainder chunk = dim - full_warps_along_dim
-          for (int d = 0; d < dim - full_warps_along_dim;
-               d += Veclen, data += kGroupSize * Veclen) {
-            loadAndComputeDist<1, decltype(compute_dist), Veclen, T, AccT> obj(dist, compute_dist);
-            obj.runLoadShmemCompute(data, query_shared, laneId, full_warps_along_dim + d);
-          }  // end for d < dim - full_warps_along_dim
+          for (int pos = 0; pos < shm_assisted_dim; pos += WarpSize) {
+            loadAndComputeDist<kUnroll, decltype(compute_dist), Veclen, T, AccT> lc(dist,
+                                                                                    compute_dist);
+            lc.runLoadShmemCompute(data, query_shared, lane_id, pos);
+            data += WarpSize * kGroupSize;
+          }
         }
-      }
 
-      // Enqueue one element per thread
-      constexpr float kDummy = Ascending ? upper_bound<float>() : lower_bound<float>();
-      float val              = (valid) ? (float)dist : kDummy;
-      queue.add(val, idx);
+        if (dim > query_smem_elems) {
+          // The default path - using shfl ops - for dimensions beyond query_smem_elems
+          loadAndComputeDist<kUnroll, decltype(compute_dist), Veclen, T, AccT> lc(dist,
+                                                                                  compute_dist);
+          for (int pos = shm_assisted_dim; pos < full_warps_along_dim; pos += WarpSize) {  //
+            lc.runLoadShflAndCompute(data, query, pos, lane_id);
+          }
+          lc.runLoadShflAndComputeRemainder(data, query, lane_id, dim, full_warps_along_dim);
+        } else {
+          // when  shm_assisted_dim == full_warps_along_dim < dim
+          if (valid) {
+            loadAndComputeDist<1, decltype(compute_dist), Veclen, T, AccT> lc(dist, compute_dist);
+            for (int pos = full_warps_along_dim; pos < dim;
+                 pos += Veclen, data += kGroupSize * Veclen) {
+              lc.runLoadShmemCompute(data, query_shared, lane_id, pos);
+            }
+          }
+        }
+
+        // Enqueue one element per thread
+        constexpr float kDummy = Ascending ? upper_bound<float>() : lower_bound<float>();
+        float val              = valid ? static_cast<float>(dist) : kDummy;
+        queue.add(val, idx);
+      }
     }
   }
 
@@ -805,13 +808,13 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
 #ifdef USE_FAISS
   queue.reduce();
   for (int i = threadIdx.x; i < k; i += kThreadsPerBlock) {
-    neighbors[queryId * k * gridDim.x + blockIdx.x * k + i] = (size_t)smemV[i];
-    distances[queryId * k * gridDim.x + blockIdx.x * k + i] = smemK[i];
+    neighbors[query_id * k * gridDim.x + blockIdx.x * k + i] = (size_t)smemV[i];
+    distances[query_id * k * gridDim.x + blockIdx.x * k + i] = smemK[i];
   }
 #else
   queue.done();
-  queue.store(distances + queryId * k * gridDim.x + blockIdx.x * k,
-              neighbors + queryId * k * gridDim.x + blockIdx.x * k);
+  queue.store(distances + query_id * k * gridDim.x + blockIdx.x * k,
+              neighbors + query_id * k * gridDim.x + blockIdx.x * k);
 #endif
 }  // end kernel
 
@@ -819,7 +822,7 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
  *  Configure the gridDim.x to maximize GPU occupancy, but reduce the output size
  */
 template <typename T>
-uint32_t configure_launch_x(uint32_t numQueries, uint32_t nprobe, int32_t sMemSize, T func)
+uint32_t configure_launch_x(uint32_t numQueries, uint32_t n_probes, int32_t sMemSize, T func)
 {
   int dev_id;
   RAFT_CUDA_TRY(cudaGetDevice(&dev_id));
@@ -831,7 +834,7 @@ uint32_t configure_launch_x(uint32_t numQueries, uint32_t nprobe, int32_t sMemSi
 
   size_t min_grid_size = num_sms * num_blocks_per_sm;
   size_t min_grid_x    = ceildiv<size_t>(min_grid_size, numQueries);
-  return min_grid_x > nprobe ? nprobe : static_cast<uint32_t>(min_grid_x);
+  return min_grid_x > n_probes ? n_probes : static_cast<uint32_t>(min_grid_x);
 }
 
 template <int Capacity, int Veclen, bool Ascending, typename T, typename AccT, typename Lambda>
@@ -840,7 +843,7 @@ void launch_kernel(Lambda lambda,
                    const T* queries,
                    const uint32_t* coarse_index,
                    const uint32_t num_queries,
-                   const uint32_t nprobe,
+                   const uint32_t n_probes,
                    const uint32_t k,
                    size_t* neighbors,
                    float* distances,
@@ -866,7 +869,7 @@ void launch_kernel(Lambda lambda,
   constexpr uint32_t kMaxGridY = 32768;
 
   if (grid_dim_x == 0) {
-    grid_dim_x = configure_launch_x(std::min(kMaxGridY, num_queries), nprobe, smem_size, kKernel);
+    grid_dim_x = configure_launch_x(std::min(kMaxGridY, num_queries), n_probes, smem_size, kKernel);
     return;
   }
 
@@ -875,12 +878,12 @@ void launch_kernel(Lambda lambda,
     dim3 grid_dim(grid_dim_x, grid_dim_y, 1);
     dim3 block_dim(kThreadsPerBlock);
     RAFT_LOG_TRACE(
-      "Launching the ivf-flat interleaved_scan_kernel (%d, %d, 1) x (%d, 1, 1), nprobe = %d, "
+      "Launching the ivf-flat interleaved_scan_kernel (%d, %d, 1) x (%d, 1, 1), n_probes = %d, "
       "smem_size = %d",
       grid_dim.x,
       grid_dim.y,
       block_dim.x,
-      nprobe,
+      n_probes,
       smem_size);
     kKernel<<<grid_dim, block_dim, smem_size, stream>>>(lambda,
                                                         query_smem_elems,
@@ -890,7 +893,7 @@ void launch_kernel(Lambda lambda,
                                                         index.data.data(),
                                                         index.list_sizes.data(),
                                                         index.list_offsets.data(),
-                                                        nprobe,
+                                                        n_probes,
                                                         k,
                                                         index.dim(),
                                                         neighbors,
@@ -903,22 +906,22 @@ void launch_kernel(Lambda lambda,
 
 template <int Veclen, typename T, typename AccT>
 struct euclidean_dist {
-  __device__ inline void operator()(AccT& acc, AccT x, AccT y)
+  __device__ __forceinline__ void operator()(AccT& acc, AccT x, AccT y)
   {
-    const AccT diff = x - y;
+    const auto diff = x - y;
     acc += diff * diff;
   }
 };
 
 template <int Veclen>
 struct euclidean_dist<Veclen, uint8_t, uint32_t> {
-  __device__ inline void operator()(uint32_t& acc, uint32_t x, uint32_t y)
+  __device__ __forceinline__ void operator()(uint32_t& acc, uint32_t x, uint32_t y)
   {
     if constexpr (Veclen > 1) {
-      const uint32_t diff = __vabsdiffu4(x, y);
-      acc                 = dp4a(diff, diff, acc);
+      const auto diff = __vabsdiffu4(x, y);
+      acc             = dp4a(diff, diff, acc);
     } else {
-      const uint32_t diff = x - y;
+      const auto diff = x - y;
       acc += diff * diff;
     }
   }
@@ -926,13 +929,13 @@ struct euclidean_dist<Veclen, uint8_t, uint32_t> {
 
 template <int Veclen>
 struct euclidean_dist<Veclen, int8_t, int32_t> {
-  __device__ inline void operator()(int32_t& acc, int32_t x, int32_t y)
+  __device__ __forceinline__ void operator()(int32_t& acc, int32_t x, int32_t y)
   {
     if constexpr (Veclen > 1) {
-      const int32_t diff = static_cast<int32_t>(__vabsdiffs4(x, y));
-      acc                = dp4a(diff, diff, acc);
+      const auto diff = static_cast<int32_t>(__vabsdiffs4(x, y));
+      acc             = dp4a(diff, diff, acc);
     } else {
-      const int32_t diff = x - y;
+      const auto diff = x - y;
       acc += diff * diff;
     }
   }
@@ -940,7 +943,7 @@ struct euclidean_dist<Veclen, int8_t, int32_t> {
 
 template <int Veclen, typename T, typename AccT>
 struct inner_prod_dist {
-  __device__ inline void operator()(AccT& acc, AccT x, AccT y)
+  __device__ __forceinline__ void operator()(AccT& acc, AccT x, AccT y)
   {
     if constexpr (Veclen > 1 && (std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>)) {
       acc = dp4a(x, y, acc);
@@ -1255,7 +1258,7 @@ inline void search(const handle_t& handle,
                    rmm::mr::device_memory_resource* mr = nullptr)
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
-    "ivf_flat_handle::search(%u, %u, %zu)", n_queries, k, neighbors);
+    "ivf_flat::search(k = %u, n_queries = %u, dim = %zu)", k, n_queries, index.dim());
 
   RAFT_EXPECTS(params.n_probes > 0,
                "n_probes (number of clusters to probe in the search) must be positive.");

From 805e78c70ffa2b079c1fe7722a2eaa4ffcc8bd2c Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 29 Jun 2022 17:02:48 +0200
Subject: [PATCH 090/118] Add a helper for creating pool_memory_resource when
 it makes sense

---
 cpp/include/raft/core/cudart_utils.hpp        | 49 +++++++++++++++++++
 .../knn/detail/ann_kmeans_balanced.cuh        | 40 +++++++--------
 .../spatial/knn/detail/ivf_flat_search.cuh    | 17 ++-----
 .../spatial/knn/detail/topk/radix_topk.cuh    | 28 +++++------
 .../spatial/knn/detail/topk/warpsort_topk.cuh | 15 +++---
 5 files changed, 91 insertions(+), 58 deletions(-)

diff --git a/cpp/include/raft/core/cudart_utils.hpp b/cpp/include/raft/core/cudart_utils.hpp
index 7fe6ecaf6b..e0957ea1f3 100644
--- a/cpp/include/raft/core/cudart_utils.hpp
+++ b/cpp/include/raft/core/cudart_utils.hpp
@@ -26,7 +26,9 @@
 
 #include <raft/error.hpp>
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/managed_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
 
 #include <cuda_runtime.h>
 
@@ -445,6 +447,53 @@ constexpr T upper_bound()
   return std::numeric_limits<T>::max();
 }
 
+/**
+ * @brief Get a pointer to a pooled memory resource within the scope of the lifetime of the returned
+ * unique pointer.
+ *
+ * This function is useful in the code where multiple repeated allocations/deallocations are
+ * expected.
+ * Use case example:
+ * @code{.cpp}
+ *   void my_func(..., size_t n, rmm::mr::device_memory_resource* mr = nullptr) {
+ *     auto pool_guard = raft::get_pool_memory_resource(mr, 2 * n * sizeof(float));
+ *     if (pool_guard){
+ *       RAFT_LOG_INFO("Created a pool %zu bytes", pool_guard->pool_size());
+ *     } else {
+ *       RAFT_LOG_INFO("Using the current default or explicitly passed device memory resource");
+ *     }
+ *     rmm::device_uvector<float> x(n, stream, mr);
+ *     rmm::device_uvector<float> y(n, stream, mr);
+ *     ...
+ *   }
+ * @endcode
+ * Here, the new memory resource would be created within the function scope if the passed `mr` is
+ * null and the default resource is not a pool. After the call, `mr` contains a valid memory
+ * resource in any case.
+ *
+ * @param[inout] mr if not null do nothing; otherwise get the current device resource and wrap it
+ * into a `pool_memory_resource` if neccessary and return the pointer to the result.
+ * @param initial_size if a new memory pool is created, this would be its initial size (rounded up
+ * to 256 bytes).
+ *
+ * @return if a new memory pool is created, it returns a unique_ptr to it;
+ *   this managed pointer controls the lifetime of the created memory resource.
+ */
+inline auto get_pool_memory_resource(rmm::mr::device_memory_resource*& mr, size_t initial_size)
+{
+  using pool_res_t = rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>;
+  std::unique_ptr<pool_res_t> pool_res{};
+  if (mr) return pool_res;
+  mr = rmm::mr::get_current_device_resource();
+  if (!dynamic_cast<pool_res_t*>(mr) &&
+      !dynamic_cast<rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource>*>(mr) &&
+      !dynamic_cast<rmm::mr::pool_memory_resource<rmm::mr::managed_memory_resource>*>(mr)) {
+    pool_res = std::make_unique<pool_res_t>(mr, (initial_size + 255) & (~255));
+    mr       = pool_res.get();
+  }
+  return pool_res;
+}
+
 }  // namespace raft
 
 #endif
diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index 066382db5a..c327ec6ed0 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -20,9 +20,9 @@
 #include "ann_utils.cuh"
 
 #include <raft/common/nvtx.hpp>
+#include <raft/core/cudart_utils.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/cuda_utils.cuh>
-#include <raft/cudart_utils.h>
 #include <raft/distance/distance.hpp>
 #include <raft/distance/distance_type.hpp>
 #include <raft/linalg/gemm.cuh>
@@ -32,11 +32,8 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_vector.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/managed_memory_resource.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/mr/device/pool_memory_resource.hpp>
-
-#include <optional>
 
 namespace raft::spatial::knn::detail::kmeans {
 
@@ -210,7 +207,7 @@ void predict(const handle_t& handle,
              uint32_t* cluster_sizes,
              bool shall_update_centers,
              rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr = nullptr)
+             rmm::mr::device_memory_resource* mr)
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
     "kmeans::predict(%u, %u)", n_rows, n_clusters);
@@ -241,13 +238,6 @@ void predict(const handle_t& handle,
     utils::memset(cluster_sizes, 0, sizeof(uint32_t) * n_clusters, stream);
   }
 
-  std::optional<rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>> pool_res;
-  if (mr == nullptr) {
-    pool_res.emplace(rmm::mr::get_current_device_resource(),
-                     Pow2<256>::roundUp(max_minibatch_size * dim * 4));
-    mr = &(pool_res.value());
-  }
-
   rmm::device_uvector<float> cur_dataset(max_minibatch_size * dim, stream, mr);
   for (uint32_t offset = 0; offset < n_rows; offset += max_minibatch_size) {
     auto minibatch_size = std::min<uint32_t>(max_minibatch_size, n_rows - offset);
@@ -619,10 +609,14 @@ void build_optimized_kmeans(const handle_t& handle,
   RAFT_LOG_DEBUG("(%s) # n_mesoclusters: %u", __func__, n_mesoclusters);
 
   rmm::mr::managed_memory_resource managed_memory;
-  rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> device_memory(
-    rmm::mr::get_current_device_resource(),
-    // an arbitrary guess on the upper bound of the workspace size
-    Pow2<256>::roundUp(kmeans::calc_minibatch_size(n_mesoclusters, n_rows) * dim * 4));
+  rmm::mr::device_memory_resource* device_memory = nullptr;
+  auto pool_guard                                = raft::get_pool_memory_resource(
+    device_memory, kmeans::calc_minibatch_size(n_mesoclusters, n_rows) * dim * 4);
+  if (pool_guard) {
+    RAFT_LOG_DEBUG(
+      "kmeans::build_optimized_kmeans: using pool memory resource with initial size %zu bytes",
+      pool_guard->pool_size());
+  }
 
   rmm::device_uvector<T> trainset(n_rows_train * dim, stream, &managed_memory);
   // TODO: a proper sampling
@@ -651,7 +645,7 @@ void build_optimized_kmeans(const handle_t& handle,
                    mesocluster_labels_buf.data(),
                    mesocluster_sizes_buf.data(),
                    metric,
-                   &device_memory,
+                   device_memory,
                    stream);
   }
 
@@ -679,11 +673,11 @@ void build_optimized_kmeans(const handle_t& handle,
                                              cluster_centers,
                                              metric,
                                              &managed_memory,
-                                             &device_memory,
+                                             device_memory,
                                              stream);
   RAFT_EXPECTS(n_clusters_done == n_clusters, "Didn't process all clusters.");
 
-  rmm::device_uvector<float> centers_temp(n_clusters * dim, stream, &device_memory);
+  rmm::device_uvector<float> centers_temp(n_clusters * dim, stream, device_memory);
 
   // fit clusters using the trainset
   for (int iter = 0; iter < 2; iter++) {
@@ -701,7 +695,7 @@ void build_optimized_kmeans(const handle_t& handle,
                     cluster_sizes,
                     true,
                     stream,
-                    &device_memory);
+                    device_memory);
   }
 
   RAFT_LOG_DEBUG("(%s) Final fitting.", __func__);
@@ -720,7 +714,7 @@ void build_optimized_kmeans(const handle_t& handle,
                   cluster_sizes,
                   true,
                   stream,
-                  &device_memory);
+                  device_memory);
 
   kmeans::predict(handle,
                   cluster_centers,
@@ -735,7 +729,7 @@ void build_optimized_kmeans(const handle_t& handle,
                   cluster_sizes,
                   false,
                   stream,
-                  &device_memory);
+                  device_memory);
 }
 
 }  // namespace raft::spatial::knn::detail::kmeans
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
index c3cbd6f3ea..17b0acde99 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
@@ -24,11 +24,11 @@
 #include "topk/warpsort_topk.cuh"
 
 #include <raft/common/device_loads_stores.cuh>
+#include <raft/core/cudart_utils.hpp>
 #include <raft/core/handle.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/core/mdarray.hpp>
 #include <raft/cuda_utils.cuh>
-#include <raft/cudart_utils.h>
 #include <raft/distance/distance.cuh>
 #include <raft/distance/distance_type.hpp>
 #include <raft/pow2_utils.cuh>
@@ -1277,17 +1277,10 @@ inline void search(const handle_t& handle,
     default: select_min = true;
   }
 
-  //   // Set memory buffer to be reused across searches
-  //   auto cur_memory_resource = rmm::mr::get_current_device_resource();
-  //   if (!search_mem_res_.has_value() || search_mem_res_->get_upstream() != cur_memory_resource) {
-  //     search_mem_res_.emplace(cur_memory_resource, Pow2<256>::roundUp(n_queries * n_probes * k *
-  //     16));
-  //   }
-  std::optional<rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>> pool_res;
-  if (mr == nullptr) {
-    pool_res.emplace(rmm::mr::get_current_device_resource(),
-                     Pow2<256>::roundUp(n_queries * n_probes * k * 16));
-    mr = &(pool_res.value());
+  auto pool_guard = raft::get_pool_memory_resource(mr, n_queries * n_probes * k * 16);
+  if (pool_guard) {
+    RAFT_LOG_DEBUG("ivf_flat::search: using pool memory resource with initial size %zu bytes",
+                   pool_guard->pool_size());
   }
 
   return search_impl<T, float>(
diff --git a/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh b/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
index a012bd7f7d..53d88ff366 100644
--- a/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
@@ -16,7 +16,8 @@
 
 #pragma once
 
-#include <raft/cudart_utils.h>
+#include <raft/core/cudart_utils.hpp>
+#include <raft/core/logger.hpp>
 #include <raft/device_atomics.cuh>
 #include <raft/vectorized.cuh>
 
@@ -26,10 +27,7 @@
 #include <cub/block/radix_rank_sort_operations.cuh>
 
 #include <rmm/device_vector.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/mr/device/pool_memory_resource.hpp>
-
-#include <optional>
+#include <rmm/mr/device/device_memory_resource.hpp>
 
 namespace raft::spatial::knn::detail::topk {
 
@@ -574,17 +572,17 @@ void radix_topk(const T* in,
   dim3 blocks           = get_optimal_grid_size<T, IdxT, BitsPerPass, BlockSize>(batch_size, len);
   size_t max_chunk_size = blocks.y;
 
-  std::optional<rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>> pool_res;
-  if (mr == nullptr) {
-    pool_res.emplace(
-      rmm::mr::get_current_device_resource(),
-      Pow2<256>::roundUp(max_chunk_size *
-                         (sizeof(Counter<T, IdxT>)            // counters
-                          + sizeof(IdxT) * (num_buckets + 2)  // histograms and IdxT bufs
-                          + sizeof(T) * 2                     // T bufs
-                          )));
-    mr = &(pool_res.value());
+  auto pool_guard = raft::get_pool_memory_resource(
+    mr,
+    max_chunk_size * (sizeof(Counter<T, IdxT>)            // counters
+                      + sizeof(IdxT) * (num_buckets + 2)  // histograms and IdxT bufs
+                      + sizeof(T) * 2                     // T bufs
+                      ));
+  if (pool_guard) {
+    RAFT_LOG_DEBUG("radix_topk: using pool memory resource with initial size %zu bytes",
+                   pool_guard->pool_size());
   }
+
   rmm::device_uvector<Counter<T, IdxT>> counters(max_chunk_size, stream, mr);
   rmm::device_uvector<IdxT> histograms(num_buckets * max_chunk_size, stream, mr);
   rmm::device_uvector<T> buf1(len * max_chunk_size, stream, mr);
diff --git a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
index cd5d2fc728..017678afbb 100644
--- a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
@@ -18,6 +18,7 @@
 
 #include "bitonic_sort.cuh"
 
+#include <raft/core/logger.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/pow2_utils.cuh>
 
@@ -26,8 +27,7 @@
 #include <type_traits>
 
 #include <rmm/device_vector.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/mr/device/pool_memory_resource.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
 
 /*
   Three APIs of different scopes are provided:
@@ -726,12 +726,11 @@ void warp_sort_topk_(int num_of_block,
                      rmm::cuda_stream_view stream,
                      rmm::mr::device_memory_resource* mr = nullptr)
 {
-  std::optional<rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>> pool_res;
-  if (mr == nullptr) {
-    pool_res.emplace(
-      rmm::mr::get_current_device_resource(),
-      Pow2<256>::roundUp(num_of_block * k * batch_size * 2 * std::max(sizeof(T), sizeof(IdxT))));
-    mr = &(pool_res.value());
+  auto pool_guard = raft::get_pool_memory_resource(
+    mr, num_of_block * k * batch_size * 2 * std::max(sizeof(T), sizeof(IdxT)));
+  if (pool_guard) {
+    RAFT_LOG_DEBUG("warp_sort_topk: using pool memory resource with initial size %zu bytes",
+                   pool_guard->pool_size());
   }
 
   rmm::device_uvector<T> tmp_val(num_of_block * k * batch_size, stream, mr);

From a4973e6ff3d8268b8cda7141b472119ee658f086 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 29 Jun 2022 17:03:40 +0200
Subject: [PATCH 091/118] Force move the mdarrays when creating index to avoid
 copying them

---
 cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
index 0f2f70a315..a1d3881405 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
@@ -203,8 +203,14 @@ inline auto build(const handle_t& handle,
                           : std::nullopt;
 
   // assemble the index
-  index<T> index{
-    veclen, params.metric, data, indices, list_sizes, list_offsets, centers, center_norms};
+  index<T> index{veclen,
+                 params.metric,
+                 std::move(data),
+                 std::move(indices),
+                 std::move(list_sizes),
+                 std::move(list_offsets),
+                 std::move(centers),
+                 std::move(center_norms)};
 
   // check index invariants
   index.check_consistency();

From 68c267edfddee8a9b97def18faf169e4cfcf8478 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 29 Jun 2022 17:05:08 +0200
Subject: [PATCH 092/118] Minor refactorings

---
 .../raft/spatial/knn/detail/ann_quantized.cuh    | 16 ++++++++--------
 .../raft/spatial/knn/detail/ann_utils.cuh        | 12 ++++++------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
index 420299c8d1..64e0de95a1 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
@@ -127,10 +127,10 @@ void approx_knn_build_index(const handle_t& handle,
   auto ivf_pq_pams = dynamic_cast<const ivf_pq_index_params*>(&params);
   auto ivf_sq_pams = dynamic_cast<const ivf_sq_index_params*>(&params);
 
-  if constexpr (std::is_same<T, float>{}) {
+  if constexpr (std::is_same_v<T, float>) {
     index->metric_processor = create_processor<float>(metric, n, D, 0, false, stream);
   }
-  if constexpr (std::is_same<T, float>{}) { index->metric_processor->preprocess(index_array); }
+  if constexpr (std::is_same_v<T, float>) { index->metric_processor->preprocess(index_array); }
 
   if (ivf_ft_pams && (metric == raft::distance::DistanceType::L2SqrtExpanded ||
                       metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
@@ -153,7 +153,7 @@ void approx_knn_build_index(const handle_t& handle,
     } else {
       RAFT_FAIL("Unrecognized index type.");
     }
-    if constexpr (std::is_same<T, float>{}) {
+    if constexpr (std::is_same_v<T, float>) {
       index->index->train(n, index_array);
       index->index->add(n, index_array);
     } else {
@@ -161,7 +161,7 @@ void approx_knn_build_index(const handle_t& handle,
     }
   }
 
-  if constexpr (std::is_same<T, float>{}) { index->metric_processor->revert(index_array); }
+  if constexpr (std::is_same_v<T, float>) { index->metric_processor->revert(index_array); }
 }
 
 template <typename T = float, typename IntType = int>
@@ -179,11 +179,11 @@ void approx_knn_search(const handle_t& handle,
   auto faiss_ivf   = dynamic_cast<GpuIndexIVF*>(index->index.get());
   if (ivf_pams && faiss_ivf) { faiss_ivf->setNumProbes(ivf_pams->n_probes); }
 
-  if constexpr (std::is_same<T, float>{}) { index->metric_processor->preprocess(query_array); }
+  if constexpr (std::is_same_v<T, float>) { index->metric_processor->preprocess(query_array); }
 
   // search
   if (faiss_ivf) {
-    if constexpr (std::is_same<T, float>{}) {
+    if constexpr (std::is_same_v<T, float>) {
       faiss_ivf->search(n, query_array, k, distances, indices);
     } else {
       RAFT_FAIL("FAISS-based index supports only float data.");
@@ -203,7 +203,7 @@ void approx_knn_search(const handle_t& handle,
   }
 
   // revert changes to the query
-  if constexpr (std::is_same<T, float>{}) { index->metric_processor->revert(query_array); }
+  if constexpr (std::is_same_v<T, float>) { index->metric_processor->revert(query_array); }
 
   // perform post-processing to show the real distances
   if (index->metric == raft::distance::DistanceType::L2SqrtExpanded ||
@@ -221,7 +221,7 @@ void approx_knn_search(const handle_t& handle,
       [p] __device__(float input) { return powf(input, p); },
       handle.get_stream());
   }
-  if constexpr (std::is_same<T, float>{}) { index->metric_processor->postprocess(distances); }
+  if constexpr (std::is_same_v<T, float>) { index->metric_processor->postprocess(distances); }
 }
 
 }  // namespace raft::spatial::knn::detail
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index 93c3c2f616..c3c723d71a 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -67,7 +67,7 @@ struct pointer_residency_count<Type, Types...> {
 
 /** Check if all argument pointers reside on the host or on the device. */
 template <typename... Types>
-inline auto check_pointer_residency(const Types*... ptrs) -> pointer_residency
+auto check_pointer_residency(const Types*... ptrs) -> pointer_residency
 {
   auto [on_device, on_host] = pointer_residency_count<Types...>::run(ptrs...);
   int n_args                = sizeof...(Types);
@@ -137,7 +137,7 @@ struct mapping {
  * @param[in] value
  * @param[in] n_bytes
  */
-void memset(void* ptr, int value, size_t n_bytes, rmm::cuda_stream_view stream)
+inline void memset(void* ptr, int value, size_t n_bytes, rmm::cuda_stream_view stream)
 {
   switch (check_pointer_residency(ptr)) {
     case pointer_residency::host_and_device:
@@ -199,7 +199,7 @@ __global__ void argmin_along_rows_kernel(uint32_t n_rows,
  * @param[out] out device pointer to the vector of selected indices [n_rows]
  * @param stream
  */
-void argmin_along_rows(
+inline void argmin_along_rows(
   uint32_t n_rows, uint32_t n_cols, const float* a, uint32_t* out, rmm::cuda_stream_view stream)
 {
   uint32_t block_dim = 1024;
@@ -239,7 +239,7 @@ __global__ void dots_along_rows_kernel(uint32_t n_rows, uint32_t n_cols, const f
  * @param[out] out device pointer to the vector of dot-products [n_rows]
  * @param stream
  */
-void dots_along_rows(
+inline void dots_along_rows(
   uint32_t n_rows, uint32_t n_cols, const float* a, float* out, rmm::cuda_stream_view stream)
 {
   dim3 threads(32, 4, 1);
@@ -350,7 +350,7 @@ __global__ void normalize_rows_kernel(uint32_t n_rows, uint32_t n_cols, float* a
  * @param[inout] a device pointer to a row-major matrix [n_rows, n_cols]
  * @param stream
  */
-void normalize_rows(uint32_t n_rows, uint32_t n_cols, float* a, rmm::cuda_stream_view stream)
+inline void normalize_rows(uint32_t n_rows, uint32_t n_cols, float* a, rmm::cuda_stream_view stream)
 {
   dim3 threads(32, 4, 1);  // DO NOT CHANGE
   dim3 blocks(ceildiv(n_rows, threads.y), 1, 1);
@@ -379,7 +379,7 @@ __global__ void divide_along_rows_kernel(uint32_t n_rows,
  * @param[inout] a device pointer to a row-major matrix [n_rows, n_cols]
  * @param[in] d device pointer to a vector of divisors [n_rows]
  */
-void divide_along_rows(
+inline void divide_along_rows(
   uint32_t n_rows, uint32_t n_cols, float* a, const uint32_t* d, rmm::cuda_stream_view stream)
 {
   dim3 threads(128, 1, 1);

From f2b8ed8f19925e5a26e6b60b4f373d135dbacbde Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 29 Jun 2022 17:06:04 +0200
Subject: [PATCH 093/118] Add nvtx annotations to the outermost ANN calls for
 better performance analysis

---
 cpp/include/raft/spatial/knn/ann.cuh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cpp/include/raft/spatial/knn/ann.cuh b/cpp/include/raft/spatial/knn/ann.cuh
index 51e76d44f2..8d580db6de 100644
--- a/cpp/include/raft/spatial/knn/ann.cuh
+++ b/cpp/include/raft/spatial/knn/ann.cuh
@@ -19,6 +19,7 @@
 #include "ann_common.hpp"
 #include "detail/ann_quantized.cuh"
 
+#include <raft/core/nvtx.hpp>
 #include <raft/spatial/knn/faiss_mr.hpp>
 
 namespace raft::spatial::knn {
@@ -42,6 +43,8 @@ inline void approx_knn_build_index(const raft::handle_t& handle,
                                    value_idx n,
                                    value_idx D)
 {
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "approx_knn_build_index(n_rows = %u, dim = %u)", n, D);
   detail::approx_knn_build_index(handle, index, params, index_array, n, D);
 }
 
@@ -69,6 +72,8 @@ inline void approx_knn_search(const raft::handle_t& handle,
                               T* query_array,
                               value_idx n_queries)
 {
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "approx_knn_search(k = %u, n_queries = %u)", k, n_queries);
   detail::approx_knn_search(handle, distances, indices, index, params, k, query_array, n_queries);
 }
 

From f91c7f7e395ae4b53f62d4f1b3138b7c26cca31f Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 29 Jun 2022 17:32:08 +0200
Subject: [PATCH 094/118] Add a few more test cases and annotations for them

---
 cpp/test/spatial/ann_ivf_flat.cu | 37 +++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/cpp/test/spatial/ann_ivf_flat.cu b/cpp/test/spatial/ann_ivf_flat.cu
index 401907af8a..c4e4dca84e 100644
--- a/cpp/test/spatial/ann_ivf_flat.cu
+++ b/cpp/test/spatial/ann_ivf_flat.cu
@@ -151,8 +151,6 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
       index_params.metric    = ps.metric;
       search_params.n_probes = ps.nprobe;
       raft::spatial::knn::knnIndex index;
-      index.index   = nullptr;
-      index.gpu_res = nullptr;
 
       approx_knn_build_index(
         handle_, &index, index_params, database.data(), ps.num_db_vecs, ps.dim);
@@ -216,21 +214,24 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
 };
 
 const std::vector<AnnIvfFlatInputs> inputs = {
-
+  // test various dims (aligned and not aligned to vector sizes)
   {1000, 10000, 1, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
   {1000, 10000, 2, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
   {1000, 10000, 3, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
   {1000, 10000, 4, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
-  {1000, 10000, 5, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
-  {1000, 10000, 8, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
+  {1000, 10000, 5, 16, 40, 1024, raft::distance::DistanceType::InnerProduct},
+  {1000, 10000, 8, 16, 40, 1024, raft::distance::DistanceType::InnerProduct},
+
+  // test dims that do not fit into kernel shared memory limits
   {1000, 10000, 2048, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
   {1000, 10000, 2049, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
-  {1000, 10000, 2050, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
-  {1000, 10000, 2051, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
-  {1000, 10000, 2052, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
+  {1000, 10000, 2050, 16, 40, 1024, raft::distance::DistanceType::InnerProduct},
+  {1000, 10000, 2051, 16, 40, 1024, raft::distance::DistanceType::InnerProduct},
+  {1000, 10000, 2052, 16, 40, 1024, raft::distance::DistanceType::InnerProduct},
   {1000, 10000, 2053, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
   {1000, 10000, 2056, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
 
+  // various random combinations
   {1000, 10000, 16, 10, 40, 1024, raft::distance::DistanceType::L2Expanded},
   {1000, 10000, 16, 10, 50, 1024, raft::distance::DistanceType::L2Expanded},
   {1000, 10000, 16, 10, 70, 1024, raft::distance::DistanceType::L2Expanded},
@@ -249,9 +250,25 @@ const std::vector<AnnIvfFlatInputs> inputs = {
 
   {1000, 10000, 4096, 20, 50, 1024, raft::distance::DistanceType::InnerProduct},
 
+  // test splitting the big query batches  (> max gridDim.y) into smaller batches
   {100000, 1024, 32, 10, 64, 64, raft::distance::DistanceType::InnerProduct},
-
-  {98306, 1024, 32, 10, 64, 64, raft::distance::DistanceType::InnerProduct}};
+  {98306, 1024, 32, 10, 64, 64, raft::distance::DistanceType::InnerProduct},
+
+  // test radix_sort for getting the cluster selection
+  {1000,
+   10000,
+   16,
+   10,
+   raft::spatial::knn::detail::topk::kMaxCapacity * 2,
+   raft::spatial::knn::detail::topk::kMaxCapacity * 4,
+   raft::distance::DistanceType::L2Expanded},
+  {1000,
+   10000,
+   16,
+   10,
+   raft::spatial::knn::detail::topk::kMaxCapacity * 4,
+   raft::spatial::knn::detail::topk::kMaxCapacity * 4,
+   raft::distance::DistanceType::InnerProduct} 1024};
 
 typedef AnnIVFFlatTest<float, float> AnnIVFFlatTestF;
 TEST_P(AnnIVFFlatTestF, AnnIVFFlat) { this->testIVFFlat(); }

From 84b1c5bdfdd59a698fc08f260e42309f88a2e83a Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 29 Jun 2022 17:36:06 +0200
Subject: [PATCH 095/118] Fix a typo

---
 cpp/test/spatial/ann_ivf_flat.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/test/spatial/ann_ivf_flat.cu b/cpp/test/spatial/ann_ivf_flat.cu
index c4e4dca84e..2f2fa0f34f 100644
--- a/cpp/test/spatial/ann_ivf_flat.cu
+++ b/cpp/test/spatial/ann_ivf_flat.cu
@@ -268,7 +268,7 @@ const std::vector<AnnIvfFlatInputs> inputs = {
    10,
    raft::spatial::knn::detail::topk::kMaxCapacity * 4,
    raft::spatial::knn::detail::topk::kMaxCapacity * 4,
-   raft::distance::DistanceType::InnerProduct} 1024};
+   raft::distance::DistanceType::InnerProduct}};
 
 typedef AnnIVFFlatTest<float, float> AnnIVFFlatTestF;
 TEST_P(AnnIVFFlatTestF, AnnIVFFlat) { this->testIVFFlat(); }

From afc1f6ad7e94a7eee3e442546216a0db1aba11e7 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 30 Jun 2022 07:54:11 +0200
Subject: [PATCH 096/118] Move ensure_integral_extents to the detail folder

---
 cpp/include/raft/core/mdarray.hpp   | 12 ------------
 cpp/include/raft/detail/mdarray.hpp | 11 +++++++++++
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/cpp/include/raft/core/mdarray.hpp b/cpp/include/raft/core/mdarray.hpp
index b5bc86322c..2a48587bfc 100644
--- a/cpp/include/raft/core/mdarray.hpp
+++ b/cpp/include/raft/core/mdarray.hpp
@@ -711,18 +711,6 @@ auto make_device_vector_view(ElementType* ptr, size_t n)
   return device_vector_view<ElementType, LayoutPolicy>{ptr, extents};
 }
 
-namespace detail {
-/**
- * Ensure all types listed in the parameter pack `Extents` are integral types.
- * Usage:
- *   put it as the last nameless template parameter of a function:
- *     `typename = ensure_integral_extents<Extents...>`
- */
-template <typename... Extents>
-using ensure_integral_extents =
-  std::enable_if_t<(true && ... && std::is_integral_v<Extents>), void>;
-}  // namespace detail
-
 /**
  * @brief Create a host mdarray.
  * @tparam ElementType the data type of the matrix elements
diff --git a/cpp/include/raft/detail/mdarray.hpp b/cpp/include/raft/detail/mdarray.hpp
index 96069ec5dd..8e70df1614 100644
--- a/cpp/include/raft/detail/mdarray.hpp
+++ b/cpp/include/raft/detail/mdarray.hpp
@@ -253,4 +253,15 @@ namespace stdex = std::experimental;
 using vector_extent = stdex::extents<dynamic_extent>;
 using matrix_extent = stdex::extents<dynamic_extent, dynamic_extent>;
 using scalar_extent = stdex::extents<1>;
+
+/**
+ * Ensure all types listed in the parameter pack `Extents` are integral types.
+ * Usage:
+ *   put it as the last nameless template parameter of a function:
+ *     `typename = ensure_integral_extents<Extents...>`
+ */
+template <typename... Extents>
+using ensure_integral_extents =
+  std::enable_if_t<(true && ... && std::is_integral_v<Extents>), void>;
+
 }  // namespace raft::detail

From 3a10f86930a2468fb1e15d6275800a8529aed008 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 30 Jun 2022 11:55:16 +0200
Subject: [PATCH 097/118] Lift the requirement to have query pointers aligned
 with Veclen

---
 .../spatial/knn/detail/ivf_flat_search.cuh    | 214 ++++++++----------
 1 file changed, 94 insertions(+), 120 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
index 17b0acde99..4b0a9b5590 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
@@ -32,6 +32,7 @@
 #include <raft/distance/distance.cuh>
 #include <raft/distance/distance_type.hpp>
 #include <raft/pow2_utils.cuh>
+#include <raft/vectorized.cuh>
 
 #ifdef USE_FAISS
 #include <faiss/gpu/utils/Comparators.cuh>
@@ -52,70 +53,49 @@ using raft::spatial::knn::ivf_flat::search_params;
 constexpr int kThreadsPerBlock = 128;
 
 /**
- * @brief Copy Veclen elements of type T from `query` to `query_shared` at position `loadDim *
- * Veclen`.
+ * @brief Copy `n` elements per block from one place to another.
  *
- * @param[in] query a pointer to a device global memory
- * @param[out] query_shared a pointer to a device shared memory
- * @param loadDim position at which to start copying elements.
+ * @param[out] out target pointer (unique per block)
+ * @param[in] in source pointer
+ * @param n number of elements to copy
  */
-template <typename T, int Veclen>
-__device__ __forceinline__ void queryLoadToShmem(const T* const& query,
-                                                 T* query_shared,
-                                                 const int loadDim)
+template <int VecBytes = 16, typename T>
+__device__ inline void copy_vectorized(T* out, const T* in, uint32_t n)
 {
-  T queryReg[Veclen];
-  const int loadIndex = loadDim * Veclen;
-  ldg(queryReg, query + loadIndex);
-  sts(&query_shared[loadIndex], queryReg);
-}
-
-template <>
-__device__ __forceinline__ void queryLoadToShmem<uint8_t, 8>(const uint8_t* const& query,
-                                                             uint8_t* query_shared,
-                                                             const int loadDim)
-{
-  constexpr int veclen = 2;  // 8 uint8_t
-  uint32_t queryReg[veclen];
-  const int loadIndex = loadDim * veclen;
-  ldg(queryReg, reinterpret_cast<uint32_t const*>(query) + loadIndex);
-  sts(reinterpret_cast<uint32_t*>(query_shared) + loadIndex, queryReg);
-}
-
-template <>
-__device__ __forceinline__ void queryLoadToShmem<uint8_t, 16>(const uint8_t* const& query,
-                                                              uint8_t* query_shared,
-                                                              const int loadDim)
-{
-  constexpr int veclen = 4;  // 16 uint8_t
-  uint32_t queryReg[veclen];
-  const int loadIndex = loadDim * veclen;
-  ldg(queryReg, reinterpret_cast<uint32_t const*>(query) + loadIndex);
-  sts(reinterpret_cast<uint32_t*>(query_shared) + loadIndex, queryReg);
-}
-
-template <>
-__device__ __forceinline__ void queryLoadToShmem<int8_t, 8>(const int8_t* const& query,
-                                                            int8_t* query_shared,
-                                                            const int loadDim)
-{
-  constexpr int veclen = 2;  // 8 int8_t
-  int32_t queryReg[veclen];
-  const int loadIndex = loadDim * veclen;
-  ldg(queryReg, reinterpret_cast<int32_t const*>(query) + loadIndex);
-  sts(reinterpret_cast<int32_t*>(query_shared) + loadIndex, queryReg);
-}
-
-template <>
-__device__ __forceinline__ void queryLoadToShmem<int8_t, 16>(const int8_t* const& query,
-                                                             int8_t* query_shared,
-                                                             const int loadDim)
-{
-  constexpr int veclen = 4;  // 16 int8_t
-  int32_t queryReg[veclen];
-  const int loadIndex = loadDim * veclen;
-  ldg(queryReg, reinterpret_cast<int32_t const*>(query) + loadIndex);
-  sts(reinterpret_cast<int32_t*>(query_shared) + loadIndex, queryReg);
+  constexpr int VecElems = VecBytes / sizeof(T);  // NOLINT
+  using align_bytes      = Pow2<(size_t)VecBytes>;
+  if constexpr (VecElems > 1) {
+    using align_elems = Pow2<VecElems>;
+    if (!align_bytes::areSameAlignOffsets(out, in)) {
+      return copy_vectorized<(VecBytes >> 1), T>(out, in, n);
+    }
+    {  // process unaligned head
+      uint32_t head = align_bytes::roundUp(in) - in;
+      if (head > 0) {
+        copy_vectorized<sizeof(T), T>(out, in, head);
+        n -= head;
+        in += head;
+        out += head;
+      }
+    }
+    {  // process main part vectorized
+      using vec_t = typename IOType<T, VecElems>::Type;
+      copy_vectorized<sizeof(vec_t), vec_t>(
+        reinterpret_cast<vec_t*>(out), reinterpret_cast<const vec_t*>(in), align_elems::div(n));
+    }
+    {  // process unaligned tail
+      uint32_t tail = align_elems::mod(n);
+      if (tail > 0) {
+        n -= tail;
+        copy_vectorized<sizeof(T), T>(out + n, in + n, tail);
+      }
+    }
+  }
+  if constexpr (VecElems <= 1) {
+    for (int i = threadIdx.x; i < n; i += blockDim.x) {
+      out[i] = in[i];
+    }
+  }
 }
 
 /**
@@ -213,7 +193,7 @@ struct loadAndComputeDist {
       for (int k = 0; k < Veclen; k++) {
         compute_dist(dist, shfl(queryReg, d + k, WarpSize), enc[k]);
       }
-    }  // end for d < dim - dimBlocks
+    }
   }
 };
 
@@ -292,7 +272,7 @@ struct loadAndComputeDist<kUnroll, Lambda, uint8_veclen, uint8_t, uint32_t> {
         uint32_t q = shfl(queryReg, (d / 4) + k, WarpSize);
         compute_dist(dist, q, enc[k]);
       }
-    }  // end for d < dim - dimBlocks
+    }
   }
 };
 
@@ -354,7 +334,7 @@ struct loadAndComputeDist<kUnroll, Lambda, 4, uint8_t, uint32_t> {
       uint32_t enc = reinterpret_cast<unsigned const*>(data)[lane_id];
       uint32_t q   = shfl(queryReg, d / veclen, WarpSize);
       compute_dist(dist, q, enc);
-    }  // end for d < dim - dimBlocks
+    }
   }
 };
 
@@ -553,7 +533,7 @@ struct loadAndComputeDist<kUnroll, Lambda, int8_veclen, int8_t, int32_t> {
         int32_t q = shfl(queryReg, (d / 4) + k, WarpSize);  // Here 4 is for 1 - int;
         compute_dist(dist, q, enc[k]);
       }
-    }  // end for d < dim - dimBlocks
+    }
   }
 };
 
@@ -672,7 +652,7 @@ template <int Capacity, int Veclen, bool Ascending, typename T, typename AccT, t
 __global__ void __launch_bounds__(kThreadsPerBlock)
   interleaved_scan_kernel(Lambda compute_dist,
                           const uint32_t query_smem_elems,
-                          const T* queries,
+                          const T* query,
                           const uint32_t* coarse_index,
                           const uint32_t* list_index,
                           const T* list_data,
@@ -685,6 +665,24 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
                           float* distances)
 {
   extern __shared__ __align__(256) uint8_t interleaved_scan_kernel_smem[];
+  // Using shared memory for the (part of the) query;
+  // This allows to save on global memory bandwidth when reading index and query
+  // data at the same time.
+  // Its size is `query_smem_elems`.
+  T* query_shared = reinterpret_cast<T*>(interleaved_scan_kernel_smem);
+  // Make the query input and output point to this block's shared query
+  {
+    const int query_id = blockIdx.y;
+    query += query_id * dim;
+    neighbors += query_id * k * gridDim.x + blockIdx.x * k;
+    distances += query_id * k * gridDim.x + blockIdx.x * k;
+    coarse_index += query_id * n_probes;
+  }
+
+  // Copy a part of the query into shared memory for faster processing
+  copy_vectorized(query_shared, query, std::min(dim, query_smem_elems));
+  __syncthreads();
+
 #ifdef USE_FAISS
   // temporary use of FAISS blockSelect for development purpose of k <= 32
   // for comparison purpose
@@ -702,78 +700,54 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
       queue(identity, keyMax, smemK, smemV, k);
 
 #else
-  topk::block_sort<topk::warp_sort_immediate, Capacity, Ascending, float, size_t> queue(
+  topk::block_sort<topk::warp_sort_filtered, Capacity, Ascending, float, size_t> queue(
     k, interleaved_scan_kernel_smem + query_smem_elems * sizeof(T));
 #endif
 
-  const int query_id = blockIdx.y;
   {
-    // Using shared memory for the (part of the) query;
-    // This allows to save on global memory bandwidth when reading index and query
-    // data at the same time.
-    // Its size is `query_smem_elems`.
-    T* query_shared = reinterpret_cast<T*>(interleaved_scan_kernel_smem);
-
     using align_warp  = Pow2<WarpSize>;
     const int lane_id = align_warp::mod(threadIdx.x);
-    const int warp_id = align_warp::div(threadIdx.x);
-
-    /// Set the address
-    auto query               = queries + query_id * dim;
-    constexpr int kGroupSize = WarpSize;
 
     // How many full warps needed to compute the distance (without remainder)
-    const int full_warps_along_dim = align_warp::roundDown(dim);
+    const uint32_t full_warps_along_dim = align_warp::roundDown(dim);
 
-    int shm_assisted_dim = (dim < query_smem_elems) ? dim : query_smem_elems;
-
-    // load the query data from global to shared memory
-    for (int i = threadIdx.x; i * Veclen < shm_assisted_dim; i += blockDim.x) {
-      queryLoadToShmem<T, Veclen>(query, query_shared, i);
-    }
-    __syncthreads();
-    shm_assisted_dim = (dim > query_smem_elems) ? query_smem_elems : full_warps_along_dim;
+    const uint32_t shm_assisted_dim =
+      (dim > query_smem_elems) ? query_smem_elems : full_warps_along_dim;
 
     // Every CUDA block scans one cluster at a time.
     for (int probe_id = blockIdx.x; probe_id < n_probes; probe_id += gridDim.x) {
-      const uint32_t list_id =
-        coarse_index[query_id * n_probes + probe_id];  // The id of cluster(list)
-
-      /**
-       * Uses shared memory
-       */
-      // The start address of the full value of vector for each cluster(list) interleaved
-      auto vecsBase = list_data + size_t(list_prefix_interleave[list_id]) * dim;
-      // The start address of index of vector for each cluster(list) interleaved
-      auto indexBase = list_index + list_prefix_interleave[list_id];
+      const uint32_t list_id   = coarse_index[probe_id];  // The id of cluster(list)
+      const size_t list_offset = list_prefix_interleave[list_id];
+
       // The number of vectors in each cluster(list); [nlist]
       const uint32_t list_length = list_lengths[list_id];
 
       // The number of interleaved groups to be processed
-      const uint32_t num_groups = ceildiv<uint32_t>(list_length, WarpSize);
+      const uint32_t num_groups =
+        align_warp::div(list_length + align_warp::Mask);  // ceildiv by power of 2
 
       constexpr int kUnroll        = WarpSize / Veclen;
       constexpr uint32_t kNumWarps = kThreadsPerBlock / WarpSize;
       // Every warp reads WarpSize vectors and computes the distances to them.
       // Then, the distances and corresponding ids are distributed among the threads,
       // and each thread adds one (id, dist) pair to the filtering queue.
-      for (uint32_t block = warp_id; block < num_groups; block += kNumWarps) {
+      for (uint32_t group_id = align_warp::div(threadIdx.x); group_id < num_groups;
+           group_id += kNumWarps) {
         AccT dist = 0;
+        // This is where this warp begins reading data (start position of an interleaved group)
+        const T* data = list_data + (list_offset + group_id * kIndexGroupSize) * dim;
+
         // This is the vector a given lane/thread handles
-        const uint32_t vec = block * WarpSize + lane_id;
-        bool valid         = vec < list_length;
-        size_t idx         = (valid) ? (size_t)indexBase[vec] : (size_t)lane_id;
-        // This is where this warp begins reading data
-        const T* data =
-          vecsBase + size_t(block) * kGroupSize * dim;  // Start position of this block
+        const uint32_t vec_id = group_id * WarpSize + lane_id;
+        const bool valid      = vec_id < list_length;
 
         // Process first shm_assisted_dim dimensions (always using shared memory)
         if (valid) {
-          for (int pos = 0; pos < shm_assisted_dim; pos += WarpSize) {
-            loadAndComputeDist<kUnroll, decltype(compute_dist), Veclen, T, AccT> lc(dist,
-                                                                                    compute_dist);
+          loadAndComputeDist<kUnroll, decltype(compute_dist), Veclen, T, AccT> lc(dist,
+                                                                                  compute_dist);
+          for (int pos = 0; pos < shm_assisted_dim;
+               pos += WarpSize, data += kIndexGroupSize * WarpSize) {
             lc.runLoadShmemCompute(data, query_shared, lane_id, pos);
-            data += WarpSize * kGroupSize;
           }
         }
 
@@ -781,7 +755,7 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
           // The default path - using shfl ops - for dimensions beyond query_smem_elems
           loadAndComputeDist<kUnroll, decltype(compute_dist), Veclen, T, AccT> lc(dist,
                                                                                   compute_dist);
-          for (int pos = shm_assisted_dim; pos < full_warps_along_dim; pos += WarpSize) {  //
+          for (int pos = shm_assisted_dim; pos < full_warps_along_dim; pos += WarpSize) {
             lc.runLoadShflAndCompute(data, query, pos, lane_id);
           }
           lc.runLoadShflAndComputeRemainder(data, query, lane_id, dim, full_warps_along_dim);
@@ -790,7 +764,7 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
           if (valid) {
             loadAndComputeDist<1, decltype(compute_dist), Veclen, T, AccT> lc(dist, compute_dist);
             for (int pos = full_warps_along_dim; pos < dim;
-                 pos += Veclen, data += kGroupSize * Veclen) {
+                 pos += Veclen, data += kIndexGroupSize * Veclen) {
               lc.runLoadShmemCompute(data, query_shared, lane_id, pos);
             }
           }
@@ -798,7 +772,8 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
 
         // Enqueue one element per thread
         constexpr float kDummy = Ascending ? upper_bound<float>() : lower_bound<float>();
-        float val              = valid ? static_cast<float>(dist) : kDummy;
+        const float val        = valid ? static_cast<float>(dist) : kDummy;
+        const size_t idx       = valid ? static_cast<size_t>(list_index[list_offset + vec_id]) : 0;
         queue.add(val, idx);
       }
     }
@@ -808,15 +783,14 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
 #ifdef USE_FAISS
   queue.reduce();
   for (int i = threadIdx.x; i < k; i += kThreadsPerBlock) {
-    neighbors[query_id * k * gridDim.x + blockIdx.x * k + i] = (size_t)smemV[i];
-    distances[query_id * k * gridDim.x + blockIdx.x * k + i] = smemK[i];
+    neighbors[i] = (size_t)smemV[i];
+    distances[i] = smemK[i];
   }
 #else
   queue.done();
-  queue.store(distances + query_id * k * gridDim.x + blockIdx.x * k,
-              neighbors + query_id * k * gridDim.x + blockIdx.x * k);
+  queue.store(distances, neighbors);
 #endif
-}  // end kernel
+}
 
 /**
  *  Configure the gridDim.x to maximize GPU occupancy, but reduce the output size
@@ -850,8 +824,6 @@ void launch_kernel(Lambda lambda,
                    uint32_t& grid_dim_x,
                    rmm::cuda_stream_view stream)
 {
-  RAFT_EXPECTS(reinterpret_cast<size_t>(queries) % (Veclen * sizeof(T)) == 0,
-               "Queries data is not aligned to the vector load size (Veclen).");
   RAFT_EXPECTS(Veclen == index.veclen,
                "Configured Veclen does not match the index interleaving pattern.");
   constexpr auto kKernel   = interleaved_scan_kernel<Capacity, Veclen, Ascending, T, AccT, Lambda>;
@@ -1230,6 +1202,8 @@ void search_impl(const handle_t& handle,
                                          stream,
                                          search_mr);
     } else {
+      // NB: this branch can only be triggered once `ivfflat_interleaved_scan` above supports larger
+      // `k` values (kMaxCapacity limit as a dependency of topk::block_sort)
       topk::radix_topk<AccT, size_t, 11, 512>(refined_distances_dev.data(),
                                               refined_indices_dev.data(),
                                               n_queries,

From 1afd667b4cdcbe8e919c18f22c6083e842dfeea7 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 30 Jun 2022 15:39:44 +0200
Subject: [PATCH 098/118] Use move semantics for the index everywhere, but try
 to keep it const where possible

---
 cpp/include/raft/spatial/knn/ann_common.hpp   | 14 ++++-----
 .../raft/spatial/knn/detail/ann_quantized.cuh |  4 +--
 .../spatial/knn/detail/ivf_flat_search.cuh    | 16 +++++-----
 cpp/include/raft/spatial/knn/ivf_flat.cuh     |  2 +-
 .../raft/spatial/knn/ivf_flat_types.hpp       | 30 +++++++++++++++++--
 5 files changed, 46 insertions(+), 20 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/ann_common.hpp b/cpp/include/raft/spatial/knn/ann_common.hpp
index 7008bddaa1..76c2b078c0 100644
--- a/cpp/include/raft/spatial/knn/ann_common.hpp
+++ b/cpp/include/raft/spatial/knn/ann_common.hpp
@@ -33,31 +33,31 @@ struct knnIndex {
   float metricArg;
   std::unique_ptr<faiss::gpu::GpuIndex> index;
   std::unique_ptr<MetricProcessor<float>> metric_processor;
-  std::unique_ptr<ivf_flat::index<float>> ivf_flat_float_;
-  std::unique_ptr<ivf_flat::index<uint8_t>> ivf_flat_uint8_t_;
-  std::unique_ptr<ivf_flat::index<int8_t>> ivf_flat_int8_t_;
+  std::unique_ptr<const ivf_flat::index<float>> ivf_flat_float_;
+  std::unique_ptr<const ivf_flat::index<uint8_t>> ivf_flat_uint8_t_;
+  std::unique_ptr<const ivf_flat::index<int8_t>> ivf_flat_int8_t_;
 
   std::unique_ptr<raft::spatial::knn::RmmGpuResources> gpu_res;
   int device;
 
   template <typename T>
-  auto ivf_flat() -> std::unique_ptr<ivf_flat::index<T>>&;
+  auto ivf_flat() -> std::unique_ptr<const ivf_flat::index<T>>&;
 };
 
 template <>
-auto knnIndex::ivf_flat<float>() -> std::unique_ptr<ivf_flat::index<float>>&
+auto knnIndex::ivf_flat<float>() -> std::unique_ptr<const ivf_flat::index<float>>&
 {
   return ivf_flat_float_;
 }
 
 template <>
-auto knnIndex::ivf_flat<uint8_t>() -> std::unique_ptr<ivf_flat::index<uint8_t>>&
+auto knnIndex::ivf_flat<uint8_t>() -> std::unique_ptr<const ivf_flat::index<uint8_t>>&
 {
   return ivf_flat_uint8_t_;
 }
 
 template <>
-auto knnIndex::ivf_flat<int8_t>() -> std::unique_ptr<ivf_flat::index<int8_t>>&
+auto knnIndex::ivf_flat<int8_t>() -> std::unique_ptr<const ivf_flat::index<int8_t>>&
 {
   return ivf_flat_int8_t_;
 }
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
index 64e0de95a1..862c80122e 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
@@ -45,7 +45,7 @@
 
 #include <thrust/iterator/transform_iterator.h>
 
-namespace raft ::spatial ::knn::detail {
+namespace raft::spatial::knn::detail {
 
 inline faiss::ScalarQuantizer::QuantizerType build_faiss_qtype(QuantizerType qtype)
 {
@@ -137,7 +137,7 @@ void approx_knn_build_index(const handle_t& handle,
                       metric == raft::distance::DistanceType::L2Unexpanded ||
                       metric == raft::distance::DistanceType::L2Expanded ||
                       metric == raft::distance::DistanceType::InnerProduct)) {
-    index->ivf_flat<T>() = std::make_unique<ivf_flat::index<T>>(
+    index->ivf_flat<T>() = std::make_unique<const ivf_flat::index<T>>(
       ivf_flat::build(handle, *ivf_ft_pams, index_array, n, D, stream));
   } else {
     RAFT_CUDA_TRY(cudaGetDevice(&(index->device)));
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
index 4b0a9b5590..ff032c6e9d 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
@@ -931,11 +931,11 @@ void launch_with_fixed_consts(raft::distance::DistanceType metric, Args&&... arg
 {
   if (metric == raft::distance::DistanceType::L2Expanded ||
       metric == raft::distance::DistanceType::L2Unexpanded) {
-    launch_kernel<Capacity, Veclen, Ascending, T, AccT, euclidean_dist<Veclen, T, AccT>>({},
-                                                                                         args...);
+    launch_kernel<Capacity, Veclen, Ascending, T, AccT, euclidean_dist<Veclen, T, AccT>>(
+      {}, std::forward<Args>(args)...);
   } else {
-    launch_kernel<Capacity, Veclen, Ascending, T, AccT, inner_prod_dist<Veclen, T, AccT>>({},
-                                                                                          args...);
+    launch_kernel<Capacity, Veclen, Ascending, T, AccT, inner_prod_dist<Veclen, T, AccT>>(
+      {}, std::forward<Args>(args)...);
   }
 }
 
@@ -960,13 +960,13 @@ struct select_interleaved_scan_kernel {
     if constexpr (Capacity > 1) {
       if (capacity * 2 <= Capacity) {
         return select_interleaved_scan_kernel<T, AccT, Capacity / 2, Veclen>::run(
-          capacity, veclen, select_min, args...);
+          capacity, veclen, select_min, std::forward<Args>(args)...);
       }
     }
     if constexpr (Veclen > 1) {
       if (veclen * 2 <= Veclen) {
         return select_interleaved_scan_kernel<T, AccT, Capacity, Veclen / 2>::run(
-          capacity, veclen, select_min, args...);
+          capacity, veclen, select_min, std::forward<Args>(args)...);
       }
     }
     RAFT_EXPECTS(capacity == Capacity,
@@ -977,9 +977,9 @@ struct select_interleaved_scan_kernel {
       veclen == Veclen,
       "Veclen must be power-of-two not bigger than the maximum allowed size for this data type.");
     if (select_min) {
-      launch_with_fixed_consts<Capacity, Veclen, true, T, AccT>(args...);
+      launch_with_fixed_consts<Capacity, Veclen, true, T, AccT>(std::forward<Args>(args)...);
     } else {
-      launch_with_fixed_consts<Capacity, Veclen, false, T, AccT>(args...);
+      launch_with_fixed_consts<Capacity, Veclen, false, T, AccT>(std::forward<Args>(args)...);
     }
   }
 };
diff --git a/cpp/include/raft/spatial/knn/ivf_flat.cuh b/cpp/include/raft/spatial/knn/ivf_flat.cuh
index 18085666f0..3f50ff2e6f 100644
--- a/cpp/include/raft/spatial/knn/ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/ivf_flat.cuh
@@ -47,7 +47,7 @@ inline auto build(const handle_t& handle,
                   const T* dataset,
                   uint32_t n_rows,
                   uint32_t dim,
-                  rmm::cuda_stream_view stream) -> index<T>
+                  rmm::cuda_stream_view stream) -> const index<T>
 {
   return raft::spatial::knn::detail::ivf_flat::build(handle, params, dataset, n_rows, dim, stream);
 }
diff --git a/cpp/include/raft/spatial/knn/ivf_flat_types.hpp b/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
index 828e465316..90a0610f7f 100644
--- a/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
+++ b/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
@@ -28,6 +28,22 @@ namespace raft::spatial::knn::ivf_flat {
 /** Size of the interleaved group (see `index::data` description). */
 constexpr static uint32_t kIndexGroupSize = 32;
 
+/**
+ * @brief IVF-flat index.
+ *
+ * This structure is supposed to be immutable: it's only constructed using `ivf_flat::build`,
+ * and should never be modified.
+ * At the same time, we expose all its members and allow the aggregate construction, so that
+ * third-party users can implement custom serialization/deserialization routines or modify
+ * the index building process.
+ *
+ * It would seem logical to make all the type's members constant. However, we can't do that
+ * because it would imply copying data when the index is moved. The current solution to this
+ * is to make all public factory functions, such as `ivf_flat::build` return `const index`.
+ *
+ * @tparam T data element type
+ *
+ */
 template <typename T>
 struct index {
   using row_major = layout_c_contiguous;
@@ -40,9 +56,9 @@ struct index {
    * TODO: in theory, we can lift this to the template parameter and keep it at hardware maximum
    * possible value by padding the `dim` of the data https://github.com/rapidsai/raft/issues/711
    */
-  uint32_t veclen;
+  const uint32_t veclen;
   /** Distance metric used for clustering. */
-  raft::distance::DistanceType metric;
+  const raft::distance::DistanceType metric;
 
   /**
    * Inverted list data [size, dim].
@@ -82,6 +98,13 @@ struct index {
   /** (Optional) Precomputed norms of the `centers` w.r.t. the chosen distance metric [n_lists]  */
   std::optional<device_mdarray<float, extent_1d, row_major>> center_norms;
 
+  // Don't allow copying the index for performance reasons (try avoiding copying data)
+  index(const index&) = delete;
+  index(index&&)      = default;
+  auto operator=(const index&) -> index& = delete;
+  auto operator=(index&&) -> index& = default;
+  ~index()                          = default;
+
   /** Total length of the index. */
   [[nodiscard]] constexpr inline auto size() const noexcept -> size_t { return data.extent(0); }
   /** Dimensionality of the data. */
@@ -118,4 +141,7 @@ struct index_params : ivf_index_params {
 struct search_params : ivf_search_params {
 };
 
+static_assert(std::is_standard_layout_v<index<float>>);
+static_assert(std::is_aggregate_v<index<float>>);
+
 }  // namespace raft::spatial::knn::ivf_flat

From 73ce9e108358c142a2175bbc0fc48e1072fe72ee Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 30 Jun 2022 16:46:13 +0200
Subject: [PATCH 099/118] Update documentation

---
 .../spatial/knn/detail/ivf_flat_search.cuh    | 53 +++++++++++++------
 cpp/include/raft/spatial/knn/ivf_flat.cuh     |  5 ++
 2 files changed, 43 insertions(+), 15 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
index ff032c6e9d..ccd3359364 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
@@ -644,9 +644,27 @@ struct loadAndComputeDist<kUnroll, Lambda, 1, int8_t, int32_t> {
 };
 
 /**
- * See `ivfflat_interleaved_scan` for parameter docs.
+ * Scan clusters for nearest neighbors of the query vectors.
+ * See `ivfflat_interleaved_scan` for more information.
  *
- * query_smem_elems must be multiple of WarpSize * Veclen
+ * The clusters are stored in the interleaved index format described in ivf_flat_types.hpp.
+ * For each query vector, a set of clusters is probed: the distance to each vector in the cluster is
+ * calculated, and the top-k nearest neighbors are selected.
+ *
+ * @param compute_dist distance function
+ * @param query_smem_elems number of dimensions of the query vector to fit in a shared memory of a
+ * block; this number must be a multiple of `WarpSize * Veclen`.
+ * @param[in] query a pointer to all queries in a row-major contiguous format [gridDim.y, dim]
+ * @param[in] coarse_index a pointer to the cluster indices to search through [n_probes]
+ * @param[in] list_indices index<T>.indices
+ * @param[in] list_data index<T>.data
+ * @param[in] list_sizes index<T>.list_sizes
+ * @param[in] list_offsets index<T>.list_offsets
+ * @param n_probes
+ * @param k
+ * @param dim
+ * @param[out] neighbors
+ * @param[out] distances
  */
 template <int Capacity, int Veclen, bool Ascending, typename T, typename AccT, typename Lambda>
 __global__ void __launch_bounds__(kThreadsPerBlock)
@@ -654,10 +672,10 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
                           const uint32_t query_smem_elems,
                           const T* query,
                           const uint32_t* coarse_index,
-                          const uint32_t* list_index,
+                          const uint32_t* list_indices,
                           const T* list_data,
-                          const uint32_t* list_lengths,
-                          const uint32_t* list_prefix_interleave,
+                          const uint32_t* list_sizes,
+                          const uint32_t* list_offsets,
                           const uint32_t n_probes,
                           const uint32_t k,
                           const uint32_t dim,
@@ -717,10 +735,10 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
     // Every CUDA block scans one cluster at a time.
     for (int probe_id = blockIdx.x; probe_id < n_probes; probe_id += gridDim.x) {
       const uint32_t list_id   = coarse_index[probe_id];  // The id of cluster(list)
-      const size_t list_offset = list_prefix_interleave[list_id];
+      const size_t list_offset = list_offsets[list_id];
 
       // The number of vectors in each cluster(list); [nlist]
-      const uint32_t list_length = list_lengths[list_id];
+      const uint32_t list_length = list_sizes[list_id];
 
       // The number of interleaved groups to be processed
       const uint32_t num_groups =
@@ -773,7 +791,7 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
         // Enqueue one element per thread
         constexpr float kDummy = Ascending ? upper_bound<float>() : lower_bound<float>();
         const float val        = valid ? static_cast<float>(dist) : kDummy;
-        const size_t idx       = valid ? static_cast<size_t>(list_index[list_offset + vec_id]) : 0;
+        const size_t idx = valid ? static_cast<size_t>(list_indices[list_offset + vec_id]) : 0;
         queue.add(val, idx);
       }
     }
@@ -929,13 +947,16 @@ struct inner_prod_dist {
 template <int Capacity, int Veclen, bool Ascending, typename T, typename AccT, typename... Args>
 void launch_with_fixed_consts(raft::distance::DistanceType metric, Args&&... args)
 {
-  if (metric == raft::distance::DistanceType::L2Expanded ||
-      metric == raft::distance::DistanceType::L2Unexpanded) {
-    launch_kernel<Capacity, Veclen, Ascending, T, AccT, euclidean_dist<Veclen, T, AccT>>(
-      {}, std::forward<Args>(args)...);
-  } else {
-    launch_kernel<Capacity, Veclen, Ascending, T, AccT, inner_prod_dist<Veclen, T, AccT>>(
-      {}, std::forward<Args>(args)...);
+  switch (metric) {
+    case raft::distance::DistanceType::L2Expanded:
+    case raft::distance::DistanceType::L2Unexpanded:
+      return launch_kernel<Capacity, Veclen, Ascending, T, AccT, euclidean_dist<Veclen, T, AccT>>(
+        {}, std::forward<Args>(args)...);
+    case raft::distance::DistanceType::InnerProduct:
+      return launch_kernel<Capacity, Veclen, Ascending, T, AccT, inner_prod_dist<Veclen, T, AccT>>(
+        {}, std::forward<Args>(args)...);
+    // NB: update the description of `knn::ivf_flat::build` when adding here a new metric.
+    default: RAFT_FAIL("The chosen distance metric is not supported (%d)", int(metric));
   }
 }
 
@@ -969,6 +990,8 @@ struct select_interleaved_scan_kernel {
           capacity, veclen, select_min, std::forward<Args>(args)...);
       }
     }
+    // NB: this is the limitation of the topk::block_topk stuctures that use a huge number of
+    //     registers (used in the main kernel here).
     RAFT_EXPECTS(capacity == Capacity,
                  "Capacity must be power-of-two not bigger than the maximum allowed size "
                  "topk::kMaxCapacity (%d).",
diff --git a/cpp/include/raft/spatial/knn/ivf_flat.cuh b/cpp/include/raft/spatial/knn/ivf_flat.cuh
index 3f50ff2e6f..e05ac40fde 100644
--- a/cpp/include/raft/spatial/knn/ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/ivf_flat.cuh
@@ -30,6 +30,11 @@ namespace raft::spatial::knn::ivf_flat {
 /**
  * @brief Build the index from the dataset for efficient search.
  *
+ * NB: Currently, the following distance metrics are supported:
+ *   L2Expanded
+ *   L2Unexpanded
+ *   InnerProduct
+ *
  * @tparam T data element type
  *
  * @param handle

From 2a45645b600fe8894b892a4cbaa90d24fc0acfd7 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 30 Jun 2022 16:50:14 +0200
Subject: [PATCH 100/118] Remove the debug path USE_FAISS

---
 .../spatial/knn/detail/ivf_flat_search.cuh    | 39 +------------------
 1 file changed, 2 insertions(+), 37 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
index ccd3359364..fbd1af4343 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
@@ -16,8 +16,6 @@
 
 #pragma once
 
-// #define USE_FAISS
-
 #include "../ivf_flat_types.hpp"
 #include "ann_utils.cuh"
 #include "topk/radix_topk.cuh"
@@ -34,11 +32,6 @@
 #include <raft/pow2_utils.cuh>
 #include <raft/vectorized.cuh>
 
-#ifdef USE_FAISS
-#include <faiss/gpu/utils/Comparators.cuh>
-#include <faiss/gpu/utils/Select.cuh>
-#endif
-
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 
@@ -701,26 +694,8 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
   copy_vectorized(query_shared, query, std::min(dim, query_smem_elems));
   __syncthreads();
 
-#ifdef USE_FAISS
-  // temporary use of FAISS blockSelect for development purpose of k <= 32
-  // for comparison purpose
-  __shared__ float smemK[kThreadsPerBlock];
-  __shared__ size_t smemV[kThreadsPerBlock];
-
-  constexpr auto Dir = !Ascending;
-  constexpr auto identity =
-    Dir ? std::numeric_limits<float>::min() : std::numeric_limits<float>::max();
-  constexpr auto keyMax =
-    Dir ? std::numeric_limits<size_t>::min() : std::numeric_limits<size_t>::max();
-
-  faiss::gpu::
-    BlockSelect<float, size_t, Dir, faiss::gpu::Comparator<float>, 32, 2, kThreadsPerBlock>
-      queue(identity, keyMax, smemK, smemV, k);
-
-#else
   topk::block_sort<topk::warp_sort_filtered, Capacity, Ascending, float, size_t> queue(
     k, interleaved_scan_kernel_smem + query_smem_elems * sizeof(T));
-#endif
 
   {
     using align_warp  = Pow2<WarpSize>;
@@ -797,17 +772,9 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
     }
   }
 
-  /// Warp_wise topk
-#ifdef USE_FAISS
-  queue.reduce();
-  for (int i = threadIdx.x; i < k; i += kThreadsPerBlock) {
-    neighbors[i] = (size_t)smemV[i];
-    distances[i] = smemK[i];
-  }
-#else
+  // finalize and store selected neighbours
   queue.done();
   queue.store(distances, neighbors);
-#endif
 }
 
 /**
@@ -848,12 +815,10 @@ void launch_kernel(Lambda lambda,
   const int max_query_smem = 16384;
   int query_smem_elems =
     std::min<int>(max_query_smem / sizeof(T), Pow2<Veclen * WarpSize>::roundUp(index.dim()));
-  int smem_size = query_smem_elems * sizeof(T);
-#ifndef USE_FAISS
+  int smem_size              = query_smem_elems * sizeof(T);
   constexpr int kSubwarpSize = std::min<int>(Capacity, WarpSize);
   smem_size += raft::spatial::knn::detail::topk::calc_smem_size_for_block_wide<AccT, size_t>(
     kThreadsPerBlock / kSubwarpSize, k);
-#endif
 
   // power-of-two less than cuda limit (for better addr alignment)
   constexpr uint32_t kMaxGridY = 32768;

From 75a48b408069da0f8aa3bc4fd80b5fb269bd22f2 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 1 Jul 2022 08:10:11 +0200
Subject: [PATCH 101/118] Add a type trait for checking if the conversion
 between two numeric types is narrowing

---
 cpp/include/raft/integer_utils.h | 22 +++++++++++++++++++++-
 cpp/test/integer_utils.cpp       | 10 +++++++++-
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/cpp/include/raft/integer_utils.h b/cpp/include/raft/integer_utils.h
index 5fc56de14b..a2ce7598c6 100644
--- a/cpp/include/raft/integer_utils.h
+++ b/cpp/include/raft/integer_utils.h
@@ -1,7 +1,7 @@
 /*
  * Copyright 2019 BlazingDB, Inc.
  *     Copyright 2019 Eyal Rozenberg <eyalroz@blazingdb.com>
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -161,4 +161,24 @@ std::enable_if_t<!std::is_signed<T>::value, T> constexpr inline absolute_value(T
   return value;
 }
 
+/**
+ * @defgroup Check whether the numeric conversion is narrowing
+ *
+ * @tparam From source type
+ * @tparam To destination type
+ * @{
+ */
+template <typename From, typename To, typename = void>
+struct is_narrowing : std::true_type {
+};
+
+template <typename From, typename To>
+struct is_narrowing<From, To, std::void_t<decltype(To{std::declval<From>()})>> : std::false_type {
+};
+/** @} */
+
+/** Check whether the numeric conversion is narrowing */
+template <typename From, typename To>
+inline constexpr bool is_narrowing_v = is_narrowing<From, To>::value;  // NOLINT
+
 }  // namespace raft
diff --git a/cpp/test/integer_utils.cpp b/cpp/test/integer_utils.cpp
index 71567deb45..117e7f5f7e 100644
--- a/cpp/test/integer_utils.cpp
+++ b/cpp/test/integer_utils.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,14 @@
 
 namespace raft {
 
+static_assert(!is_narrowing_v<uint32_t, uint64_t>);
+static_assert(!is_narrowing_v<uint32_t, int64_t>);
+static_assert(!is_narrowing_v<uint32_t, uint32_t>);
+static_assert(is_narrowing_v<uint32_t, int32_t>);
+static_assert(is_narrowing_v<uint32_t, int>);
+static_assert(!is_narrowing_v<float, double>);
+static_assert(is_narrowing_v<double, float>);
+
 TEST(Raft, rounding_up)
 {
   ASSERT_EQ(raft::div_rounding_up_safe(5, 3), 2);

From 388200c91007c5c304b3fdb738a39fcf43c29594 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 1 Jul 2022 11:51:05 +0200
Subject: [PATCH 102/118] Support 32bit and unsigned indices in bruteforce KNN

---
 cpp/bench/CMakeLists.txt                      |   1 +
 cpp/bench/common/benchmark.hpp                |  20 +-
 cpp/bench/spatial/knn.cu                      | 331 ++++++++++++++++++
 .../spatial/knn/detail/haversine_distance.cuh |   6 +-
 .../knn/detail/knn_brute_force_faiss.cuh      |   6 +-
 .../raft/spatial/knn/specializations/knn.cuh  |  32 ++
 cpp/src/nn/specializations/knn.cu             |  30 ++
 cpp/test/spatial/knn.cu                       |  24 +-
 8 files changed, 431 insertions(+), 19 deletions(-)
 create mode 100644 cpp/bench/spatial/knn.cu

diff --git a/cpp/bench/CMakeLists.txt b/cpp/bench/CMakeLists.txt
index 7a0f1d5201..51e1c41499 100644
--- a/cpp/bench/CMakeLists.txt
+++ b/cpp/bench/CMakeLists.txt
@@ -30,6 +30,7 @@ add_executable(${RAFT_CPP_BENCH_TARGET}
   bench/random/permute.cu
   bench/random/rng.cu
   bench/spatial/fused_l2_nn.cu
+  bench/spatial/knn.cu
   bench/spatial/selection.cu
   bench/main.cpp
 )
diff --git a/cpp/bench/common/benchmark.hpp b/cpp/bench/common/benchmark.hpp
index de34cf4f57..fb878a0c8d 100644
--- a/cpp/bench/common/benchmark.hpp
+++ b/cpp/bench/common/benchmark.hpp
@@ -40,7 +40,7 @@ struct using_pool_memory_res {
  private:
   rmm::mr::device_memory_resource* orig_res_;
   rmm::mr::cuda_memory_resource cuda_res_;
-  rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> pool_res_;
+  rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> pool_res_;
 
  public:
   using_pool_memory_res(size_t initial_size, size_t max_size)
@@ -115,13 +115,20 @@ class fixture {
     int device_id     = 0;
     RAFT_CUDA_TRY(cudaGetDevice(&device_id));
     RAFT_CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_size, cudaDevAttrL2CacheSize, device_id));
-    scratch_buf_ = rmm::device_buffer(l2_cache_size, stream);
+    scratch_buf_ = rmm::device_buffer(l2_cache_size * 3, stream);
   }
 
   // every benchmark should be overriding this
   virtual void run_benchmark(::benchmark::State& state) = 0;
   virtual void generate_metrics(::benchmark::State& state) {}
 
+ protected:
+  /** The helper that writes zeroes to some buffer in GPU memory to flush the L2 cache.  */
+  void flush_L2_cache()
+  {
+    RAFT_CUDA_TRY(cudaMemsetAsync(scratch_buf_.data(), 0, scratch_buf_.size(), stream));
+  }
+
   /**
    * The helper to be used inside `run_benchmark`, to loop over the state and record time using the
    * cuda_event_timer.
@@ -130,9 +137,7 @@ class fixture {
   void loop_on_state(::benchmark::State& state, Lambda benchmark_func, bool flush_L2 = true)
   {
     for (auto _ : state) {
-      if (flush_L2) {
-        RAFT_CUDA_TRY(cudaMemsetAsync(scratch_buf_.data(), 0, scratch_buf_.size(), stream));
-      }
+      if (flush_L2) { flush_L2_cache(); }
       cuda_event_timer timer(state, stream);
       benchmark_func();
     }
@@ -147,9 +152,9 @@ class Fixture : public ::benchmark::Fixture {
 
  public:
   explicit Fixture(const std::string name, const Params&... params)
-    : ::benchmark::Fixture(), params_(params...)
+    : ::benchmark::Fixture(), params_(params...), name_(name)
   {
-    SetName(name.c_str());
+    SetName(name_.c_str());
   }
   Fixture() = delete;
 
@@ -165,6 +170,7 @@ class Fixture : public ::benchmark::Fixture {
  private:
   std::unique_ptr<Class> fixture_;
   std::tuple<Params...> params_;
+  const std::string name_;
 
  protected:
   void BenchmarkCase(State& state) override
diff --git a/cpp/bench/spatial/knn.cu b/cpp/bench/spatial/knn.cu
new file mode 100644
index 0000000000..7ec147511e
--- /dev/null
+++ b/cpp/bench/spatial/knn.cu
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+
+#include <raft/random/rng.cuh>
+#include <raft/spatial/knn/knn.cuh>
+#if defined RAFT_NN_COMPILED
+#include <raft/spatial/knn/specializations.cuh>
+#endif
+
+#include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <rmm/mr/host/new_delete_resource.hpp>
+#include <rmm/mr/host/pinned_memory_resource.hpp>
+
+#include <optional>
+
+namespace raft::bench::spatial {
+
+struct params {
+  /** Size of the dataset. */
+  size_t n_samples;
+  /** Number of dimensions in the dataset. */
+  size_t n_dims;
+  /** The batch size -- number of KNN searches. */
+  size_t n_queries;
+  /** Number of nearest neighbours to find for every probe. */
+  size_t k;
+};
+
+auto operator<<(std::ostream& os, const params& p) -> std::ostream&
+{
+  os << p.n_samples << "#" << p.n_dims << "#" << p.n_queries << "#" << p.k;
+  return os;
+}
+
+enum class TransferStrategy { NO_COPY, COPY_PLAIN, COPY_PINNED, MAP_PINNED, MANAGED };
+enum class Scope { BUILD, SEARCH, BUILD_SEARCH };
+
+auto operator<<(std::ostream& os, const TransferStrategy& ts) -> std::ostream&
+{
+  switch (ts) {
+    case TransferStrategy::NO_COPY: os << "NO_COPY"; break;
+    case TransferStrategy::COPY_PLAIN: os << "COPY_PLAIN"; break;
+    case TransferStrategy::COPY_PINNED: os << "COPY_PINNED"; break;
+    case TransferStrategy::MAP_PINNED: os << "MAP_PINNED"; break;
+    case TransferStrategy::MANAGED: os << "MANAGED"; break;
+    default: os << "UNKNOWN";
+  }
+  return os;
+}
+
+auto operator<<(std::ostream& os, const Scope& s) -> std::ostream&
+{
+  switch (s) {
+    case Scope::BUILD: os << "BUILD"; break;
+    case Scope::SEARCH: os << "SEARCH"; break;
+    case Scope::BUILD_SEARCH: os << "BUILD_SEARCH"; break;
+    default: os << "UNKNOWN";
+  }
+  return os;
+}
+
+struct device_resource {
+ public:
+  explicit device_resource(bool managed) : managed_(managed)
+  {
+    if (managed_) {
+      res_ = new rmm::mr::managed_memory_resource();
+    } else {
+      res_ = rmm::mr::get_current_device_resource();
+    }
+  }
+
+  ~device_resource()
+  {
+    if (managed_) { delete res_; }
+  }
+
+  [[nodiscard]] auto get() const -> rmm::mr::device_memory_resource* { return res_; }
+
+ private:
+  const bool managed_;
+  rmm::mr::device_memory_resource* res_;
+};
+
+template <typename T>
+struct host_uvector {
+  host_uvector(size_t n, bool pinned) : n_(n)
+  {
+    if (pinned) {
+      res_ = new rmm::mr::pinned_memory_resource();
+    } else {
+      res_ = new rmm::mr::new_delete_resource();
+    }
+    arr_ = static_cast<T*>(res_->allocate(n_ * sizeof(T)));
+  }
+
+  ~host_uvector() noexcept
+  {
+    res_->deallocate(arr_, n_ * sizeof(T));
+    delete res_;
+  }
+
+  auto data() -> T* { return arr_; }
+  [[nodiscard]] auto size() const -> size_t { return n_; }
+
+ private:
+  rmm::mr::host_memory_resource* res_;
+  size_t n_;
+  T* arr_;
+};
+
+template <typename ValT, typename IdxT>
+struct brute_force_knn {
+  using dist_t = ValT;
+
+  ValT* index;
+  params ps;
+
+  brute_force_knn(const raft::handle_t& handle, const params& ps, const ValT* data)
+    : index(const_cast<ValT*>(data)), ps(ps)
+  {
+  }
+
+  void search(const raft::handle_t& handle,
+              const ValT* search_items,
+              dist_t* out_dists,
+              IdxT* out_idxs)
+  {
+    std::vector<ValT*> input{index};
+    std::vector<size_t> sizes{ps.n_samples};
+    raft::spatial::knn::brute_force_knn<IdxT, ValT, size_t>(handle,
+                                                            input,
+                                                            sizes,
+                                                            ps.n_dims,
+                                                            const_cast<ValT*>(search_items),
+                                                            ps.n_queries,
+                                                            out_idxs,
+                                                            out_dists,
+                                                            ps.k);
+  }
+};
+
+template <typename ValT, typename IdxT, typename ImplT>
+struct knn : public fixture {
+  explicit knn(const params& p, const TransferStrategy& strategy, const Scope& scope)
+    : params_(p),
+      strategy_(strategy),
+      scope_(scope),
+      dev_mem_res_(strategy == TransferStrategy::MANAGED),
+      data_host_(0),
+      search_items_(p.n_queries * p.n_dims, stream),
+      out_dists_(p.n_queries * p.k, stream),
+      out_idxs_(p.n_queries * p.k, stream)
+  {
+    raft::random::RngState state{42};
+    gen_data(state, search_items_, search_items_.size(), stream);
+    try {
+      size_t total_size = p.n_samples * p.n_dims;
+      data_host_.resize(total_size);
+      constexpr size_t kGenMinibatchSize = 1024 * 1024 * 1024;
+      rmm::device_uvector<ValT> d(std::min(kGenMinibatchSize, total_size), stream);
+      for (size_t offset = 0; offset < total_size; offset += kGenMinibatchSize) {
+        size_t actual_size = std::min(total_size - offset, kGenMinibatchSize);
+        gen_data(state, d, actual_size, stream);
+        copy(data_host_.data() + offset, d.data(), actual_size, stream);
+      }
+    } catch (std::bad_alloc& e) {
+      data_does_not_fit_ = true;
+    }
+  }
+
+  template <typename T>
+  void gen_data(raft::random::RngState& state,
+                rmm::device_uvector<T>& vec,
+                size_t n,
+                rmm::cuda_stream_view stream)
+  {
+    constexpr T kRangeMax = std::is_integral_v<T> ? std::numeric_limits<T>::max() : T(1);
+    constexpr T kRangeMin = std::is_integral_v<T> ? std::numeric_limits<T>::min() : T(-1);
+    if constexpr (std::is_integral_v<T>) {
+      raft::random::uniformInt(state, vec.data(), n, kRangeMin, kRangeMax, stream);
+    } else {
+      raft::random::uniform(state, vec.data(), n, kRangeMin, kRangeMax, stream);
+    }
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    if (data_does_not_fit_) {
+      state.SkipWithError("The data size is too big to fit into the host memory.");
+    }
+    if (scope_ == Scope::SEARCH && strategy_ != TransferStrategy::NO_COPY) {
+      state.SkipWithError(
+        "When benchmarking without index building (Scope::SEARCH), the data must be already on the "
+        "device (TransferStrategy::NO_COPY)");
+    }
+
+    using_pool_memory_res default_resource;
+
+    try {
+      std::ostringstream label_stream;
+      label_stream << params_ << "#" << strategy_ << "#" << scope_;
+      state.SetLabel(label_stream.str());
+      raft::handle_t handle(stream);
+      std::optional<ImplT> index;
+
+      if (scope_ == Scope::SEARCH) {  // also implies TransferStrategy::NO_COPY
+        rmm::device_uvector<ValT> data(data_host_.size(), stream);
+        copy(data.data(), data_host_.data(), data_host_.size(), stream);
+        index.emplace(handle, params_, data.data());
+        stream.synchronize();
+      }
+
+      // benchmark loop
+      for (auto _ : state) {
+        // managed or plain device memory initialized anew every time
+        rmm::device_uvector<ValT> data(data_host_.size(), stream, dev_mem_res_.get());
+        ValT* data_ptr         = data.data();
+        size_t allocation_size = data_host_.size() * sizeof(ValT);
+
+        // Non-benchmarked part: using different methods to copy the data if necessary
+        switch (strategy_) {
+          case TransferStrategy::NO_COPY:  // copy data to GPU before starting the timer.
+            copy(data_ptr, data_host_.data(), data_host_.size(), stream);
+            break;
+          case TransferStrategy::COPY_PINNED:
+            RAFT_CUDA_TRY(
+              cudaHostRegister(data_host_.data(), allocation_size, cudaHostRegisterDefault));
+            break;
+          case TransferStrategy::MAP_PINNED:
+            RAFT_CUDA_TRY(
+              cudaHostRegister(data_host_.data(), allocation_size, cudaHostRegisterMapped));
+            RAFT_CUDA_TRY(cudaHostGetDevicePointer(&data_ptr, data_host_.data(), 0));
+            break;
+          case TransferStrategy::MANAGED:  // sic! using std::memcpy rather than cuda copy
+            CUDA_CHECK(cudaMemAdvise(
+              data_ptr, allocation_size, cudaMemAdviseSetPreferredLocation, handle.get_device()));
+            CUDA_CHECK(cudaMemAdvise(
+              data_ptr, allocation_size, cudaMemAdviseSetAccessedBy, handle.get_device()));
+            CUDA_CHECK(cudaMemAdvise(data_ptr, allocation_size, cudaMemAdviseSetReadMostly, 0));
+            std::memcpy(data_ptr, data_host_.data(), allocation_size);
+            break;
+          default: break;
+        }
+
+        flush_L2_cache();
+        {
+          // Timer synchronizes the stream, so all prior gpu work should be done before it sets off.
+          cuda_event_timer timer(state, stream);
+          switch (strategy_) {
+            case TransferStrategy::COPY_PLAIN:
+            case TransferStrategy::COPY_PINNED:
+              copy(data_ptr, data_host_.data(), data_host_.size(), stream);
+            default: break;
+          }
+
+          if (scope_ != Scope::SEARCH) { index.emplace(handle, params_, data_ptr); }
+          if (scope_ != Scope::BUILD) {
+            index->search(handle, search_items_.data(), out_dists_.data(), out_idxs_.data());
+          }
+        }
+
+        if (scope_ != Scope::SEARCH) { index.reset(); }
+
+        switch (strategy_) {
+          case TransferStrategy::COPY_PINNED:
+          case TransferStrategy::MAP_PINNED:
+            RAFT_CUDA_TRY(cudaHostUnregister(data_host_.data()));
+            break;
+          default: break;
+        }
+      }
+    } catch (raft::exception& e) {
+      state.SkipWithError(e.what());
+    } catch (std::bad_alloc& e) {
+      state.SkipWithError(e.what());
+    }
+  }
+
+ private:
+  const params params_;
+  const TransferStrategy strategy_;
+  const Scope scope_;
+  device_resource dev_mem_res_;
+  bool data_does_not_fit_ = false;
+
+  std::vector<ValT> data_host_;
+  rmm::device_uvector<ValT> search_items_;
+  rmm::device_uvector<typename ImplT::dist_t> out_dists_;
+  rmm::device_uvector<IdxT> out_idxs_;
+};
+
+const std::vector<params> kInputs{
+  {2000000, 128, 1000, 32}, {10000000, 128, 1000, 32}, {10000, 8192, 1000, 32}};
+
+const std::vector<TransferStrategy> kAllStrategies{
+  TransferStrategy::NO_COPY, TransferStrategy::MAP_PINNED, TransferStrategy::MANAGED};
+const std::vector<TransferStrategy> kNoCopyOnly{TransferStrategy::NO_COPY};
+
+const std::vector<Scope> kScopeFull{Scope::BUILD_SEARCH};
+const std::vector<Scope> kAllScopes{Scope::BUILD_SEARCH, Scope::SEARCH, Scope::BUILD};
+
+#define KNN_REGISTER(ValT, IdxT, ImplT, inputs, strats, scope)                   \
+  namespace BENCHMARK_PRIVATE_NAME(knn)                                          \
+  {                                                                              \
+    using KNN = knn<ValT, IdxT, ImplT<ValT, IdxT>>;                              \
+    RAFT_BENCH_REGISTER(KNN, #ValT "/" #IdxT "/" #ImplT, inputs, strats, scope); \
+  }
+
+KNN_REGISTER(float, int64_t, brute_force_knn, kInputs, kAllStrategies, kScopeFull);
+
+KNN_REGISTER(float, uint32_t, brute_force_knn, kInputs, kNoCopyOnly, kScopeFull);
+
+}  // namespace raft::bench::spatial
diff --git a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
index c2d89aae7d..5d703bdb8d 100644
--- a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
+++ b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
@@ -72,7 +72,11 @@ __global__ void haversine_knn_kernel(value_idx* out_inds,
 
   faiss::gpu::
     BlockSelect<value_t, value_idx, false, faiss::gpu::Comparator<value_t>, warp_q, thread_q, tpb>
-      heap(faiss::gpu::Limits<value_t>::getMax(), -1, smemK, smemV, k);
+      heap(faiss::gpu::Limits<value_t>::getMax(),
+           std::numeric_limits<value_idx>::max(),
+           smemK,
+           smemV,
+           k);
 
   // Grid is exactly sized to rows available
   int limit = faiss::gpu::utils::roundDown(n_index_rows, faiss::gpu::kWarpSize);
diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index 196124352a..f78ffa84e1 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -268,11 +268,11 @@ void brute_force_knn_impl(
   int device;
   RAFT_CUDA_TRY(cudaGetDevice(&device));
 
-  rmm::device_uvector<std::int64_t> trans(id_ranges->size(), userStream);
+  rmm::device_uvector<IdxType> trans(id_ranges->size(), userStream);
   raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(), userStream);
 
   rmm::device_uvector<value_t> all_D(0, userStream);
-  rmm::device_uvector<std::int64_t> all_I(0, userStream);
+  rmm::device_uvector<IdxType> all_I(0, userStream);
 
   value_t* out_D = res_D;
   IdxType* out_I = res_I;
@@ -342,6 +342,8 @@ void brute_force_knn_impl(
           args.numQueries      = n;
           args.outDistances    = out_d_ptr;
           args.outIndices      = out_i_ptr;
+          args.outIndicesType  = sizeof(IdxType) == 4 ? faiss::gpu::IndicesDataType::I32
+                                                      : faiss::gpu::IndicesDataType::I64;
 
           /**
            * @todo: Until FAISS supports pluggable allocation strategies,
diff --git a/cpp/include/raft/spatial/knn/specializations/knn.cuh b/cpp/include/raft/spatial/knn/specializations/knn.cuh
index 6cf2418d29..bbbbf67d71 100644
--- a/cpp/include/raft/spatial/knn/specializations/knn.cuh
+++ b/cpp/include/raft/spatial/knn/specializations/knn.cuh
@@ -50,6 +50,38 @@ extern template void brute_force_knn<long, float, unsigned int>(raft::handle_t c
                                                                 std::vector<long>* translations,
                                                                 distance::DistanceType metric,
                                                                 float metric_arg);
+
+extern template void brute_force_knn<uint32_t, float, int>(raft::handle_t const& handle,
+                                                           std::vector<float*>& input,
+                                                           std::vector<int>& sizes,
+                                                           int D,
+                                                           float* search_items,
+                                                           int n,
+                                                           uint32_t* res_I,
+                                                           float* res_D,
+                                                           int k,
+                                                           bool rowMajorIndex,
+                                                           bool rowMajorQuery,
+                                                           std::vector<uint32_t>* translations,
+                                                           distance::DistanceType metric,
+                                                           float metric_arg);
+
+extern template void brute_force_knn<uint32_t, float, unsigned int>(
+  raft::handle_t const& handle,
+  std::vector<float*>& input,
+  std::vector<unsigned int>& sizes,
+  unsigned int D,
+  float* search_items,
+  unsigned int n,
+  uint32_t* res_I,
+  float* res_D,
+  unsigned int k,
+  bool rowMajorIndex,
+  bool rowMajorQuery,
+  std::vector<uint32_t>* translations,
+  distance::DistanceType metric,
+  float metric_arg);
+
 };  // namespace knn
 };  // namespace spatial
 };  // namespace raft
diff --git a/cpp/src/nn/specializations/knn.cu b/cpp/src/nn/specializations/knn.cu
index bb59e5b2ba..4e0a821c24 100644
--- a/cpp/src/nn/specializations/knn.cu
+++ b/cpp/src/nn/specializations/knn.cu
@@ -51,6 +51,36 @@ template void brute_force_knn<long, float, unsigned int>(raft::handle_t const& h
                                                          distance::DistanceType metric,
                                                          float metric_arg);
 
+template void brute_force_knn<uint32_t, float, int>(raft::handle_t const& handle,
+                                                    std::vector<float*>& input,
+                                                    std::vector<int>& sizes,
+                                                    int D,
+                                                    float* search_items,
+                                                    int n,
+                                                    uint32_t* res_I,
+                                                    float* res_D,
+                                                    int k,
+                                                    bool rowMajorIndex,
+                                                    bool rowMajorQuery,
+                                                    std::vector<uint32_t>* translations,
+                                                    distance::DistanceType metric,
+                                                    float metric_arg);
+
+template void brute_force_knn<uint32_t, float, unsigned int>(raft::handle_t const& handle,
+                                                             std::vector<float*>& input,
+                                                             std::vector<unsigned int>& sizes,
+                                                             unsigned int D,
+                                                             float* search_items,
+                                                             unsigned int n,
+                                                             uint32_t* res_I,
+                                                             float* res_D,
+                                                             unsigned int k,
+                                                             bool rowMajorIndex,
+                                                             bool rowMajorQuery,
+                                                             std::vector<uint32_t>* translations,
+                                                             distance::DistanceType metric,
+                                                             float metric_arg);
+
 };  // namespace knn
 };  // namespace spatial
 };  // namespace raft
diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu
index bf13288f48..37e0edb6ab 100644
--- a/cpp/test/spatial/knn.cu
+++ b/cpp/test/spatial/knn.cu
@@ -16,8 +16,8 @@
 
 #include "../test_utils.h"
 
+#include <raft/core/logger.hpp>
 #include <raft/distance/distance_type.hpp>
-
 #include <raft/spatial/knn/knn.cuh>
 #if defined RAFT_NN_COMPILED
 #include <raft/spatial/knn/specializations.cuh>
@@ -40,8 +40,9 @@ struct KNNInputs {
   std::vector<int> labels;
 };
 
+template <typename IdxT>
 __global__ void build_actual_output(
-  int* output, int n_rows, int k, const int* idx_labels, const int64_t* indices)
+  int* output, int n_rows, int k, const int* idx_labels, const IdxT* indices)
 {
   int element = threadIdx.x + blockDim.x * blockIdx.x;
   if (element >= n_rows * k) return;
@@ -60,7 +61,7 @@ __global__ void build_expected_output(int* output, int n_rows, int k, const int*
   }
 }
 
-template <typename T>
+template <typename T, typename IdxT>
 class KNNTest : public ::testing::TestWithParam<KNNInputs> {
  public:
   KNNTest()
@@ -79,9 +80,11 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
  protected:
   void testBruteForce()
   {
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG)
     raft::print_device_vector("Input array: ", input_.data(), rows_ * cols_, std::cout);
-    std::cout << "K: " << k_ << "\n";
+    std::cout << "K: " << k_ << std::endl;
     raft::print_device_vector("Labels array: ", search_labels_.data(), rows_, std::cout);
+#endif
 
     std::vector<float*> input_vec;
     std::vector<int> sizes_vec;
@@ -131,7 +134,7 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
     RAFT_CUDA_TRY(cudaMemsetAsync(input_.data(), 0, input_.size() * sizeof(float), stream));
     RAFT_CUDA_TRY(
       cudaMemsetAsync(search_data_.data(), 0, search_data_.size() * sizeof(float), stream));
-    RAFT_CUDA_TRY(cudaMemsetAsync(indices_.data(), 0, indices_.size() * sizeof(int64_t), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(indices_.data(), 0, indices_.size() * sizeof(IdxT), stream));
     RAFT_CUDA_TRY(cudaMemsetAsync(distances_.data(), 0, distances_.size() * sizeof(float), stream));
     RAFT_CUDA_TRY(
       cudaMemsetAsync(search_labels_.data(), 0, search_labels_.size() * sizeof(int), stream));
@@ -165,7 +168,7 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
   int cols_;
   rmm::device_uvector<float> input_;
   rmm::device_uvector<float> search_data_;
-  rmm::device_uvector<int64_t> indices_;
+  rmm::device_uvector<IdxT> indices_;
   rmm::device_uvector<float> distances_;
   int k_;
 
@@ -191,10 +194,13 @@ const std::vector<KNNInputs> inputs = {
    2,
    {0, 0, 0, 0, 0, 1, 1, 1, 1, 1}}};
 
-typedef KNNTest<float> KNNTestF;
-TEST_P(KNNTestF, BruteForce) { this->testBruteForce(); }
+typedef KNNTest<float, int64_t> KNNTestFint64_t;
+TEST_P(KNNTestFint64_t, BruteForce) { this->testBruteForce(); }
+typedef KNNTest<float, uint32_t> KNNTestFuint32_t;
+TEST_P(KNNTestFuint32_t, BruteForce) { this->testBruteForce(); }
 
-INSTANTIATE_TEST_CASE_P(KNNTest, KNNTestF, ::testing::ValuesIn(inputs));
+INSTANTIATE_TEST_CASE_P(KNNTest, KNNTestFint64_t, ::testing::ValuesIn(inputs));
+INSTANTIATE_TEST_CASE_P(KNNTest, KNNTestFuint32_t, ::testing::ValuesIn(inputs));
 
 }  // namespace knn
 }  // namespace spatial

From 14bfe020f1e70d077026dc4599302937c7003f24 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 1 Jul 2022 14:06:02 +0200
Subject: [PATCH 103/118] Make index type a template parameter

---
 cpp/bench/spatial/knn.cu                      |  42 ++++
 cpp/include/raft/spatial/knn/ann_common.hpp   |  18 +-
 .../raft/spatial/knn/detail/ann_quantized.cuh |  26 +--
 .../spatial/knn/detail/ivf_flat_build.cuh     |  66 +++---
 .../spatial/knn/detail/ivf_flat_search.cuh    | 217 ++++++++++--------
 cpp/include/raft/spatial/knn/ivf_flat.cuh     |  14 +-
 .../raft/spatial/knn/ivf_flat_types.hpp       |  32 +--
 7 files changed, 241 insertions(+), 174 deletions(-)

diff --git a/cpp/bench/spatial/knn.cu b/cpp/bench/spatial/knn.cu
index 7ec147511e..c5d208ab9c 100644
--- a/cpp/bench/spatial/knn.cu
+++ b/cpp/bench/spatial/knn.cu
@@ -17,6 +17,7 @@
 #include <common/benchmark.hpp>
 
 #include <raft/random/rng.cuh>
+#include <raft/spatial/knn/ivf_flat.cuh>
 #include <raft/spatial/knn/knn.cuh>
 #if defined RAFT_NN_COMPILED
 #include <raft/spatial/knn/specializations.cuh>
@@ -126,6 +127,41 @@ struct host_uvector {
   T* arr_;
 };
 
+template <typename ValT, typename IdxT>
+struct ivf_flat_knn {
+  using dist_t = float;
+
+  std::optional<const raft::spatial::knn::ivf_flat::index<ValT, IdxT>> index;
+  raft::spatial::knn::ivf_flat::index_params index_params;
+  raft::spatial::knn::ivf_flat::search_params search_params;
+  params ps;
+
+  ivf_flat_knn(const raft::handle_t& handle, const params& ps, const ValT* data) : ps(ps)
+  {
+    index_params.n_lists = 4096;
+    index_params.metric  = raft::distance::DistanceType::L2Expanded;
+    index.emplace(raft::spatial::knn::ivf_flat::build(
+      handle, index_params, data, IdxT(ps.n_samples), uint32_t(ps.n_dims), handle.get_stream()));
+  }
+
+  void search(const raft::handle_t& handle,
+              const ValT* search_items,
+              dist_t* out_dists,
+              IdxT* out_idxs)
+  {
+    search_params.n_probes = 20;
+    raft::spatial::knn::ivf_flat::search(handle,
+                                         search_params,
+                                         *index,
+                                         search_items,
+                                         ps.n_queries,
+                                         ps.k,
+                                         out_idxs,
+                                         out_dists,
+                                         handle.get_stream());
+  }
+};
+
 template <typename ValT, typename IdxT>
 struct brute_force_knn {
   using dist_t = ValT;
@@ -325,7 +361,13 @@ const std::vector<Scope> kAllScopes{Scope::BUILD_SEARCH, Scope::SEARCH, Scope::B
   }
 
 KNN_REGISTER(float, int64_t, brute_force_knn, kInputs, kAllStrategies, kScopeFull);
+KNN_REGISTER(float, int64_t, ivf_flat_knn, kInputs, kNoCopyOnly, kAllScopes);
+KNN_REGISTER(int8_t, int64_t, ivf_flat_knn, kInputs, kNoCopyOnly, kAllScopes);
+KNN_REGISTER(uint8_t, int64_t, ivf_flat_knn, kInputs, kNoCopyOnly, kAllScopes);
 
 KNN_REGISTER(float, uint32_t, brute_force_knn, kInputs, kNoCopyOnly, kScopeFull);
+KNN_REGISTER(float, uint32_t, ivf_flat_knn, kInputs, kNoCopyOnly, kAllScopes);
+KNN_REGISTER(int8_t, uint32_t, ivf_flat_knn, kInputs, kNoCopyOnly, kAllScopes);
+KNN_REGISTER(uint8_t, uint32_t, ivf_flat_knn, kInputs, kNoCopyOnly, kAllScopes);
 
 }  // namespace raft::bench::spatial
diff --git a/cpp/include/raft/spatial/knn/ann_common.hpp b/cpp/include/raft/spatial/knn/ann_common.hpp
index 76c2b078c0..e91444a7f2 100644
--- a/cpp/include/raft/spatial/knn/ann_common.hpp
+++ b/cpp/include/raft/spatial/knn/ann_common.hpp
@@ -24,7 +24,7 @@
 namespace raft::spatial::knn {
 
 namespace ivf_flat {
-template <typename T>
+template <typename T, typename IdxT>
 class index;
 };
 
@@ -33,31 +33,31 @@ struct knnIndex {
   float metricArg;
   std::unique_ptr<faiss::gpu::GpuIndex> index;
   std::unique_ptr<MetricProcessor<float>> metric_processor;
-  std::unique_ptr<const ivf_flat::index<float>> ivf_flat_float_;
-  std::unique_ptr<const ivf_flat::index<uint8_t>> ivf_flat_uint8_t_;
-  std::unique_ptr<const ivf_flat::index<int8_t>> ivf_flat_int8_t_;
+  std::unique_ptr<const ivf_flat::index<float, int64_t>> ivf_flat_float_;
+  std::unique_ptr<const ivf_flat::index<uint8_t, int64_t>> ivf_flat_uint8_t_;
+  std::unique_ptr<const ivf_flat::index<int8_t, int64_t>> ivf_flat_int8_t_;
 
   std::unique_ptr<raft::spatial::knn::RmmGpuResources> gpu_res;
   int device;
 
-  template <typename T>
-  auto ivf_flat() -> std::unique_ptr<const ivf_flat::index<T>>&;
+  template <typename T, typename IdxT>
+  auto ivf_flat() -> std::unique_ptr<const ivf_flat::index<T, IdxT>>&;
 };
 
 template <>
-auto knnIndex::ivf_flat<float>() -> std::unique_ptr<const ivf_flat::index<float>>&
+auto knnIndex::ivf_flat<float>() -> std::unique_ptr<const ivf_flat::index<float, int64_t>>&
 {
   return ivf_flat_float_;
 }
 
 template <>
-auto knnIndex::ivf_flat<uint8_t>() -> std::unique_ptr<const ivf_flat::index<uint8_t>>&
+auto knnIndex::ivf_flat<uint8_t>() -> std::unique_ptr<const ivf_flat::index<uint8_t, int64_t>>&
 {
   return ivf_flat_uint8_t_;
 }
 
 template <>
-auto knnIndex::ivf_flat<int8_t>() -> std::unique_ptr<const ivf_flat::index<int8_t>>&
+auto knnIndex::ivf_flat<int8_t>() -> std::unique_ptr<const ivf_flat::index<int8_t, int64_t>>&
 {
   return ivf_flat_int8_t_;
 }
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
index 862c80122e..125bf315fc 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
@@ -132,13 +132,11 @@ void approx_knn_build_index(const handle_t& handle,
   }
   if constexpr (std::is_same_v<T, float>) { index->metric_processor->preprocess(index_array); }
 
-  if (ivf_ft_pams && (metric == raft::distance::DistanceType::L2SqrtExpanded ||
-                      metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
-                      metric == raft::distance::DistanceType::L2Unexpanded ||
+  if (ivf_ft_pams && (metric == raft::distance::DistanceType::L2Unexpanded ||
                       metric == raft::distance::DistanceType::L2Expanded ||
                       metric == raft::distance::DistanceType::InnerProduct)) {
-    index->ivf_flat<T>() = std::make_unique<const ivf_flat::index<T>>(
-      ivf_flat::build(handle, *ivf_ft_pams, index_array, n, D, stream));
+    index->ivf_flat<T, int64_t>() = std::make_unique<const ivf_flat::index<T, int64_t>>(
+      detail::ivf_flat::build(handle, *ivf_ft_pams, index_array, int64_t(n), D, stream));
   } else {
     RAFT_CUDA_TRY(cudaGetDevice(&(index->device)));
     index->gpu_res.reset(new raft::spatial::knn::RmmGpuResources());
@@ -189,15 +187,15 @@ void approx_knn_search(const handle_t& handle,
       RAFT_FAIL("FAISS-based index supports only float data.");
     }
   } else if (ivf_ft_pams) {
-    ivf_flat::search(handle,
-                     *ivf_ft_pams,
-                     *(index->ivf_flat<T>()),
-                     query_array,
-                     n,
-                     k,
-                     (size_t*)indices,
-                     distances,
-                     handle.get_stream());
+    detail::ivf_flat::search(handle,
+                             *ivf_ft_pams,
+                             *(index->ivf_flat<T, int64_t>()),
+                             query_array,
+                             n,
+                             k,
+                             indices,
+                             distances,
+                             handle.get_stream());
   } else {
     RAFT_FAIL("The model is not trained");
   }
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
index a1d3881405..89281c4b23 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
@@ -34,19 +34,6 @@ using raft::spatial::knn::ivf_flat::index;
 using raft::spatial::knn::ivf_flat::index_params;
 using raft::spatial::knn::ivf_flat::kIndexGroupSize;
 
-template <typename T, typename... Extents>
-static inline auto make_array_for_index(rmm::cuda_stream_view stream, Extents... exts)
-{
-  using extent_t  = extents<((void)exts, dynamic_extent)...>;
-  using mdarray_t = device_mdarray<T, extent_t, layout_c_contiguous>;
-
-  typename mdarray_t::extents_type extent{exts...};
-  typename mdarray_t::mapping_type layout{extent};
-  typename mdarray_t::container_policy_type policy{stream};
-
-  return mdarray_t{layout, policy};
-}
-
 /**
  * @brief Record the dataset into the index, one source row at a time.
  *
@@ -60,6 +47,7 @@ static inline auto make_array_for_index(rmm::cuda_stream_view stream, Extents...
  *   there are no dependencies between threads, hence no constraints on the block size.
  *
  * @tparam T the element type.
+ * @tparam IdxT type of the indices in the source dataset
  *
  * @param[in] labels device pointer to the cluster ids for each row [n_rows]
  * @param[in] list_offsets device pointer to the cluster offsets in the output (index) [n_lists]
@@ -73,18 +61,18 @@ static inline auto make_array_for_index(rmm::cuda_stream_view stream, Extents...
  * @param veclen size of vectorized loads/stores; must satisfy `dim % veclen == 0`.
  *
  */
-template <typename T>
+template <typename T, typename IdxT>
 __global__ void build_index_kernel(const uint32_t* labels,
-                                   const uint32_t* list_offsets,
+                                   const IdxT* list_offsets,
                                    const T* dataset,
                                    T* list_data,
-                                   uint32_t* list_index,
+                                   IdxT* list_index,
                                    uint32_t* list_sizes_ptr,
-                                   uint32_t n_rows,
+                                   IdxT n_rows,
                                    uint32_t dim,
                                    uint32_t veclen)
 {
-  const int i = blockDim.x * blockIdx.x + threadIdx.x;
+  const IdxT i = IdxT(blockDim.x) * IdxT(blockIdx.x) + threadIdx.x;
   if (i >= n_rows) { return; }
 
   auto list_id     = labels[i];
@@ -115,13 +103,13 @@ __global__ void build_index_kernel(const uint32_t* labels,
 }
 
 /** See raft::spatial::knn::ivf_flat::build docs */
-template <typename T>
+template <typename T, typename IdxT>
 inline auto build(const handle_t& handle,
                   const index_params& params,
                   const T* dataset,
-                  uint32_t n_rows,
+                  IdxT n_rows,
                   uint32_t dim,
-                  rmm::cuda_stream_view stream) -> index<T>
+                  rmm::cuda_stream_view stream) -> index<T, IdxT>
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope("ivf_flat::build(%u, %u)", n_rows, dim);
   static_assert(std::is_same_v<T, float> || std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
@@ -138,8 +126,8 @@ inline auto build(const handle_t& handle,
 
   // kmeans cluster ids for the dataset
   rmm::device_uvector<uint32_t> labels(n_rows, stream);
-  auto&& centers      = make_array_for_index<float>(stream, n_lists, dim);
-  auto&& list_sizes   = make_array_for_index<uint32_t>(stream, n_lists);
+  auto&& centers      = make_device_mdarray<float>(stream, n_lists, dim);
+  auto&& list_sizes   = make_device_mdarray<uint32_t>(stream, n_lists);
   auto list_sizes_ptr = list_sizes.data();
 
   // Predict labels of the whole dataset
@@ -157,7 +145,7 @@ inline auto build(const handle_t& handle,
                                  stream);
 
   // Calculate offsets into cluster data using exclusive scan
-  auto&& list_offsets   = make_array_for_index<uint32_t>(stream, n_lists + 1);
+  auto&& list_offsets   = make_device_mdarray<IdxT>(stream, n_lists + 1);
   auto list_offsets_ptr = list_offsets.data();
 
   thrust::exclusive_scan(
@@ -165,21 +153,21 @@ inline auto build(const handle_t& handle,
     list_sizes_ptr,
     list_sizes_ptr + n_lists + 1,
     list_offsets_ptr,
-    uint32_t(0),
-    [] __device__(uint32_t s, uint32_t l) { return s + Pow2<WarpSize>::roundUp(l); });
+    IdxT(0),
+    [] __device__(IdxT s, uint32_t l) { return s + Pow2<WarpSize>::roundUp(l); });
 
-  uint32_t index_size;
+  IdxT index_size;
   update_host(&index_size, list_offsets_ptr + n_lists, 1, stream);
   handle.sync_stream(stream);
 
-  auto&& data    = make_array_for_index<T>(stream, index_size, dim);
-  auto&& indices = make_array_for_index<uint32_t>(stream, index_size);
+  auto&& data    = make_device_mdarray<T>(stream, index_size, dim);
+  auto&& indices = make_device_mdarray<IdxT>(stream, index_size);
 
   // we'll rebuild the `list_sizes_ptr` in the following kernel, using it as an atomic counter.
   utils::memset(list_sizes_ptr, 0, sizeof(uint32_t) * n_lists, stream);
 
   const dim3 block_dim(256);
-  const dim3 grid_dim(raft::ceildiv<uint32_t>(n_rows, block_dim.x));
+  const dim3 grid_dim(raft::ceildiv<IdxT>(n_rows, block_dim.x));
   build_index_kernel<<<grid_dim, block_dim, 0, stream>>>(labels.data(),
                                                          list_offsets_ptr,
                                                          dataset,
@@ -193,7 +181,7 @@ inline auto build(const handle_t& handle,
 
   // Precompute the centers vector norms for L2Expanded distance
   auto compute_norms = [&]() {
-    auto&& r = make_array_for_index<float>(stream, n_lists);
+    auto&& r = make_device_mdarray<float>(stream, n_lists);
     utils::dots_along_rows(n_lists, dim, centers.data(), r.data(), stream);
     RAFT_LOG_TRACE_VEC(r.data(), 20);
     return r;
@@ -203,14 +191,14 @@ inline auto build(const handle_t& handle,
                           : std::nullopt;
 
   // assemble the index
-  index<T> index{veclen,
-                 params.metric,
-                 std::move(data),
-                 std::move(indices),
-                 std::move(list_sizes),
-                 std::move(list_offsets),
-                 std::move(centers),
-                 std::move(center_norms)};
+  index<T, IdxT> index{veclen,
+                       params.metric,
+                       std::move(data),
+                       std::move(indices),
+                       std::move(list_sizes),
+                       std::move(list_offsets),
+                       std::move(centers),
+                       std::move(center_norms)};
 
   // check index invariants
   index.check_consistency();
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
index fbd1af4343..fd0e3eeea5 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
@@ -649,30 +649,36 @@ struct loadAndComputeDist<kUnroll, Lambda, 1, int8_t, int32_t> {
  * block; this number must be a multiple of `WarpSize * Veclen`.
  * @param[in] query a pointer to all queries in a row-major contiguous format [gridDim.y, dim]
  * @param[in] coarse_index a pointer to the cluster indices to search through [n_probes]
- * @param[in] list_indices index<T>.indices
- * @param[in] list_data index<T>.data
- * @param[in] list_sizes index<T>.list_sizes
- * @param[in] list_offsets index<T>.list_offsets
+ * @param[in] list_indices index<T, IdxT>.indices
+ * @param[in] list_data index<T, IdxT>.data
+ * @param[in] list_sizes index<T, IdxT>.list_sizes
+ * @param[in] list_offsets index<T, IdxT>.list_offsets
  * @param n_probes
  * @param k
  * @param dim
  * @param[out] neighbors
  * @param[out] distances
  */
-template <int Capacity, int Veclen, bool Ascending, typename T, typename AccT, typename Lambda>
+template <int Capacity,
+          int Veclen,
+          bool Ascending,
+          typename T,
+          typename AccT,
+          typename IdxT,
+          typename Lambda>
 __global__ void __launch_bounds__(kThreadsPerBlock)
   interleaved_scan_kernel(Lambda compute_dist,
                           const uint32_t query_smem_elems,
                           const T* query,
                           const uint32_t* coarse_index,
-                          const uint32_t* list_indices,
+                          const IdxT* list_indices,
                           const T* list_data,
                           const uint32_t* list_sizes,
-                          const uint32_t* list_offsets,
+                          const IdxT* list_offsets,
                           const uint32_t n_probes,
                           const uint32_t k,
                           const uint32_t dim,
-                          size_t* neighbors,
+                          IdxT* neighbors,
                           float* distances)
 {
   extern __shared__ __align__(256) uint8_t interleaved_scan_kernel_smem[];
@@ -694,7 +700,7 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
   copy_vectorized(query_shared, query, std::min(dim, query_smem_elems));
   __syncthreads();
 
-  topk::block_sort<topk::warp_sort_filtered, Capacity, Ascending, float, size_t> queue(
+  topk::block_sort<topk::warp_sort_filtered, Capacity, Ascending, float, IdxT> queue(
     k, interleaved_scan_kernel_smem + query_smem_elems * sizeof(T));
 
   {
@@ -796,22 +802,29 @@ uint32_t configure_launch_x(uint32_t numQueries, uint32_t n_probes, int32_t sMem
   return min_grid_x > n_probes ? n_probes : static_cast<uint32_t>(min_grid_x);
 }
 
-template <int Capacity, int Veclen, bool Ascending, typename T, typename AccT, typename Lambda>
+template <int Capacity,
+          int Veclen,
+          bool Ascending,
+          typename T,
+          typename AccT,
+          typename IdxT,
+          typename Lambda>
 void launch_kernel(Lambda lambda,
-                   const ivf_flat::index<T>& index,
+                   const ivf_flat::index<T, IdxT>& index,
                    const T* queries,
                    const uint32_t* coarse_index,
                    const uint32_t num_queries,
                    const uint32_t n_probes,
                    const uint32_t k,
-                   size_t* neighbors,
+                   IdxT* neighbors,
                    float* distances,
                    uint32_t& grid_dim_x,
                    rmm::cuda_stream_view stream)
 {
   RAFT_EXPECTS(Veclen == index.veclen,
                "Configured Veclen does not match the index interleaving pattern.");
-  constexpr auto kKernel   = interleaved_scan_kernel<Capacity, Veclen, Ascending, T, AccT, Lambda>;
+  constexpr auto kKernel =
+    interleaved_scan_kernel<Capacity, Veclen, Ascending, T, AccT, IdxT, Lambda>;
   const int max_query_smem = 16384;
   int query_smem_elems =
     std::min<int>(max_query_smem / sizeof(T), Pow2<Veclen * WarpSize>::roundUp(index.dim()));
@@ -909,17 +922,33 @@ struct inner_prod_dist {
 };
 
 /** Select the distance computation function and forward the rest of the arguments. */
-template <int Capacity, int Veclen, bool Ascending, typename T, typename AccT, typename... Args>
+template <int Capacity,
+          int Veclen,
+          bool Ascending,
+          typename T,
+          typename AccT,
+          typename IdxT,
+          typename... Args>
 void launch_with_fixed_consts(raft::distance::DistanceType metric, Args&&... args)
 {
   switch (metric) {
     case raft::distance::DistanceType::L2Expanded:
     case raft::distance::DistanceType::L2Unexpanded:
-      return launch_kernel<Capacity, Veclen, Ascending, T, AccT, euclidean_dist<Veclen, T, AccT>>(
-        {}, std::forward<Args>(args)...);
+      return launch_kernel<Capacity,
+                           Veclen,
+                           Ascending,
+                           T,
+                           AccT,
+                           IdxT,
+                           euclidean_dist<Veclen, T, AccT>>({}, std::forward<Args>(args)...);
     case raft::distance::DistanceType::InnerProduct:
-      return launch_kernel<Capacity, Veclen, Ascending, T, AccT, inner_prod_dist<Veclen, T, AccT>>(
-        {}, std::forward<Args>(args)...);
+      return launch_kernel<Capacity,
+                           Veclen,
+                           Ascending,
+                           T,
+                           AccT,
+                           IdxT,
+                           inner_prod_dist<Veclen, T, AccT>>({}, std::forward<Args>(args)...);
     // NB: update the description of `knn::ivf_flat::build` when adding here a new metric.
     default: RAFT_FAIL("The chosen distance metric is not supported (%d)", int(metric));
   }
@@ -931,6 +960,7 @@ void launch_with_fixed_consts(raft::distance::DistanceType metric, Args&&... arg
  */
 template <typename T,
           typename AccT,
+          typename IdxT,
           int Capacity = topk::kMaxCapacity,
           int Veclen   = std::max<int>(1, 16 / sizeof(T))>
 struct select_interleaved_scan_kernel {
@@ -945,13 +975,13 @@ struct select_interleaved_scan_kernel {
   {
     if constexpr (Capacity > 1) {
       if (capacity * 2 <= Capacity) {
-        return select_interleaved_scan_kernel<T, AccT, Capacity / 2, Veclen>::run(
+        return select_interleaved_scan_kernel<T, AccT, IdxT, Capacity / 2, Veclen>::run(
           capacity, veclen, select_min, std::forward<Args>(args)...);
       }
     }
     if constexpr (Veclen > 1) {
       if (veclen * 2 <= Veclen) {
-        return select_interleaved_scan_kernel<T, AccT, Capacity, Veclen / 2>::run(
+        return select_interleaved_scan_kernel<T, AccT, IdxT, Capacity, Veclen / 2>::run(
           capacity, veclen, select_min, std::forward<Args>(args)...);
       }
     }
@@ -965,9 +995,9 @@ struct select_interleaved_scan_kernel {
       veclen == Veclen,
       "Veclen must be power-of-two not bigger than the maximum allowed size for this data type.");
     if (select_min) {
-      launch_with_fixed_consts<Capacity, Veclen, true, T, AccT>(std::forward<Args>(args)...);
+      launch_with_fixed_consts<Capacity, Veclen, true, T, AccT, IdxT>(std::forward<Args>(args)...);
     } else {
-      launch_with_fixed_consts<Capacity, Veclen, false, T, AccT>(std::forward<Args>(args)...);
+      launch_with_fixed_consts<Capacity, Veclen, false, T, AccT, IdxT>(std::forward<Args>(args)...);
     }
   }
 };
@@ -977,6 +1007,7 @@ struct select_interleaved_scan_kernel {
  *
  * @tparam T value type
  * @tparam AccT accumulated type
+ * @tparam IdxT type of the indices
  *
  * @param index previously built ivf-flat index
  * @param[in] queries device pointer to the query vectors [batch_size, dim]
@@ -996,8 +1027,8 @@ struct select_interleaved_scan_kernel {
  *               (one block processes one or more probes, hence: 1 <= grid_dim_x <= n_probes)
  * @param stream
  */
-template <typename T, typename AccT>
-void ivfflat_interleaved_scan(const ivf_flat::index<T>& index,
+template <typename T, typename AccT, typename IdxT>
+void ivfflat_interleaved_scan(const ivf_flat::index<T, IdxT>& index,
                               const T* queries,
                               const uint32_t* coarse_query_results,
                               const uint32_t n_queries,
@@ -1005,37 +1036,37 @@ void ivfflat_interleaved_scan(const ivf_flat::index<T>& index,
                               const uint32_t n_probes,
                               const uint32_t k,
                               const bool select_min,
-                              size_t* neighbors,
+                              IdxT* neighbors,
                               float* distances,
                               uint32_t& grid_dim_x,
                               rmm::cuda_stream_view stream)
 {
   const int capacity = raft::spatial::knn::detail::topk::calc_capacity(k);
-  select_interleaved_scan_kernel<T, AccT>::run(capacity,
-                                               index.veclen,
-                                               select_min,
-                                               metric,
-                                               index,
-                                               queries,
-                                               coarse_query_results,
-                                               n_queries,
-                                               n_probes,
-                                               k,
-                                               neighbors,
-                                               distances,
-                                               grid_dim_x,
-                                               stream);
+  select_interleaved_scan_kernel<T, AccT, IdxT>::run(capacity,
+                                                     index.veclen,
+                                                     select_min,
+                                                     metric,
+                                                     index,
+                                                     queries,
+                                                     coarse_query_results,
+                                                     n_queries,
+                                                     n_probes,
+                                                     k,
+                                                     neighbors,
+                                                     distances,
+                                                     grid_dim_x,
+                                                     stream);
 }
 
-template <typename T, typename AccT>
+template <typename T, typename AccT, typename IdxT>
 void search_impl(const handle_t& handle,
-                 const index<T>& index,
+                 const index<T, IdxT>& index,
                  const T* queries,
                  uint32_t n_queries,
                  uint32_t k,
                  uint32_t n_probes,
                  bool select_min,
-                 size_t* neighbors,
+                 IdxT* neighbors,
                  AccT* distances,
                  rmm::cuda_stream_view stream,
                  rmm::mr::device_memory_resource* search_mr)
@@ -1051,7 +1082,7 @@ void search_impl(const handle_t& handle,
   // The topk distance value of candicate vectors from each cluster(list)
   rmm::device_uvector<AccT> refined_distances_dev(n_queries * n_probes * k, stream, search_mr);
   // The topk index of candicate vectors from each cluster(list)
-  rmm::device_uvector<size_t> refined_indices_dev(n_queries * n_probes * k, stream, search_mr);
+  rmm::device_uvector<IdxT> refined_indices_dev(n_queries * n_probes * k, stream, search_mr);
 
   size_t float_query_size;
   if constexpr (std::is_integral_v<T>) {
@@ -1133,24 +1164,24 @@ void search_impl(const handle_t& handle,
   RAFT_LOG_TRACE_VEC(coarse_indices_dev.data(), 1 * n_probes);
   RAFT_LOG_TRACE_VEC(coarse_distances_dev.data(), 1 * n_probes);
 
-  AccT* distances_dev_ptr = refined_distances_dev.data();
-  size_t* indices_dev_ptr = refined_indices_dev.data();
+  auto distances_dev_ptr = refined_distances_dev.data();
+  auto indices_dev_ptr   = refined_indices_dev.data();
 
   uint32_t grid_dim_x = 0;
   if (n_probes > 1) {
     // query the gridDimX size to store probes topK output
-    ivfflat_interleaved_scan<T, typename utils::config<T>::value_t>(index,
-                                                                    nullptr,
-                                                                    nullptr,
-                                                                    n_queries,
-                                                                    index.metric,
-                                                                    n_probes,
-                                                                    k,
-                                                                    select_min,
-                                                                    nullptr,
-                                                                    nullptr,
-                                                                    grid_dim_x,
-                                                                    stream);
+    ivfflat_interleaved_scan<T, typename utils::config<T>::value_t, IdxT>(index,
+                                                                          nullptr,
+                                                                          nullptr,
+                                                                          n_queries,
+                                                                          index.metric,
+                                                                          n_probes,
+                                                                          k,
+                                                                          select_min,
+                                                                          nullptr,
+                                                                          nullptr,
+                                                                          grid_dim_x,
+                                                                          stream);
   } else {
     grid_dim_x = 1;
   }
@@ -1160,18 +1191,18 @@ void search_impl(const handle_t& handle,
     indices_dev_ptr   = neighbors;
   }
 
-  ivfflat_interleaved_scan<T, typename utils::config<T>::value_t>(index,
-                                                                  queries,
-                                                                  coarse_indices_dev.data(),
-                                                                  n_queries,
-                                                                  index.metric,
-                                                                  n_probes,
-                                                                  k,
-                                                                  select_min,
-                                                                  indices_dev_ptr,
-                                                                  distances_dev_ptr,
-                                                                  grid_dim_x,
-                                                                  stream);
+  ivfflat_interleaved_scan<T, typename utils::config<T>::value_t, IdxT>(index,
+                                                                        queries,
+                                                                        coarse_indices_dev.data(),
+                                                                        n_queries,
+                                                                        index.metric,
+                                                                        n_probes,
+                                                                        k,
+                                                                        select_min,
+                                                                        indices_dev_ptr,
+                                                                        distances_dev_ptr,
+                                                                        grid_dim_x,
+                                                                        stream);
 
   RAFT_LOG_TRACE_VEC(distances_dev_ptr, 2 * k);
   RAFT_LOG_TRACE_VEC(indices_dev_ptr, 2 * k);
@@ -1179,42 +1210,42 @@ void search_impl(const handle_t& handle,
   // Merge topk values from different blocks
   if (grid_dim_x > 1) {
     if (k <= raft::spatial::knn::detail::topk::kMaxCapacity) {
-      topk::warp_sort_topk<AccT, size_t>(refined_distances_dev.data(),
-                                         refined_indices_dev.data(),
-                                         n_queries,
-                                         k * grid_dim_x,
-                                         k,
-                                         distances,
-                                         neighbors,
-                                         select_min,
-                                         stream,
-                                         search_mr);
+      topk::warp_sort_topk<AccT, IdxT>(refined_distances_dev.data(),
+                                       refined_indices_dev.data(),
+                                       n_queries,
+                                       k * grid_dim_x,
+                                       k,
+                                       distances,
+                                       neighbors,
+                                       select_min,
+                                       stream,
+                                       search_mr);
     } else {
       // NB: this branch can only be triggered once `ivfflat_interleaved_scan` above supports larger
       // `k` values (kMaxCapacity limit as a dependency of topk::block_sort)
-      topk::radix_topk<AccT, size_t, 11, 512>(refined_distances_dev.data(),
-                                              refined_indices_dev.data(),
-                                              n_queries,
-                                              k * grid_dim_x,
-                                              k,
-                                              distances,
-                                              neighbors,
-                                              select_min,
-                                              stream,
-                                              search_mr);
+      topk::radix_topk<AccT, IdxT, 11, 512>(refined_distances_dev.data(),
+                                            refined_indices_dev.data(),
+                                            n_queries,
+                                            k * grid_dim_x,
+                                            k,
+                                            distances,
+                                            neighbors,
+                                            select_min,
+                                            stream,
+                                            search_mr);
     }
   }
 }
 
 /** See raft::spatial::knn::ivf_flat::search docs */
-template <typename T>
+template <typename T, typename IdxT>
 inline void search(const handle_t& handle,
                    const search_params& params,
-                   const index<T>& index,
+                   const index<T, IdxT>& index,
                    const T* queries,
                    uint32_t n_queries,
                    uint32_t k,
-                   size_t* neighbors,
+                   IdxT* neighbors,
                    float* distances,
                    rmm::cuda_stream_view stream,
                    rmm::mr::device_memory_resource* mr = nullptr)
@@ -1245,7 +1276,7 @@ inline void search(const handle_t& handle,
                    pool_guard->pool_size());
   }
 
-  return search_impl<T, float>(
+  return search_impl<T, float, IdxT>(
     handle, index, queries, n_queries, k, n_probes, select_min, neighbors, distances, stream, mr);
 }
 
diff --git a/cpp/include/raft/spatial/knn/ivf_flat.cuh b/cpp/include/raft/spatial/knn/ivf_flat.cuh
index e05ac40fde..fcf96caad7 100644
--- a/cpp/include/raft/spatial/knn/ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/ivf_flat.cuh
@@ -36,6 +36,7 @@ namespace raft::spatial::knn::ivf_flat {
  *   InnerProduct
  *
  * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
  *
  * @param handle
  * @param params configure the index building
@@ -46,13 +47,13 @@ namespace raft::spatial::knn::ivf_flat {
  *
  * @return the constructed ivf-flat index
  */
-template <typename T>
+template <typename T, typename IdxT = uint32_t>
 inline auto build(const handle_t& handle,
                   const index_params& params,
                   const T* dataset,
-                  uint32_t n_rows,
+                  IdxT n_rows,
                   uint32_t dim,
-                  rmm::cuda_stream_view stream) -> const index<T>
+                  rmm::cuda_stream_view stream) -> index<T, IdxT>
 {
   return raft::spatial::knn::detail::ivf_flat::build(handle, params, dataset, n_rows, dim, stream);
 }
@@ -61,6 +62,7 @@ inline auto build(const handle_t& handle,
  * @brief Search ANN using the constructed index.
  *
  * @tparam T data element type
+ * @tparam IdxT type of the indices
  *
  * @param handle
  * @param params configure the search
@@ -75,14 +77,14 @@ inline auto build(const handle_t& handle,
  * @param mr an optional memory resource to use across the searches (you can provide a large enough
  *           memory pool here to avoid memory allocations within search).
  */
-template <typename T>
+template <typename T, typename IdxT>
 inline void search(const handle_t& handle,
                    const search_params& params,
-                   const index<T>& index,
+                   const index<T, IdxT>& index,
                    const T* queries,
                    uint32_t n_queries,
                    uint32_t k,
-                   size_t* neighbors,
+                   IdxT* neighbors,
                    float* distances,
                    rmm::cuda_stream_view stream,
                    rmm::mr::device_memory_resource* mr = nullptr)
diff --git a/cpp/include/raft/spatial/knn/ivf_flat_types.hpp b/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
index 90a0610f7f..610e7bf516 100644
--- a/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
+++ b/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
@@ -20,6 +20,7 @@
 
 #include <raft/core/mdarray.hpp>
 #include <raft/distance/distance_type.hpp>
+#include <raft/integer_utils.h>
 
 #include <optional>
 
@@ -42,14 +43,13 @@ constexpr static uint32_t kIndexGroupSize = 32;
  * is to make all public factory functions, such as `ivf_flat::build` return `const index`.
  *
  * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
  *
  */
-template <typename T>
+template <typename T, typename IdxT>
 struct index {
-  using row_major = layout_c_contiguous;
-  using extent_1d = extents<dynamic_extent>;
-  using extent_2d = extents<dynamic_extent, dynamic_extent>;
-
+  static_assert(!raft::is_narrowing_v<uint32_t, IdxT>,
+                "IdxT must be able to represent all values of uint32_t");
   /**
    * Vectorized load/store size in elements, determines the size of interleaved data chunks.
    *
@@ -85,14 +85,14 @@ struct index {
    */
   device_mdarray<T, extent_2d, row_major> data;
   /** Inverted list indices: ids of items in the source data [size] */
-  device_mdarray<uint32_t, extent_1d, row_major> indices;
+  device_mdarray<IdxT, extent_1d, row_major> indices;
   /** Sizes of the lists (clusters) [n_lists] */
   device_mdarray<uint32_t, extent_1d, row_major> list_sizes;
   /**
    * Offsets into the lists [n_lists + 1].
    * The last value contains the total length of the index.
    */
-  device_mdarray<uint32_t, extent_1d, row_major> list_offsets;
+  device_mdarray<IdxT, extent_1d, row_major> list_offsets;
   /** k-means cluster centers corresponding to the lists [n_lists, dim] */
   device_mdarray<float, extent_2d, row_major> centers;
   /** (Optional) Precomputed norms of the `centers` w.r.t. the chosen distance metric [n_lists]  */
@@ -106,13 +106,19 @@ struct index {
   ~index()                          = default;
 
   /** Total length of the index. */
-  [[nodiscard]] constexpr inline auto size() const noexcept -> size_t { return data.extent(0); }
+  [[nodiscard]] constexpr inline auto size() const noexcept -> IdxT
+  {
+    return static_cast<uint32_t>(data.extent(0));
+  }
   /** Dimensionality of the data. */
-  [[nodiscard]] constexpr inline auto dim() const noexcept -> size_t { return data.extent(1); }
+  [[nodiscard]] constexpr inline auto dim() const noexcept -> uint32_t
+  {
+    return static_cast<uint32_t>(data.extent(1));
+  }
   /** Number of clusters/inverted lists. */
-  [[nodiscard]] constexpr inline auto n_lists() const noexcept -> size_t
+  [[nodiscard]] constexpr inline auto n_lists() const noexcept -> uint32_t
   {
-    return centers.extent(0);
+    return static_cast<uint32_t>(centers.extent(0));
   }
 
   /** Throw an error if the index content is inconsistent. */
@@ -141,7 +147,7 @@ struct index_params : ivf_index_params {
 struct search_params : ivf_search_params {
 };
 
-static_assert(std::is_standard_layout_v<index<float>>);
-static_assert(std::is_aggregate_v<index<float>>);
+static_assert(std::is_standard_layout_v<index<float, uint32_t>>);
+static_assert(std::is_aggregate_v<index<float, uint32_t>>);
 
 }  // namespace raft::spatial::knn::ivf_flat

From 1283cbec346f935e02c92f59efd4facf5ed1e0c9 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 1 Jul 2022 17:21:38 +0200
Subject: [PATCH 104/118] Revert the api changes as much as possible and
 deprecate the old api

---
 cpp/include/raft/spatial/knn/ann.cuh          |  47 +++----
 cpp/include/raft/spatial/knn/ann_common.h     | 118 ++++++++++++++++++
 cpp/include/raft/spatial/knn/common.hpp       |  38 ++++++
 .../knn/detail/ann_kmeans_balanced.cuh        |   1 -
 .../raft/spatial/knn/detail/ann_quantized.cuh |  83 ++++++------
 .../raft/spatial/knn/detail/ann_utils.cuh     |   2 -
 .../spatial/knn/detail/ivf_flat_build.cuh     |  11 +-
 .../spatial/knn/detail/ivf_flat_search.cuh    |   8 +-
 cpp/include/raft/spatial/knn/ivf_flat.cuh     |   4 +-
 .../raft/spatial/knn/ivf_flat_types.hpp       |  14 ++-
 cpp/test/spatial/ann_ivf_flat.cu              | 106 +++++++++++-----
 11 files changed, 314 insertions(+), 118 deletions(-)
 create mode 100644 cpp/include/raft/spatial/knn/ann_common.h
 create mode 100644 cpp/include/raft/spatial/knn/common.hpp

diff --git a/cpp/include/raft/spatial/knn/ann.cuh b/cpp/include/raft/spatial/knn/ann.cuh
index 8d580db6de..befb5524ac 100644
--- a/cpp/include/raft/spatial/knn/ann.cuh
+++ b/cpp/include/raft/spatial/knn/ann.cuh
@@ -16,11 +16,10 @@
 
 #pragma once
 
-#include "ann_common.hpp"
+#include "ann_common.h"
 #include "detail/ann_quantized.cuh"
 
 #include <raft/core/nvtx.hpp>
-#include <raft/spatial/knn/faiss_mr.hpp>
 
 namespace raft::spatial::knn {
 
@@ -31,21 +30,26 @@ namespace raft::spatial::knn {
  * @param[in] handle RAFT handle
  * @param[out] index index to be built
  * @param[in] params parametrization of the index to be built
+ * @param[in] metric distance metric to use. Euclidean (L2) is used by default
+ * @param[in] metricArg metric argument
  * @param[in] index_array the index array to build the index with
  * @param[in] n number of rows in the index array
  * @param[in] D the dimensionality of the index array
  */
 template <typename T = float, typename value_idx = int>
-inline void approx_knn_build_index(const raft::handle_t& handle,
-                                   knnIndex* index,
-                                   const knn_index_params& params,
-                                   T* index_array,
-                                   value_idx n,
-                                   value_idx D)
+[[deprecated("Consider using new-style raft::spatial::knn::*::build functions")]] inline void
+approx_knn_build_index(raft::handle_t& handle,
+                       raft::spatial::knn::knnIndex* index,
+                       knnIndexParam* params,
+                       raft::distance::DistanceType metric,
+                       float metricArg,
+                       T* index_array,
+                       value_idx n,
+                       value_idx D)
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
-    "approx_knn_build_index(n_rows = %u, dim = %u)", n, D);
-  detail::approx_knn_build_index(handle, index, params, index_array, n, D);
+    "legacy approx_knn_build_index(n_rows = %u, dim = %u)", n, D);
+  detail::approx_knn_build_index(handle, index, params, metric, metricArg, index_array, n, D);
 }
 
 /**
@@ -57,24 +61,23 @@ inline void approx_knn_build_index(const raft::handle_t& handle,
  *                       their query point
  * @param[out] indices indices of the nearest neighbors
  * @param[in] index index to perform a search with
- * @param[in] params configure search
  * @param[in] k the number of nearest neighbors to search for
  * @param[in] query_array the query to perform a search with
- * @param[in] n_queries number of rows in the query array
+ * @param[in] n number of rows in the query array
  */
 template <typename T = float, typename value_idx = int>
-inline void approx_knn_search(const raft::handle_t& handle,
-                              float* distances,
-                              int64_t* indices,
-                              knnIndex* index,
-                              const knn_search_params& params,
-                              value_idx k,
-                              T* query_array,
-                              value_idx n_queries)
+[[deprecated("Consider using new-style raft::spatial::knn::*::search functions")]] inline void
+approx_knn_search(raft::handle_t& handle,
+                  float* distances,
+                  int64_t* indices,
+                  raft::spatial::knn::knnIndex* index,
+                  value_idx k,
+                  T* query_array,
+                  value_idx n)
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
-    "approx_knn_search(k = %u, n_queries = %u)", k, n_queries);
-  detail::approx_knn_search(handle, distances, indices, index, params, k, query_array, n_queries);
+    "legacy approx_knn_search(k = %u, n_queries = %u)", k, n);
+  detail::approx_knn_search(handle, distances, indices, index, k, query_array, n);
 }
 
 }  // namespace raft::spatial::knn
diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h
new file mode 100644
index 0000000000..463950045f
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/ann_common.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the other approximate KNN implementations defined in spatial/knn/*.")
+
+#pragma once
+
+#include "detail/processing.hpp"
+#include "ivf_flat_types.hpp"
+
+#include <raft/distance/distance_type.hpp>
+
+#include <faiss/gpu/GpuIndex.h>
+#include <raft/spatial/knn/faiss_mr.hpp>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+
+struct knnIndex {
+  raft::distance::DistanceType metric;
+  float metricArg;
+  int nprobe;
+  std::unique_ptr<faiss::gpu::GpuIndex> index;
+  std::unique_ptr<MetricProcessor<float>> metric_processor;
+  std::unique_ptr<const ivf_flat::index<float, int64_t>> ivf_flat_float_;
+  std::unique_ptr<const ivf_flat::index<uint8_t, int64_t>> ivf_flat_uint8_t_;
+  std::unique_ptr<const ivf_flat::index<int8_t, int64_t>> ivf_flat_int8_t_;
+
+  std::unique_ptr<raft::spatial::knn::RmmGpuResources> gpu_res;
+  int device;
+
+  template <typename T, typename IdxT>
+  auto ivf_flat() -> std::unique_ptr<const ivf_flat::index<T, IdxT>>&;
+};
+
+template <>
+auto knnIndex::ivf_flat<float, int64_t>() -> std::unique_ptr<const ivf_flat::index<float, int64_t>>&
+{
+  return ivf_flat_float_;
+}
+
+template <>
+auto knnIndex::ivf_flat<uint8_t, int64_t>()
+  -> std::unique_ptr<const ivf_flat::index<uint8_t, int64_t>>&
+{
+  return ivf_flat_uint8_t_;
+}
+
+template <>
+auto knnIndex::ivf_flat<int8_t, int64_t>()
+  -> std::unique_ptr<const ivf_flat::index<int8_t, int64_t>>&
+{
+  return ivf_flat_int8_t_;
+}
+
+enum QuantizerType : unsigned int {
+  QT_8bit,
+  QT_4bit,
+  QT_8bit_uniform,
+  QT_4bit_uniform,
+  QT_fp16,
+  QT_8bit_direct,
+  QT_6bit
+};
+
+struct knnIndexParam {
+  virtual ~knnIndexParam() {}
+};
+
+struct IVFParam : knnIndexParam {
+  int nlist;
+  int nprobe;
+};
+
+struct IVFFlatParam : IVFParam {
+};
+
+struct IVFPQParam : IVFParam {
+  int M;
+  int n_bits;
+  bool usePrecomputedTables;
+};
+
+struct IVFSQParam : IVFParam {
+  QuantizerType qtype;
+  bool encodeResidual;
+};
+
+inline auto from_legacy_index_params(const IVFFlatParam& legacy,
+                                     raft::distance::DistanceType metric,
+                                     float metric_arg)
+{
+  ivf_flat::index_params params;
+  params.metric     = metric;
+  params.metric_arg = metric_arg;
+  params.n_lists    = legacy.nlist;
+  return params;
+}
+
+};  // namespace knn
+};  // namespace spatial
+};  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/common.hpp b/cpp/include/raft/spatial/knn/common.hpp
new file mode 100644
index 0000000000..4704093065
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/common.hpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/distance_type.hpp>
+
+namespace raft::spatial::knn {
+
+/** The base for approximate KNN index structures. */
+struct index {
+};
+
+/** The base for KNN index parameters. */
+struct index_params {
+  /** Distance type. */
+  raft::distance::DistanceType metric = distance::DistanceType::L2Expanded;
+  /** The argument used by some distance metrics. */
+  float metric_arg = 2.0f;
+};
+
+struct search_params {
+};
+
+};  // namespace raft::spatial::knn
diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index c327ec6ed0..30a8262e4a 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include "../ann_common.hpp"
 #include "ann_utils.cuh"
 
 #include <raft/common/nvtx.hpp>
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
index 125bf315fc..7b07d02d36 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "../ann_common.hpp"
+#include "../ann_common.h"
 #include "../ivf_flat.cuh"
 #include "knn_brute_force_faiss.cuh"
 
@@ -65,67 +65,60 @@ inline faiss::ScalarQuantizer::QuantizerType build_faiss_qtype(QuantizerType qty
 
 template <typename IntType = int>
 void approx_knn_ivfflat_build_index(knnIndex* index,
-                                    const ivf_index_params& params,
+                                    const IVFFlatParam& params,
                                     IntType n,
                                     IntType D)
 {
   faiss::gpu::GpuIndexIVFFlatConfig config;
   config.device                  = index->device;
-  faiss::MetricType faiss_metric = build_faiss_metric(params.metric);
+  faiss::MetricType faiss_metric = build_faiss_metric(index->metric);
   index->index.reset(
-    new faiss::gpu::GpuIndexIVFFlat(index->gpu_res.get(), D, params.n_lists, faiss_metric, config));
+    new faiss::gpu::GpuIndexIVFFlat(index->gpu_res.get(), D, params.nlist, faiss_metric, config));
 }
 
 template <typename IntType = int>
-void approx_knn_ivfpq_build_index(knnIndex* index,
-                                  const ivf_pq_index_params& params,
-                                  IntType n,
-                                  IntType D)
+void approx_knn_ivfpq_build_index(knnIndex* index, const IVFPQParam& params, IntType n, IntType D)
 {
   faiss::gpu::GpuIndexIVFPQConfig config;
   config.device                  = index->device;
-  config.usePrecomputedTables    = params.use_precomputed_tables;
+  config.usePrecomputedTables    = params.usePrecomputedTables;
   config.interleavedLayout       = params.n_bits != 8;
-  faiss::MetricType faiss_metric = build_faiss_metric(params.metric);
-  index->index.reset(new faiss::gpu::GpuIndexIVFPQ(index->gpu_res.get(),
-                                                   D,
-                                                   params.n_lists,
-                                                   params.n_subquantizers,
-                                                   params.n_bits,
-                                                   faiss_metric,
-                                                   config));
+  faiss::MetricType faiss_metric = build_faiss_metric(index->metric);
+  index->index.reset(new faiss::gpu::GpuIndexIVFPQ(
+    index->gpu_res.get(), D, params.nlist, params.M, params.n_bits, faiss_metric, config));
 }
 
 template <typename IntType = int>
-void approx_knn_ivfsq_build_index(knnIndex* index,
-                                  const ivf_sq_index_params& params,
-                                  IntType n,
-                                  IntType D)
+void approx_knn_ivfsq_build_index(knnIndex* index, const IVFSQParam& params, IntType n, IntType D)
 {
   faiss::gpu::GpuIndexIVFScalarQuantizerConfig config;
   config.device                                     = index->device;
-  faiss::MetricType faiss_metric                    = build_faiss_metric(params.metric);
+  faiss::MetricType faiss_metric                    = build_faiss_metric(index->metric);
   faiss::ScalarQuantizer::QuantizerType faiss_qtype = build_faiss_qtype(params.qtype);
   index->index.reset(new faiss::gpu::GpuIndexIVFScalarQuantizer(
-    index->gpu_res.get(), D, params.n_lists, faiss_qtype, faiss_metric, params.encode_residual));
+    index->gpu_res.get(), D, params.nlist, faiss_qtype, faiss_metric, params.encodeResidual));
 }
 
 template <typename T = float, typename IntType = int>
 void approx_knn_build_index(const handle_t& handle,
                             knnIndex* index,
-                            const knn_index_params& params,
+                            knnIndexParam* params,
+                            raft::distance::DistanceType metric,
+                            float metricArg,
                             T* index_array,
                             IntType n,
                             IntType D)
 {
   auto stream      = handle.get_stream();
-  auto metric      = params.metric;
   index->index     = nullptr;
   index->metric    = metric;
-  index->metricArg = params.metric_arg;
-  auto ivf_ft_pams = dynamic_cast<const ivf_flat::index_params*>(&params);
-  auto ivf_pq_pams = dynamic_cast<const ivf_pq_index_params*>(&params);
-  auto ivf_sq_pams = dynamic_cast<const ivf_sq_index_params*>(&params);
+  index->metricArg = metricArg;
+  if (dynamic_cast<const IVFParam*>(params)) {
+    index->nprobe = dynamic_cast<const IVFParam*>(params)->nprobe;
+  }
+  auto ivf_ft_pams = dynamic_cast<IVFFlatParam*>(params);
+  auto ivf_pq_pams = dynamic_cast<IVFPQParam*>(params);
+  auto ivf_sq_pams = dynamic_cast<IVFSQParam*>(params);
 
   if constexpr (std::is_same_v<T, float>) {
     index->metric_processor = create_processor<float>(metric, n, D, 0, false, stream);
@@ -135,8 +128,9 @@ void approx_knn_build_index(const handle_t& handle,
   if (ivf_ft_pams && (metric == raft::distance::DistanceType::L2Unexpanded ||
                       metric == raft::distance::DistanceType::L2Expanded ||
                       metric == raft::distance::DistanceType::InnerProduct)) {
+    auto new_params               = from_legacy_index_params(*ivf_ft_pams, metric, metricArg);
     index->ivf_flat<T, int64_t>() = std::make_unique<const ivf_flat::index<T, int64_t>>(
-      detail::ivf_flat::build(handle, *ivf_ft_pams, index_array, int64_t(n), D, stream));
+      ivf_flat::build(handle, new_params, index_array, int64_t(n), D, stream));
   } else {
     RAFT_CUDA_TRY(cudaGetDevice(&(index->device)));
     index->gpu_res.reset(new raft::spatial::knn::RmmGpuResources());
@@ -167,15 +161,12 @@ void approx_knn_search(const handle_t& handle,
                        float* distances,
                        int64_t* indices,
                        knnIndex* index,
-                       const knn_search_params& params,
                        IntType k,
                        T* query_array,
                        IntType n)
 {
-  auto ivf_ft_pams = dynamic_cast<const ivf_flat::search_params*>(&params);
-  auto ivf_pams    = dynamic_cast<const ivf_search_params*>(&params);
-  auto faiss_ivf   = dynamic_cast<GpuIndexIVF*>(index->index.get());
-  if (ivf_pams && faiss_ivf) { faiss_ivf->setNumProbes(ivf_pams->n_probes); }
+  auto faiss_ivf = dynamic_cast<GpuIndexIVF*>(index->index.get());
+  if (faiss_ivf) { faiss_ivf->setNumProbes(index->nprobe); }
 
   if constexpr (std::is_same_v<T, float>) { index->metric_processor->preprocess(query_array); }
 
@@ -186,16 +177,18 @@ void approx_knn_search(const handle_t& handle,
     } else {
       RAFT_FAIL("FAISS-based index supports only float data.");
     }
-  } else if (ivf_ft_pams) {
-    detail::ivf_flat::search(handle,
-                             *ivf_ft_pams,
-                             *(index->ivf_flat<T, int64_t>()),
-                             query_array,
-                             n,
-                             k,
-                             indices,
-                             distances,
-                             handle.get_stream());
+  } else if (index->ivf_flat<T, int64_t>()) {
+    ivf_flat::search_params params;
+    params.n_probes = index->nprobe;
+    ivf_flat::search(handle,
+                     params,
+                     *(index->ivf_flat<T, int64_t>()),
+                     query_array,
+                     n,
+                     k,
+                     indices,
+                     distances,
+                     handle.get_stream());
   } else {
     RAFT_FAIL("The model is not trained");
   }
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index c3c723d71a..f1d198a99a 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -16,8 +16,6 @@
 
 #pragma once
 
-#include "../ann_common.hpp"
-
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/distance/distance.hpp>
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
index 89281c4b23..6b7f7662be 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
@@ -28,11 +28,9 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
-namespace raft::spatial::knn::detail::ivf_flat {
+namespace raft::spatial::knn::ivf_flat::detail {
 
-using raft::spatial::knn::ivf_flat::index;
-using raft::spatial::knn::ivf_flat::index_params;
-using raft::spatial::knn::ivf_flat::kIndexGroupSize;
+using namespace raft::spatial::knn::detail;  // NOLINT
 
 /**
  * @brief Record the dataset into the index, one source row at a time.
@@ -191,7 +189,8 @@ inline auto build(const handle_t& handle,
                           : std::nullopt;
 
   // assemble the index
-  index<T, IdxT> index{veclen,
+  index<T, IdxT> index{{},
+                       veclen,
                        params.metric,
                        std::move(data),
                        std::move(indices),
@@ -206,4 +205,4 @@ inline auto build(const handle_t& handle,
   return index;
 }
 
-}  // namespace raft::spatial::knn::detail::ivf_flat
+}  // namespace raft::spatial::knn::ivf_flat::detail
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
index fd0e3eeea5..c04ece3858 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
@@ -37,11 +37,9 @@
 
 #include <optional>
 
-namespace raft::spatial::knn::detail::ivf_flat {
+namespace raft::spatial::knn::ivf_flat::detail {
 
-using raft::spatial::knn::ivf_flat::index;
-using raft::spatial::knn::ivf_flat::kIndexGroupSize;
-using raft::spatial::knn::ivf_flat::search_params;
+using namespace raft::spatial::knn::detail;  // NOLINT
 
 constexpr int kThreadsPerBlock = 128;
 
@@ -1280,4 +1278,4 @@ inline void search(const handle_t& handle,
     handle, index, queries, n_queries, k, n_probes, select_min, neighbors, distances, stream, mr);
 }
 
-}  // namespace raft::spatial::knn::detail::ivf_flat
+}  // namespace raft::spatial::knn::ivf_flat::detail
diff --git a/cpp/include/raft/spatial/knn/ivf_flat.cuh b/cpp/include/raft/spatial/knn/ivf_flat.cuh
index fcf96caad7..1564f74dcd 100644
--- a/cpp/include/raft/spatial/knn/ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/ivf_flat.cuh
@@ -55,7 +55,7 @@ inline auto build(const handle_t& handle,
                   uint32_t dim,
                   rmm::cuda_stream_view stream) -> index<T, IdxT>
 {
-  return raft::spatial::knn::detail::ivf_flat::build(handle, params, dataset, n_rows, dim, stream);
+  return raft::spatial::knn::ivf_flat::detail::build(handle, params, dataset, n_rows, dim, stream);
 }
 
 /**
@@ -89,7 +89,7 @@ inline void search(const handle_t& handle,
                    rmm::cuda_stream_view stream,
                    rmm::mr::device_memory_resource* mr = nullptr)
 {
-  return raft::spatial::knn::detail::ivf_flat::search(
+  return raft::spatial::knn::ivf_flat::detail::search(
     handle, params, index, queries, n_queries, k, neighbors, distances, stream, mr);
 }
 
diff --git a/cpp/include/raft/spatial/knn/ivf_flat_types.hpp b/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
index 610e7bf516..f02514d00e 100644
--- a/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
+++ b/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "ann_common.hpp"
+#include "common.hpp"
 
 #include <raft/core/mdarray.hpp>
 #include <raft/distance/distance_type.hpp>
@@ -47,7 +47,7 @@ constexpr static uint32_t kIndexGroupSize = 32;
  *
  */
 template <typename T, typename IdxT>
-struct index {
+struct index : knn::index {
   static_assert(!raft::is_narrowing_v<uint32_t, IdxT>,
                 "IdxT must be able to represent all values of uint32_t");
   /**
@@ -137,17 +137,23 @@ struct index {
   }
 };
 
-struct index_params : ivf_index_params {
+struct index_params : knn::index_params {
+  /** The number of inverted lists (clusters) */
+  uint32_t n_lists = 1024;
   /** The number of iterations searching for kmeans centers (index building). */
   uint32_t kmeans_n_iters = 20;
   /** The fraction of data to use during iterative kmeans building. */
   double kmeans_trainset_fraction = 0.5;
 };
 
-struct search_params : ivf_search_params {
+struct search_params : knn::search_params {
+  /** The number of clusters to search. */
+  uint32_t n_probes = 20;
 };
 
 static_assert(std::is_standard_layout_v<index<float, uint32_t>>);
 static_assert(std::is_aggregate_v<index<float, uint32_t>>);
+static_assert(std::is_aggregate_v<index_params>);
+static_assert(std::is_aggregate_v<search_params>);
 
 }  // namespace raft::spatial::knn::ivf_flat
diff --git a/cpp/test/spatial/ann_ivf_flat.cu b/cpp/test/spatial/ann_ivf_flat.cu
index 2f2fa0f34f..f41127942d 100644
--- a/cpp/test/spatial/ann_ivf_flat.cu
+++ b/cpp/test/spatial/ann_ivf_flat.cu
@@ -21,6 +21,7 @@
 #include <raft/distance/distance_type.hpp>
 #include <raft/random/rng.cuh>
 #include <raft/spatial/knn/ann.cuh>
+#include <raft/spatial/knn/ivf_flat.cuh>
 #include <raft/spatial/knn/knn.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -143,43 +144,86 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
     }
 
     {
+      // unless something is really wrong with clustering, this could serve as a lower bound on
+      // recall
+      double min_recall = static_cast<double>(ps.nprobe) / static_cast<double>(ps.nlist);
+
       rmm::device_uvector<T> distances_ivfflat_dev(queries_size, stream_);
       rmm::device_uvector<int64_t> indices_ivfflat_dev(queries_size, stream_);
-      raft::spatial::knn::ivf_flat::index_params index_params;
-      raft::spatial::knn::ivf_flat::search_params search_params;
-      index_params.n_lists   = ps.nlist;
-      index_params.metric    = ps.metric;
-      search_params.n_probes = ps.nprobe;
-      raft::spatial::knn::knnIndex index;
-
-      approx_knn_build_index(
-        handle_, &index, index_params, database.data(), ps.num_db_vecs, ps.dim);
-      handle_.sync_stream(stream_);
 
-      approx_knn_search(handle_,
-                        distances_ivfflat_dev.data(),
-                        indices_ivfflat_dev.data(),
-                        &index,
-                        search_params,
-                        ps.k,
-                        search_queries.data(),
-                        ps.num_queries);
-      update_host(distances_ivfflat.data(), distances_ivfflat_dev.data(), queries_size, stream_);
-      update_host(indices_ivfflat.data(), indices_ivfflat_dev.data(), queries_size, stream_);
-      handle_.sync_stream(stream_);
-    }
+      {
+        // legacy interface
+        raft::spatial::knn::IVFFlatParam ivfParams;
+        ivfParams.nprobe = ps.nprobe;
+        ivfParams.nlist  = ps.nlist;
+        raft::spatial::knn::knnIndex index;
+        index.index   = nullptr;
+        index.gpu_res = nullptr;
+
+        approx_knn_build_index(handle_,
+                               &index,
+                               dynamic_cast<raft::spatial::knn::knnIndexParam*>(&ivfParams),
+                               ps.metric,
+                               0,
+                               database.data(),
+                               ps.num_db_vecs,
+                               ps.dim);
+        handle_.sync_stream(stream_);
+        approx_knn_search(handle_,
+                          distances_ivfflat_dev.data(),
+                          indices_ivfflat_dev.data(),
+                          &index,
+                          ps.k,
+                          search_queries.data(),
+                          ps.num_queries);
+
+        update_host(distances_ivfflat.data(), distances_ivfflat_dev.data(), queries_size, stream_);
+        update_host(indices_ivfflat.data(), indices_ivfflat_dev.data(), queries_size, stream_);
+        handle_.sync_stream(stream_);
+      }
 
-    // unless something is really wrong with clustering, this could serve as a lower bound on recall
-    double min_recall = static_cast<double>(ps.nprobe) / static_cast<double>(ps.nlist);
-    // verify.
-    ASSERT_TRUE(eval_knn(indices_naive,
-                         indices_ivfflat,
-                         distances_naive,
-                         distances_ivfflat,
+      ASSERT_TRUE(eval_knn(indices_naive,
+                           indices_ivfflat,
+                           distances_naive,
+                           distances_ivfflat,
+                           ps.num_queries,
+                           ps.k,
+                           float(0.001),
+                           min_recall));
+      {
+        // new interface
+        raft::spatial::knn::ivf_flat::index_params index_params;
+        raft::spatial::knn::ivf_flat::search_params search_params;
+        index_params.n_lists   = ps.nlist;
+        index_params.metric    = ps.metric;
+        search_params.n_probes = ps.nprobe;
+
+        auto index = ivf_flat::build(
+          handle_, index_params, database.data(), int64_t(ps.num_db_vecs), ps.dim, stream_);
+
+        ivf_flat::search(handle_,
+                         search_params,
+                         index,
+                         search_queries.data(),
                          ps.num_queries,
                          ps.k,
-                         float(0.001),
-                         min_recall));
+                         indices_ivfflat_dev.data(),
+                         distances_ivfflat_dev.data(),
+                         stream_);
+
+        update_host(distances_ivfflat.data(), distances_ivfflat_dev.data(), queries_size, stream_);
+        update_host(indices_ivfflat.data(), indices_ivfflat_dev.data(), queries_size, stream_);
+        handle_.sync_stream(stream_);
+      }
+      ASSERT_TRUE(eval_knn(indices_naive,
+                           indices_ivfflat,
+                           distances_naive,
+                           distances_ivfflat,
+                           ps.num_queries,
+                           ps.k,
+                           float(0.001),
+                           min_recall));
+    }
   }
 
   void SetUp() override

From e73b259d359ee50e78942e7e58fbf924d00a3023 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 4 Jul 2022 16:28:43 +0200
Subject: [PATCH 105/118] Remove the stream argument from the public API

---
 cpp/bench/spatial/knn.cu                        | 13 +++----------
 .../raft/spatial/knn/detail/ann_quantized.cuh   | 13 +++----------
 cpp/include/raft/spatial/knn/ivf_flat.cuh       | 17 ++++++-----------
 cpp/include/raft/spatial/knn/ivf_flat_types.hpp |  7 +++++--
 cpp/test/spatial/ann_ivf_flat.cu                |  7 +++----
 5 files changed, 20 insertions(+), 37 deletions(-)

diff --git a/cpp/bench/spatial/knn.cu b/cpp/bench/spatial/knn.cu
index c5d208ab9c..ace2c0d088 100644
--- a/cpp/bench/spatial/knn.cu
+++ b/cpp/bench/spatial/knn.cu
@@ -141,7 +141,7 @@ struct ivf_flat_knn {
     index_params.n_lists = 4096;
     index_params.metric  = raft::distance::DistanceType::L2Expanded;
     index.emplace(raft::spatial::knn::ivf_flat::build(
-      handle, index_params, data, IdxT(ps.n_samples), uint32_t(ps.n_dims), handle.get_stream()));
+      handle, index_params, data, IdxT(ps.n_samples), uint32_t(ps.n_dims)));
   }
 
   void search(const raft::handle_t& handle,
@@ -150,15 +150,8 @@ struct ivf_flat_knn {
               IdxT* out_idxs)
   {
     search_params.n_probes = 20;
-    raft::spatial::knn::ivf_flat::search(handle,
-                                         search_params,
-                                         *index,
-                                         search_items,
-                                         ps.n_queries,
-                                         ps.k,
-                                         out_idxs,
-                                         out_dists,
-                                         handle.get_stream());
+    raft::spatial::knn::ivf_flat::search(
+      handle, search_params, *index, search_items, ps.n_queries, ps.k, out_idxs, out_dists);
   }
 };
 
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
index 7b07d02d36..6b3bb73acc 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
@@ -130,7 +130,7 @@ void approx_knn_build_index(const handle_t& handle,
                       metric == raft::distance::DistanceType::InnerProduct)) {
     auto new_params               = from_legacy_index_params(*ivf_ft_pams, metric, metricArg);
     index->ivf_flat<T, int64_t>() = std::make_unique<const ivf_flat::index<T, int64_t>>(
-      ivf_flat::build(handle, new_params, index_array, int64_t(n), D, stream));
+      ivf_flat::build(handle, new_params, index_array, int64_t(n), D));
   } else {
     RAFT_CUDA_TRY(cudaGetDevice(&(index->device)));
     index->gpu_res.reset(new raft::spatial::knn::RmmGpuResources());
@@ -180,15 +180,8 @@ void approx_knn_search(const handle_t& handle,
   } else if (index->ivf_flat<T, int64_t>()) {
     ivf_flat::search_params params;
     params.n_probes = index->nprobe;
-    ivf_flat::search(handle,
-                     params,
-                     *(index->ivf_flat<T, int64_t>()),
-                     query_array,
-                     n,
-                     k,
-                     indices,
-                     distances,
-                     handle.get_stream());
+    ivf_flat::search(
+      handle, params, *(index->ivf_flat<T, int64_t>()), query_array, n, k, indices, distances);
   } else {
     RAFT_FAIL("The model is not trained");
   }
diff --git a/cpp/include/raft/spatial/knn/ivf_flat.cuh b/cpp/include/raft/spatial/knn/ivf_flat.cuh
index 1564f74dcd..6d0e2e5911 100644
--- a/cpp/include/raft/spatial/knn/ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/ivf_flat.cuh
@@ -43,19 +43,16 @@ namespace raft::spatial::knn::ivf_flat {
  * @param[in] dataset a device pointer to a row-major matrix [n_rows, dim]
  * @param n_rows the number of samples
  * @param dim the dimensionality of the data
- * @param stream
  *
  * @return the constructed ivf-flat index
  */
 template <typename T, typename IdxT = uint32_t>
-inline auto build(const handle_t& handle,
-                  const index_params& params,
-                  const T* dataset,
-                  IdxT n_rows,
-                  uint32_t dim,
-                  rmm::cuda_stream_view stream) -> index<T, IdxT>
+inline auto build(
+  const handle_t& handle, const index_params& params, const T* dataset, IdxT n_rows, uint32_t dim)
+  -> index<T, IdxT>
 {
-  return raft::spatial::knn::ivf_flat::detail::build(handle, params, dataset, n_rows, dim, stream);
+  return raft::spatial::knn::ivf_flat::detail::build(
+    handle, params, dataset, n_rows, dim, handle.get_stream());
 }
 
 /**
@@ -73,7 +70,6 @@ inline auto build(const handle_t& handle,
  * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
  * [n_queries, k]
  * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
- * @param stream
  * @param mr an optional memory resource to use across the searches (you can provide a large enough
  *           memory pool here to avoid memory allocations within search).
  */
@@ -86,11 +82,10 @@ inline void search(const handle_t& handle,
                    uint32_t k,
                    IdxT* neighbors,
                    float* distances,
-                   rmm::cuda_stream_view stream,
                    rmm::mr::device_memory_resource* mr = nullptr)
 {
   return raft::spatial::knn::ivf_flat::detail::search(
-    handle, params, index, queries, n_queries, k, neighbors, distances, stream, mr);
+    handle, params, index, queries, n_queries, k, neighbors, distances, handle.get_stream(), mr);
 }
 
 }  // namespace raft::spatial::knn::ivf_flat
diff --git a/cpp/include/raft/spatial/knn/ivf_flat_types.hpp b/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
index f02514d00e..e8d6cb74eb 100644
--- a/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
+++ b/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
@@ -39,8 +39,11 @@ constexpr static uint32_t kIndexGroupSize = 32;
  * the index building process.
  *
  * It would seem logical to make all the type's members constant. However, we can't do that
- * because it would imply copying data when the index is moved. The current solution to this
- * is to make all public factory functions, such as `ivf_flat::build` return `const index`.
+ * because it would imply copying data when the index is moved. And we also cannot return
+ * `const index` in our factory functions, such as `ivf_flat::build`, because then the result
+ * couldn't be moved.
+ * Therefore, we return `index` mutable as-is, with a warning to the users that there are no
+ * protection mechanisms against manipulating the data.
  *
  * @tparam T data element type
  * @tparam IdxT type of the indices in the source dataset
diff --git a/cpp/test/spatial/ann_ivf_flat.cu b/cpp/test/spatial/ann_ivf_flat.cu
index f41127942d..22abb32659 100644
--- a/cpp/test/spatial/ann_ivf_flat.cu
+++ b/cpp/test/spatial/ann_ivf_flat.cu
@@ -198,8 +198,8 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
         index_params.metric    = ps.metric;
         search_params.n_probes = ps.nprobe;
 
-        auto index = ivf_flat::build(
-          handle_, index_params, database.data(), int64_t(ps.num_db_vecs), ps.dim, stream_);
+        auto index =
+          ivf_flat::build(handle_, index_params, database.data(), int64_t(ps.num_db_vecs), ps.dim);
 
         ivf_flat::search(handle_,
                          search_params,
@@ -208,8 +208,7 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
                          ps.num_queries,
                          ps.k,
                          indices_ivfflat_dev.data(),
-                         distances_ivfflat_dev.data(),
-                         stream_);
+                         distances_ivfflat_dev.data());
 
         update_host(distances_ivfflat.data(), distances_ivfflat_dev.data(), queries_size, stream_);
         update_host(indices_ivfflat.data(), indices_ivfflat_dev.data(), queries_size, stream_);

From 03ebbe0418c4cd4b2a83c0d8a1ae6b7406bdcc86 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 6 Jul 2022 14:59:30 +0200
Subject: [PATCH 106/118] Simplify kmeans::predict a little bit

---
 .../knn/detail/ann_kmeans_balanced.cuh        | 41 +++++++++----------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index 30a8262e4a..ec5f78e952 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -185,7 +185,6 @@ void update_centers(float* centers,
  * @param n_rows number samples in the `dataset`
  * @param[out] labels output predictions [n_rows]
  * @param metric
- * @param is_center_set
  * @param[in] centers_temp optional [n_clusters, dim]
  * @param[inout] cluster_sizes (optional) number of rows in each cluster [n_clusters]
  * @param shall_update_centers
@@ -201,7 +200,6 @@ void predict(const handle_t& handle,
              uint32_t n_rows,
              uint32_t* labels,
              raft::distance::DistanceType metric,
-             bool is_center_set,
              float* centers_temp,
              uint32_t* cluster_sizes,
              bool shall_update_centers,
@@ -215,20 +213,6 @@ void predict(const handle_t& handle,
       "cuann_kmeans_predict: empty dataset (n_rows = %d, n_clusters = %d)", n_rows, n_clusters);
     return;
   }
-  if (!is_center_set) {
-    // If centers are not set, the labels will be determined randomly.
-    linalg::writeOnlyUnaryOp(
-      labels,
-      n_rows,
-      [n_clusters] __device__(uint32_t * out, uint32_t i) { *out = i % n_clusters; },
-      stream);
-    if (centers_temp != nullptr && cluster_sizes != nullptr) {
-      // update centers
-      update_centers(
-        centers, n_clusters, dim, dataset, n_rows, labels, metric, cluster_sizes, nullptr, stream);
-    }
-    return;
-  }
 
   const uint32_t max_minibatch_size = calc_minibatch_size(n_clusters, n_rows);
 
@@ -382,7 +366,7 @@ auto adjust_centers(float* centers,
 
 /** predict & adjust_centers combined in an iterative process. */
 template <typename T>
-auto build_clusters(const handle_t& handle,
+void build_clusters(const handle_t& handle,
                     uint32_t n_iters,
                     size_t dim,
                     const T* dataset,  // managedl [n_rows, dim]
@@ -397,6 +381,25 @@ auto build_clusters(const handle_t& handle,
 {
   rmm::device_uvector<float> cluster_centers_tmp(n_clusters * dim, stream, device_memory);
 
+  // "randomly initialize labels"
+  linalg::writeOnlyUnaryOp(
+    cluster_labels,
+    n_rows,
+    [n_clusters] __device__(uint32_t * out, uint32_t i) { *out = i % n_clusters; },
+    stream);
+
+  // update centers to match the initialized labels.
+  update_centers(cluster_centers,
+                 n_clusters,
+                 dim,
+                 dataset,
+                 n_rows,
+                 cluster_labels,
+                 metric,
+                 cluster_sizes,
+                 nullptr,
+                 stream);
+
   for (uint32_t iter = 0; iter < 2 * n_iters; iter += 2) {
     kmeans::predict(handle,
                     cluster_centers,
@@ -406,7 +409,6 @@ auto build_clusters(const handle_t& handle,
                     n_rows,
                     cluster_labels,
                     metric,
-                    (iter != 0),
                     cluster_centers_tmp.data(),
                     cluster_sizes,
                     true,
@@ -689,7 +691,6 @@ void build_optimized_kmeans(const handle_t& handle,
                     n_rows_train,
                     labels,
                     metric,
-                    true,
                     centers_temp.data(),
                     cluster_sizes,
                     true,
@@ -708,7 +709,6 @@ void build_optimized_kmeans(const handle_t& handle,
                   n_rows,
                   labels,
                   metric,
-                  true,
                   centers_temp.data(),
                   cluster_sizes,
                   true,
@@ -723,7 +723,6 @@ void build_optimized_kmeans(const handle_t& handle,
                   n_rows,
                   labels,
                   metric,
-                  true,
                   centers_temp.data(),
                   cluster_sizes,
                   false,

From cde7f97b06722574ec0b134d1360ced97b82afaa Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 7 Jul 2022 11:30:32 +0200
Subject: [PATCH 107/118] Factor out predict from the other ops in kmeans for
 use outside of the module

---
 .../knn/detail/ann_kmeans_balanced.cuh        | 357 +++++++++---------
 .../raft/spatial/knn/detail/ann_utils.cuh     |  14 +-
 .../spatial/knn/detail/ivf_flat_build.cuh     |  17 +-
 3 files changed, 191 insertions(+), 197 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index ec5f78e952..ae2caf9c28 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -33,36 +33,60 @@
 #include <rmm/device_vector.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 
 namespace raft::spatial::knn::detail::kmeans {
 
-void predict_core_(const handle_t& handle,
-                   const float* centers,  // [n_clusters, dim]
-                   uint32_t n_clusters,
-                   uint32_t dim,
-                   const float* dataset,  // [n_rows, dim]
-                   uint32_t n_rows,
-                   uint32_t* labels,  // [n_rows]
-                   raft::distance::DistanceType metric,
-                   rmm::mr::device_memory_resource* mr,
-                   rmm::cuda_stream_view stream)
+/**
+ * @brief Predict labels for the dataset; floats only.
+ *
+ * NB: no minibatch splitting is done here, it may require large amount of temporary memory (n_rows
+ * * n_cluster * sizeof(float)).
+ *
+ * @param handle
+ * @param[in] centers a pointer to the row-major matrix of cluster centers [n_clusters, dim]
+ * @param n_clusters number of clusters/centers
+ * @param dim dimensionality of the data
+ * @param[in] dataset a pointer to the data [n_rows, dim]
+ * @param n_rows number samples in the `dataset`
+ * @param[out] labels output predictions [n_rows]
+ * @param metric
+ * @param stream
+ * @param mr (optional) memory resource to use for temporary allocations
+ */
+void predict_float_core(const handle_t& handle,
+                        const float* centers,
+                        uint32_t n_clusters,
+                        uint32_t dim,
+                        const float* dataset,
+                        size_t n_rows,
+                        uint32_t* labels,
+                        raft::distance::DistanceType metric,
+                        rmm::cuda_stream_view stream,
+                        rmm::mr::device_memory_resource* mr)
 {
   rmm::device_uvector<float> distances(n_rows * n_clusters, stream, mr);
 
   float alpha;
   float beta;
-  if (metric == raft::distance::DistanceType::InnerProduct) {
-    alpha = -1.0;
-    beta  = 0.0;
-  } else {
-    rmm::device_uvector<float> sqsum_centers(n_clusters, stream, mr);
-    rmm::device_uvector<float> sqsum_data(n_rows, stream, mr);
-    utils::dots_along_rows(n_clusters, dim, centers, sqsum_centers.data(), stream);
-    utils::dots_along_rows(n_rows, dim, dataset, sqsum_data.data(), stream);
-    utils::outer_add(
-      sqsum_data.data(), n_rows, sqsum_centers.data(), n_clusters, distances.data(), stream);
-    alpha = -2.0;
-    beta  = 1.0;
+  switch (metric) {
+    case raft::distance::DistanceType::InnerProduct: {
+      alpha = -1.0;
+      beta  = 0.0;
+    } break;
+    case raft::distance::DistanceType::L2Expanded:
+    case raft::distance::DistanceType::L2Unexpanded: {
+      rmm::device_uvector<float> sqsum_centers(n_clusters, stream, mr);
+      rmm::device_uvector<float> sqsum_data(n_rows, stream, mr);
+      utils::dots_along_rows(n_clusters, dim, centers, sqsum_centers.data(), stream);
+      utils::dots_along_rows(n_rows, dim, dataset, sqsum_data.data(), stream);
+      utils::outer_add(
+        sqsum_data.data(), n_rows, sqsum_centers.data(), n_clusters, distances.data(), stream);
+      alpha = -2.0;
+      beta  = 1.0;
+    } break;
+    // NB: update the description of `knn::ivf_flat::build` when adding here a new metric.
+    default: RAFT_FAIL("The chosen distance metric is not supported (%d)", int(metric));
   }
   linalg::gemm(handle,
                true,
@@ -92,7 +116,7 @@ void predict_core_(const handle_t& handle,
  * @param n_rows dataset size
  * @return a suggested minibatch size
  */
-constexpr auto calc_minibatch_size(uint32_t n_clusters, uint32_t n_rows) -> uint32_t
+constexpr auto calc_minibatch_size(uint32_t n_clusters, size_t n_rows) -> uint32_t
 {
   n_clusters              = std::max<uint32_t>(1, n_clusters);
   uint32_t minibatch_size = (1 << 20);
@@ -101,21 +125,17 @@ constexpr auto calc_minibatch_size(uint32_t n_clusters, uint32_t n_rows) -> uint
     minibatch_size += 32;
     minibatch_size -= minibatch_size % 64;
   }
-  minibatch_size = std::min<uint32_t>(minibatch_size, n_rows);
+  minibatch_size = uint32_t(std::min<size_t>(minibatch_size, n_rows));
   return minibatch_size;
 }
 
 /**
- * @brief update kmeans centers
+ * @brief Given the data and labels, calculate cluster centers and sizes in one sweep.
  *
  * Let S_i = {x_k | x_k \in dataset & labels[k] == i} be the vectors in the dataset with label i.
  *   On exit centers_i = normalize(\sum_{x \in S_i} x), where `normalize` depends on the distance
  * type.
  *
- * If accumulated_centers is not null, then it is expected that the summation is already done and
- * the results are stored in accumulated_centers. In that case only the normalization will be
- * applied.
- *
  * NB: `centers` and `cluster_sizes` must be accessible on GPU due to
  * divide_along_rows/normalize_rows. The rest can be both, under assumption that all pointers are
  * accessible from the same place.
@@ -128,38 +148,29 @@ constexpr auto calc_minibatch_size(uint32_t n_clusters, uint32_t n_rows) -> uint
  * @tparam T element type
  *
  * @param[out] centers pointer to the output [n_clusters, dim]
+ * @param[out] cluster_sizes number of rows in each cluster [n_clusters]
  * @param n_clusters number of clusters/centers
  * @param dim dimensionality of the data
  * @param[in] dataset a pointer to the data [n_rows, dim]
  * @param n_rows number samples in the `dataset`
  * @param[in] labels output predictions [n_rows]
  * @param metric
- * @param[inout] cluster_sizes (optional) number of rows in each cluster [n_clusters]
- * @param[in] accumulated_centers (optional) pre-computed accumulated sums
- *                                (non-normalized centers) [n_clusters, dim]
  * @param stream
  */
 template <typename T>
-void update_centers(float* centers,
-                    uint32_t n_clusters,
-                    uint32_t dim,
-                    const T* dataset,
-                    uint32_t n_rows,
-                    const uint32_t* labels,
-                    raft::distance::DistanceType metric,
-                    uint32_t* cluster_sizes,
-                    const float* accumulated_centers,
-                    rmm::cuda_stream_view stream)
+void calc_centers_and_sizes(float* centers,
+                            uint32_t* cluster_sizes,
+                            uint32_t n_clusters,
+                            uint32_t dim,
+                            const T* dataset,
+                            size_t n_rows,
+                            const uint32_t* labels,
+                            raft::distance::DistanceType metric,
+                            rmm::cuda_stream_view stream)
 {
-  if (accumulated_centers == nullptr) {
-    // accumulate
-    utils::memset(centers, 0, sizeof(float) * n_clusters * dim, stream);
-    utils::memset(cluster_sizes, 0, sizeof(uint32_t) * n_clusters, stream);
-    utils::accumulate_into_selected<T>(
-      n_rows, dim, centers, cluster_sizes, dataset, labels, stream);
-  } else {
-    raft::copy(centers, accumulated_centers, n_clusters * dim, stream);
-  }
+  utils::memset(centers, 0, sizeof(float) * n_clusters * dim, stream);
+  utils::memset(cluster_sizes, 0, sizeof(uint32_t) * n_clusters, stream);
+  utils::accumulate_into_selected(n_rows, dim, centers, cluster_sizes, dataset, labels, stream);
 
   if (metric == raft::distance::DistanceType::InnerProduct) {
     // normalize
@@ -171,109 +182,72 @@ void update_centers(float* centers,
 }
 
 /**
- * @brief Predict labels for the dataset. For each point we assign the label of the nearest center.
+ * @brief Predict labels for the dataset.
  *
- * NB: seems that all pointers here are accessed by devicie code only
+ * NB: no minibatch splitting is done here, it may require large amount of temporary memory (n_rows
+ * * n_cluster * sizeof(float)).
  *
  * @tparam T element type
  *
  * @param handle
- * @param[inout] centers a pointer to the row-major matrix of cluster centers [n_clusters, dim]
+ * @param[in] centers a pointer to the row-major matrix of cluster centers [n_clusters, dim]
  * @param n_clusters number of clusters/centers
  * @param dim dimensionality of the data
  * @param[in] dataset a pointer to the data [n_rows, dim]
  * @param n_rows number samples in the `dataset`
  * @param[out] labels output predictions [n_rows]
  * @param metric
- * @param[in] centers_temp optional [n_clusters, dim]
- * @param[inout] cluster_sizes (optional) number of rows in each cluster [n_clusters]
- * @param shall_update_centers
  * @param stream
  * @param mr (optional) memory resource to use for temporary allocations
  */
+
 template <typename T>
 void predict(const handle_t& handle,
-             float* centers,
+             const float* centers,
              uint32_t n_clusters,
              uint32_t dim,
              const T* dataset,
-             uint32_t n_rows,
+             size_t n_rows,
              uint32_t* labels,
              raft::distance::DistanceType metric,
-             float* centers_temp,
-             uint32_t* cluster_sizes,
-             bool shall_update_centers,
              rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr)
+             rmm::mr::device_memory_resource* mr = nullptr)
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
     "kmeans::predict(%u, %u)", n_rows, n_clusters);
-  if (n_rows == 0) {
-    RAFT_LOG_WARN(
-      "cuann_kmeans_predict: empty dataset (n_rows = %d, n_clusters = %d)", n_rows, n_clusters);
-    return;
-  }
-
+  if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); }
   const uint32_t max_minibatch_size = calc_minibatch_size(n_clusters, n_rows);
-
-  if (centers_temp != nullptr && cluster_sizes != nullptr) {
-    utils::memset(centers_temp, 0, sizeof(float) * n_clusters * dim, stream);
-    utils::memset(cluster_sizes, 0, sizeof(uint32_t) * n_clusters, stream);
-  }
-
-  rmm::device_uvector<float> cur_dataset(max_minibatch_size * dim, stream, mr);
-  for (uint32_t offset = 0; offset < n_rows; offset += max_minibatch_size) {
+  rmm::device_uvector<float> cur_dataset(
+    std::is_same_v<T, float> ? 0 : max_minibatch_size * dim, stream, mr);
+  auto cur_dataset_ptr = cur_dataset.data();
+  for (size_t offset = 0; offset < n_rows; offset += max_minibatch_size) {
     auto minibatch_size = std::min<uint32_t>(max_minibatch_size, n_rows - offset);
 
     if constexpr (std::is_same_v<T, float>) {
-      raft::copy(cur_dataset.data(), dataset + offset * dim, minibatch_size * dim, stream);
+      cur_dataset_ptr = const_cast<float*>(dataset + offset * dim);
     } else {
-      linalg::unaryOp(cur_dataset.data(),
+      linalg::unaryOp(cur_dataset_ptr,
                       dataset + offset * dim,
                       minibatch_size * dim,
                       utils::mapping<float>{},
                       stream);
     }
-    // predict
-    predict_core_(handle,
-                  centers,
-                  n_clusters,
-                  dim,
-                  cur_dataset.data(),
-                  minibatch_size,
-                  labels + offset,
-                  metric,
-                  mr,
-                  stream);
-
-    if ((centers_temp != nullptr) && (cluster_sizes != nullptr)) {
-      // accumulate
-      utils::accumulate_into_selected<float>(minibatch_size,
-                                             dim,
-                                             centers_temp,
-                                             cluster_sizes,
-                                             cur_dataset.data(),
-                                             labels + offset,
-                                             stream);
-    }
-  }
 
-  if ((centers_temp != nullptr) && (cluster_sizes != nullptr) && shall_update_centers) {
-    update_centers(centers,
-                   n_clusters,
-                   dim,
-                   dataset,
-                   n_rows,
-                   labels,
-                   metric,
-                   cluster_sizes,
-                   centers_temp,
-                   stream);
+    predict_float_core(handle,
+                       centers,
+                       n_clusters,
+                       dim,
+                       cur_dataset_ptr,
+                       minibatch_size,
+                       labels + offset,
+                       metric,
+                       stream,
+                       mr);
   }
 }
 
 /**
- * @brief Adjust centers which have small number of entries.
+ * @brief Adjust centers for clusters that have small number of entries.
  *
  * For each cluster, where the cluster size is not bigger than a threshold, the center is moved
  * towards a data point that belongs to a large cluster.
@@ -323,7 +297,7 @@ auto adjust_centers(float* centers,
   static size_t i_primes = 0;
 
   bool adjusted    = false;
-  uint32_t average = n_rows / n_clusters;
+  uint32_t average = static_cast<uint32_t>(n_rows / size_t(n_clusters));
   uint32_t ofst;
 
   do {
@@ -379,8 +353,6 @@ void build_clusters(const handle_t& handle,
                     rmm::mr::device_memory_resource* device_memory,
                     rmm::cuda_stream_view stream)
 {
-  rmm::device_uvector<float> cluster_centers_tmp(n_clusters * dim, stream, device_memory);
-
   // "randomly initialize labels"
   linalg::writeOnlyUnaryOp(
     cluster_labels,
@@ -389,31 +361,36 @@ void build_clusters(const handle_t& handle,
     stream);
 
   // update centers to match the initialized labels.
-  update_centers(cluster_centers,
-                 n_clusters,
-                 dim,
-                 dataset,
-                 n_rows,
-                 cluster_labels,
-                 metric,
-                 cluster_sizes,
-                 nullptr,
-                 stream);
+  calc_centers_and_sizes(cluster_centers,
+                         cluster_sizes,
+                         n_clusters,
+                         dim,
+                         dataset,
+                         n_rows,
+                         cluster_labels,
+                         metric,
+                         stream);
 
   for (uint32_t iter = 0; iter < 2 * n_iters; iter += 2) {
-    kmeans::predict(handle,
-                    cluster_centers,
-                    n_clusters,
-                    dim,
-                    dataset,
-                    n_rows,
-                    cluster_labels,
-                    metric,
-                    cluster_centers_tmp.data(),
-                    cluster_sizes,
-                    true,
-                    stream,
-                    device_memory);
+    predict(handle,
+            cluster_centers,
+            n_clusters,
+            dim,
+            dataset,
+            n_rows,
+            cluster_labels,
+            metric,
+            stream,
+            device_memory);
+    calc_centers_and_sizes(cluster_centers,
+                           cluster_sizes,
+                           n_clusters,
+                           dim,
+                           dataset,
+                           n_rows,
+                           cluster_labels,
+                           metric,
+                           stream);
 
     if (iter + 1 < 2 * n_iters) {
       if (kmeans::adjust_centers(cluster_centers,
@@ -443,7 +420,7 @@ auto arrange_fine_clusters(size_t n_clusters,
   fine_clusters_csum[0] = 0;
 
   uint32_t n_lists_rem            = n_clusters;
-  uint32_t n_rows_rem             = n_rows;
+  size_t n_rows_rem               = n_rows;
   uint32_t mesocluster_size_sum   = 0;
   uint32_t mesocluster_size_max   = 0;
   uint32_t fine_clusters_nums_max = 0;
@@ -528,7 +505,7 @@ auto build_fine_clusters(const handle_t& handle,
   uint32_t n_clusters_done = 0;
   for (uint32_t i = 0; i < n_mesoclusters; i++) {
     uint32_t k = 0;
-    for (uint32_t j = 0; j < n_rows; j++) {
+    for (size_t j = 0; j < n_rows; j++) {
       if (labels_mptr[j] == i) { mc_trainset_ids[k++] = j; }
     }
     RAFT_EXPECTS(k == mesocluster_sizes[i], "Incorrect mesocluster size at %d.", i);
@@ -577,7 +554,6 @@ auto build_fine_clusters(const handle_t& handle,
  * @param[in] dataset a device pointer to the source dataset [n_rows, dim]
  * @param n_rows number of rows in the input
  * @param[out] labels a device pointer to the output labels [n_rows]
- * @param[out] cluster_sizes a device pointer to the found cluster sizes [n_cluster]
  * @param[out] cluster_centers a device pointer to the found cluster centers [n_cluster, dim]
  * @param n_cluster
  * @param trainset_fraction a fraction of rows in the `dataset` to sample for kmeans training;
@@ -592,7 +568,6 @@ void build_optimized_kmeans(const handle_t& handle,
                             const T* dataset,
                             size_t n_rows,
                             uint32_t* labels,
-                            uint32_t* cluster_sizes,
                             float* cluster_centers,
                             size_t n_clusters,
                             double trainset_fraction,
@@ -606,7 +581,7 @@ void build_optimized_kmeans(const handle_t& handle,
     std::max<size_t>(1, n_rows / std::max<size_t>(trainset_fraction * n_rows, n_clusters));
   auto n_rows_train = n_rows / trainset_ratio;
 
-  uint32_t n_mesoclusters = std::pow<double>(n_clusters, 0.5) + 0.5;
+  uint32_t n_mesoclusters = std::min<uint32_t>(n_clusters, std::sqrt(n_clusters) + 0.5);
   RAFT_LOG_DEBUG("(%s) # n_mesoclusters: %u", __func__, n_mesoclusters);
 
   rmm::mr::managed_memory_resource managed_memory;
@@ -678,56 +653,66 @@ void build_optimized_kmeans(const handle_t& handle,
                                              stream);
   RAFT_EXPECTS(n_clusters_done == n_clusters, "Didn't process all clusters.");
 
-  rmm::device_uvector<float> centers_temp(n_clusters * dim, stream, device_memory);
+  rmm::device_uvector<uint32_t> cluster_sizes(n_clusters, stream, device_memory);
 
   // fit clusters using the trainset
   for (int iter = 0; iter < 2; iter++) {
     // NB: labels.size == n_rows >= n_rows_train; the output is not used.
-    kmeans::predict(handle,
-                    cluster_centers,
-                    n_clusters,
-                    dim,
-                    trainset.data(),
-                    n_rows_train,
-                    labels,
-                    metric,
-                    centers_temp.data(),
-                    cluster_sizes,
-                    true,
-                    stream,
-                    device_memory);
+    predict(handle,
+            cluster_centers,
+            n_clusters,
+            dim,
+            trainset.data(),
+            n_rows_train,
+            labels,
+            metric,
+            stream,
+            device_memory);
+    calc_centers_and_sizes(cluster_centers,
+                           cluster_sizes.data(),
+                           n_clusters,
+                           dim,
+                           trainset.data(),
+                           n_rows_train,
+                           labels,
+                           metric,
+                           stream);
   }
 
   RAFT_LOG_DEBUG("(%s) Final fitting.", __func__);
 
   // fit clusters using the whole dataset
-  kmeans::predict(handle,
-                  cluster_centers,
-                  n_clusters,
-                  dim,
-                  dataset,
-                  n_rows,
-                  labels,
-                  metric,
-                  centers_temp.data(),
-                  cluster_sizes,
-                  true,
-                  stream,
-                  device_memory);
-
-  kmeans::predict(handle,
-                  cluster_centers,
-                  n_clusters,
-                  dim,
-                  dataset,
-                  n_rows,
-                  labels,
-                  metric,
-                  centers_temp.data(),
-                  cluster_sizes,
-                  false,
-                  stream,
-                  device_memory);
+  predict(handle,
+          cluster_centers,
+          n_clusters,
+          dim,
+          dataset,
+          n_rows,
+          labels,
+          metric,
+          stream,
+          device_memory);
+  // update the cluster centers one last time to better represent the whole dataset
+  calc_centers_and_sizes(cluster_centers,
+                         cluster_sizes.data(),
+                         n_clusters,
+                         dim,
+                         dataset,
+                         n_rows,
+                         labels,
+                         metric,
+                         stream);
+
+  predict(handle,
+          cluster_centers,
+          n_clusters,
+          dim,
+          dataset,
+          n_rows,
+          labels,
+          metric,
+          stream,
+          device_memory);
 }
 
 }  // namespace raft::spatial::knn::detail::kmeans
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index f1d198a99a..eae444f4f8 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -278,13 +278,13 @@ __global__ void accumulate_into_selected_kernel(uint32_t n_rows,
  *
  * @param n_cols number of columns in all matrices
  * @param[out] output output matrix [..., n_cols]
- * @param[out] selection_counters number of occurrences of each row id in row_ids [..., n_cols]
+ * @param[inout] selection_counters number of occurrences of each row id in row_ids [..., n_cols]
  * @param n_rows number of rows in the input
  * @param[in] input row-major input matrix [n_rows, n_cols]
  * @param[in] row_ids row indices in the output matrix [n_rows]
  */
 template <typename T>
-void accumulate_into_selected(uint32_t n_rows,
+void accumulate_into_selected(size_t n_rows,
                               uint32_t n_cols,
                               float* output,
                               uint32_t* selection_counters,
@@ -296,17 +296,17 @@ void accumulate_into_selected(uint32_t n_rows,
     case pointer_residency::host_and_device:
     case pointer_residency::device_only: {
       uint32_t block_dim = 128;
-      auto grid_dim      = static_cast<uint32_t>(ceildiv<uint64_t>(
-        static_cast<uint64_t>(n_rows) * static_cast<uint64_t>(n_cols), block_dim));
+      auto grid_dim =
+        static_cast<uint32_t>(ceildiv<size_t>(n_rows * static_cast<size_t>(n_cols), block_dim));
       accumulate_into_selected_kernel<T><<<grid_dim, block_dim, 0, stream>>>(
         n_rows, n_cols, output, selection_counters, input, row_ids);
     } break;
     case pointer_residency::host_only: {
       stream.synchronize();
-      for (uint64_t i = 0; i < n_rows; i++) {
-        uint64_t l = row_ids[i];
+      for (size_t i = 0; i < n_rows; i++) {
+        uint32_t l = row_ids[i];
         selection_counters[l]++;
-        for (uint64_t j = 0; j < n_cols; j++) {
+        for (uint32_t j = 0; j < n_cols; j++) {
           output[j + n_cols * l] += mapping<float>{}(input[j + n_cols * i]);
         }
       }
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
index 6b7f7662be..f96c900e3f 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
@@ -25,6 +25,7 @@
 #include <raft/core/mdarray.hpp>
 #include <raft/core/nvtx.hpp>
 #include <raft/pow2_utils.cuh>
+#include <raft/stats/histogram.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -124,9 +125,7 @@ inline auto build(const handle_t& handle,
 
   // kmeans cluster ids for the dataset
   rmm::device_uvector<uint32_t> labels(n_rows, stream);
-  auto&& centers      = make_device_mdarray<float>(stream, n_lists, dim);
-  auto&& list_sizes   = make_device_mdarray<uint32_t>(stream, n_lists);
-  auto list_sizes_ptr = list_sizes.data();
+  auto&& centers = make_device_mdarray<float>(stream, n_lists, dim);
 
   // Predict labels of the whole dataset
   kmeans::build_optimized_kmeans(handle,
@@ -135,13 +134,23 @@ inline auto build(const handle_t& handle,
                                  dataset,
                                  n_rows,
                                  labels.data(),
-                                 list_sizes_ptr,
                                  centers.data(),
                                  n_lists,
                                  params.kmeans_trainset_fraction,
                                  params.metric,
                                  stream);
 
+  // sizes of the clusters
+  auto&& list_sizes   = make_device_mdarray<uint32_t>(stream, n_lists);
+  auto list_sizes_ptr = list_sizes.data();
+  raft::stats::histogram<uint32_t, size_t>(raft::stats::HistTypeAuto,
+                                           reinterpret_cast<int32_t*>(list_sizes_ptr),
+                                           size_t(n_lists),
+                                           labels.data(),
+                                           n_rows,
+                                           1,
+                                           stream);
+
   // Calculate offsets into cluster data using exclusive scan
   auto&& list_offsets   = make_device_mdarray<IdxT>(stream, n_lists + 1);
   auto list_offsets_ptr = list_offsets.data();

From 305bbcd1aa7f845bcf5fa226399c46b43b77efef Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 20 Jul 2022 17:50:32 +0200
Subject: [PATCH 108/118] Add new function extend(index, new_vecs, new_inds) to
 ivf_flat

---
 cpp/include/raft/spatial/knn/common.hpp       |   9 +
 .../knn/detail/ann_kmeans_balanced.cuh        | 220 +++++++++---------
 .../raft/spatial/knn/detail/ann_utils.cuh     |  97 ++++++--
 .../spatial/knn/detail/ivf_flat_build.cuh     | 206 +++++++++++-----
 cpp/include/raft/spatial/knn/ivf_flat.cuh     |  29 +++
 cpp/test/spatial/ann_ivf_flat.cu              |  20 +-
 6 files changed, 390 insertions(+), 191 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/common.hpp b/cpp/include/raft/spatial/knn/common.hpp
index 4704093065..caaa951a66 100644
--- a/cpp/include/raft/spatial/knn/common.hpp
+++ b/cpp/include/raft/spatial/knn/common.hpp
@@ -30,6 +30,15 @@ struct index_params {
   raft::distance::DistanceType metric = distance::DistanceType::L2Expanded;
   /** The argument used by some distance metrics. */
   float metric_arg = 2.0f;
+  /**
+   * Whether to add the dataset content to the index, i.e.:
+   *
+   *  - `true` means the index is filled with the dataset vectors and ready to search after calling
+   * `build`.
+   *  - `false` means `build` only trains the underlying model (e.g. quantizer or clustering), but
+   * the index is left empty; you'd need to call `extend` on the index afterwards to populate it.
+   */
+  bool add_data_on_build = true;
 };
 
 struct search_params {
diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index ae2caf9c28..8412d753fd 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -27,7 +27,6 @@
 #include <raft/linalg/gemm.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/matrix/matrix.cuh>
-#include <raft/pow2_utils.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_vector.hpp>
@@ -147,14 +146,16 @@ constexpr auto calc_minibatch_size(uint32_t n_clusters, size_t n_rows) -> uint32
  *
  * @tparam T element type
  *
- * @param[out] centers pointer to the output [n_clusters, dim]
- * @param[out] cluster_sizes number of rows in each cluster [n_clusters]
+ * @param[inout] centers pointer to the output [n_clusters, dim]
+ * @param[inout] cluster_sizes number of rows in each cluster [n_clusters]
  * @param n_clusters number of clusters/centers
  * @param dim dimensionality of the data
  * @param[in] dataset a pointer to the data [n_rows, dim]
  * @param n_rows number samples in the `dataset`
  * @param[in] labels output predictions [n_rows]
- * @param metric
+ * @param reset_counters whether to clear the output arrays before calculating.
+ *    When set to `false`, this function may be used to update existing centers and sizes using
+ *    the weighted average principle.
  * @param stream
  */
 template <typename T>
@@ -165,28 +166,34 @@ void calc_centers_and_sizes(float* centers,
                             const T* dataset,
                             size_t n_rows,
                             const uint32_t* labels,
-                            raft::distance::DistanceType metric,
+                            bool reset_counters,
                             rmm::cuda_stream_view stream)
 {
-  utils::memset(centers, 0, sizeof(float) * n_clusters * dim, stream);
-  utils::memset(cluster_sizes, 0, sizeof(uint32_t) * n_clusters, stream);
-  utils::accumulate_into_selected(n_rows, dim, centers, cluster_sizes, dataset, labels, stream);
-
-  if (metric == raft::distance::DistanceType::InnerProduct) {
-    // normalize
-    utils::normalize_rows(n_clusters, dim, centers, stream);
+  if (reset_counters) {
+    utils::memzero(centers, n_clusters * dim, stream);
+    utils::memzero(cluster_sizes, n_clusters, stream);
   } else {
-    // average
-    utils::divide_along_rows(n_clusters, dim, centers, cluster_sizes, stream);
+    utils::map_along_rows(
+      n_clusters,
+      dim,
+      centers,
+      cluster_sizes,
+      [] __device__(float c, uint32_t s) -> float { return c * s; },
+      stream);
   }
+  utils::accumulate_into_selected(n_rows, dim, centers, cluster_sizes, dataset, labels, stream);
+  utils::map_along_rows(
+    n_clusters,
+    dim,
+    centers,
+    cluster_sizes,
+    [] __device__(float c, uint32_t s) -> float { return s == 0 ? 0.0f : c / float(s); },
+    stream);
 }
 
 /**
  * @brief Predict labels for the dataset.
  *
- * NB: no minibatch splitting is done here, it may require large amount of temporary memory (n_rows
- * * n_cluster * sizeof(float)).
- *
  * @tparam T element type
  *
  * @param handle
@@ -214,7 +221,7 @@ void predict(const handle_t& handle,
              rmm::mr::device_memory_resource* mr = nullptr)
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
-    "kmeans::predict(%u, %u)", n_rows, n_clusters);
+    "kmeans::predict(%zu, %u)", n_rows, n_clusters);
   if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); }
   const uint32_t max_minibatch_size = calc_minibatch_size(n_clusters, n_rows);
   rmm::device_uvector<float> cur_dataset(
@@ -264,7 +271,6 @@ void predict(const handle_t& handle,
  * @param[in] dataset a host pointer to the row-major data matrix [n_rows, dim]
  * @param n_rows number of rows in `dataset`
  * @param[in] labels a host pointer to the cluster indices [n_rows]
- * @param metric
  * @param[in] cluster_sizes number of rows in each cluster [n_clusters]
  * @param threshold defines a criterion for adjusting a cluster
  *                   (cluster_sizes <= average_size * threshold)
@@ -275,18 +281,17 @@ void predict(const handle_t& handle,
  */
 template <typename T>
 auto adjust_centers(float* centers,
-                    size_t n_clusters,
-                    size_t dim,
+                    uint32_t n_clusters,
+                    uint32_t dim,
                     const T* dataset,
                     size_t n_rows,
                     const uint32_t* labels,
-                    raft::distance::DistanceType metric,
                     const uint32_t* cluster_sizes,
                     float threshold,
                     rmm::cuda_stream_view stream) -> bool
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
-    "kmeans::adjust_centers(%u, %u)", n_rows, n_clusters);
+    "kmeans::adjust_centers(%zu, %u)", n_rows, n_clusters);
   stream.synchronize();
   if (n_clusters == 0) { return false; }
   constexpr static std::array kPrimes{29,   71,   113,  173,  229,  281,  349,  409,  463,  541,
@@ -305,33 +310,29 @@ auto adjust_centers(float* centers,
     ofst     = kPrimes[i_primes];
   } while (n_rows % ofst == 0);
 
-  for (size_t l = 0; l < n_clusters; l++) {
+  for (uint32_t l = 0; l < n_clusters; l++) {
+    auto csize = cluster_sizes[l];
     // skip big clusters
-    if (cluster_sizes[l] > static_cast<uint32_t>(average * threshold)) continue;
+    if (csize > static_cast<uint32_t>(average * threshold)) continue;
     // choose a "random" i that belongs to a rather large cluster
     do {
       i = (i + ofst) % n_rows;
     } while (cluster_sizes[labels[i]] < average);
     // Adjust the center of the selected smaller cluster to gravitate towards
     // a sample from the selected larger cluster.
-    size_t li   = labels[i];
-    float sqsum = 0.0;
-    for (size_t j = 0; j < dim; j++) {
-      constexpr float kWc = 7.0;
-      constexpr float kWd = 1.0;
-      float val           = 0;
-      val += kWc * centers[j + dim * li];
-      val += kWd * utils::mapping<float>{}(dataset[j + dim * i]);
-      val /= kWc + kWd;
-      sqsum += val * val;
+    const size_t li = labels[i];
+    // Weight of the current center for the weighted average.
+    // We dump it for anomalously small clusters, but keep constant overwise.
+    const float wc = std::min<float>(csize, 7.0);
+    // Weight for the datapoint used to shift the center.
+    const float wd = 1.0;
+    for (uint32_t j = 0; j < dim; j++) {
+      float val = 0;
+      val += wc * centers[j + dim * li];
+      val += wd * utils::mapping<float>{}(dataset[j + size_t(dim) * i]);
+      val /= wc + wd;
       centers[j + dim * l] = val;
     }
-    if (metric == raft::distance::DistanceType::InnerProduct) {
-      sqsum = sqrt(sqsum);
-      for (size_t j = 0; j < dim; j++) {
-        centers[j + dim * l] /= sqsum;
-      }
-    }
     adjusted = true;
   }
   stream.synchronize();
@@ -342,10 +343,10 @@ auto adjust_centers(float* centers,
 template <typename T>
 void build_clusters(const handle_t& handle,
                     uint32_t n_iters,
-                    size_t dim,
+                    uint32_t dim,
                     const T* dataset,  // managedl [n_rows, dim]
                     size_t n_rows,
-                    size_t n_clusters,
+                    uint32_t n_clusters,
                     float* cluster_centers,    // managed; [n_clusters, dim]
                     uint32_t* cluster_labels,  // managed; [n_rows]
                     uint32_t* cluster_sizes,   // managed; [n_clusters]
@@ -354,24 +355,25 @@ void build_clusters(const handle_t& handle,
                     rmm::cuda_stream_view stream)
 {
   // "randomly initialize labels"
-  linalg::writeOnlyUnaryOp(
-    cluster_labels,
-    n_rows,
-    [n_clusters] __device__(uint32_t * out, uint32_t i) { *out = i % n_clusters; },
-    stream);
+  auto f = [n_clusters] __device__(uint32_t * out, size_t i) {
+    *out = uint32_t(i % size_t(n_clusters));
+  };
+  linalg::writeOnlyUnaryOp<uint32_t, decltype(f), size_t>(cluster_labels, n_rows, f, stream);
 
   // update centers to match the initialized labels.
-  calc_centers_and_sizes(cluster_centers,
-                         cluster_sizes,
-                         n_clusters,
-                         dim,
-                         dataset,
-                         n_rows,
-                         cluster_labels,
-                         metric,
-                         stream);
+  calc_centers_and_sizes(
+    cluster_centers, cluster_sizes, n_clusters, dim, dataset, n_rows, cluster_labels, true, stream);
 
   for (uint32_t iter = 0; iter < 2 * n_iters; iter += 2) {
+    switch (metric) {
+      // For some metrics, cluster calculation and adjustment tends to favor zero center vectors.
+      // To avoid converging to zero, we normalize the center vectors on every iteration.
+      case raft::distance::DistanceType::InnerProduct:
+      case raft::distance::DistanceType::CosineExpanded:
+      case raft::distance::DistanceType::CorrelationExpanded:
+        utils::normalize_rows(n_clusters, dim, cluster_centers, stream);
+      default: break;
+    }
     predict(handle,
             cluster_centers,
             n_clusters,
@@ -389,7 +391,7 @@ void build_clusters(const handle_t& handle,
                            dataset,
                            n_rows,
                            cluster_labels,
-                           metric,
+                           true,
                            stream);
 
     if (iter + 1 < 2 * n_iters) {
@@ -399,7 +401,6 @@ void build_clusters(const handle_t& handle,
                                  dataset,
                                  n_rows,
                                  cluster_labels,
-                                 metric,
                                  cluster_sizes,
                                  (float)1.0 / 4,
                                  stream)) {
@@ -410,8 +411,8 @@ void build_clusters(const handle_t& handle,
 }
 
 /** Calculate how many fine clusters should belong to each mesocluster. */
-auto arrange_fine_clusters(size_t n_clusters,
-                           size_t n_mesoclusters,
+auto arrange_fine_clusters(uint32_t n_clusters,
+                           uint32_t n_mesoclusters,
                            size_t n_rows,
                            const uint32_t* mesocluster_sizes)
 {
@@ -419,14 +420,29 @@ auto arrange_fine_clusters(size_t n_clusters,
   std::vector<uint32_t> fine_clusters_csum(n_mesoclusters + 1);
   fine_clusters_csum[0] = 0;
 
-  uint32_t n_lists_rem            = n_clusters;
+  uint32_t n_lists_rem       = n_clusters;
+  uint32_t n_nonempty_ms_rem = 0;
+  for (uint32_t i = 0; i < n_mesoclusters; i++) {
+    n_nonempty_ms_rem += mesocluster_sizes[i] > 0 ? 1 : 0;
+  }
   size_t n_rows_rem               = n_rows;
-  uint32_t mesocluster_size_sum   = 0;
+  size_t mesocluster_size_sum     = 0;
   uint32_t mesocluster_size_max   = 0;
   uint32_t fine_clusters_nums_max = 0;
   for (uint32_t i = 0; i < n_mesoclusters; i++) {
     if (i < n_mesoclusters - 1) {
-      fine_clusters_nums[i] = (double)n_lists_rem * mesocluster_sizes[i] / n_rows_rem + .5;
+      // Although the algorithm is meant to produce balanced clusters, when something
+      // goes wrong, we may get empty clusters (e.g. during development/debugging).
+      // The code below ensures a proportional arrangement of fine cluster numbers
+      // per mesocluster, even if some clusters are empty.
+      if (mesocluster_sizes[i] == 0) {
+        fine_clusters_nums[i] = 0;
+      } else {
+        n_nonempty_ms_rem--;
+        auto s = uint32_t((double)n_lists_rem * mesocluster_sizes[i] / n_rows_rem + .5);
+        s      = std::min<uint32_t>(s, n_lists_rem - n_nonempty_ms_rem);
+        fine_clusters_nums[i] = std::max<uint32_t>(s, 1);
+      }
     } else {
       fine_clusters_nums[i] = n_lists_rem;
     }
@@ -439,13 +455,13 @@ auto arrange_fine_clusters(size_t n_clusters,
   }
 
   RAFT_EXPECTS(mesocluster_size_sum == n_rows,
-               "mesocluster sizes do not add up (%u) to the total trainset size (%zu)",
+               "mesocluster sizes do not add up (%zu) to the total trainset size (%zu)",
                mesocluster_size_sum,
                n_rows);
   RAFT_EXPECTS(fine_clusters_csum[n_mesoclusters] == n_clusters,
-               "fine cluster numbers do not add up (%u) to the total number of clusters (%zu)",
+               "fine cluster numbers do not add up (%u) to the total number of clusters (%u)",
                fine_clusters_csum[n_mesoclusters],
-               n_rows
+               n_clusters
 
   );
 
@@ -471,16 +487,16 @@ auto arrange_fine_clusters(size_t n_clusters,
 template <typename T>
 auto build_fine_clusters(const handle_t& handle,
                          uint32_t n_iters,
-                         size_t dim,
+                         uint32_t dim,
                          const T* dataset_mptr,
                          const uint32_t* labels_mptr,
                          size_t n_rows,
                          const uint32_t* fine_clusters_nums,
                          const uint32_t* fine_clusters_csum,
                          const uint32_t* mesocluster_sizes,
-                         size_t n_mesoclusters,
-                         size_t mesocluster_size_max,
-                         size_t fine_clusters_nums_max,
+                         uint32_t n_mesoclusters,
+                         uint32_t mesocluster_size_max,
+                         uint32_t fine_clusters_nums_max,
                          float* cluster_centers,
                          raft::distance::DistanceType metric,
                          rmm::mr::managed_memory_resource* managed_memory,
@@ -515,6 +531,9 @@ auto build_fine_clusters(const handle_t& handle,
                    "Number of fine clusters must be zero for the empty mesocluster (got %d)",
                    fine_clusters_nums[i]);
       continue;
+    } else {
+      RAFT_EXPECTS(fine_clusters_nums[i] > 0,
+                   "Number of fine clusters must be non-zero for a non-empty mesocluster");
     }
 
     utils::copy_selected(
@@ -553,7 +572,6 @@ auto build_fine_clusters(const handle_t& handle,
  * @param dim number of columns in `centers` and `dataset`
  * @param[in] dataset a device pointer to the source dataset [n_rows, dim]
  * @param n_rows number of rows in the input
- * @param[out] labels a device pointer to the output labels [n_rows]
  * @param[out] cluster_centers a device pointer to the found cluster centers [n_cluster, dim]
  * @param n_cluster
  * @param trainset_fraction a fraction of rows in the `dataset` to sample for kmeans training;
@@ -564,18 +582,17 @@ auto build_fine_clusters(const handle_t& handle,
 template <typename T>
 void build_optimized_kmeans(const handle_t& handle,
                             uint32_t n_iters,
-                            size_t dim,
+                            uint32_t dim,
                             const T* dataset,
                             size_t n_rows,
-                            uint32_t* labels,
                             float* cluster_centers,
-                            size_t n_clusters,
+                            uint32_t n_clusters,
                             double trainset_fraction,
                             raft::distance::DistanceType metric,
                             rmm::cuda_stream_view stream)
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
-    "kmeans::build_optimized_kmeans(%u, %u)", n_rows, n_clusters);
+    "kmeans::build_optimized_kmeans(%zu, %u)", n_rows, n_clusters);
 
   auto trainset_ratio =
     std::max<size_t>(1, n_rows / std::max<size_t>(trainset_fraction * n_rows, n_clusters));
@@ -587,7 +604,7 @@ void build_optimized_kmeans(const handle_t& handle,
   rmm::mr::managed_memory_resource managed_memory;
   rmm::mr::device_memory_resource* device_memory = nullptr;
   auto pool_guard                                = raft::get_pool_memory_resource(
-    device_memory, kmeans::calc_minibatch_size(n_mesoclusters, n_rows) * dim * 4);
+    device_memory, kmeans::calc_minibatch_size(n_mesoclusters, n_rows_train) * dim * 4);
   if (pool_guard) {
     RAFT_LOG_DEBUG(
       "kmeans::build_optimized_kmeans: using pool memory resource with initial size %zu bytes",
@@ -634,6 +651,12 @@ void build_optimized_kmeans(const handle_t& handle,
   auto [mesocluster_size_max, fine_clusters_nums_max, fine_clusters_nums, fine_clusters_csum] =
     arrange_fine_clusters(n_clusters, n_mesoclusters, n_rows_train, mesocluster_sizes);
 
+  if (mesocluster_size_max * n_mesoclusters > 2 * n_rows_train) {
+    RAFT_LOG_WARN("build_optimized_kmeans: built unbalanced mesoclusters");
+    RAFT_LOG_INFO_VEC(mesocluster_sizes, n_mesoclusters);
+    RAFT_LOG_INFO_VEC(fine_clusters_nums.data(), n_mesoclusters);
+  }
+
   auto n_clusters_done = build_fine_clusters(handle,
                                              n_iters,
                                              dim,
@@ -654,17 +677,17 @@ void build_optimized_kmeans(const handle_t& handle,
   RAFT_EXPECTS(n_clusters_done == n_clusters, "Didn't process all clusters.");
 
   rmm::device_uvector<uint32_t> cluster_sizes(n_clusters, stream, device_memory);
+  rmm::device_uvector<uint32_t> labels(n_rows_train, stream, device_memory);
 
   // fit clusters using the trainset
   for (int iter = 0; iter < 2; iter++) {
-    // NB: labels.size == n_rows >= n_rows_train; the output is not used.
     predict(handle,
             cluster_centers,
             n_clusters,
             dim,
             trainset.data(),
             n_rows_train,
-            labels,
+            labels.data(),
             metric,
             stream,
             device_memory);
@@ -674,45 +697,10 @@ void build_optimized_kmeans(const handle_t& handle,
                            dim,
                            trainset.data(),
                            n_rows_train,
-                           labels,
-                           metric,
+                           labels.data(),
+                           true,
                            stream);
   }
-
-  RAFT_LOG_DEBUG("(%s) Final fitting.", __func__);
-
-  // fit clusters using the whole dataset
-  predict(handle,
-          cluster_centers,
-          n_clusters,
-          dim,
-          dataset,
-          n_rows,
-          labels,
-          metric,
-          stream,
-          device_memory);
-  // update the cluster centers one last time to better represent the whole dataset
-  calc_centers_and_sizes(cluster_centers,
-                         cluster_sizes.data(),
-                         n_clusters,
-                         dim,
-                         dataset,
-                         n_rows,
-                         labels,
-                         metric,
-                         stream);
-
-  predict(handle,
-          cluster_centers,
-          n_clusters,
-          dim,
-          dataset,
-          n_rows,
-          labels,
-          metric,
-          stream,
-          device_memory);
 }
 
 }  // namespace raft::spatial::knn::detail::kmeans
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index eae444f4f8..e789bafde2 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -135,16 +135,17 @@ struct mapping {
  * @param[in] value
  * @param[in] n_bytes
  */
-inline void memset(void* ptr, int value, size_t n_bytes, rmm::cuda_stream_view stream)
+template <typename T>
+inline void memzero(T* ptr, size_t n_elems, rmm::cuda_stream_view stream)
 {
   switch (check_pointer_residency(ptr)) {
     case pointer_residency::host_and_device:
     case pointer_residency::device_only: {
-      RAFT_CUDA_TRY(cudaMemsetAsync(ptr, value, n_bytes, stream));
+      RAFT_CUDA_TRY(cudaMemsetAsync(ptr, 0, n_elems * sizeof(T), stream));
     } break;
     case pointer_residency::host_only: {
       stream.synchronize();
-      ::memset(ptr, value, n_bytes);
+      ::memset(ptr, 0, n_elems * sizeof(T));
     } break;
     default: RAFT_FAIL("memset: unreachable code");
   }
@@ -355,15 +356,15 @@ inline void normalize_rows(uint32_t n_rows, uint32_t n_cols, float* a, rmm::cuda
   normalize_rows_kernel<<<blocks, threads, 0, stream>>>(n_rows, n_cols, a);
 }
 
-__global__ void divide_along_rows_kernel(uint32_t n_rows,
-                                         uint32_t n_cols,
-                                         float* a,
-                                         const uint32_t* d)
+template <typename Lambda>
+__global__ void map_along_rows_kernel(
+  uint32_t n_rows, uint32_t n_cols, float* a, const uint32_t* d, Lambda map)
 {
   uint64_t gid = threadIdx.x + blockDim.x * blockIdx.x;
   uint64_t i   = gid / n_cols;
   if (i >= n_rows) return;
-  if (d[i] != 0) { a[gid] /= d[i]; }
+  float& x = a[gid];
+  x        = map(x, d[i]);
 }
 
 /**
@@ -372,20 +373,28 @@ __global__ void divide_along_rows_kernel(uint32_t n_rows,
  *
  * NB: device-only function
  *
- * @param[in] n_rows
- * @param[in] n_cols
+ * @tparam Lambda
+ *
+ * @param n_rows
+ * @param n_cols
  * @param[inout] a device pointer to a row-major matrix [n_rows, n_cols]
- * @param[in] d device pointer to a vector of divisors [n_rows]
+ * @param[in] d device pointer to a vector [n_rows]
+ * @param map the binary operation to apply on every element of matrix rows and of the vector
  */
-inline void divide_along_rows(
-  uint32_t n_rows, uint32_t n_cols, float* a, const uint32_t* d, rmm::cuda_stream_view stream)
+template <typename Lambda>
+inline void map_along_rows(uint32_t n_rows,
+                           uint32_t n_cols,
+                           float* a,
+                           const uint32_t* d,
+                           Lambda map,
+                           rmm::cuda_stream_view stream)
 {
   dim3 threads(128, 1, 1);
   dim3 blocks(
     ceildiv<uint64_t>(static_cast<uint64_t>(n_rows) * static_cast<uint64_t>(n_cols), threads.x),
     1,
     1);
-  divide_along_rows_kernel<<<blocks, threads, 0, stream>>>(n_rows, n_cols, a, d);
+  map_along_rows_kernel<<<blocks, threads, 0, stream>>>(n_rows, n_cols, a, d, map);
 }
 
 template <typename T>
@@ -398,6 +407,66 @@ __global__ void outer_add_kernel(const T* a, uint32_t len_a, const T* b, uint32_
   c[gid] = (a == nullptr ? T(0) : a[i]) + (b == nullptr ? T(0) : b[j]);
 }
 
+template <typename T, typename IdxT>
+__global__ void block_copy_kernel(const IdxT* in_offsets,
+                                  const IdxT* out_offsets,
+                                  IdxT n_blocks,
+                                  const T* in_data,
+                                  T* out_data,
+                                  IdxT n_mult)
+{
+  IdxT i = IdxT(blockDim.x) * IdxT(blockIdx.x) + threadIdx.x;
+  // find the source offset using the binary search.
+  uint32_t l     = 0;
+  uint32_t r     = n_blocks;
+  IdxT in_offset = 0;
+  if (in_offsets[r] * n_mult <= i) return;
+  while (l + 1 < r) {
+    uint32_t c = (l + r) >> 1;
+    IdxT o     = in_offsets[c] * n_mult;
+    if (o <= i) {
+      l         = c;
+      in_offset = o;
+    } else {
+      r = c;
+    }
+  }
+  // copy the data
+  out_data[out_offsets[l] * n_mult - in_offset + i] = in_data[i];
+}
+
+/**
+ * Copy chunks of data from one array to another at given offsets.
+ *
+ * @tparam T element type
+ * @tparam IdxT index type
+ *
+ * @param[in] in_offsets
+ * @param[in] out_offsets
+ * @param n_blocks size of the offset arrays minus one.
+ * @param[in] in_data
+ * @param[out] out_data
+ * @param n_mult constant multiplier for offset values (such as e.g. `dim`)
+ * @param stream
+ */
+template <typename T, typename IdxT>
+void block_copy(const IdxT* in_offsets,
+                const IdxT* out_offsets,
+                IdxT n_blocks,
+                const T* in_data,
+                T* out_data,
+                IdxT n_mult,
+                rmm::cuda_stream_view stream)
+{
+  IdxT in_size;
+  update_host(&in_size, in_offsets + n_blocks, 1, stream);
+  stream.synchronize();
+  dim3 threads(128, 1, 1);
+  dim3 blocks(ceildiv<IdxT>(in_size * n_mult, threads.x), 1, 1);
+  block_copy_kernel<<<blocks, threads, 0, stream>>>(
+    in_offsets, out_offsets, n_blocks, in_data, out_data, n_mult);
+}
+
 /**
  * @brief Fill matrix `c` with all combinations of sums of vectors `a` and `b`.
  *
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
index f96c900e3f..d1a49674b6 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
@@ -25,7 +25,6 @@
 #include <raft/core/mdarray.hpp>
 #include <raft/core/nvtx.hpp>
 #include <raft/pow2_utils.cuh>
-#include <raft/stats/histogram.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -46,11 +45,12 @@ using namespace raft::spatial::knn::detail;  // NOLINT
  *   there are no dependencies between threads, hence no constraints on the block size.
  *
  * @tparam T the element type.
- * @tparam IdxT type of the indices in the source dataset
+ * @tparam IdxT type of the indices in the source source_vecs
  *
  * @param[in] labels device pointer to the cluster ids for each row [n_rows]
  * @param[in] list_offsets device pointer to the cluster offsets in the output (index) [n_lists]
- * @param[in] dataset device poitner to the input data [n_rows, dim]
+ * @param[in] source_vecs device poitner to the input data [n_rows, dim]
+ * @param[in] source_ixs device poitner to the input indices [n_rows]
  * @param[out] list_data device pointer to the output [index_size, dim]
  * @param[out] list_index device pointer to the source ids corr. to the output [index_size]
  * @param[out] list_sizes_ptr device pointer to the cluster sizes [n_lists];
@@ -63,7 +63,8 @@ using namespace raft::spatial::knn::detail;  // NOLINT
 template <typename T, typename IdxT>
 __global__ void build_index_kernel(const uint32_t* labels,
                                    const IdxT* list_offsets,
-                                   const T* dataset,
+                                   const T* source_vecs,
+                                   const IdxT* source_ixs,
                                    T* list_data,
                                    IdxT* list_index,
                                    uint32_t* list_sizes_ptr,
@@ -79,7 +80,7 @@ __global__ void build_index_kernel(const uint32_t* labels,
   auto list_offset = list_offsets[list_id];
 
   // Record the source vector id in the index
-  list_index[list_offset + inlist_id] = i;
+  list_index[list_offset + inlist_id] = source_ixs == nullptr ? i : source_ixs[i];
 
   // The data is written in interleaved groups of `index::kGroupSize` vectors
   using interleaved_group = Pow2<kIndexGroupSize>;
@@ -90,71 +91,69 @@ __global__ void build_index_kernel(const uint32_t* labels,
   list_data += (list_offset + group_offset) * dim;
 
   // Point to the source vector
-  dataset += i * dim;
+  source_vecs += i * dim;
 
   // Interleave dimensions of the source vector while recording it.
   // NB: such `veclen` is selected, that `dim % veclen == 0`
   for (uint32_t l = 0; l < dim; l += veclen) {
     for (uint32_t j = 0; j < veclen; j++) {
-      list_data[l * kIndexGroupSize + ingroup_id + j] = dataset[l + j];
+      list_data[l * kIndexGroupSize + ingroup_id + j] = source_vecs[l + j];
     }
   }
 }
 
-/** See raft::spatial::knn::ivf_flat::build docs */
+/** See raft::spatial::knn::ivf_flat::extend docs */
 template <typename T, typename IdxT>
-inline auto build(const handle_t& handle,
-                  const index_params& params,
-                  const T* dataset,
-                  IdxT n_rows,
-                  uint32_t dim,
-                  rmm::cuda_stream_view stream) -> index<T, IdxT>
+inline auto extend(const handle_t& handle,
+                   const index<T, IdxT>& orig_index,
+                   const T* new_vectors,
+                   const IdxT* new_indices,
+                   IdxT n_rows,
+                   rmm::cuda_stream_view stream) -> index<T, IdxT>
 {
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope("ivf_flat::build(%u, %u)", n_rows, dim);
-  static_assert(std::is_same_v<T, float> || std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
-                "unsupported data type");
-  RAFT_EXPECTS(n_rows > 0 && dim > 0, "empty dataset");
+  auto n_lists = orig_index.n_lists();
+  auto dim     = orig_index.dim();
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "ivf_flat::extend(%zu, %u)", size_t(n_rows), dim);
 
-  // TODO: consider padding the dimensions and fixing veclen to its maximum possible value as a
-  // template parameter (https://github.com/rapidsai/raft/issues/711)
-  uint32_t veclen = 16 / sizeof(T);
-  while (dim % veclen != 0) {
-    veclen = veclen >> 1;
-  }
-  auto n_lists = static_cast<uint32_t>(params.n_lists);
+  RAFT_EXPECTS(new_indices != nullptr || orig_index.size() == 0,
+               "You must pass data indices when the index is non-empty.");
 
-  // kmeans cluster ids for the dataset
-  rmm::device_uvector<uint32_t> labels(n_rows, stream);
-  auto&& centers = make_device_mdarray<float>(stream, n_lists, dim);
+  rmm::device_uvector<uint32_t> new_labels(n_rows, stream);
+  kmeans::predict(handle,
+                  orig_index.centers.data(),
+                  n_lists,
+                  dim,
+                  new_vectors,
+                  n_rows,
+                  new_labels.data(),
+                  orig_index.metric,
+                  stream);
 
-  // Predict labels of the whole dataset
-  kmeans::build_optimized_kmeans(handle,
-                                 params.kmeans_n_iters,
+  auto&& list_sizes     = make_device_mdarray<uint32_t>(stream, n_lists);
+  auto&& list_offsets   = make_device_mdarray<IdxT>(stream, n_lists + 1);
+  auto list_sizes_ptr   = list_sizes.data();
+  auto list_offsets_ptr = list_offsets.data();
+
+  auto&& centers   = make_device_mdarray<float>(stream, n_lists, dim);
+  auto centers_ptr = centers.data();
+
+  // Calculate the centers and sizes on the new data, starting from the original values
+  raft::copy(centers_ptr, orig_index.centers.data(), centers.size(), stream);
+  raft::copy(list_sizes_ptr, orig_index.list_sizes.data(), list_sizes.size(), stream);
+
+  kmeans::calc_centers_and_sizes(centers_ptr,
+                                 list_sizes_ptr,
+                                 n_lists,
                                  dim,
-                                 dataset,
+                                 new_vectors,
                                  n_rows,
-                                 labels.data(),
-                                 centers.data(),
-                                 n_lists,
-                                 params.kmeans_trainset_fraction,
-                                 params.metric,
+                                 new_labels.data(),
+                                 false,
                                  stream);
 
-  // sizes of the clusters
-  auto&& list_sizes   = make_device_mdarray<uint32_t>(stream, n_lists);
-  auto list_sizes_ptr = list_sizes.data();
-  raft::stats::histogram<uint32_t, size_t>(raft::stats::HistTypeAuto,
-                                           reinterpret_cast<int32_t*>(list_sizes_ptr),
-                                           size_t(n_lists),
-                                           labels.data(),
-                                           n_rows,
-                                           1,
-                                           stream);
-
-  // Calculate offsets into cluster data using exclusive scan
-  auto&& list_offsets   = make_device_mdarray<IdxT>(stream, n_lists + 1);
-  auto list_offsets_ptr = list_offsets.data();
-
+  // Calculate new offsets
+  IdxT index_size;
   thrust::exclusive_scan(
     rmm::exec_policy(stream),
     list_sizes_ptr,
@@ -163,27 +162,47 @@ inline auto build(const handle_t& handle,
     IdxT(0),
     [] __device__(IdxT s, uint32_t l) { return s + Pow2<WarpSize>::roundUp(l); });
 
-  IdxT index_size;
   update_host(&index_size, list_offsets_ptr + n_lists, 1, stream);
   handle.sync_stream(stream);
 
   auto&& data    = make_device_mdarray<T>(stream, index_size, dim);
   auto&& indices = make_device_mdarray<IdxT>(stream, index_size);
 
+  // Populate index with the old data
+  if (orig_index.size() > 0) {
+    utils::block_copy(orig_index.list_offsets.data(),
+                      list_offsets_ptr,
+                      IdxT(n_lists),
+                      orig_index.data.data(),
+                      data.data(),
+                      IdxT(dim),
+                      stream);
+
+    utils::block_copy(orig_index.list_offsets.data(),
+                      list_offsets_ptr,
+                      IdxT(n_lists),
+                      orig_index.indices.data(),
+                      indices.data(),
+                      IdxT(1),
+                      stream);
+  }
+
+  // Copy the old sizes, so we can start from the current state of the index;
   // we'll rebuild the `list_sizes_ptr` in the following kernel, using it as an atomic counter.
-  utils::memset(list_sizes_ptr, 0, sizeof(uint32_t) * n_lists, stream);
+  raft::copy(list_sizes_ptr, orig_index.list_sizes.data(), list_sizes.size(), stream);
 
   const dim3 block_dim(256);
   const dim3 grid_dim(raft::ceildiv<IdxT>(n_rows, block_dim.x));
-  build_index_kernel<<<grid_dim, block_dim, 0, stream>>>(labels.data(),
+  build_index_kernel<<<grid_dim, block_dim, 0, stream>>>(new_labels.data(),
                                                          list_offsets_ptr,
-                                                         dataset,
+                                                         new_vectors,
+                                                         new_indices,
                                                          data.data(),
                                                          indices.data(),
                                                          list_sizes_ptr,
                                                          n_rows,
                                                          dim,
-                                                         veclen);
+                                                         orig_index.veclen);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
   // Precompute the centers vector norms for L2Expanded distance
@@ -193,10 +212,72 @@ inline auto build(const handle_t& handle,
     RAFT_LOG_TRACE_VEC(r.data(), 20);
     return r;
   };
-  auto&& center_norms = params.metric == raft::distance::DistanceType::L2Expanded
+  auto&& center_norms = orig_index.metric == raft::distance::DistanceType::L2Expanded
                           ? std::optional(compute_norms())
                           : std::nullopt;
 
+  // assemble the index
+  index<T, IdxT> new_index{{},
+                           orig_index.veclen,
+                           orig_index.metric,
+                           std::move(data),
+                           std::move(indices),
+                           std::move(list_sizes),
+                           std::move(list_offsets),
+                           std::move(centers),
+                           std::move(center_norms)};
+
+  // check index invariants
+  new_index.check_consistency();
+
+  return new_index;
+}
+
+/** See raft::spatial::knn::ivf_flat::build docs */
+template <typename T, typename IdxT>
+inline auto build(const handle_t& handle,
+                  const index_params& params,
+                  const T* dataset,
+                  IdxT n_rows,
+                  uint32_t dim,
+                  rmm::cuda_stream_view stream) -> index<T, IdxT>
+{
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "ivf_flat::build(%zu, %u)", size_t(n_rows), dim);
+  static_assert(std::is_same_v<T, float> || std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
+                "unsupported data type");
+  RAFT_EXPECTS(n_rows > 0 && dim > 0, "empty dataset");
+
+  // TODO: consider padding the dimensions and fixing veclen to its maximum possible value as a
+  // template parameter (https://github.com/rapidsai/raft/issues/711)
+  uint32_t veclen = 16 / sizeof(T);
+  while (dim % veclen != 0) {
+    veclen = veclen >> 1;
+  }
+  auto n_lists = static_cast<uint32_t>(params.n_lists);
+
+  // kmeans cluster ids for the dataset
+  auto&& centers = make_device_mdarray<float>(stream, n_lists, dim);
+
+  // Predict labels of the whole dataset
+  kmeans::build_optimized_kmeans(handle,
+                                 params.kmeans_n_iters,
+                                 dim,
+                                 dataset,
+                                 n_rows,
+                                 centers.data(),
+                                 n_lists,
+                                 params.kmeans_trainset_fraction,
+                                 params.metric,
+                                 stream);
+
+  auto&& data         = make_device_mdarray<T>(stream, 0, dim);
+  auto&& indices      = make_device_mdarray<IdxT>(stream, 0);
+  auto&& list_sizes   = make_device_mdarray<uint32_t>(stream, n_lists);
+  auto&& list_offsets = make_device_mdarray<IdxT>(stream, n_lists + 1);
+  utils::memzero(list_sizes.data(), list_sizes.size(), stream);
+  utils::memzero(list_offsets.data(), list_offsets.size(), stream);
+
   // assemble the index
   index<T, IdxT> index{{},
                        veclen,
@@ -206,12 +287,17 @@ inline auto build(const handle_t& handle,
                        std::move(list_sizes),
                        std::move(list_offsets),
                        std::move(centers),
-                       std::move(center_norms)};
+                       std::nullopt};
 
   // check index invariants
   index.check_consistency();
 
-  return index;
+  // add the data if necessary
+  if (params.add_data_on_build) {
+    return extend<T, IdxT>(handle, index, dataset, nullptr, n_rows, stream);
+  } else {
+    return index;
+  }
 }
 
 }  // namespace raft::spatial::knn::ivf_flat::detail
diff --git a/cpp/include/raft/spatial/knn/ivf_flat.cuh b/cpp/include/raft/spatial/knn/ivf_flat.cuh
index 6d0e2e5911..c62fb8d2b8 100644
--- a/cpp/include/raft/spatial/knn/ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/ivf_flat.cuh
@@ -55,6 +55,35 @@ inline auto build(
     handle, params, dataset, n_rows, dim, handle.get_stream());
 }
 
+/**
+ * @brief Build a new index containing the data of the original plus new extra vectors.
+ *
+ * Implementation note:
+ *    The new data is clustered according to existing kmeans clusters, then the cluster
+ *    centers are adjusted to match the newly labeled data.
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param handle
+ * @param index original index
+ * @param[in] new_vectors a device pointer to a row-major matrix [n_rows, index.dim()]
+ * @param[in] new_indices a device pointer to a vector of indices [n_rows]
+ * @param n_rows the number of samples
+ *
+ * @return the constructed extended ivf-flat index
+ */
+template <typename T, typename IdxT>
+inline auto extend(const handle_t& handle,
+                   const index<T, IdxT>& orig_index,
+                   const T* new_vectors,
+                   const IdxT* new_indices,
+                   IdxT n_rows) -> index<T, IdxT>
+{
+  return raft::spatial::knn::ivf_flat::detail::extend(
+    handle, orig_index, new_vectors, new_indices, n_rows, handle.get_stream());
+}
+
 /**
  * @brief Search ANN using the constructed index.
  *
diff --git a/cpp/test/spatial/ann_ivf_flat.cu b/cpp/test/spatial/ann_ivf_flat.cu
index 22abb32659..7468fd75b7 100644
--- a/cpp/test/spatial/ann_ivf_flat.cu
+++ b/cpp/test/spatial/ann_ivf_flat.cu
@@ -20,6 +20,7 @@
 #include <raft/core/logger.hpp>
 #include <raft/distance/distance_type.hpp>
 #include <raft/random/rng.cuh>
+#include <raft/sparse/detail/utils.h>
 #include <raft/spatial/knn/ann.cuh>
 #include <raft/spatial/knn/ivf_flat.cuh>
 #include <raft/spatial/knn/knn.cuh>
@@ -198,12 +199,29 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
         index_params.metric    = ps.metric;
         search_params.n_probes = ps.nprobe;
 
+        index_params.add_data_on_build        = false;
+        index_params.kmeans_trainset_fraction = 0.5;
         auto index =
           ivf_flat::build(handle_, index_params, database.data(), int64_t(ps.num_db_vecs), ps.dim);
 
+        rmm::device_uvector<int64_t> vector_indices(ps.num_db_vecs, stream_);
+        sparse::iota_fill(vector_indices.data(), int64_t(ps.num_db_vecs), int64_t(1), stream_);
+        handle_.sync_stream(stream_);
+
+        int64_t half_of_data = ps.num_db_vecs / 2;
+
+        auto index_2 =
+          ivf_flat::extend<DataT, int64_t>(handle_, index, database.data(), nullptr, half_of_data);
+
+        auto index_3 = ivf_flat::extend<DataT, int64_t>(handle_,
+                                                        index_2,
+                                                        database.data() + half_of_data * ps.dim,
+                                                        vector_indices.data() + half_of_data,
+                                                        int64_t(ps.num_db_vecs) - half_of_data);
+
         ivf_flat::search(handle_,
                          search_params,
-                         index,
+                         index_3,
                          search_queries.data(),
                          ps.num_queries,
                          ps.k,

From 7f640a98217eca04bfc8c1b87feb998232464697 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 21 Jul 2022 09:13:04 +0200
Subject: [PATCH 109/118] Improve the docs

---
 cpp/include/raft/spatial/knn/ivf_flat.cuh     | 39 ++++++++++++++++---
 .../raft/spatial/knn/ivf_flat_types.hpp       | 16 ++++----
 docs/source/cpp_api/spatial.rst               |  9 +++++
 3 files changed, 51 insertions(+), 13 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/ivf_flat.cuh b/cpp/include/raft/spatial/knn/ivf_flat.cuh
index c62fb8d2b8..98cccd64df 100644
--- a/cpp/include/raft/spatial/knn/ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/ivf_flat.cuh
@@ -31,9 +31,22 @@ namespace raft::spatial::knn::ivf_flat {
  * @brief Build the index from the dataset for efficient search.
  *
  * NB: Currently, the following distance metrics are supported:
- *   L2Expanded
- *   L2Unexpanded
- *   InnerProduct
+ * - L2Expanded
+ * - L2Unexpanded
+ * - InnerProduct
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::spatial::knn;
+ *   // use default index parameters
+ *   ivf_flat::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = ivf_flat::build(handle, index_params, dataset, N, D);
+ *   // use default search parameters
+ *   ivf_flat::search_params search_params;
+ *   // search K nearest neighbours for each of the N queries
+ *   ivf_flat::search(handle, search_params, index, queries, N, K, out_inds, out_dists);
+ * @endcode
  *
  * @tparam T data element type
  * @tparam IdxT type of the indices in the source dataset
@@ -62,13 +75,27 @@ inline auto build(
  *    The new data is clustered according to existing kmeans clusters, then the cluster
  *    centers are adjusted to match the newly labeled data.
  *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::spatial::knn;
+ *   ivf_flat::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_flat::build(handle, index_params, dataset, N, D);
+ *   // fill the index with the data
+ *   auto index = ivf_flat::extend(handle, index_empty, dataset, nullptr, N);
+ * @endcode
+ *
  * @tparam T data element type
  * @tparam IdxT type of the indices in the source dataset
  *
  * @param handle
- * @param index original index
+ * @param orig_index original index
  * @param[in] new_vectors a device pointer to a row-major matrix [n_rows, index.dim()]
- * @param[in] new_indices a device pointer to a vector of indices [n_rows]
+ * @param[in] new_indices a device pointer to a vector of indices [n_rows].
+ *    If the original index is empty (`orig_index.size() == 0`), you can pass `nullptr`
+ *    here to imply a continuous range `[0...n_rows)`.
  * @param n_rows the number of samples
  *
  * @return the constructed extended ivf-flat index
@@ -87,6 +114,8 @@ inline auto extend(const handle_t& handle,
 /**
  * @brief Search ANN using the constructed index.
  *
+ * See the [ivf_flat::build](#ivf_flat::build) documentation for a usage example.
+ *
  * @tparam T data element type
  * @tparam IdxT type of the indices
  *
diff --git a/cpp/include/raft/spatial/knn/ivf_flat_types.hpp b/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
index e8d6cb74eb..81c9bba998 100644
--- a/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
+++ b/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
@@ -77,14 +77,14 @@ struct index : knn::index {
    * followed by a chunk of the same size of the next row, and so on.
    *
    * __Example__: veclen = 2, dim = 6, kGroupSize = 32, list_size = 31
-   * `
-   *   x[ 0, 0], x[ 0, 1], x[ 1, 0], x[ 1, 1], ... x[14, 0], x[14, 1], x[15, 0], x[15, 1],
-   *   x[16, 0], x[16, 1], x[17, 0], x[17, 1], ... x[30, 0], x[30, 1],    -    ,    -    ,
-   *   x[ 0, 2], x[ 0, 3], x[ 1, 2], x[ 1, 3], ... x[14, 2], x[14, 3], x[15, 2], x[15, 3],
-   *   x[16, 2], x[16, 3], x[17, 2], x[17, 3], ... x[30, 2], x[30, 3],    -    ,    -    ,
-   *   x[ 0, 4], x[ 0, 5], x[ 1, 4], x[ 1, 5], ... x[14, 4], x[14, 5], x[15, 4], x[15, 5],
-   *   x[16, 4], x[16, 5], x[17, 4], x[17, 5], ... x[30, 4], x[30, 5],    -    ,    -    ,
-   * `
+   *
+   *     x[ 0, 0], x[ 0, 1], x[ 1, 0], x[ 1, 1], ... x[14, 0], x[14, 1], x[15, 0], x[15, 1],
+   *     x[16, 0], x[16, 1], x[17, 0], x[17, 1], ... x[30, 0], x[30, 1],    -    ,    -    ,
+   *     x[ 0, 2], x[ 0, 3], x[ 1, 2], x[ 1, 3], ... x[14, 2], x[14, 3], x[15, 2], x[15, 3],
+   *     x[16, 2], x[16, 3], x[17, 2], x[17, 3], ... x[30, 2], x[30, 3],    -    ,    -    ,
+   *     x[ 0, 4], x[ 0, 5], x[ 1, 4], x[ 1, 5], ... x[14, 4], x[14, 5], x[15, 4], x[15, 5],
+   *     x[16, 4], x[16, 5], x[17, 4], x[17, 5], ... x[30, 4], x[30, 5],    -    ,    -    ,
+   *
    */
   device_mdarray<T, extent_2d, row_major> data;
   /** Inverted list indices: ids of items in the source data [size] */
diff --git a/docs/source/cpp_api/spatial.rst b/docs/source/cpp_api/spatial.rst
index 5065fa5af0..243bf19bf7 100644
--- a/docs/source/cpp_api/spatial.rst
+++ b/docs/source/cpp_api/spatial.rst
@@ -16,3 +16,12 @@ Nearest Neighbors
 .. doxygennamespace:: raft::spatial::knn
     :project: RAFT
     :members:
+
+
+
+IVF-Flat
+--------
+
+.. doxygennamespace:: raft::spatial::knn::ivf_flat
+    :project: RAFT
+    :members:

From 2e9eda52b9cfec173b6fd2fac84ff562aff1ba46 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 21 Jul 2022 10:28:38 +0200
Subject: [PATCH 110/118] Fix using non-existing log function

---
 cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index 8412d753fd..74e1ae75a8 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -653,8 +653,8 @@ void build_optimized_kmeans(const handle_t& handle,
 
   if (mesocluster_size_max * n_mesoclusters > 2 * n_rows_train) {
     RAFT_LOG_WARN("build_optimized_kmeans: built unbalanced mesoclusters");
-    RAFT_LOG_INFO_VEC(mesocluster_sizes, n_mesoclusters);
-    RAFT_LOG_INFO_VEC(fine_clusters_nums.data(), n_mesoclusters);
+    RAFT_LOG_TRACE_VEC(mesocluster_sizes, n_mesoclusters);
+    RAFT_LOG_TRACE_VEC(fine_clusters_nums.data(), n_mesoclusters);
   }
 
   auto n_clusters_done = build_fine_clusters(handle,

From fb841c304427107e71c998ad4801217afd66a812 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 22 Jul 2022 08:59:06 +0200
Subject: [PATCH 111/118] Replace thurst::exclusive_scan with
 thrust::inclusive_scan to avoid an out-of-bounds read of the last entry

---
 .../raft/spatial/knn/detail/ivf_flat_build.cuh        | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
index d1a49674b6..d82ed158e7 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
@@ -153,15 +153,14 @@ inline auto extend(const handle_t& handle,
                                  stream);
 
   // Calculate new offsets
-  IdxT index_size;
-  thrust::exclusive_scan(
+  IdxT index_size = 0;
+  update_device(list_offsets_ptr, &index_size, 1, stream);
+  thrust::inclusive_scan(
     rmm::exec_policy(stream),
     list_sizes_ptr,
-    list_sizes_ptr + n_lists + 1,
-    list_offsets_ptr,
-    IdxT(0),
+    list_sizes_ptr + n_lists,
+    list_offsets_ptr + 1,
     [] __device__(IdxT s, uint32_t l) { return s + Pow2<WarpSize>::roundUp(l); });
-
   update_host(&index_size, list_offsets_ptr + n_lists, 1, stream);
   handle.sync_stream(stream);
 

From 0c72ee8a6004108162d70ac80fb2b007bc424f93 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 22 Jul 2022 13:38:50 +0200
Subject: [PATCH 112/118] ann_common.h: remove deps on cuda code, so that the
 file can be included in .cpp files as before (in cuml)

---
 cpp/include/raft/spatial/knn/ann_common.h   |   4 +-
 cpp/include/raft/spatial/knn/ann_common.hpp | 111 --------------------
 2 files changed, 3 insertions(+), 112 deletions(-)
 delete mode 100644 cpp/include/raft/spatial/knn/ann_common.hpp

diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h
index 463950045f..1f0d59683b 100644
--- a/cpp/include/raft/spatial/knn/ann_common.h
+++ b/cpp/include/raft/spatial/knn/ann_common.h
@@ -20,7 +20,6 @@
 
 #pragma once
 
-#include "detail/processing.hpp"
 #include "ivf_flat_types.hpp"
 
 #include <raft/distance/distance_type.hpp>
@@ -32,6 +31,9 @@ namespace raft {
 namespace spatial {
 namespace knn {
 
+template <typename math_t>
+class MetricProcessor;
+
 struct knnIndex {
   raft::distance::DistanceType metric;
   float metricArg;
diff --git a/cpp/include/raft/spatial/knn/ann_common.hpp b/cpp/include/raft/spatial/knn/ann_common.hpp
deleted file mode 100644
index e91444a7f2..0000000000
--- a/cpp/include/raft/spatial/knn/ann_common.hpp
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "detail/processing.hpp"
-#include <faiss/gpu/GpuIndex.h>
-#include <raft/distance/distance_type.hpp>
-#include <raft/spatial/knn/faiss_mr.hpp>
-
-namespace raft::spatial::knn {
-
-namespace ivf_flat {
-template <typename T, typename IdxT>
-class index;
-};
-
-struct knnIndex {
-  raft::distance::DistanceType metric;
-  float metricArg;
-  std::unique_ptr<faiss::gpu::GpuIndex> index;
-  std::unique_ptr<MetricProcessor<float>> metric_processor;
-  std::unique_ptr<const ivf_flat::index<float, int64_t>> ivf_flat_float_;
-  std::unique_ptr<const ivf_flat::index<uint8_t, int64_t>> ivf_flat_uint8_t_;
-  std::unique_ptr<const ivf_flat::index<int8_t, int64_t>> ivf_flat_int8_t_;
-
-  std::unique_ptr<raft::spatial::knn::RmmGpuResources> gpu_res;
-  int device;
-
-  template <typename T, typename IdxT>
-  auto ivf_flat() -> std::unique_ptr<const ivf_flat::index<T, IdxT>>&;
-};
-
-template <>
-auto knnIndex::ivf_flat<float>() -> std::unique_ptr<const ivf_flat::index<float, int64_t>>&
-{
-  return ivf_flat_float_;
-}
-
-template <>
-auto knnIndex::ivf_flat<uint8_t>() -> std::unique_ptr<const ivf_flat::index<uint8_t, int64_t>>&
-{
-  return ivf_flat_uint8_t_;
-}
-
-template <>
-auto knnIndex::ivf_flat<int8_t>() -> std::unique_ptr<const ivf_flat::index<int8_t, int64_t>>&
-{
-  return ivf_flat_int8_t_;
-}
-
-enum QuantizerType : unsigned int {
-  QT_8bit,
-  QT_4bit,
-  QT_8bit_uniform,
-  QT_4bit_uniform,
-  QT_fp16,
-  QT_8bit_direct,
-  QT_6bit
-};
-
-struct knn_index_params {
-  /** Distance type. */
-  raft::distance::DistanceType metric = distance::DistanceType::L2Expanded;
-  /** The argument used by some distance metrics. */
-  float metric_arg = 2.0f;
-
-  virtual ~knn_index_params() = default;
-};
-
-struct knn_search_params {
-  virtual ~knn_search_params() = default;
-};
-
-struct ivf_index_params : knn_index_params {
-  /** The number of inverted lists (clusters) */
-  uint32_t n_lists = 1024;
-};
-
-struct ivf_search_params : knn_search_params {
-  /** The number of clusters to search. */
-  uint32_t n_probes = 20;
-};
-
-// TODO: move to ivf_pq
-struct ivf_pq_index_params : ivf_index_params {
-  int n_subquantizers;
-  int n_bits;
-  bool use_precomputed_tables;
-};
-
-// TODO: move to ivf_sq
-struct ivf_sq_index_params : ivf_index_params {
-  QuantizerType qtype;
-  bool encode_residual;
-};
-
-};  // namespace raft::spatial::knn

From 0196695e9907299ecfcd6b738da1aca02b00e34e Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 22 Jul 2022 14:24:38 +0200
Subject: [PATCH 113/118] Make helper overloads inline for linking in cuml

---
 cpp/include/raft/spatial/knn/ann_common.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h
index 1f0d59683b..dc5a6c50fb 100644
--- a/cpp/include/raft/spatial/knn/ann_common.h
+++ b/cpp/include/raft/spatial/knn/ann_common.h
@@ -52,20 +52,21 @@ struct knnIndex {
 };
 
 template <>
-auto knnIndex::ivf_flat<float, int64_t>() -> std::unique_ptr<const ivf_flat::index<float, int64_t>>&
+inline auto knnIndex::ivf_flat<float, int64_t>()
+  -> std::unique_ptr<const ivf_flat::index<float, int64_t>>&
 {
   return ivf_flat_float_;
 }
 
 template <>
-auto knnIndex::ivf_flat<uint8_t, int64_t>()
+inline auto knnIndex::ivf_flat<uint8_t, int64_t>()
   -> std::unique_ptr<const ivf_flat::index<uint8_t, int64_t>>&
 {
   return ivf_flat_uint8_t_;
 }
 
 template <>
-auto knnIndex::ivf_flat<int8_t, int64_t>()
+inline auto knnIndex::ivf_flat<int8_t, int64_t>()
   -> std::unique_ptr<const ivf_flat::index<int8_t, int64_t>>&
 {
   return ivf_flat_int8_t_;

From eb156395788614c7e700ffe64bad8e644206a488 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 22 Jul 2022 14:41:42 +0200
Subject: [PATCH 114/118] Split processing.hpp into *.cuh and *.hpp to avoid
 incomplete types

---
 cpp/include/raft/spatial/knn/ann_common.h     |   4 +-
 .../raft/spatial/knn/detail/ann_quantized.cuh |   2 +-
 .../raft/spatial/knn/detail/fused_l2_knn.cuh  |   2 +-
 .../knn/detail/knn_brute_force_faiss.cuh      |   2 +-
 .../raft/spatial/knn/detail/processing.cuh    | 203 ++++++++++++++++++
 .../raft/spatial/knn/detail/processing.hpp    | 177 ---------------
 6 files changed, 207 insertions(+), 183 deletions(-)
 create mode 100644 cpp/include/raft/spatial/knn/detail/processing.cuh

diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h
index dc5a6c50fb..45867dbfee 100644
--- a/cpp/include/raft/spatial/knn/ann_common.h
+++ b/cpp/include/raft/spatial/knn/ann_common.h
@@ -20,6 +20,7 @@
 
 #pragma once
 
+#include "detail/processing.hpp"
 #include "ivf_flat_types.hpp"
 
 #include <raft/distance/distance_type.hpp>
@@ -31,9 +32,6 @@ namespace raft {
 namespace spatial {
 namespace knn {
 
-template <typename math_t>
-class MetricProcessor;
-
 struct knnIndex {
   raft::distance::DistanceType metric;
   float metricArg;
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
index 6b3bb73acc..0dd6baed5b 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
@@ -21,7 +21,7 @@
 #include "knn_brute_force_faiss.cuh"
 
 #include "common_faiss.h"
-#include "processing.hpp"
+#include "processing.cuh"
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
index f8532e52a0..41f1df85fe 100644
--- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
@@ -19,7 +19,7 @@
 #include <limits>
 #include <raft/linalg/norm.cuh>
 // TODO: Need to hide the PairwiseDistance class impl and expose to public API
-#include "processing.hpp"
+#include "processing.cuh"
 #include <raft/distance/detail/distance.cuh>
 #include <raft/distance/detail/pairwise_distance_base.cuh>
 
diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index f78ffa84e1..7cefeffea2 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -38,7 +38,7 @@
 
 #include "fused_l2_knn.cuh"
 #include "haversine_distance.cuh"
-#include "processing.hpp"
+#include "processing.cuh"
 
 #include "common_faiss.h"
 
diff --git a/cpp/include/raft/spatial/knn/detail/processing.cuh b/cpp/include/raft/spatial/knn/detail/processing.cuh
new file mode 100644
index 0000000000..2319df5aac
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/processing.cuh
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "processing.hpp"
+
+#include <raft/distance/distance_type.hpp>
+#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/linalg/norm.cuh>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/stats/mean.cuh>
+#include <raft/stats/mean_center.cuh>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+
+template <typename math_t>
+class CosineMetricProcessor : public MetricProcessor<math_t> {
+ protected:
+  int k_;
+  bool row_major_;
+  size_t n_rows_;
+  size_t n_cols_;
+  cudaStream_t stream_;
+  rmm::device_uvector<math_t> colsums_;
+
+ public:
+  CosineMetricProcessor(size_t n_rows, size_t n_cols, int k, bool row_major, cudaStream_t stream)
+    : stream_(stream),
+      colsums_(n_rows, stream),
+      n_cols_(n_cols),
+      n_rows_(n_rows),
+      row_major_(row_major),
+      k_(k)
+  {
+  }
+
+  void preprocess(math_t* data)
+  {
+    raft::linalg::rowNorm(colsums_.data(),
+                          data,
+                          n_cols_,
+                          n_rows_,
+                          raft::linalg::NormType::L2Norm,
+                          row_major_,
+                          stream_,
+                          [] __device__(math_t in) { return sqrtf(in); });
+
+    raft::linalg::matrixVectorOp(
+      data,
+      data,
+      colsums_.data(),
+      n_cols_,
+      n_rows_,
+      row_major_,
+      false,
+      [] __device__(math_t mat_in, math_t vec_in) { return mat_in / vec_in; },
+      stream_);
+  }
+
+  void revert(math_t* data)
+  {
+    raft::linalg::matrixVectorOp(
+      data,
+      data,
+      colsums_.data(),
+      n_cols_,
+      n_rows_,
+      row_major_,
+      false,
+      [] __device__(math_t mat_in, math_t vec_in) { return mat_in * vec_in; },
+      stream_);
+  }
+
+  void postprocess(math_t* data)
+  {
+    raft::linalg::unaryOp(
+      data, data, k_ * n_rows_, [] __device__(math_t in) { return 1 - in; }, stream_);
+  }
+
+  ~CosineMetricProcessor() = default;
+};
+
+template <typename math_t>
+class CorrelationMetricProcessor : public CosineMetricProcessor<math_t> {
+  using cosine = CosineMetricProcessor<math_t>;
+
+ public:
+  CorrelationMetricProcessor(
+    size_t n_rows, size_t n_cols, int k, bool row_major, cudaStream_t stream)
+    : CosineMetricProcessor<math_t>(n_rows, n_cols, k, row_major, stream), means_(n_rows, stream)
+  {
+  }
+
+  void preprocess(math_t* data)
+  {
+    math_t normalizer_const = 1.0 / (math_t)cosine::n_cols_;
+
+    raft::linalg::reduce(means_.data(),
+                         data,
+                         cosine::n_cols_,
+                         cosine::n_rows_,
+                         (math_t)0.0,
+                         cosine::row_major_,
+                         true,
+                         cosine::stream_);
+
+    raft::linalg::unaryOp(
+      means_.data(),
+      means_.data(),
+      cosine::n_rows_,
+      [=] __device__(math_t in) { return in * normalizer_const; },
+      cosine::stream_);
+
+    raft::stats::meanCenter(data,
+                            data,
+                            means_.data(),
+                            cosine::n_cols_,
+                            cosine::n_rows_,
+                            cosine::row_major_,
+                            false,
+                            cosine::stream_);
+
+    CosineMetricProcessor<math_t>::preprocess(data);
+  }
+
+  void revert(math_t* data)
+  {
+    CosineMetricProcessor<math_t>::revert(data);
+
+    raft::stats::meanAdd(data,
+                         data,
+                         means_.data(),
+                         cosine::n_cols_,
+                         cosine::n_rows_,
+                         cosine::row_major_,
+                         false,
+                         cosine::stream_);
+  }
+
+  void postprocess(math_t* data) { CosineMetricProcessor<math_t>::postprocess(data); }
+
+  ~CorrelationMetricProcessor() = default;
+
+  rmm::device_uvector<math_t> means_;
+};
+
+template <typename math_t>
+class DefaultMetricProcessor : public MetricProcessor<math_t> {
+ public:
+  void preprocess(math_t* data) {}
+
+  void revert(math_t* data) {}
+
+  void postprocess(math_t* data) {}
+
+  ~DefaultMetricProcessor() = default;
+};
+
+template <typename math_t>
+inline std::unique_ptr<MetricProcessor<math_t>> create_processor(
+  distance::DistanceType metric, int n, int D, int k, bool rowMajorQuery, cudaStream_t userStream)
+{
+  MetricProcessor<math_t>* mp = nullptr;
+
+  switch (metric) {
+    case distance::DistanceType::CosineExpanded:
+      mp = new CosineMetricProcessor<math_t>(n, D, k, rowMajorQuery, userStream);
+      break;
+
+    case distance::DistanceType::CorrelationExpanded:
+      mp = new CorrelationMetricProcessor<math_t>(n, D, k, rowMajorQuery, userStream);
+      break;
+    default: mp = new DefaultMetricProcessor<math_t>();
+  }
+
+  return std::unique_ptr<MetricProcessor<math_t>>(mp);
+}
+
+// Currently only being used by floats
+template class MetricProcessor<float>;
+template class CosineMetricProcessor<float>;
+template class CorrelationMetricProcessor<float>;
+template class DefaultMetricProcessor<float>;
+
+}  // namespace knn
+}  // namespace spatial
+}  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp
index 001f57a4aa..32e5541eb8 100644
--- a/cpp/include/raft/spatial/knn/detail/processing.hpp
+++ b/cpp/include/raft/spatial/knn/detail/processing.hpp
@@ -15,14 +15,6 @@
  */
 #pragma once
 
-#include <raft/distance/distance_type.hpp>
-#include <raft/linalg/matrix_vector_op.cuh>
-#include <raft/linalg/norm.cuh>
-#include <raft/linalg/unary_op.cuh>
-#include <raft/stats/mean.cuh>
-#include <raft/stats/mean_center.cuh>
-#include <rmm/device_uvector.hpp>
-
 namespace raft {
 namespace spatial {
 namespace knn {
@@ -46,175 +38,6 @@ class MetricProcessor {
   virtual ~MetricProcessor() = default;
 };
 
-template <typename math_t>
-class CosineMetricProcessor : public MetricProcessor<math_t> {
- protected:
-  int k_;
-  bool row_major_;
-  size_t n_rows_;
-  size_t n_cols_;
-  cudaStream_t stream_;
-  rmm::device_uvector<math_t> colsums_;
-
- public:
-  CosineMetricProcessor(size_t n_rows, size_t n_cols, int k, bool row_major, cudaStream_t stream)
-    : stream_(stream),
-      colsums_(n_rows, stream),
-      n_cols_(n_cols),
-      n_rows_(n_rows),
-      row_major_(row_major),
-      k_(k)
-  {
-  }
-
-  void preprocess(math_t* data)
-  {
-    raft::linalg::rowNorm(colsums_.data(),
-                          data,
-                          n_cols_,
-                          n_rows_,
-                          raft::linalg::NormType::L2Norm,
-                          row_major_,
-                          stream_,
-                          [] __device__(math_t in) { return sqrtf(in); });
-
-    raft::linalg::matrixVectorOp(
-      data,
-      data,
-      colsums_.data(),
-      n_cols_,
-      n_rows_,
-      row_major_,
-      false,
-      [] __device__(math_t mat_in, math_t vec_in) { return mat_in / vec_in; },
-      stream_);
-  }
-
-  void revert(math_t* data)
-  {
-    raft::linalg::matrixVectorOp(
-      data,
-      data,
-      colsums_.data(),
-      n_cols_,
-      n_rows_,
-      row_major_,
-      false,
-      [] __device__(math_t mat_in, math_t vec_in) { return mat_in * vec_in; },
-      stream_);
-  }
-
-  void postprocess(math_t* data)
-  {
-    raft::linalg::unaryOp(
-      data, data, k_ * n_rows_, [] __device__(math_t in) { return 1 - in; }, stream_);
-  }
-
-  ~CosineMetricProcessor() = default;
-};
-
-template <typename math_t>
-class CorrelationMetricProcessor : public CosineMetricProcessor<math_t> {
-  using cosine = CosineMetricProcessor<math_t>;
-
- public:
-  CorrelationMetricProcessor(
-    size_t n_rows, size_t n_cols, int k, bool row_major, cudaStream_t stream)
-    : CosineMetricProcessor<math_t>(n_rows, n_cols, k, row_major, stream), means_(n_rows, stream)
-  {
-  }
-
-  void preprocess(math_t* data)
-  {
-    math_t normalizer_const = 1.0 / (math_t)cosine::n_cols_;
-
-    raft::linalg::reduce(means_.data(),
-                         data,
-                         cosine::n_cols_,
-                         cosine::n_rows_,
-                         (math_t)0.0,
-                         cosine::row_major_,
-                         true,
-                         cosine::stream_);
-
-    raft::linalg::unaryOp(
-      means_.data(),
-      means_.data(),
-      cosine::n_rows_,
-      [=] __device__(math_t in) { return in * normalizer_const; },
-      cosine::stream_);
-
-    raft::stats::meanCenter(data,
-                            data,
-                            means_.data(),
-                            cosine::n_cols_,
-                            cosine::n_rows_,
-                            cosine::row_major_,
-                            false,
-                            cosine::stream_);
-
-    CosineMetricProcessor<math_t>::preprocess(data);
-  }
-
-  void revert(math_t* data)
-  {
-    CosineMetricProcessor<math_t>::revert(data);
-
-    raft::stats::meanAdd(data,
-                         data,
-                         means_.data(),
-                         cosine::n_cols_,
-                         cosine::n_rows_,
-                         cosine::row_major_,
-                         false,
-                         cosine::stream_);
-  }
-
-  void postprocess(math_t* data) { CosineMetricProcessor<math_t>::postprocess(data); }
-
-  ~CorrelationMetricProcessor() = default;
-
-  rmm::device_uvector<math_t> means_;
-};
-
-template <typename math_t>
-class DefaultMetricProcessor : public MetricProcessor<math_t> {
- public:
-  void preprocess(math_t* data) {}
-
-  void revert(math_t* data) {}
-
-  void postprocess(math_t* data) {}
-
-  ~DefaultMetricProcessor() = default;
-};
-
-template <typename math_t>
-inline std::unique_ptr<MetricProcessor<math_t>> create_processor(
-  distance::DistanceType metric, int n, int D, int k, bool rowMajorQuery, cudaStream_t userStream)
-{
-  MetricProcessor<math_t>* mp = nullptr;
-
-  switch (metric) {
-    case distance::DistanceType::CosineExpanded:
-      mp = new CosineMetricProcessor<math_t>(n, D, k, rowMajorQuery, userStream);
-      break;
-
-    case distance::DistanceType::CorrelationExpanded:
-      mp = new CorrelationMetricProcessor<math_t>(n, D, k, rowMajorQuery, userStream);
-      break;
-    default: mp = new DefaultMetricProcessor<math_t>();
-  }
-
-  return std::unique_ptr<MetricProcessor<math_t>>(mp);
-}
-
-// Currently only being used by floats
-template class MetricProcessor<float>;
-template class CosineMetricProcessor<float>;
-template class CorrelationMetricProcessor<float>;
-template class DefaultMetricProcessor<float>;
-
 }  // namespace knn
 }  // namespace spatial
 }  // namespace raft

From e4b2b395a1ab28ddb0a66ca2ed811e9201efcbcb Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 25 Jul 2022 16:03:07 +0200
Subject: [PATCH 115/118] WIP: investigating segmentation fault in cuml test

---
 cpp/include/raft/core/mdarray.hpp   | 51 ++++++++++++++---------------
 cpp/include/raft/detail/mdarray.hpp | 30 ++++++++---------
 2 files changed, 39 insertions(+), 42 deletions(-)

diff --git a/cpp/include/raft/core/mdarray.hpp b/cpp/include/raft/core/mdarray.hpp
index a4f6ca67b1..2b6fa623f6 100644
--- a/cpp/include/raft/core/mdarray.hpp
+++ b/cpp/include/raft/core/mdarray.hpp
@@ -758,32 +758,31 @@ auto make_device_mdarray(rmm::cuda_stream_view stream, Extents... exts)
   return mdarray_t{layout, policy};
 }
 
-/**
- * @brief Create a device mdarray.
- * @tparam ElementType the data type of the matrix elements
- * @tparam LayoutPolicy policy for strides and layout ordering
- * @param stream cuda stream for ordering events
- * @param mr rmm memory resource used for allocating the memory for the array
- * @param exts dimensionality of the array (series of integers)
- * @return raft::device_mdarray
- */
-template <typename ElementType,
-          typename LayoutPolicy = layout_c_contiguous,
-          typename... Extents,
-          typename = detail::ensure_integral_extents<Extents...>>
-auto make_device_mdarray(rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr,
-                         Extents... exts)
-{
-  using extent_t  = extents<((void)exts, dynamic_extent)...>;
-  using mdarray_t = device_mdarray<ElementType, extent_t, LayoutPolicy>;
-
-  typename mdarray_t::extents_type extent{exts...};
-  typename mdarray_t::mapping_type layout{extent};
-  typename mdarray_t::container_policy_type policy{stream, mr};
-
-  return mdarray_t{layout, policy};
-}
+// /**
+//  * @brief Create a device mdarray.
+//  * @tparam ElementType the data type of the matrix elements
+//  * @tparam LayoutPolicy policy for strides and layout ordering
+//  * @param stream cuda stream for ordering events
+//  * @param mr rmm memory resource used for allocating the memory for the array
+//  * @param exts dimensionality of the array (series of integers)
+//  * @return raft::device_mdarray
+//  */
+// template <typename ElementType,
+//           typename LayoutPolicy = layout_c_contiguous,
+//           typename... Extents,
+//           typename = detail::ensure_integral_extents<Extents...>>
+// auto make_device_mdarray(rmm::cuda_stream_view stream,
+//                          rmm::mr::device_memory_resource* mr,
+//                          Extents... exts)
+// {
+//   using extent_t  = extents<((void)exts, dynamic_extent)...>;
+//   using mdarray_t = device_mdarray<ElementType, extent_t, LayoutPolicy>;
+
+//   typename mdarray_t::extents_type extent{exts...};
+//   typename mdarray_t::mapping_type layout{extent};
+//   typename mdarray_t::container_policy_type policy{stream, mr};
+//   return mdarray_t{layout, policy};
+// }
 
 /**
  * @brief Create a 2-dim c-contiguous host mdarray.
diff --git a/cpp/include/raft/detail/mdarray.hpp b/cpp/include/raft/detail/mdarray.hpp
index e2ab07d75c..53aa6435fb 100644
--- a/cpp/include/raft/detail/mdarray.hpp
+++ b/cpp/include/raft/detail/mdarray.hpp
@@ -107,10 +107,13 @@ class device_uvector {
   /**
    * @brief Ctor that accepts a size, stream and an optional mr.
    */
-  explicit device_uvector(
-    std::size_t size,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  explicit device_uvector(std::size_t size, rmm::cuda_stream_view stream) : data_{size, stream} {}
+  /**
+   * @brief Ctor that accepts a size, stream and a memory resource.
+   */
+  explicit device_uvector(std::size_t size,
+                          rmm::cuda_stream_view stream,
+                          rmm::mr::device_memory_resource* mr)
     : data_{size, stream, mr}
   {
   }
@@ -141,7 +144,11 @@ class device_uvector {
 template <typename ElementType>
 class device_uvector_policy {
   rmm::cuda_stream_view stream_;
-  rmm::mr::device_memory_resource* mr_;
+  // FIXME: adding this member makes cuml pytest crash:
+  //    python/cuml/tests/test_nearest_neighbors.py::test_ann_distances_metrics[sqeuclidean-ivfflat]
+  //    this crashes during GC, at the moment any mdarray member of ivf_flat::index is getting
+  //    destroyed
+  // rmm::mr::device_memory_resource* mr_ = nullptr;
 
  public:
   using element_type   = ElementType;
@@ -156,21 +163,12 @@ class device_uvector_policy {
   using const_accessor_policy = std::experimental::default_accessor<element_type const>;
 
  public:
-  auto create(size_t n) -> container_type
-  {
-    return mr_ ? container_type(n, stream_, mr_) : container_type(n, stream_);
-  }
+  auto create(size_t n) -> container_type { return container_type(n, stream_); }
 
   device_uvector_policy() = delete;
   explicit device_uvector_policy(rmm::cuda_stream_view stream) noexcept(
     std::is_nothrow_copy_constructible_v<rmm::cuda_stream_view>)
-    : stream_{stream}, mr_(nullptr)
-  {
-  }
-
-  device_uvector_policy(rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) noexcept(
-    std::is_nothrow_copy_constructible_v<rmm::cuda_stream_view>)
-    : stream_{stream}, mr_(mr)
+    : stream_{stream}
   {
   }
 

From 6bc0fcb3d62dce8f0de8f54bb01f165fbf7020ec Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 26 Jul 2022 08:16:10 +0200
Subject: [PATCH 116/118] Revert the wip-changes from the last commit

---
 cpp/include/raft/core/mdarray.hpp   | 51 +++++++++++++++--------------
 cpp/include/raft/detail/mdarray.hpp | 19 ++++++-----
 2 files changed, 36 insertions(+), 34 deletions(-)

diff --git a/cpp/include/raft/core/mdarray.hpp b/cpp/include/raft/core/mdarray.hpp
index 2b6fa623f6..a4f6ca67b1 100644
--- a/cpp/include/raft/core/mdarray.hpp
+++ b/cpp/include/raft/core/mdarray.hpp
@@ -758,31 +758,32 @@ auto make_device_mdarray(rmm::cuda_stream_view stream, Extents... exts)
   return mdarray_t{layout, policy};
 }
 
-// /**
-//  * @brief Create a device mdarray.
-//  * @tparam ElementType the data type of the matrix elements
-//  * @tparam LayoutPolicy policy for strides and layout ordering
-//  * @param stream cuda stream for ordering events
-//  * @param mr rmm memory resource used for allocating the memory for the array
-//  * @param exts dimensionality of the array (series of integers)
-//  * @return raft::device_mdarray
-//  */
-// template <typename ElementType,
-//           typename LayoutPolicy = layout_c_contiguous,
-//           typename... Extents,
-//           typename = detail::ensure_integral_extents<Extents...>>
-// auto make_device_mdarray(rmm::cuda_stream_view stream,
-//                          rmm::mr::device_memory_resource* mr,
-//                          Extents... exts)
-// {
-//   using extent_t  = extents<((void)exts, dynamic_extent)...>;
-//   using mdarray_t = device_mdarray<ElementType, extent_t, LayoutPolicy>;
-
-//   typename mdarray_t::extents_type extent{exts...};
-//   typename mdarray_t::mapping_type layout{extent};
-//   typename mdarray_t::container_policy_type policy{stream, mr};
-//   return mdarray_t{layout, policy};
-// }
+/**
+ * @brief Create a device mdarray.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @param stream cuda stream for ordering events
+ * @param mr rmm memory resource used for allocating the memory for the array
+ * @param exts dimensionality of the array (series of integers)
+ * @return raft::device_mdarray
+ */
+template <typename ElementType,
+          typename LayoutPolicy = layout_c_contiguous,
+          typename... Extents,
+          typename = detail::ensure_integral_extents<Extents...>>
+auto make_device_mdarray(rmm::cuda_stream_view stream,
+                         rmm::mr::device_memory_resource* mr,
+                         Extents... exts)
+{
+  using extent_t  = extents<((void)exts, dynamic_extent)...>;
+  using mdarray_t = device_mdarray<ElementType, extent_t, LayoutPolicy>;
+
+  typename mdarray_t::extents_type extent{exts...};
+  typename mdarray_t::mapping_type layout{extent};
+  typename mdarray_t::container_policy_type policy{stream, mr};
+
+  return mdarray_t{layout, policy};
+}
 
 /**
  * @brief Create a 2-dim c-contiguous host mdarray.
diff --git a/cpp/include/raft/detail/mdarray.hpp b/cpp/include/raft/detail/mdarray.hpp
index 53aa6435fb..dd87e9f245 100644
--- a/cpp/include/raft/detail/mdarray.hpp
+++ b/cpp/include/raft/detail/mdarray.hpp
@@ -144,11 +144,7 @@ class device_uvector {
 template <typename ElementType>
 class device_uvector_policy {
   rmm::cuda_stream_view stream_;
-  // FIXME: adding this member makes cuml pytest crash:
-  //    python/cuml/tests/test_nearest_neighbors.py::test_ann_distances_metrics[sqeuclidean-ivfflat]
-  //    this crashes during GC, at the moment any mdarray member of ivf_flat::index is getting
-  //    destroyed
-  // rmm::mr::device_memory_resource* mr_ = nullptr;
+  rmm::mr::device_memory_resource* mr_;
 
  public:
   using element_type   = ElementType;
@@ -163,12 +159,17 @@ class device_uvector_policy {
   using const_accessor_policy = std::experimental::default_accessor<element_type const>;
 
  public:
-  auto create(size_t n) -> container_type { return container_type(n, stream_); }
+  auto create(size_t n) -> container_type
+  {
+    return mr_ ? container_type(n, stream_, mr_) : container_type(n, stream_);
+  }
 
   device_uvector_policy() = delete;
-  explicit device_uvector_policy(rmm::cuda_stream_view stream) noexcept(
-    std::is_nothrow_copy_constructible_v<rmm::cuda_stream_view>)
-    : stream_{stream}
+  explicit device_uvector_policy(
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr =
+      nullptr) noexcept(std::is_nothrow_copy_constructible_v<rmm::cuda_stream_view>)
+    : stream_{stream}, mr_(mr)
   {
   }
 

From 196b83fed03fbc99bfcb218b1645a05f4f049236 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 26 Jul 2022 09:27:07 +0200
Subject: [PATCH 117/118] Fix K not set for postprocessing of the queries

---
 cpp/include/raft/spatial/knn/detail/ann_quantized.cuh | 5 ++++-
 cpp/include/raft/spatial/knn/detail/processing.cuh    | 2 ++
 cpp/include/raft/spatial/knn/detail/processing.hpp    | 2 ++
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
index 0dd6baed5b..5a56a84fe3 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
@@ -168,7 +168,10 @@ void approx_knn_search(const handle_t& handle,
   auto faiss_ivf = dynamic_cast<GpuIndexIVF*>(index->index.get());
   if (faiss_ivf) { faiss_ivf->setNumProbes(index->nprobe); }
 
-  if constexpr (std::is_same_v<T, float>) { index->metric_processor->preprocess(query_array); }
+  if constexpr (std::is_same_v<T, float>) {
+    index->metric_processor->preprocess(query_array);
+    index->metric_processor->set_num_queries(k);
+  }
 
   // search
   if (faiss_ivf) {
diff --git a/cpp/include/raft/spatial/knn/detail/processing.cuh b/cpp/include/raft/spatial/knn/detail/processing.cuh
index 2319df5aac..79c437b020 100644
--- a/cpp/include/raft/spatial/knn/detail/processing.cuh
+++ b/cpp/include/raft/spatial/knn/detail/processing.cuh
@@ -93,6 +93,8 @@ class CosineMetricProcessor : public MetricProcessor<math_t> {
       data, data, k_ * n_rows_, [] __device__(math_t in) { return 1 - in; }, stream_);
   }
 
+  virtual void set_num_queries(int k) { k_ = k; }
+
   ~CosineMetricProcessor() = default;
 };
 
diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp
index 32e5541eb8..41003c2030 100644
--- a/cpp/include/raft/spatial/knn/detail/processing.hpp
+++ b/cpp/include/raft/spatial/knn/detail/processing.hpp
@@ -35,6 +35,8 @@ class MetricProcessor {
 
   virtual void postprocess(math_t* data) {}
 
+  virtual void set_num_queries(int k) {}
+
   virtual ~MetricProcessor() = default;
 };
 

From 8b267506f4f48936a6609d8d7d1866eca754bd56 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 26 Jul 2022 15:50:00 +0200
Subject: [PATCH 118/118] Replace mdarrays with rmm::device_uvector to
 workaround a crash in cuml

---
 .../spatial/knn/detail/ivf_flat_build.cuh     | 38 ++++++++------
 .../spatial/knn/detail/ivf_flat_search.cuh    | 34 ++++++-------
 .../raft/spatial/knn/ivf_flat_types.hpp       | 49 +++++++------------
 3 files changed, 57 insertions(+), 64 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
index d82ed158e7..96af5c9522 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
@@ -111,12 +111,12 @@ inline auto extend(const handle_t& handle,
                    IdxT n_rows,
                    rmm::cuda_stream_view stream) -> index<T, IdxT>
 {
-  auto n_lists = orig_index.n_lists();
-  auto dim     = orig_index.dim();
+  auto n_lists = orig_index.n_lists;
+  auto dim     = orig_index.dim;
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
     "ivf_flat::extend(%zu, %u)", size_t(n_rows), dim);
 
-  RAFT_EXPECTS(new_indices != nullptr || orig_index.size() == 0,
+  RAFT_EXPECTS(new_indices != nullptr || orig_index.size == 0,
                "You must pass data indices when the index is non-empty.");
 
   rmm::device_uvector<uint32_t> new_labels(n_rows, stream);
@@ -130,12 +130,12 @@ inline auto extend(const handle_t& handle,
                   orig_index.metric,
                   stream);
 
-  auto&& list_sizes     = make_device_mdarray<uint32_t>(stream, n_lists);
-  auto&& list_offsets   = make_device_mdarray<IdxT>(stream, n_lists + 1);
+  auto&& list_sizes     = rmm::device_uvector<uint32_t>(n_lists, stream);
+  auto&& list_offsets   = rmm::device_uvector<IdxT>(n_lists + 1, stream);
   auto list_sizes_ptr   = list_sizes.data();
   auto list_offsets_ptr = list_offsets.data();
 
-  auto&& centers   = make_device_mdarray<float>(stream, n_lists, dim);
+  auto&& centers   = rmm::device_uvector<float>(size_t(n_lists) * size_t(dim), stream);
   auto centers_ptr = centers.data();
 
   // Calculate the centers and sizes on the new data, starting from the original values
@@ -164,11 +164,11 @@ inline auto extend(const handle_t& handle,
   update_host(&index_size, list_offsets_ptr + n_lists, 1, stream);
   handle.sync_stream(stream);
 
-  auto&& data    = make_device_mdarray<T>(stream, index_size, dim);
-  auto&& indices = make_device_mdarray<IdxT>(stream, index_size);
+  auto&& data    = rmm::device_uvector<T>(index_size * IdxT(dim), stream);
+  auto&& indices = rmm::device_uvector<IdxT>(index_size, stream);
 
   // Populate index with the old data
-  if (orig_index.size() > 0) {
+  if (orig_index.size > 0) {
     utils::block_copy(orig_index.list_offsets.data(),
                       list_offsets_ptr,
                       IdxT(n_lists),
@@ -206,10 +206,10 @@ inline auto extend(const handle_t& handle,
 
   // Precompute the centers vector norms for L2Expanded distance
   auto compute_norms = [&]() {
-    auto&& r = make_device_mdarray<float>(stream, n_lists);
+    auto&& r = rmm::device_uvector<float>(n_lists, stream);
     utils::dots_along_rows(n_lists, dim, centers.data(), r.data(), stream);
     RAFT_LOG_TRACE_VEC(r.data(), 20);
-    return r;
+    return std::move(r);
   };
   auto&& center_norms = orig_index.metric == raft::distance::DistanceType::L2Expanded
                           ? std::optional(compute_norms())
@@ -219,6 +219,9 @@ inline auto extend(const handle_t& handle,
   index<T, IdxT> new_index{{},
                            orig_index.veclen,
                            orig_index.metric,
+                           index_size,
+                           orig_index.dim,
+                           orig_index.n_lists,
                            std::move(data),
                            std::move(indices),
                            std::move(list_sizes),
@@ -256,7 +259,7 @@ inline auto build(const handle_t& handle,
   auto n_lists = static_cast<uint32_t>(params.n_lists);
 
   // kmeans cluster ids for the dataset
-  auto&& centers = make_device_mdarray<float>(stream, n_lists, dim);
+  auto&& centers = rmm::device_uvector<float>(size_t(n_lists) * size_t(dim), stream);
 
   // Predict labels of the whole dataset
   kmeans::build_optimized_kmeans(handle,
@@ -270,10 +273,10 @@ inline auto build(const handle_t& handle,
                                  params.metric,
                                  stream);
 
-  auto&& data         = make_device_mdarray<T>(stream, 0, dim);
-  auto&& indices      = make_device_mdarray<IdxT>(stream, 0);
-  auto&& list_sizes   = make_device_mdarray<uint32_t>(stream, n_lists);
-  auto&& list_offsets = make_device_mdarray<IdxT>(stream, n_lists + 1);
+  auto&& data         = rmm::device_uvector<T>(0, stream);
+  auto&& indices      = rmm::device_uvector<IdxT>(0, stream);
+  auto&& list_sizes   = rmm::device_uvector<uint32_t>(n_lists, stream);
+  auto&& list_offsets = rmm::device_uvector<IdxT>(n_lists + 1, stream);
   utils::memzero(list_sizes.data(), list_sizes.size(), stream);
   utils::memzero(list_offsets.data(), list_offsets.size(), stream);
 
@@ -281,6 +284,9 @@ inline auto build(const handle_t& handle,
   index<T, IdxT> index{{},
                        veclen,
                        params.metric,
+                       IdxT(0),
+                       dim,
+                       n_lists,
                        std::move(data),
                        std::move(indices),
                        std::move(list_sizes),
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
index c04ece3858..a52fbc69de 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
@@ -825,7 +825,7 @@ void launch_kernel(Lambda lambda,
     interleaved_scan_kernel<Capacity, Veclen, Ascending, T, AccT, IdxT, Lambda>;
   const int max_query_smem = 16384;
   int query_smem_elems =
-    std::min<int>(max_query_smem / sizeof(T), Pow2<Veclen * WarpSize>::roundUp(index.dim()));
+    std::min<int>(max_query_smem / sizeof(T), Pow2<Veclen * WarpSize>::roundUp(index.dim));
   int smem_size              = query_smem_elems * sizeof(T);
   constexpr int kSubwarpSize = std::min<int>(Capacity, WarpSize);
   smem_size += raft::spatial::knn::detail::topk::calc_smem_size_for_block_wide<AccT, size_t>(
@@ -861,10 +861,10 @@ void launch_kernel(Lambda lambda,
                                                         index.list_offsets.data(),
                                                         n_probes,
                                                         k,
-                                                        index.dim(),
+                                                        index.dim,
                                                         neighbors,
                                                         distances);
-    queries += grid_dim_y * index.dim();
+    queries += grid_dim_y * index.dim;
     neighbors += grid_dim_y * grid_dim_x * k;
     distances += grid_dim_y * grid_dim_x * k;
   }
@@ -1072,7 +1072,7 @@ void search_impl(const handle_t& handle,
   // The norm of query
   rmm::device_uvector<float> query_norm_dev(n_queries, stream, search_mr);
   // The distance value of cluster(list) and queries
-  rmm::device_uvector<float> distance_buffer_dev(n_queries * index.n_lists(), stream, search_mr);
+  rmm::device_uvector<float> distance_buffer_dev(n_queries * index.n_lists, stream, search_mr);
   // The topk distance value of cluster(list) and queries
   rmm::device_uvector<float> coarse_distances_dev(n_queries * n_probes, stream, search_mr);
   // The topk  index of cluster(list) and queries
@@ -1084,7 +1084,7 @@ void search_impl(const handle_t& handle,
 
   size_t float_query_size;
   if constexpr (std::is_integral_v<T>) {
-    float_query_size = n_queries * index.dim();
+    float_query_size = n_queries * index.dim;
   } else {
     float_query_size = 0;
   }
@@ -1095,7 +1095,7 @@ void search_impl(const handle_t& handle,
     converted_queries_ptr = const_cast<float*>(queries);
   } else {
     linalg::unaryOp(
-      converted_queries_ptr, queries, n_queries * index.dim(), utils::mapping<float>{}, stream);
+      converted_queries_ptr, queries, n_queries * index.dim, utils::mapping<float>{}, stream);
   }
 
   float alpha = 1.0f;
@@ -1105,11 +1105,11 @@ void search_impl(const handle_t& handle,
     alpha = -2.0f;
     beta  = 1.0f;
     utils::dots_along_rows(
-      n_queries, index.dim(), converted_queries_ptr, query_norm_dev.data(), stream);
+      n_queries, index.dim, converted_queries_ptr, query_norm_dev.data(), stream);
     utils::outer_add(query_norm_dev.data(),
                      n_queries,
                      index.center_norms->data(),
-                     index.n_lists(),
+                     index.n_lists,
                      distance_buffer_dev.data(),
                      stream);
     RAFT_LOG_TRACE_VEC(index.center_norms->data(), 20);
@@ -1122,17 +1122,17 @@ void search_impl(const handle_t& handle,
   linalg::gemm(handle,
                true,
                false,
-               index.n_lists(),
+               index.n_lists,
                n_queries,
-               index.dim(),
+               index.dim,
                &alpha,
                index.centers.data(),
-               index.dim(),
+               index.dim,
                converted_queries_ptr,
-               index.dim(),
+               index.dim,
                &beta,
                distance_buffer_dev.data(),
-               index.n_lists(),
+               index.n_lists,
                stream);
 
   RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), 20);
@@ -1140,7 +1140,7 @@ void search_impl(const handle_t& handle,
     topk::warp_sort_topk<AccT, uint32_t>(distance_buffer_dev.data(),
                                          nullptr,
                                          n_queries,
-                                         index.n_lists(),
+                                         index.n_lists,
                                          n_probes,
                                          coarse_distances_dev.data(),
                                          coarse_indices_dev.data(),
@@ -1151,7 +1151,7 @@ void search_impl(const handle_t& handle,
     topk::radix_topk<AccT, uint32_t, 11, 512>(distance_buffer_dev.data(),
                                               nullptr,
                                               n_queries,
-                                              index.n_lists(),
+                                              index.n_lists,
                                               n_probes,
                                               coarse_distances_dev.data(),
                                               coarse_indices_dev.data(),
@@ -1249,11 +1249,11 @@ inline void search(const handle_t& handle,
                    rmm::mr::device_memory_resource* mr = nullptr)
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
-    "ivf_flat::search(k = %u, n_queries = %u, dim = %zu)", k, n_queries, index.dim());
+    "ivf_flat::search(k = %u, n_queries = %u, dim = %zu)", k, n_queries, index.dim);
 
   RAFT_EXPECTS(params.n_probes > 0,
                "n_probes (number of clusters to probe in the search) must be positive.");
-  auto n_probes = std::min<uint32_t>(params.n_probes, index.n_lists());
+  auto n_probes = std::min<uint32_t>(params.n_probes, index.n_lists);
 
   bool select_min;
   switch (index.metric) {
diff --git a/cpp/include/raft/spatial/knn/ivf_flat_types.hpp b/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
index 81c9bba998..6c46a288c1 100644
--- a/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
+++ b/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
@@ -18,10 +18,12 @@
 
 #include "common.hpp"
 
-#include <raft/core/mdarray.hpp>
+#include <raft/core/error.hpp>
 #include <raft/distance/distance_type.hpp>
 #include <raft/integer_utils.h>
 
+#include <rmm/device_uvector.hpp>
+
 #include <optional>
 
 namespace raft::spatial::knn::ivf_flat {
@@ -62,6 +64,12 @@ struct index : knn::index {
   const uint32_t veclen;
   /** Distance metric used for clustering. */
   const raft::distance::DistanceType metric;
+  /** Total length of the index. */
+  const IdxT size;
+  /** Dimensionality of the data. */
+  const uint32_t dim;
+  /** Number of clusters/inverted lists. */
+  const uint32_t n_lists;
 
   /**
    * Inverted list data [size, dim].
@@ -86,20 +94,20 @@ struct index : knn::index {
    *     x[16, 4], x[16, 5], x[17, 4], x[17, 5], ... x[30, 4], x[30, 5],    -    ,    -    ,
    *
    */
-  device_mdarray<T, extent_2d, row_major> data;
+  rmm::device_uvector<T> data;
   /** Inverted list indices: ids of items in the source data [size] */
-  device_mdarray<IdxT, extent_1d, row_major> indices;
+  rmm::device_uvector<IdxT> indices;
   /** Sizes of the lists (clusters) [n_lists] */
-  device_mdarray<uint32_t, extent_1d, row_major> list_sizes;
+  rmm::device_uvector<uint32_t> list_sizes;
   /**
    * Offsets into the lists [n_lists + 1].
    * The last value contains the total length of the index.
    */
-  device_mdarray<IdxT, extent_1d, row_major> list_offsets;
+  rmm::device_uvector<IdxT> list_offsets;
   /** k-means cluster centers corresponding to the lists [n_lists, dim] */
-  device_mdarray<float, extent_2d, row_major> centers;
+  rmm::device_uvector<float> centers;
   /** (Optional) Precomputed norms of the `centers` w.r.t. the chosen distance metric [n_lists]  */
-  std::optional<device_mdarray<float, extent_1d, row_major>> center_norms;
+  std::optional<rmm::device_uvector<float>> center_norms;
 
   // Don't allow copying the index for performance reasons (try avoiding copying data)
   index(const index&) = delete;
@@ -108,33 +116,12 @@ struct index : knn::index {
   auto operator=(index&&) -> index& = default;
   ~index()                          = default;
 
-  /** Total length of the index. */
-  [[nodiscard]] constexpr inline auto size() const noexcept -> IdxT
-  {
-    return static_cast<uint32_t>(data.extent(0));
-  }
-  /** Dimensionality of the data. */
-  [[nodiscard]] constexpr inline auto dim() const noexcept -> uint32_t
-  {
-    return static_cast<uint32_t>(data.extent(1));
-  }
-  /** Number of clusters/inverted lists. */
-  [[nodiscard]] constexpr inline auto n_lists() const noexcept -> uint32_t
-  {
-    return static_cast<uint32_t>(centers.extent(0));
-  }
-
   /** Throw an error if the index content is inconsistent. */
   inline void check_consistency() const
   {
-    RAFT_EXPECTS(dim() % veclen == 0, "dimensionality is not a multiple of the veclen");
-    RAFT_EXPECTS(data.extent(0) == indices.extent(0), "inconsistent index size");
-    RAFT_EXPECTS(data.extent(1) == centers.extent(1), "inconsistent data dimensionality");
-    RAFT_EXPECTS(                                             //
-      (centers.extent(0) == list_sizes.extent(0)) &&          //
-        (centers.extent(0) + 1 == list_offsets.extent(0)) &&  //
-        (!center_norms.has_value() || centers.extent(0) == center_norms->extent(0)),
-      "inconsistent number of lists (clusters)");
+    RAFT_EXPECTS(dim % veclen == 0, "dimensionality is not a multiple of the veclen");
+    RAFT_EXPECTS(list_offsets.size() == list_sizes.size() + 1,
+                 "inconsistent number of lists (clusters)");
     RAFT_EXPECTS(reinterpret_cast<size_t>(data.data()) % (veclen * sizeof(T)) == 0,
                  "The data storage pointer is not aligned to the vector length");
   }