diff --git a/cpp/bench/spatial/knn.cu b/cpp/bench/spatial/knn.cu
index 72a1244269..64a1217d7f 100644
--- a/cpp/bench/spatial/knn.cu
+++ b/cpp/bench/spatial/knn.cu
@@ -17,7 +17,8 @@
 #include <common/benchmark.hpp>
 
 #include <raft/random/rng.cuh>
-#include <raft/spatial/knn/knn.cuh>
+
+#include <raft/spatial/knn/ivf_flat.cuh>
 #if defined RAFT_NN_COMPILED
 #include <raft/spatial/knn/specializations.cuh>
 #endif
@@ -126,6 +127,34 @@ struct host_uvector {
   T* arr_;
 };
 
+template <typename ValT, typename IdxT>
+struct ivf_flat_knn {
+  using dist_t = float;
+
+  std::optional<const raft::spatial::knn::ivf_flat::index<ValT, IdxT>> index;
+  raft::spatial::knn::ivf_flat::index_params index_params;
+  raft::spatial::knn::ivf_flat::search_params search_params;
+  params ps;
+
+  ivf_flat_knn(const raft::handle_t& handle, const params& ps, const ValT* data) : ps(ps)
+  {
+    index_params.n_lists = 4096;
+    index_params.metric  = raft::distance::DistanceType::L2Expanded;
+    index.emplace(raft::spatial::knn::ivf_flat::build(
+      handle, index_params, data, IdxT(ps.n_samples), uint32_t(ps.n_dims)));
+  }
+
+  void search(const raft::handle_t& handle,
+              const ValT* search_items,
+              dist_t* out_dists,
+              IdxT* out_idxs)
+  {
+    search_params.n_probes = 20;
+    raft::spatial::knn::ivf_flat::search(
+      handle, search_params, *index, search_items, ps.n_queries, ps.k, out_idxs, out_dists);
+  }
+};
+
 template <typename ValT, typename IdxT>
 struct brute_force_knn {
   using dist_t = ValT;
@@ -326,7 +355,13 @@ const std::vector<Scope> kAllScopes{Scope::BUILD_SEARCH, Scope::SEARCH, Scope::B
   }
 
 KNN_REGISTER(float, int64_t, brute_force_knn, kInputs, kAllStrategies, kScopeFull);
+KNN_REGISTER(float, int64_t, ivf_flat_knn, kInputs, kNoCopyOnly, kAllScopes);
+KNN_REGISTER(int8_t, int64_t, ivf_flat_knn, kInputs, kNoCopyOnly, kAllScopes);
+KNN_REGISTER(uint8_t, int64_t, ivf_flat_knn, kInputs, kNoCopyOnly, kAllScopes);
 
 KNN_REGISTER(float, uint32_t, brute_force_knn, kInputs, kNoCopyOnly, kScopeFull);
+KNN_REGISTER(float, uint32_t, ivf_flat_knn, kInputs, kNoCopyOnly, kAllScopes);
+KNN_REGISTER(int8_t, uint32_t, ivf_flat_knn, kInputs, kNoCopyOnly, kAllScopes);
+KNN_REGISTER(uint8_t, uint32_t, ivf_flat_knn, kInputs, kNoCopyOnly, kAllScopes);
 
 }  // namespace raft::bench::spatial
diff --git a/cpp/include/raft/common/device_loads_stores.cuh b/cpp/include/raft/common/device_loads_stores.cuh
index 41dc9cab08..0c4750aa69 100644
--- a/cpp/include/raft/common/device_loads_stores.cuh
+++ b/cpp/include/raft/common/device_loads_stores.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,6 +31,121 @@ namespace raft {
  * @param[out] addr shared memory address (should be aligned to vector size)
  * @param[in]  x    data to be stored at this address
  */
+DI void sts(uint8_t* addr, const uint8_t& x)
+{
+  uint32_t x_int;
+  x_int   = x;
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<uint8_t*>(addr));
+  asm volatile("st.shared.u8 [%0], {%1};" : : "l"(s1), "r"(x_int));
+}
+DI void sts(uint8_t* addr, const uint8_t (&x)[1])
+{
+  uint32_t x_int[1];
+  x_int[0] = x[0];
+  auto s1  = __cvta_generic_to_shared(reinterpret_cast<uint8_t*>(addr));
+  asm volatile("st.shared.u8 [%0], {%1};" : : "l"(s1), "r"(x_int[0]));
+}
+DI void sts(uint8_t* addr, const uint8_t (&x)[2])
+{
+  uint32_t x_int[2];
+  x_int[0] = x[0];
+  x_int[1] = x[1];
+  auto s2  = __cvta_generic_to_shared(reinterpret_cast<uint8_t*>(addr));
+  asm volatile("st.shared.v2.u8 [%0], {%1, %2};" : : "l"(s2), "r"(x_int[0]), "r"(x_int[1]));
+}
+DI void sts(uint8_t* addr, const uint8_t (&x)[4])
+{
+  uint32_t x_int[4];
+  x_int[0] = x[0];
+  x_int[1] = x[1];
+  x_int[2] = x[2];
+  x_int[3] = x[3];
+  auto s4  = __cvta_generic_to_shared(reinterpret_cast<uint8_t*>(addr));
+  asm volatile("st.shared.v4.u8 [%0], {%1, %2, %3, %4};"
+               :
+               : "l"(s4), "r"(x_int[0]), "r"(x_int[1]), "r"(x_int[2]), "r"(x_int[3]));
+}
+
+DI void sts(int8_t* addr, const int8_t& x)
+{
+  int32_t x_int = x;
+  auto s1       = __cvta_generic_to_shared(reinterpret_cast<int8_t*>(addr));
+  asm volatile("st.shared.s8 [%0], {%1};" : : "l"(s1), "r"(x_int));
+}
+DI void sts(int8_t* addr, const int8_t (&x)[1])
+{
+  int32_t x_int[1];
+  x_int[0] = x[0];
+  auto s1  = __cvta_generic_to_shared(reinterpret_cast<int8_t*>(addr));
+  asm volatile("st.shared.s8 [%0], {%1};" : : "l"(s1), "r"(x_int[0]));
+}
+DI void sts(int8_t* addr, const int8_t (&x)[2])
+{
+  int32_t x_int[2];
+  x_int[0] = x[0];
+  x_int[1] = x[1];
+  auto s2  = __cvta_generic_to_shared(reinterpret_cast<int8_t*>(addr));
+  asm volatile("st.shared.v2.s8 [%0], {%1, %2};" : : "l"(s2), "r"(x_int[0]), "r"(x_int[1]));
+}
+DI void sts(int8_t* addr, const int8_t (&x)[4])
+{
+  int32_t x_int[4];
+  x_int[0] = x[0];
+  x_int[1] = x[1];
+  x_int[2] = x[2];
+  x_int[3] = x[3];
+  auto s4  = __cvta_generic_to_shared(reinterpret_cast<int8_t*>(addr));
+  asm volatile("st.shared.v4.s8 [%0], {%1, %2, %3, %4};"
+               :
+               : "l"(s4), "r"(x_int[0]), "r"(x_int[1]), "r"(x_int[2]), "r"(x_int[3]));
+}
+
+DI void sts(uint32_t* addr, const uint32_t& x)
+{
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<uint32_t*>(addr));
+  asm volatile("st.shared.u32 [%0], {%1};" : : "l"(s1), "r"(x));
+}
+DI void sts(uint32_t* addr, const uint32_t (&x)[1])
+{
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<uint32_t*>(addr));
+  asm volatile("st.shared.u32 [%0], {%1};" : : "l"(s1), "r"(x[0]));
+}
+DI void sts(uint32_t* addr, const uint32_t (&x)[2])
+{
+  auto s2 = __cvta_generic_to_shared(reinterpret_cast<uint2*>(addr));
+  asm volatile("st.shared.v2.u32 [%0], {%1, %2};" : : "l"(s2), "r"(x[0]), "r"(x[1]));
+}
+DI void sts(uint32_t* addr, const uint32_t (&x)[4])
+{
+  auto s4 = __cvta_generic_to_shared(reinterpret_cast<uint4*>(addr));
+  asm volatile("st.shared.v4.u32 [%0], {%1, %2, %3, %4};"
+               :
+               : "l"(s4), "r"(x[0]), "r"(x[1]), "r"(x[2]), "r"(x[3]));
+}
+
+DI void sts(int32_t* addr, const int32_t& x)
+{
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<int32_t*>(addr));
+  asm volatile("st.shared.u32 [%0], {%1};" : : "l"(s1), "r"(x));
+}
+DI void sts(int32_t* addr, const int32_t (&x)[1])
+{
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<int32_t*>(addr));
+  asm volatile("st.shared.u32 [%0], {%1};" : : "l"(s1), "r"(x[0]));
+}
+DI void sts(int32_t* addr, const int32_t (&x)[2])
+{
+  auto s2 = __cvta_generic_to_shared(reinterpret_cast<int2*>(addr));
+  asm volatile("st.shared.v2.u32 [%0], {%1, %2};" : : "l"(s2), "r"(x[0]), "r"(x[1]));
+}
+DI void sts(int32_t* addr, const int32_t (&x)[4])
+{
+  auto s4 = __cvta_generic_to_shared(reinterpret_cast<int4*>(addr));
+  asm volatile("st.shared.v4.u32 [%0], {%1, %2, %3, %4};"
+               :
+               : "l"(s4), "r"(x[0]), "r"(x[1]), "r"(x[2]), "r"(x[3]));
+}
+
 DI void sts(float* addr, const float& x)
 {
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<float*>(addr));
@@ -83,6 +198,152 @@ DI void sts(double* addr, const double (&x)[2])
  * @param[in]  addr shared memory address from where to load
  *                  (should be aligned to vector size)
  */
+
+DI void lds(uint8_t& x, const uint8_t* addr)
+{
+  uint32_t x_int;
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<const uint8_t*>(addr));
+  asm volatile("ld.shared.u8 {%0}, [%1];" : "=r"(x_int) : "l"(s1));
+  x = x_int;
+}
+DI void lds(uint8_t (&x)[1], const uint8_t* addr)
+{
+  uint32_t x_int[1];
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<const uint8_t*>(addr));
+  asm volatile("ld.shared.u8 {%0}, [%1];" : "=r"(x_int[0]) : "l"(s1));
+  x[0] = x_int[0];
+}
+DI void lds(uint8_t (&x)[2], const uint8_t* addr)
+{
+  uint32_t x_int[2];
+  auto s2 = __cvta_generic_to_shared(reinterpret_cast<const uint8_t*>(addr));
+  asm volatile("ld.shared.v2.u8 {%0, %1}, [%2];" : "=r"(x_int[0]), "=r"(x_int[1]) : "l"(s2));
+  x[0] = x_int[0];
+  x[1] = x_int[1];
+}
+DI void lds(uint8_t (&x)[4], const uint8_t* addr)
+{
+  uint32_t x_int[4];
+  auto s4 = __cvta_generic_to_shared(reinterpret_cast<const uint8_t*>(addr));
+  asm volatile("ld.shared.v4.u8 {%0, %1, %2, %3}, [%4];"
+               : "=r"(x_int[0]), "=r"(x_int[1]), "=r"(x_int[2]), "=r"(x_int[3])
+               : "l"(s4));
+  x[0] = x_int[0];
+  x[1] = x_int[1];
+  x[2] = x_int[2];
+  x[3] = x_int[3];
+}
+
+DI void lds(int8_t& x, const int8_t* addr)
+{
+  int32_t x_int;
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<const int8_t*>(addr));
+  asm volatile("ld.shared.s8 {%0}, [%1];" : "=r"(x_int) : "l"(s1));
+  x = x_int;
+}
+DI void lds(int8_t (&x)[1], const int8_t* addr)
+{
+  int32_t x_int[1];
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<const int8_t*>(addr));
+  asm volatile("ld.shared.s8 {%0}, [%1];" : "=r"(x_int[0]) : "l"(s1));
+  x[0] = x_int[0];
+}
+DI void lds(int8_t (&x)[2], const int8_t* addr)
+{
+  int32_t x_int[2];
+  auto s2 = __cvta_generic_to_shared(reinterpret_cast<const int8_t*>(addr));
+  asm volatile("ld.shared.v2.s8 {%0, %1}, [%2];" : "=r"(x_int[0]), "=r"(x_int[1]) : "l"(s2));
+  x[0] = x_int[0];
+  x[1] = x_int[1];
+}
+DI void lds(int8_t (&x)[4], const int8_t* addr)
+{
+  int32_t x_int[4];
+  auto s4 = __cvta_generic_to_shared(reinterpret_cast<const int8_t*>(addr));
+  asm volatile("ld.shared.v4.s8 {%0, %1, %2, %3}, [%4];"
+               : "=r"(x_int[0]), "=r"(x_int[1]), "=r"(x_int[2]), "=r"(x_int[3])
+               : "l"(s4));
+  x[0] = x_int[0];
+  x[1] = x_int[1];
+  x[2] = x_int[2];
+  x[3] = x_int[3];
+}
+
+DI void lds(uint32_t (&x)[4], const uint32_t* addr)
+{
+  auto s4 = __cvta_generic_to_shared(reinterpret_cast<const uint32_t*>(addr));
+  asm volatile("ld.shared.v4.u32 {%0, %1, %2, %3}, [%4];"
+               : "=r"(x[0]), "=r"(x[1]), "=r"(x[2]), "=r"(x[3])
+               : "l"(s4));
+}
+
+DI void lds(uint32_t (&x)[2], const uint32_t* addr)
+{
+  auto s2 = __cvta_generic_to_shared(reinterpret_cast<const uint32_t*>(addr));
+  asm volatile("ld.shared.v2.u32 {%0, %1}, [%2];" : "=r"(x[0]), "=r"(x[1]) : "l"(s2));
+}
+
+DI void lds(uint32_t (&x)[1], const uint32_t* addr)
+{
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<const uint32_t*>(addr));
+  asm volatile("ld.shared.u32 {%0}, [%1];" : "=r"(x[0]) : "l"(s1));
+}
+
+DI void lds(uint32_t& x, const uint32_t* addr)
+{
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<const uint32_t*>(addr));
+  asm volatile("ld.shared.u32 {%0}, [%1];" : "=r"(x) : "l"(s1));
+}
+
+DI void lds(int32_t (&x)[4], const int32_t* addr)
+{
+  auto s4 = __cvta_generic_to_shared(reinterpret_cast<const int32_t*>(addr));
+  asm volatile("ld.shared.v4.u32 {%0, %1, %2, %3}, [%4];"
+               : "=r"(x[0]), "=r"(x[1]), "=r"(x[2]), "=r"(x[3])
+               : "l"(s4));
+}
+
+DI void lds(int32_t (&x)[2], const int32_t* addr)
+{
+  auto s2 = __cvta_generic_to_shared(reinterpret_cast<const int32_t*>(addr));
+  asm volatile("ld.shared.v2.u32 {%0, %1}, [%2];" : "=r"(x[0]), "=r"(x[1]) : "l"(s2));
+}
+
+DI void lds(int32_t (&x)[1], const int32_t* addr)
+{
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<const int32_t*>(addr));
+  asm volatile("ld.shared.u32 {%0}, [%1];" : "=r"(x[0]) : "l"(s1));
+}
+
+DI void lds(int32_t& x, const int32_t* addr)
+{
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<const int32_t*>(addr));
+  asm volatile("ld.shared.u32 {%0}, [%1];" : "=r"(x) : "l"(s1));
+}
+
+DI void lds(float& x, const float* addr)
+{
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<const float*>(addr));
+  asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x) : "l"(s1));
+}
+DI void lds(float (&x)[1], const float* addr)
+{
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<const float*>(addr));
+  asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x[0]) : "l"(s1));
+}
+DI void lds(float (&x)[2], const float* addr)
+{
+  auto s2 = __cvta_generic_to_shared(reinterpret_cast<const float2*>(addr));
+  asm volatile("ld.shared.v2.f32 {%0, %1}, [%2];" : "=f"(x[0]), "=f"(x[1]) : "l"(s2));
+}
+DI void lds(float (&x)[4], const float* addr)
+{
+  auto s4 = __cvta_generic_to_shared(reinterpret_cast<const float4*>(addr));
+  asm volatile("ld.shared.v4.f32 {%0, %1, %2, %3}, [%4];"
+               : "=f"(x[0]), "=f"(x[1]), "=f"(x[2]), "=f"(x[3])
+               : "l"(s4));
+}
+
 DI void lds(float& x, float* addr)
 {
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<float*>(addr));
@@ -159,6 +420,119 @@ DI void ldg(double (&x)[2], const double* addr)
 {
   asm volatile("ld.global.cg.v2.f64 {%0, %1}, [%2];" : "=d"(x[0]), "=d"(x[1]) : "l"(addr));
 }
+
+DI void ldg(uint32_t (&x)[4], const uint32_t* const& addr)
+{
+  asm volatile("ld.global.cg.v4.u32 {%0, %1, %2, %3}, [%4];"
+               : "=r"(x[0]), "=r"(x[1]), "=r"(x[2]), "=r"(x[3])
+               : "l"(addr));
+}
+
+DI void ldg(uint32_t (&x)[2], const uint32_t* const& addr)
+{
+  asm volatile("ld.global.cg.v2.u32 {%0, %1}, [%2];" : "=r"(x[0]), "=r"(x[1]) : "l"(addr));
+}
+
+DI void ldg(uint32_t (&x)[1], const uint32_t* const& addr)
+{
+  asm volatile("ld.global.cg.u32 %0, [%1];" : "=r"(x[0]) : "l"(addr));
+}
+
+DI void ldg(uint32_t& x, const uint32_t* const& addr)
+{
+  asm volatile("ld.global.cg.u32 %0, [%1];" : "=r"(x) : "l"(addr));
+}
+
+DI void ldg(int32_t (&x)[4], const int32_t* const& addr)
+{
+  asm volatile("ld.global.cg.v4.u32 {%0, %1, %2, %3}, [%4];"
+               : "=r"(x[0]), "=r"(x[1]), "=r"(x[2]), "=r"(x[3])
+               : "l"(addr));
+}
+
+DI void ldg(int32_t (&x)[2], const int32_t* const& addr)
+{
+  asm volatile("ld.global.cg.v2.u32 {%0, %1}, [%2];" : "=r"(x[0]), "=r"(x[1]) : "l"(addr));
+}
+
+DI void ldg(int32_t (&x)[1], const int32_t* const& addr)
+{
+  asm volatile("ld.global.cg.u32 %0, [%1];" : "=r"(x[0]) : "l"(addr));
+}
+
+DI void ldg(int32_t& x, const int32_t* const& addr)
+{
+  asm volatile("ld.global.cg.u32 %0, [%1];" : "=r"(x) : "l"(addr));
+}
+
+DI void ldg(uint8_t (&x)[4], const uint8_t* const& addr)
+{
+  uint32_t x_int[4];
+  asm volatile("ld.global.cg.v4.u8 {%0, %1, %2, %3}, [%4];"
+               : "=r"(x_int[0]), "=r"(x_int[1]), "=r"(x_int[2]), "=r"(x_int[3])
+               : "l"(addr));
+  x[0] = x_int[0];
+  x[1] = x_int[1];
+  x[2] = x_int[2];
+  x[3] = x_int[3];
+}
+
+DI void ldg(uint8_t (&x)[2], const uint8_t* const& addr)
+{
+  uint32_t x_int[2];
+  asm volatile("ld.global.cg.v2.u8 {%0, %1}, [%2];" : "=r"(x_int[0]), "=r"(x_int[1]) : "l"(addr));
+  x[0] = x_int[0];
+  x[1] = x_int[1];
+}
+
+DI void ldg(uint8_t (&x)[1], const uint8_t* const& addr)
+{
+  uint32_t x_int;
+  asm volatile("ld.global.cg.u8 %0, [%1];" : "=r"(x_int) : "l"(addr));
+  x[0] = x_int;
+}
+
+DI void ldg(uint8_t& x, const uint8_t* const& addr)
+{
+  uint32_t x_int;
+  asm volatile("ld.global.cg.u8 %0, [%1];" : "=r"(x_int) : "l"(addr));
+  x = x_int;
+}
+
+DI void ldg(int8_t (&x)[4], const int8_t* const& addr)
+{
+  int x_int[4];
+  asm volatile("ld.global.cg.v4.s8 {%0, %1, %2, %3}, [%4];"
+               : "=r"(x_int[0]), "=r"(x_int[1]), "=r"(x_int[2]), "=r"(x_int[3])
+               : "l"(addr));
+  x[0] = x_int[0];
+  x[1] = x_int[1];
+  x[2] = x_int[2];
+  x[3] = x_int[3];
+}
+
+DI void ldg(int8_t (&x)[2], const int8_t* const& addr)
+{
+  int x_int[2];
+  asm volatile("ld.global.cg.v2.s8 {%0, %1}, [%2];" : "=r"(x_int[0]), "=r"(x_int[1]) : "l"(addr));
+  x[0] = x_int[0];
+  x[1] = x_int[1];
+}
+
+DI void ldg(int8_t& x, const int8_t* const& addr)
+{
+  int x_int;
+  asm volatile("ld.global.cg.s8 %0, [%1];" : "=r"(x_int) : "l"(addr));
+  x = x_int;
+}
+
+DI void ldg(int8_t (&x)[1], const int8_t* const& addr)
+{
+  int x_int;
+  asm volatile("ld.global.cg.s8 %0, [%1];" : "=r"(x_int) : "l"(addr));
+  x[0] = x_int;
+}
+
 /** @} */
 
 }  // namespace raft
diff --git a/cpp/include/raft/core/cudart_utils.hpp b/cpp/include/raft/core/cudart_utils.hpp
index 7fe6ecaf6b..e0957ea1f3 100644
--- a/cpp/include/raft/core/cudart_utils.hpp
+++ b/cpp/include/raft/core/cudart_utils.hpp
@@ -26,7 +26,9 @@
 
 #include <raft/error.hpp>
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/managed_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
 
 #include <cuda_runtime.h>
 
@@ -445,6 +447,53 @@ constexpr T upper_bound()
   return std::numeric_limits<T>::max();
 }
 
+/**
+ * @brief Get a pointer to a pooled memory resource within the scope of the lifetime of the returned
+ * unique pointer.
+ *
+ * This function is useful in the code where multiple repeated allocations/deallocations are
+ * expected.
+ * Use case example:
+ * @code{.cpp}
+ *   void my_func(..., size_t n, rmm::mr::device_memory_resource* mr = nullptr) {
+ *     auto pool_guard = raft::get_pool_memory_resource(mr, 2 * n * sizeof(float));
+ *     if (pool_guard){
+ *       RAFT_LOG_INFO("Created a pool %zu bytes", pool_guard->pool_size());
+ *     } else {
+ *       RAFT_LOG_INFO("Using the current default or explicitly passed device memory resource");
+ *     }
+ *     rmm::device_uvector<float> x(n, stream, mr);
+ *     rmm::device_uvector<float> y(n, stream, mr);
+ *     ...
+ *   }
+ * @endcode
+ * Here, the new memory resource would be created within the function scope if the passed `mr` is
+ * null and the default resource is not a pool. After the call, `mr` contains a valid memory
+ * resource in any case.
+ *
+ * @param[inout] mr if not null do nothing; otherwise get the current device resource and wrap it
+ * into a `pool_memory_resource` if neccessary and return the pointer to the result.
+ * @param initial_size if a new memory pool is created, this would be its initial size (rounded up
+ * to 256 bytes).
+ *
+ * @return if a new memory pool is created, it returns a unique_ptr to it;
+ *   this managed pointer controls the lifetime of the created memory resource.
+ */
+inline auto get_pool_memory_resource(rmm::mr::device_memory_resource*& mr, size_t initial_size)
+{
+  using pool_res_t = rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>;
+  std::unique_ptr<pool_res_t> pool_res{};
+  if (mr) return pool_res;
+  mr = rmm::mr::get_current_device_resource();
+  if (!dynamic_cast<pool_res_t*>(mr) &&
+      !dynamic_cast<rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource>*>(mr) &&
+      !dynamic_cast<rmm::mr::pool_memory_resource<rmm::mr::managed_memory_resource>*>(mr)) {
+    pool_res = std::make_unique<pool_res_t>(mr, (initial_size + 255) & (~255));
+    mr       = pool_res.get();
+  }
+  return pool_res;
+}
+
 }  // namespace raft
 
 #endif
diff --git a/cpp/include/raft/cuda_utils.cuh b/cpp/include/raft/cuda_utils.cuh
index 362dba66c5..19800cb2d9 100644
--- a/cpp/include/raft/cuda_utils.cuh
+++ b/cpp/include/raft/cuda_utils.cuh
@@ -649,6 +649,67 @@ DI T shfl_xor(T val, int laneMask, int width = WarpSize, uint32_t mask = 0xfffff
 #endif
 }
 
+/**
+ * @brief Four-way byte dot product-accumulate.
+ * @tparam T Four-byte integer: int or unsigned int
+ * @tparam S Either same as T or a 4-byte vector of the same signedness.
+ *
+ * @param a
+ * @param b
+ * @param c
+ * @return dot(a, b) + c
+ */
+template <typename T, typename S = T>
+DI auto dp4a(S a, S b, T c) -> T;
+
+template <>
+DI auto dp4a(char4 a, char4 b, int c) -> int
+{
+#if __CUDA_ARCH__ >= 610
+  return __dp4a(a, b, c);
+#else
+  c += static_cast<int>(a.x) * static_cast<int>(b.x);
+  c += static_cast<int>(a.y) * static_cast<int>(b.y);
+  c += static_cast<int>(a.z) * static_cast<int>(b.z);
+  c += static_cast<int>(a.w) * static_cast<int>(b.w);
+  return c;
+#endif
+}
+
+template <>
+DI auto dp4a(uchar4 a, uchar4 b, unsigned int c) -> unsigned int
+{
+#if __CUDA_ARCH__ >= 610
+  return __dp4a(a, b, c);
+#else
+  c += static_cast<unsigned int>(a.x) * static_cast<unsigned int>(b.x);
+  c += static_cast<unsigned int>(a.y) * static_cast<unsigned int>(b.y);
+  c += static_cast<unsigned int>(a.z) * static_cast<unsigned int>(b.z);
+  c += static_cast<unsigned int>(a.w) * static_cast<unsigned int>(b.w);
+  return c;
+#endif
+}
+
+template <>
+DI auto dp4a(int a, int b, int c) -> int
+{
+#if __CUDA_ARCH__ >= 610
+  return __dp4a(a, b, c);
+#else
+  return dp4a(*reinterpret_cast<char4*>(&a), *reinterpret_cast<char4*>(&b), c);
+#endif
+}
+
+template <>
+DI auto dp4a(unsigned int a, unsigned int b, unsigned int c) -> unsigned int
+{
+#if __CUDA_ARCH__ >= 610
+  return __dp4a(a, b, c);
+#else
+  return dp4a(*reinterpret_cast<uchar4*>(&a), *reinterpret_cast<uchar4*>(&b), c);
+#endif
+}
+
 /**
  * @brief Warp-level sum reduction
  * @param val input value
diff --git a/cpp/include/raft/detail/mdarray.hpp b/cpp/include/raft/detail/mdarray.hpp
index 9d749ee47b..48094e3ccf 100644
--- a/cpp/include/raft/detail/mdarray.hpp
+++ b/cpp/include/raft/detail/mdarray.hpp
@@ -107,10 +107,13 @@ class device_uvector {
   /**
    * @brief Ctor that accepts a size, stream and an optional mr.
    */
-  explicit device_uvector(
-    std::size_t size,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  explicit device_uvector(std::size_t size, rmm::cuda_stream_view stream) : data_{size, stream} {}
+  /**
+   * @brief Ctor that accepts a size, stream and a memory resource.
+   */
+  explicit device_uvector(std::size_t size,
+                          rmm::cuda_stream_view stream,
+                          rmm::mr::device_memory_resource* mr)
     : data_{size, stream, mr}
   {
   }
@@ -162,14 +165,10 @@ class device_uvector_policy {
   }
 
   device_uvector_policy() = delete;
-  explicit device_uvector_policy(rmm::cuda_stream_view stream) noexcept(
-    std::is_nothrow_copy_constructible_v<rmm::cuda_stream_view>)
-    : stream_{stream}, mr_(nullptr)
-  {
-  }
-
-  device_uvector_policy(rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) noexcept(
-    std::is_nothrow_copy_constructible_v<rmm::cuda_stream_view>)
+  explicit device_uvector_policy(
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr =
+      nullptr) noexcept(std::is_nothrow_copy_constructible_v<rmm::cuda_stream_view>)
     : stream_{stream}, mr_(mr)
   {
   }
diff --git a/cpp/include/raft/integer_utils.h b/cpp/include/raft/integer_utils.h
index 5fc56de14b..a2ce7598c6 100644
--- a/cpp/include/raft/integer_utils.h
+++ b/cpp/include/raft/integer_utils.h
@@ -1,7 +1,7 @@
 /*
  * Copyright 2019 BlazingDB, Inc.
  *     Copyright 2019 Eyal Rozenberg <eyalroz@blazingdb.com>
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -161,4 +161,24 @@ std::enable_if_t<!std::is_signed<T>::value, T> constexpr inline absolute_value(T
   return value;
 }
 
+/**
+ * @defgroup Check whether the numeric conversion is narrowing
+ *
+ * @tparam From source type
+ * @tparam To destination type
+ * @{
+ */
+template <typename From, typename To, typename = void>
+struct is_narrowing : std::true_type {
+};
+
+template <typename From, typename To>
+struct is_narrowing<From, To, std::void_t<decltype(To{std::declval<From>()})>> : std::false_type {
+};
+/** @} */
+
+/** Check whether the numeric conversion is narrowing */
+template <typename From, typename To>
+inline constexpr bool is_narrowing_v = is_narrowing<From, To>::value;  // NOLINT
+
 }  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/ann.cuh b/cpp/include/raft/spatial/knn/ann.cuh
index 2ef2ae0fa4..befb5524ac 100644
--- a/cpp/include/raft/spatial/knn/ann.cuh
+++ b/cpp/include/raft/spatial/knn/ann.cuh
@@ -14,20 +14,14 @@
  * limitations under the License.
  */
 
-#ifndef __ANN_H
-#define __ANN_H
-
 #pragma once
 
 #include "ann_common.h"
-#include "detail/ann_quantized_faiss.cuh"
+#include "detail/ann_quantized.cuh"
 
-#include <faiss/gpu/GpuIndex.h>
-#include <raft/spatial/knn/faiss_mr.hpp>
+#include <raft/core/nvtx.hpp>
 
-namespace raft {
-namespace spatial {
-namespace knn {
+namespace raft::spatial::knn {
 
 /**
  * @brief Flat C++ API function to build an approximate nearest neighbors index
@@ -42,16 +36,19 @@ namespace knn {
  * @param[in] n number of rows in the index array
  * @param[in] D the dimensionality of the index array
  */
-template <typename value_idx = int>
-inline void approx_knn_build_index(raft::handle_t& handle,
-                                   raft::spatial::knn::knnIndex* index,
-                                   knnIndexParam* params,
-                                   raft::distance::DistanceType metric,
-                                   float metricArg,
-                                   float* index_array,
-                                   value_idx n,
-                                   value_idx D)
+template <typename T = float, typename value_idx = int>
+[[deprecated("Consider using new-style raft::spatial::knn::*::build functions")]] inline void
+approx_knn_build_index(raft::handle_t& handle,
+                       raft::spatial::knn::knnIndex* index,
+                       knnIndexParam* params,
+                       raft::distance::DistanceType metric,
+                       float metricArg,
+                       T* index_array,
+                       value_idx n,
+                       value_idx D)
 {
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "legacy approx_knn_build_index(n_rows = %u, dim = %u)", n, D);
   detail::approx_knn_build_index(handle, index, params, metric, metricArg, index_array, n, D);
 }
 
@@ -68,20 +65,19 @@ inline void approx_knn_build_index(raft::handle_t& handle,
  * @param[in] query_array the query to perform a search with
  * @param[in] n number of rows in the query array
  */
-template <typename value_idx = int>
-inline void approx_knn_search(raft::handle_t& handle,
-                              float* distances,
-                              int64_t* indices,
-                              raft::spatial::knn::knnIndex* index,
-                              value_idx k,
-                              float* query_array,
-                              value_idx n)
+template <typename T = float, typename value_idx = int>
+[[deprecated("Consider using new-style raft::spatial::knn::*::search functions")]] inline void
+approx_knn_search(raft::handle_t& handle,
+                  float* distances,
+                  int64_t* indices,
+                  raft::spatial::knn::knnIndex* index,
+                  value_idx k,
+                  T* query_array,
+                  value_idx n)
 {
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "legacy approx_knn_search(k = %u, n_queries = %u)", k, n);
   detail::approx_knn_search(handle, distances, indices, index, k, query_array, n);
 }
 
-}  // namespace knn
-}  // namespace spatial
-}  // namespace raft
-
-#endif
\ No newline at end of file
+}  // namespace raft::spatial::knn
diff --git a/cpp/include/raft/spatial/knn/ann.hpp b/cpp/include/raft/spatial/knn/ann.hpp
index b6d3ca2976..516435271d 100644
--- a/cpp/include/raft/spatial/knn/ann.hpp
+++ b/cpp/include/raft/spatial/knn/ann.hpp
@@ -13,79 +13,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-/**
- * This file is deprecated and will be removed in release 22.06.
- * Please use the cuh version instead.
- */
-
-#ifndef __ANN_H
-#define __ANN_H
 
 #pragma once
 
-#include "ann_common.h"
-#include "detail/ann_quantized_faiss.cuh"
-
-#include <faiss/gpu/GpuIndex.h>
-#include <raft/spatial/knn/faiss_mr.hpp>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-
-/**
- * @brief Flat C++ API function to build an approximate nearest neighbors index
- * from an index array and a set of parameters.
- *
- * @param[in] handle RAFT handle
- * @param[out] index index to be built
- * @param[in] params parametrization of the index to be built
- * @param[in] metric distance metric to use. Euclidean (L2) is used by default
- * @param[in] metricArg metric argument
- * @param[in] index_array the index array to build the index with
- * @param[in] n number of rows in the index array
- * @param[in] D the dimensionality of the index array
- */
-template <typename value_idx = int>
-inline void approx_knn_build_index(raft::handle_t& handle,
-                                   raft::spatial::knn::knnIndex* index,
-                                   knnIndexParam* params,
-                                   raft::distance::DistanceType metric,
-                                   float metricArg,
-                                   float* index_array,
-                                   value_idx n,
-                                   value_idx D)
-{
-  detail::approx_knn_build_index(handle, index, params, metric, metricArg, index_array, n, D);
-}
-
-/**
- * @brief Flat C++ API function to perform an approximate nearest neighbors
- * search from previously built index and a query array
- *
- * @param[in] handle RAFT handle
- * @param[out] distances distances of the nearest neighbors toward
- *                       their query point
- * @param[out] indices indices of the nearest neighbors
- * @param[in] index index to perform a search with
- * @param[in] k the number of nearest neighbors to search for
- * @param[in] query_array the query to perform a search with
- * @param[in] n number of rows in the query array
- */
-template <typename value_idx = int>
-inline void approx_knn_search(raft::handle_t& handle,
-                              float* distances,
-                              int64_t* indices,
-                              raft::spatial::knn::knnIndex* index,
-                              value_idx k,
-                              float* query_array,
-                              value_idx n)
-{
-  detail::approx_knn_search(handle, distances, indices, index, k, query_array, n);
-}
-
-}  // namespace knn
-}  // namespace spatial
-}  // namespace raft
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "ann.cuh"
diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h
index 5cdd6b1141..45867dbfee 100644
--- a/cpp/include/raft/spatial/knn/ann_common.h
+++ b/cpp/include/raft/spatial/knn/ann_common.h
@@ -14,8 +14,15 @@
  * limitations under the License.
  */
 
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the other approximate KNN implementations defined in spatial/knn/*.")
+
 #pragma once
 
+#include "detail/processing.hpp"
+#include "ivf_flat_types.hpp"
+
 #include <raft/distance/distance_type.hpp>
 
 #include <faiss/gpu/GpuIndex.h>
@@ -26,19 +33,43 @@ namespace spatial {
 namespace knn {
 
 struct knnIndex {
-  faiss::gpu::GpuIndex* index;
   raft::distance::DistanceType metric;
   float metricArg;
+  int nprobe;
+  std::unique_ptr<faiss::gpu::GpuIndex> index;
+  std::unique_ptr<MetricProcessor<float>> metric_processor;
+  std::unique_ptr<const ivf_flat::index<float, int64_t>> ivf_flat_float_;
+  std::unique_ptr<const ivf_flat::index<uint8_t, int64_t>> ivf_flat_uint8_t_;
+  std::unique_ptr<const ivf_flat::index<int8_t, int64_t>> ivf_flat_int8_t_;
 
-  raft::spatial::knn::RmmGpuResources* gpu_res;
+  std::unique_ptr<raft::spatial::knn::RmmGpuResources> gpu_res;
   int device;
-  ~knnIndex()
-  {
-    delete index;
-    delete gpu_res;
-  }
+
+  template <typename T, typename IdxT>
+  auto ivf_flat() -> std::unique_ptr<const ivf_flat::index<T, IdxT>>&;
 };
 
+template <>
+inline auto knnIndex::ivf_flat<float, int64_t>()
+  -> std::unique_ptr<const ivf_flat::index<float, int64_t>>&
+{
+  return ivf_flat_float_;
+}
+
+template <>
+inline auto knnIndex::ivf_flat<uint8_t, int64_t>()
+  -> std::unique_ptr<const ivf_flat::index<uint8_t, int64_t>>&
+{
+  return ivf_flat_uint8_t_;
+}
+
+template <>
+inline auto knnIndex::ivf_flat<int8_t, int64_t>()
+  -> std::unique_ptr<const ivf_flat::index<int8_t, int64_t>>&
+{
+  return ivf_flat_int8_t_;
+}
+
 enum QuantizerType : unsigned int {
   QT_8bit,
   QT_4bit,
@@ -72,6 +103,17 @@ struct IVFSQParam : IVFParam {
   bool encodeResidual;
 };
 
+inline auto from_legacy_index_params(const IVFFlatParam& legacy,
+                                     raft::distance::DistanceType metric,
+                                     float metric_arg)
+{
+  ivf_flat::index_params params;
+  params.metric     = metric;
+  params.metric_arg = metric_arg;
+  params.n_lists    = legacy.nlist;
+  return params;
+}
+
 };  // namespace knn
 };  // namespace spatial
 };  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/common.hpp b/cpp/include/raft/spatial/knn/common.hpp
new file mode 100644
index 0000000000..caaa951a66
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/common.hpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/distance_type.hpp>
+
+namespace raft::spatial::knn {
+
+/** The base for approximate KNN index structures. */
+struct index {
+};
+
+/** The base for KNN index parameters. */
+struct index_params {
+  /** Distance type. */
+  raft::distance::DistanceType metric = distance::DistanceType::L2Expanded;
+  /** The argument used by some distance metrics. */
+  float metric_arg = 2.0f;
+  /**
+   * Whether to add the dataset content to the index, i.e.:
+   *
+   *  - `true` means the index is filled with the dataset vectors and ready to search after calling
+   * `build`.
+   *  - `false` means `build` only trains the underlying model (e.g. quantizer or clustering), but
+   * the index is left empty; you'd need to call `extend` on the index afterwards to populate it.
+   */
+  bool add_data_on_build = true;
+};
+
+struct search_params {
+};
+
+};  // namespace raft::spatial::knn
diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
new file mode 100644
index 0000000000..74e1ae75a8
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -0,0 +1,706 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "ann_utils.cuh"
+
+#include <raft/common/nvtx.hpp>
+#include <raft/core/cudart_utils.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/cuda_utils.cuh>
+#include <raft/distance/distance.hpp>
+#include <raft/distance/distance_type.hpp>
+#include <raft/linalg/gemm.cuh>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/matrix/matrix.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_vector.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+namespace raft::spatial::knn::detail::kmeans {
+
+/**
+ * @brief Predict labels for the dataset; floats only.
+ *
+ * NB: no minibatch splitting is done here, it may require large amount of temporary memory (n_rows
+ * * n_cluster * sizeof(float)).
+ *
+ * @param handle
+ * @param[in] centers a pointer to the row-major matrix of cluster centers [n_clusters, dim]
+ * @param n_clusters number of clusters/centers
+ * @param dim dimensionality of the data
+ * @param[in] dataset a pointer to the data [n_rows, dim]
+ * @param n_rows number samples in the `dataset`
+ * @param[out] labels output predictions [n_rows]
+ * @param metric
+ * @param stream
+ * @param mr (optional) memory resource to use for temporary allocations
+ */
+void predict_float_core(const handle_t& handle,
+                        const float* centers,
+                        uint32_t n_clusters,
+                        uint32_t dim,
+                        const float* dataset,
+                        size_t n_rows,
+                        uint32_t* labels,
+                        raft::distance::DistanceType metric,
+                        rmm::cuda_stream_view stream,
+                        rmm::mr::device_memory_resource* mr)
+{
+  rmm::device_uvector<float> distances(n_rows * n_clusters, stream, mr);
+
+  float alpha;
+  float beta;
+  switch (metric) {
+    case raft::distance::DistanceType::InnerProduct: {
+      alpha = -1.0;
+      beta  = 0.0;
+    } break;
+    case raft::distance::DistanceType::L2Expanded:
+    case raft::distance::DistanceType::L2Unexpanded: {
+      rmm::device_uvector<float> sqsum_centers(n_clusters, stream, mr);
+      rmm::device_uvector<float> sqsum_data(n_rows, stream, mr);
+      utils::dots_along_rows(n_clusters, dim, centers, sqsum_centers.data(), stream);
+      utils::dots_along_rows(n_rows, dim, dataset, sqsum_data.data(), stream);
+      utils::outer_add(
+        sqsum_data.data(), n_rows, sqsum_centers.data(), n_clusters, distances.data(), stream);
+      alpha = -2.0;
+      beta  = 1.0;
+    } break;
+    // NB: update the description of `knn::ivf_flat::build` when adding here a new metric.
+    default: RAFT_FAIL("The chosen distance metric is not supported (%d)", int(metric));
+  }
+  linalg::gemm(handle,
+               true,
+               false,
+               n_clusters,
+               n_rows,
+               dim,
+               &alpha,
+               centers,
+               dim,
+               dataset,
+               dim,
+               &beta,
+               distances.data(),
+               n_clusters,
+               stream);
+  utils::argmin_along_rows(n_rows, n_clusters, distances.data(), labels, stream);
+}
+
+/**
+ * @brief Suggest a minibatch size for kmeans prediction.
+ *
+ * This function is used as a heuristic to split the work over a large dataset
+ * to reduce the size of temporary memory allocations.
+ *
+ * @param n_clusters number of clusters in kmeans clustering
+ * @param n_rows dataset size
+ * @return a suggested minibatch size
+ */
+constexpr auto calc_minibatch_size(uint32_t n_clusters, size_t n_rows) -> uint32_t
+{
+  n_clusters              = std::max<uint32_t>(1, n_clusters);
+  uint32_t minibatch_size = (1 << 20);
+  if (minibatch_size > (1 << 28) / n_clusters) {
+    minibatch_size = (1 << 28) / n_clusters;
+    minibatch_size += 32;
+    minibatch_size -= minibatch_size % 64;
+  }
+  minibatch_size = uint32_t(std::min<size_t>(minibatch_size, n_rows));
+  return minibatch_size;
+}
+
+/**
+ * @brief Given the data and labels, calculate cluster centers and sizes in one sweep.
+ *
+ * Let S_i = {x_k | x_k \in dataset & labels[k] == i} be the vectors in the dataset with label i.
+ *   On exit centers_i = normalize(\sum_{x \in S_i} x), where `normalize` depends on the distance
+ * type.
+ *
+ * NB: `centers` and `cluster_sizes` must be accessible on GPU due to
+ * divide_along_rows/normalize_rows. The rest can be both, under assumption that all pointers are
+ * accessible from the same place.
+ *
+ * i.e. two variants are possible:
+ *
+ *   1. All pointers are on the device.
+ *   2. All pointers are on the host, but `centers` and `cluster_sizes` are accessible from GPU.
+ *
+ * @tparam T element type
+ *
+ * @param[inout] centers pointer to the output [n_clusters, dim]
+ * @param[inout] cluster_sizes number of rows in each cluster [n_clusters]
+ * @param n_clusters number of clusters/centers
+ * @param dim dimensionality of the data
+ * @param[in] dataset a pointer to the data [n_rows, dim]
+ * @param n_rows number samples in the `dataset`
+ * @param[in] labels output predictions [n_rows]
+ * @param reset_counters whether to clear the output arrays before calculating.
+ *    When set to `false`, this function may be used to update existing centers and sizes using
+ *    the weighted average principle.
+ * @param stream
+ */
+template <typename T>
+void calc_centers_and_sizes(float* centers,
+                            uint32_t* cluster_sizes,
+                            uint32_t n_clusters,
+                            uint32_t dim,
+                            const T* dataset,
+                            size_t n_rows,
+                            const uint32_t* labels,
+                            bool reset_counters,
+                            rmm::cuda_stream_view stream)
+{
+  if (reset_counters) {
+    utils::memzero(centers, n_clusters * dim, stream);
+    utils::memzero(cluster_sizes, n_clusters, stream);
+  } else {
+    utils::map_along_rows(
+      n_clusters,
+      dim,
+      centers,
+      cluster_sizes,
+      [] __device__(float c, uint32_t s) -> float { return c * s; },
+      stream);
+  }
+  utils::accumulate_into_selected(n_rows, dim, centers, cluster_sizes, dataset, labels, stream);
+  utils::map_along_rows(
+    n_clusters,
+    dim,
+    centers,
+    cluster_sizes,
+    [] __device__(float c, uint32_t s) -> float { return s == 0 ? 0.0f : c / float(s); },
+    stream);
+}
+
+/**
+ * @brief Predict labels for the dataset.
+ *
+ * @tparam T element type
+ *
+ * @param handle
+ * @param[in] centers a pointer to the row-major matrix of cluster centers [n_clusters, dim]
+ * @param n_clusters number of clusters/centers
+ * @param dim dimensionality of the data
+ * @param[in] dataset a pointer to the data [n_rows, dim]
+ * @param n_rows number samples in the `dataset`
+ * @param[out] labels output predictions [n_rows]
+ * @param metric
+ * @param stream
+ * @param mr (optional) memory resource to use for temporary allocations
+ */
+
+template <typename T>
+void predict(const handle_t& handle,
+             const float* centers,
+             uint32_t n_clusters,
+             uint32_t dim,
+             const T* dataset,
+             size_t n_rows,
+             uint32_t* labels,
+             raft::distance::DistanceType metric,
+             rmm::cuda_stream_view stream,
+             rmm::mr::device_memory_resource* mr = nullptr)
+{
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "kmeans::predict(%zu, %u)", n_rows, n_clusters);
+  if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); }
+  const uint32_t max_minibatch_size = calc_minibatch_size(n_clusters, n_rows);
+  rmm::device_uvector<float> cur_dataset(
+    std::is_same_v<T, float> ? 0 : max_minibatch_size * dim, stream, mr);
+  auto cur_dataset_ptr = cur_dataset.data();
+  for (size_t offset = 0; offset < n_rows; offset += max_minibatch_size) {
+    auto minibatch_size = std::min<uint32_t>(max_minibatch_size, n_rows - offset);
+
+    if constexpr (std::is_same_v<T, float>) {
+      cur_dataset_ptr = const_cast<float*>(dataset + offset * dim);
+    } else {
+      linalg::unaryOp(cur_dataset_ptr,
+                      dataset + offset * dim,
+                      minibatch_size * dim,
+                      utils::mapping<float>{},
+                      stream);
+    }
+
+    predict_float_core(handle,
+                       centers,
+                       n_clusters,
+                       dim,
+                       cur_dataset_ptr,
+                       minibatch_size,
+                       labels + offset,
+                       metric,
+                       stream,
+                       mr);
+  }
+}
+
+/**
+ * @brief Adjust centers for clusters that have small number of entries.
+ *
+ * For each cluster, where the cluster size is not bigger than a threshold, the center is moved
+ * towards a data point that belongs to a large cluster.
+ *
+ * NB: if this function returns `true`, you should update the labels.
+ *
+ * NB: all pointers are used on the host side.
+ *
+ * @tparam T element type
+ *
+ * @param[inout] centers cluster centers [n_clusters, dim]
+ * @param n_clusters number of rows in `centers`
+ * @param dim number of columns in `centers` and `dataset`
+ * @param[in] dataset a host pointer to the row-major data matrix [n_rows, dim]
+ * @param n_rows number of rows in `dataset`
+ * @param[in] labels a host pointer to the cluster indices [n_rows]
+ * @param[in] cluster_sizes number of rows in each cluster [n_clusters]
+ * @param threshold defines a criterion for adjusting a cluster
+ *                   (cluster_sizes <= average_size * threshold)
+ *                   0 <= threshold < 1
+ * @param stream
+ *
+ * @return whether any of the centers has been updated (and thus, `labels` need to be recalculated).
+ */
+template <typename T>
+auto adjust_centers(float* centers,
+                    uint32_t n_clusters,
+                    uint32_t dim,
+                    const T* dataset,
+                    size_t n_rows,
+                    const uint32_t* labels,
+                    const uint32_t* cluster_sizes,
+                    float threshold,
+                    rmm::cuda_stream_view stream) -> bool
+{
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "kmeans::adjust_centers(%zu, %u)", n_rows, n_clusters);
+  stream.synchronize();
+  if (n_clusters == 0) { return false; }
+  constexpr static std::array kPrimes{29,   71,   113,  173,  229,  281,  349,  409,  463,  541,
+                                      601,  659,  733,  809,  863,  941,  1013, 1069, 1151, 1223,
+                                      1291, 1373, 1451, 1511, 1583, 1657, 1733, 1811, 1889, 1987,
+                                      2053, 2129, 2213, 2287, 2357, 2423, 2531, 2617, 2687, 2741};
+  static size_t i        = 0;
+  static size_t i_primes = 0;
+
+  bool adjusted    = false;
+  uint32_t average = static_cast<uint32_t>(n_rows / size_t(n_clusters));
+  uint32_t ofst;
+
+  do {
+    i_primes = (i_primes + 1) % kPrimes.size();
+    ofst     = kPrimes[i_primes];
+  } while (n_rows % ofst == 0);
+
+  for (uint32_t l = 0; l < n_clusters; l++) {
+    auto csize = cluster_sizes[l];
+    // skip big clusters
+    if (csize > static_cast<uint32_t>(average * threshold)) continue;
+    // choose a "random" i that belongs to a rather large cluster
+    do {
+      i = (i + ofst) % n_rows;
+    } while (cluster_sizes[labels[i]] < average);
+    // Adjust the center of the selected smaller cluster to gravitate towards
+    // a sample from the selected larger cluster.
+    const size_t li = labels[i];
+    // Weight of the current center for the weighted average.
+    // We dump it for anomalously small clusters, but keep constant overwise.
+    const float wc = std::min<float>(csize, 7.0);
+    // Weight for the datapoint used to shift the center.
+    const float wd = 1.0;
+    for (uint32_t j = 0; j < dim; j++) {
+      float val = 0;
+      val += wc * centers[j + dim * li];
+      val += wd * utils::mapping<float>{}(dataset[j + size_t(dim) * i]);
+      val /= wc + wd;
+      centers[j + dim * l] = val;
+    }
+    adjusted = true;
+  }
+  stream.synchronize();
+  return adjusted;
+}
+
+/** predict & adjust_centers combined in an iterative process. */
+template <typename T>
+void build_clusters(const handle_t& handle,
+                    uint32_t n_iters,
+                    uint32_t dim,
+                    const T* dataset,  // managedl [n_rows, dim]
+                    size_t n_rows,
+                    uint32_t n_clusters,
+                    float* cluster_centers,    // managed; [n_clusters, dim]
+                    uint32_t* cluster_labels,  // managed; [n_rows]
+                    uint32_t* cluster_sizes,   // managed; [n_clusters]
+                    raft::distance::DistanceType metric,
+                    rmm::mr::device_memory_resource* device_memory,
+                    rmm::cuda_stream_view stream)
+{
+  // "randomly initialize labels"
+  auto f = [n_clusters] __device__(uint32_t * out, size_t i) {
+    *out = uint32_t(i % size_t(n_clusters));
+  };
+  linalg::writeOnlyUnaryOp<uint32_t, decltype(f), size_t>(cluster_labels, n_rows, f, stream);
+
+  // update centers to match the initialized labels.
+  calc_centers_and_sizes(
+    cluster_centers, cluster_sizes, n_clusters, dim, dataset, n_rows, cluster_labels, true, stream);
+
+  for (uint32_t iter = 0; iter < 2 * n_iters; iter += 2) {
+    switch (metric) {
+      // For some metrics, cluster calculation and adjustment tends to favor zero center vectors.
+      // To avoid converging to zero, we normalize the center vectors on every iteration.
+      case raft::distance::DistanceType::InnerProduct:
+      case raft::distance::DistanceType::CosineExpanded:
+      case raft::distance::DistanceType::CorrelationExpanded:
+        utils::normalize_rows(n_clusters, dim, cluster_centers, stream);
+      default: break;
+    }
+    predict(handle,
+            cluster_centers,
+            n_clusters,
+            dim,
+            dataset,
+            n_rows,
+            cluster_labels,
+            metric,
+            stream,
+            device_memory);
+    calc_centers_and_sizes(cluster_centers,
+                           cluster_sizes,
+                           n_clusters,
+                           dim,
+                           dataset,
+                           n_rows,
+                           cluster_labels,
+                           true,
+                           stream);
+
+    if (iter + 1 < 2 * n_iters) {
+      if (kmeans::adjust_centers(cluster_centers,
+                                 n_clusters,
+                                 dim,
+                                 dataset,
+                                 n_rows,
+                                 cluster_labels,
+                                 cluster_sizes,
+                                 (float)1.0 / 4,
+                                 stream)) {
+        iter -= 1;
+      }
+    }
+  }
+}
+
+/** Calculate how many fine clusters should belong to each mesocluster. */
+auto arrange_fine_clusters(uint32_t n_clusters,
+                           uint32_t n_mesoclusters,
+                           size_t n_rows,
+                           const uint32_t* mesocluster_sizes)
+{
+  std::vector<uint32_t> fine_clusters_nums(n_mesoclusters);
+  std::vector<uint32_t> fine_clusters_csum(n_mesoclusters + 1);
+  fine_clusters_csum[0] = 0;
+
+  uint32_t n_lists_rem       = n_clusters;
+  uint32_t n_nonempty_ms_rem = 0;
+  for (uint32_t i = 0; i < n_mesoclusters; i++) {
+    n_nonempty_ms_rem += mesocluster_sizes[i] > 0 ? 1 : 0;
+  }
+  size_t n_rows_rem               = n_rows;
+  size_t mesocluster_size_sum     = 0;
+  uint32_t mesocluster_size_max   = 0;
+  uint32_t fine_clusters_nums_max = 0;
+  for (uint32_t i = 0; i < n_mesoclusters; i++) {
+    if (i < n_mesoclusters - 1) {
+      // Although the algorithm is meant to produce balanced clusters, when something
+      // goes wrong, we may get empty clusters (e.g. during development/debugging).
+      // The code below ensures a proportional arrangement of fine cluster numbers
+      // per mesocluster, even if some clusters are empty.
+      if (mesocluster_sizes[i] == 0) {
+        fine_clusters_nums[i] = 0;
+      } else {
+        n_nonempty_ms_rem--;
+        auto s = uint32_t((double)n_lists_rem * mesocluster_sizes[i] / n_rows_rem + .5);
+        s      = std::min<uint32_t>(s, n_lists_rem - n_nonempty_ms_rem);
+        fine_clusters_nums[i] = std::max<uint32_t>(s, 1);
+      }
+    } else {
+      fine_clusters_nums[i] = n_lists_rem;
+    }
+    n_lists_rem -= fine_clusters_nums[i];
+    n_rows_rem -= mesocluster_sizes[i];
+    mesocluster_size_max = max(mesocluster_size_max, mesocluster_sizes[i]);
+    mesocluster_size_sum += mesocluster_sizes[i];
+    fine_clusters_nums_max    = max(fine_clusters_nums_max, fine_clusters_nums[i]);
+    fine_clusters_csum[i + 1] = fine_clusters_csum[i] + fine_clusters_nums[i];
+  }
+
+  RAFT_EXPECTS(mesocluster_size_sum == n_rows,
+               "mesocluster sizes do not add up (%zu) to the total trainset size (%zu)",
+               mesocluster_size_sum,
+               n_rows);
+  RAFT_EXPECTS(fine_clusters_csum[n_mesoclusters] == n_clusters,
+               "fine cluster numbers do not add up (%u) to the total number of clusters (%u)",
+               fine_clusters_csum[n_mesoclusters],
+               n_clusters
+
+  );
+
+  return std::make_tuple(mesocluster_size_max,
+                         fine_clusters_nums_max,
+                         std::move(fine_clusters_nums),
+                         std::move(fine_clusters_csum));
+}
+
+/**
+ *  Given the (coarse) mesoclusters and the distribution of fine clusters within them,
+ *  build the fine clusters.
+ *
+ *  Processing one mesocluster at a time:
+ *   1. Copy mesocluster data into a separate buffer
+ *   2. Predict fine cluster
+ *   3. Refince the fine cluster centers
+ *
+ *  As a result, the fine clusters are what is returned by `build_optimized_kmeans`;
+ *  this function returns the total number of fine clusters, which can be checked to be
+ *  the same as the requested number of clusters.
+ */
+template <typename T>
+auto build_fine_clusters(const handle_t& handle,
+                         uint32_t n_iters,
+                         uint32_t dim,
+                         const T* dataset_mptr,
+                         const uint32_t* labels_mptr,
+                         size_t n_rows,
+                         const uint32_t* fine_clusters_nums,
+                         const uint32_t* fine_clusters_csum,
+                         const uint32_t* mesocluster_sizes,
+                         uint32_t n_mesoclusters,
+                         uint32_t mesocluster_size_max,
+                         uint32_t fine_clusters_nums_max,
+                         float* cluster_centers,
+                         raft::distance::DistanceType metric,
+                         rmm::mr::managed_memory_resource* managed_memory,
+                         rmm::mr::device_memory_resource* device_memory,
+                         rmm::cuda_stream_view stream) -> uint32_t
+{
+  rmm::device_uvector<uint32_t> mc_trainset_ids_buf(mesocluster_size_max, stream, managed_memory);
+  rmm::device_uvector<float> mc_trainset_buf(mesocluster_size_max * dim, stream, managed_memory);
+  auto mc_trainset_ids = mc_trainset_ids_buf.data();
+  auto mc_trainset     = mc_trainset_buf.data();
+
+  // label (cluster ID) of each vector
+  rmm::device_uvector<uint32_t> mc_trainset_labels(mesocluster_size_max, stream, managed_memory);
+
+  rmm::device_uvector<float> mc_trainset_ccenters(
+    fine_clusters_nums_max * dim, stream, managed_memory);
+  // number of vectors in each cluster
+  rmm::device_uvector<uint32_t> mc_trainset_csizes_tmp(
+    fine_clusters_nums_max, stream, managed_memory);
+
+  // Training clusters in each meso-cluster
+  uint32_t n_clusters_done = 0;
+  for (uint32_t i = 0; i < n_mesoclusters; i++) {
+    uint32_t k = 0;
+    for (size_t j = 0; j < n_rows; j++) {
+      if (labels_mptr[j] == i) { mc_trainset_ids[k++] = j; }
+    }
+    RAFT_EXPECTS(k == mesocluster_sizes[i], "Incorrect mesocluster size at %d.", i);
+    if (k == 0) {
+      RAFT_LOG_DEBUG("Empty cluster %d", i);
+      RAFT_EXPECTS(fine_clusters_nums[i] == 0,
+                   "Number of fine clusters must be zero for the empty mesocluster (got %d)",
+                   fine_clusters_nums[i]);
+      continue;
+    } else {
+      RAFT_EXPECTS(fine_clusters_nums[i] > 0,
+                   "Number of fine clusters must be non-zero for a non-empty mesocluster");
+    }
+
+    utils::copy_selected(
+      mesocluster_sizes[i], dim, dataset_mptr, mc_trainset_ids, dim, mc_trainset, dim, stream);
+
+    build_clusters(handle,
+                   n_iters,
+                   dim,
+                   mc_trainset,
+                   mesocluster_sizes[i],
+                   fine_clusters_nums[i],
+                   mc_trainset_ccenters.data(),
+                   mc_trainset_labels.data(),
+                   mc_trainset_csizes_tmp.data(),
+                   metric,
+                   device_memory,
+                   stream);
+
+    raft::copy(cluster_centers + (dim * fine_clusters_csum[i]),
+               mc_trainset_ccenters.data(),
+               fine_clusters_nums[i] * dim,
+               stream);
+    handle.sync_stream(stream);
+    n_clusters_done += fine_clusters_nums[i];
+  }
+  return n_clusters_done;
+}
+
+/**
+ * kmeans
+ *
+ * @tparam T element type
+ *
+ * @param handle
+ * @param n_iters number of training iterations
+ * @param dim number of columns in `centers` and `dataset`
+ * @param[in] dataset a device pointer to the source dataset [n_rows, dim]
+ * @param n_rows number of rows in the input
+ * @param[out] cluster_centers a device pointer to the found cluster centers [n_cluster, dim]
+ * @param n_cluster
+ * @param trainset_fraction a fraction of rows in the `dataset` to sample for kmeans training;
+ *                            0 < trainset_fraction <= 1.
+ * @param metric the distance metric
+ * @param stream
+ */
+template <typename T>
+void build_optimized_kmeans(const handle_t& handle,
+                            uint32_t n_iters,
+                            uint32_t dim,
+                            const T* dataset,
+                            size_t n_rows,
+                            float* cluster_centers,
+                            uint32_t n_clusters,
+                            double trainset_fraction,
+                            raft::distance::DistanceType metric,
+                            rmm::cuda_stream_view stream)
+{
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "kmeans::build_optimized_kmeans(%zu, %u)", n_rows, n_clusters);
+
+  auto trainset_ratio =
+    std::max<size_t>(1, n_rows / std::max<size_t>(trainset_fraction * n_rows, n_clusters));
+  auto n_rows_train = n_rows / trainset_ratio;
+
+  uint32_t n_mesoclusters = std::min<uint32_t>(n_clusters, std::sqrt(n_clusters) + 0.5);
+  RAFT_LOG_DEBUG("(%s) # n_mesoclusters: %u", __func__, n_mesoclusters);
+
+  rmm::mr::managed_memory_resource managed_memory;
+  rmm::mr::device_memory_resource* device_memory = nullptr;
+  auto pool_guard                                = raft::get_pool_memory_resource(
+    device_memory, kmeans::calc_minibatch_size(n_mesoclusters, n_rows_train) * dim * 4);
+  if (pool_guard) {
+    RAFT_LOG_DEBUG(
+      "kmeans::build_optimized_kmeans: using pool memory resource with initial size %zu bytes",
+      pool_guard->pool_size());
+  }
+
+  rmm::device_uvector<T> trainset(n_rows_train * dim, stream, &managed_memory);
+  // TODO: a proper sampling
+  RAFT_CUDA_TRY(cudaMemcpy2DAsync(trainset.data(),
+                                  sizeof(T) * dim,
+                                  dataset,
+                                  sizeof(T) * dim * trainset_ratio,
+                                  sizeof(T) * dim,
+                                  n_rows_train,
+                                  cudaMemcpyDefault,
+                                  stream));
+
+  // build coarse clusters (mesoclusters)
+  rmm::device_uvector<uint32_t> mesocluster_labels_buf(n_rows_train, stream, &managed_memory);
+  rmm::device_uvector<uint32_t> mesocluster_sizes_buf(n_mesoclusters, stream, &managed_memory);
+  {
+    rmm::device_uvector<float> mesocluster_centers_buf(
+      n_mesoclusters * dim, stream, &managed_memory);
+    build_clusters(handle,
+                   n_iters,
+                   dim,
+                   trainset.data(),
+                   n_rows_train,
+                   n_mesoclusters,
+                   mesocluster_centers_buf.data(),
+                   mesocluster_labels_buf.data(),
+                   mesocluster_sizes_buf.data(),
+                   metric,
+                   device_memory,
+                   stream);
+  }
+
+  auto mesocluster_sizes  = mesocluster_sizes_buf.data();
+  auto mesocluster_labels = mesocluster_labels_buf.data();
+
+  handle.sync_stream(stream);
+
+  // build fine clusters
+  auto [mesocluster_size_max, fine_clusters_nums_max, fine_clusters_nums, fine_clusters_csum] =
+    arrange_fine_clusters(n_clusters, n_mesoclusters, n_rows_train, mesocluster_sizes);
+
+  if (mesocluster_size_max * n_mesoclusters > 2 * n_rows_train) {
+    RAFT_LOG_WARN("build_optimized_kmeans: built unbalanced mesoclusters");
+    RAFT_LOG_TRACE_VEC(mesocluster_sizes, n_mesoclusters);
+    RAFT_LOG_TRACE_VEC(fine_clusters_nums.data(), n_mesoclusters);
+  }
+
+  auto n_clusters_done = build_fine_clusters(handle,
+                                             n_iters,
+                                             dim,
+                                             trainset.data(),
+                                             mesocluster_labels,
+                                             n_rows_train,
+                                             fine_clusters_nums.data(),
+                                             fine_clusters_csum.data(),
+                                             mesocluster_sizes,
+                                             n_mesoclusters,
+                                             mesocluster_size_max,
+                                             fine_clusters_nums_max,
+                                             cluster_centers,
+                                             metric,
+                                             &managed_memory,
+                                             device_memory,
+                                             stream);
+  RAFT_EXPECTS(n_clusters_done == n_clusters, "Didn't process all clusters.");
+
+  rmm::device_uvector<uint32_t> cluster_sizes(n_clusters, stream, device_memory);
+  rmm::device_uvector<uint32_t> labels(n_rows_train, stream, device_memory);
+
+  // fit clusters using the trainset
+  for (int iter = 0; iter < 2; iter++) {
+    predict(handle,
+            cluster_centers,
+            n_clusters,
+            dim,
+            trainset.data(),
+            n_rows_train,
+            labels.data(),
+            metric,
+            stream,
+            device_memory);
+    calc_centers_and_sizes(cluster_centers,
+                           cluster_sizes.data(),
+                           n_clusters,
+                           dim,
+                           trainset.data(),
+                           n_rows_train,
+                           labels.data(),
+                           true,
+                           stream);
+  }
+}
+
+}  // namespace raft::spatial::knn::detail::kmeans
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
new file mode 100644
index 0000000000..5a56a84fe3
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "../ann_common.h"
+#include "../ivf_flat.cuh"
+#include "knn_brute_force_faiss.cuh"
+
+#include "common_faiss.h"
+#include "processing.cuh"
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+
+#include <raft/distance/distance.cuh>
+#include <raft/distance/distance_type.hpp>
+#include <raft/label/classlabels.cuh>
+#include <raft/spatial/knn/faiss_mr.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <faiss/gpu/GpuDistance.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/GpuIndexIVFFlat.h>
+#include <faiss/gpu/GpuIndexIVFPQ.h>
+#include <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/utils/Limits.cuh>
+#include <faiss/gpu/utils/Select.cuh>
+#include <faiss/gpu/utils/Tensor.cuh>
+#include <faiss/utils/Heap.h>
+
+#include <thrust/iterator/transform_iterator.h>
+
+namespace raft::spatial::knn::detail {
+
+inline faiss::ScalarQuantizer::QuantizerType build_faiss_qtype(QuantizerType qtype)
+{
+  switch (qtype) {
+    case QuantizerType::QT_8bit: return faiss::ScalarQuantizer::QuantizerType::QT_8bit;
+    case QuantizerType::QT_8bit_uniform:
+      return faiss::ScalarQuantizer::QuantizerType::QT_8bit_uniform;
+    case QuantizerType::QT_4bit_uniform:
+      return faiss::ScalarQuantizer::QuantizerType::QT_4bit_uniform;
+    case QuantizerType::QT_fp16: return faiss::ScalarQuantizer::QuantizerType::QT_fp16;
+    case QuantizerType::QT_8bit_direct:
+      return faiss::ScalarQuantizer::QuantizerType::QT_8bit_direct;
+    case QuantizerType::QT_6bit: return faiss::ScalarQuantizer::QuantizerType::QT_6bit;
+    default: return (faiss::ScalarQuantizer::QuantizerType)qtype;
+  }
+}
+
+template <typename IntType = int>
+void approx_knn_ivfflat_build_index(knnIndex* index,
+                                    const IVFFlatParam& params,
+                                    IntType n,
+                                    IntType D)
+{
+  faiss::gpu::GpuIndexIVFFlatConfig config;
+  config.device                  = index->device;
+  faiss::MetricType faiss_metric = build_faiss_metric(index->metric);
+  index->index.reset(
+    new faiss::gpu::GpuIndexIVFFlat(index->gpu_res.get(), D, params.nlist, faiss_metric, config));
+}
+
+template <typename IntType = int>
+void approx_knn_ivfpq_build_index(knnIndex* index, const IVFPQParam& params, IntType n, IntType D)
+{
+  faiss::gpu::GpuIndexIVFPQConfig config;
+  config.device                  = index->device;
+  config.usePrecomputedTables    = params.usePrecomputedTables;
+  config.interleavedLayout       = params.n_bits != 8;
+  faiss::MetricType faiss_metric = build_faiss_metric(index->metric);
+  index->index.reset(new faiss::gpu::GpuIndexIVFPQ(
+    index->gpu_res.get(), D, params.nlist, params.M, params.n_bits, faiss_metric, config));
+}
+
+template <typename IntType = int>
+void approx_knn_ivfsq_build_index(knnIndex* index, const IVFSQParam& params, IntType n, IntType D)
+{
+  faiss::gpu::GpuIndexIVFScalarQuantizerConfig config;
+  config.device                                     = index->device;
+  faiss::MetricType faiss_metric                    = build_faiss_metric(index->metric);
+  faiss::ScalarQuantizer::QuantizerType faiss_qtype = build_faiss_qtype(params.qtype);
+  index->index.reset(new faiss::gpu::GpuIndexIVFScalarQuantizer(
+    index->gpu_res.get(), D, params.nlist, faiss_qtype, faiss_metric, params.encodeResidual));
+}
+
+template <typename T = float, typename IntType = int>
+void approx_knn_build_index(const handle_t& handle,
+                            knnIndex* index,
+                            knnIndexParam* params,
+                            raft::distance::DistanceType metric,
+                            float metricArg,
+                            T* index_array,
+                            IntType n,
+                            IntType D)
+{
+  auto stream      = handle.get_stream();
+  index->index     = nullptr;
+  index->metric    = metric;
+  index->metricArg = metricArg;
+  if (dynamic_cast<const IVFParam*>(params)) {
+    index->nprobe = dynamic_cast<const IVFParam*>(params)->nprobe;
+  }
+  auto ivf_ft_pams = dynamic_cast<IVFFlatParam*>(params);
+  auto ivf_pq_pams = dynamic_cast<IVFPQParam*>(params);
+  auto ivf_sq_pams = dynamic_cast<IVFSQParam*>(params);
+
+  if constexpr (std::is_same_v<T, float>) {
+    index->metric_processor = create_processor<float>(metric, n, D, 0, false, stream);
+  }
+  if constexpr (std::is_same_v<T, float>) { index->metric_processor->preprocess(index_array); }
+
+  if (ivf_ft_pams && (metric == raft::distance::DistanceType::L2Unexpanded ||
+                      metric == raft::distance::DistanceType::L2Expanded ||
+                      metric == raft::distance::DistanceType::InnerProduct)) {
+    auto new_params               = from_legacy_index_params(*ivf_ft_pams, metric, metricArg);
+    index->ivf_flat<T, int64_t>() = std::make_unique<const ivf_flat::index<T, int64_t>>(
+      ivf_flat::build(handle, new_params, index_array, int64_t(n), D));
+  } else {
+    RAFT_CUDA_TRY(cudaGetDevice(&(index->device)));
+    index->gpu_res.reset(new raft::spatial::knn::RmmGpuResources());
+    index->gpu_res->noTempMemory();
+    index->gpu_res->setDefaultStream(index->device, stream);
+    if (ivf_ft_pams) {
+      approx_knn_ivfflat_build_index(index, *ivf_ft_pams, n, D);
+    } else if (ivf_pq_pams) {
+      approx_knn_ivfpq_build_index(index, *ivf_pq_pams, n, D);
+    } else if (ivf_sq_pams) {
+      approx_knn_ivfsq_build_index(index, *ivf_sq_pams, n, D);
+    } else {
+      RAFT_FAIL("Unrecognized index type.");
+    }
+    if constexpr (std::is_same_v<T, float>) {
+      index->index->train(n, index_array);
+      index->index->add(n, index_array);
+    } else {
+      RAFT_FAIL("FAISS-based index supports only float data.");
+    }
+  }
+
+  if constexpr (std::is_same_v<T, float>) { index->metric_processor->revert(index_array); }
+}
+
+template <typename T = float, typename IntType = int>
+void approx_knn_search(const handle_t& handle,
+                       float* distances,
+                       int64_t* indices,
+                       knnIndex* index,
+                       IntType k,
+                       T* query_array,
+                       IntType n)
+{
+  auto faiss_ivf = dynamic_cast<GpuIndexIVF*>(index->index.get());
+  if (faiss_ivf) { faiss_ivf->setNumProbes(index->nprobe); }
+
+  if constexpr (std::is_same_v<T, float>) {
+    index->metric_processor->preprocess(query_array);
+    index->metric_processor->set_num_queries(k);
+  }
+
+  // search
+  if (faiss_ivf) {
+    if constexpr (std::is_same_v<T, float>) {
+      faiss_ivf->search(n, query_array, k, distances, indices);
+    } else {
+      RAFT_FAIL("FAISS-based index supports only float data.");
+    }
+  } else if (index->ivf_flat<T, int64_t>()) {
+    ivf_flat::search_params params;
+    params.n_probes = index->nprobe;
+    ivf_flat::search(
+      handle, params, *(index->ivf_flat<T, int64_t>()), query_array, n, k, indices, distances);
+  } else {
+    RAFT_FAIL("The model is not trained");
+  }
+
+  // revert changes to the query
+  if constexpr (std::is_same_v<T, float>) { index->metric_processor->revert(query_array); }
+
+  // perform post-processing to show the real distances
+  if (index->metric == raft::distance::DistanceType::L2SqrtExpanded ||
+      index->metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
+      index->metric == raft::distance::DistanceType::LpUnexpanded) {
+    /**
+     * post-processing
+     */
+    float p = 0.5;  // standard l2
+    if (index->metric == raft::distance::DistanceType::LpUnexpanded) p = 1.0 / index->metricArg;
+    raft::linalg::unaryOp<float>(
+      distances,
+      distances,
+      n * k,
+      [p] __device__(float input) { return powf(input, p); },
+      handle.get_stream());
+  }
+  if constexpr (std::is_same_v<T, float>) { index->metric_processor->postprocess(distances); }
+}
+
+}  // namespace raft::spatial::knn::detail
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
deleted file mode 100644
index 78631b431f..0000000000
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
+++ /dev/null
@@ -1,207 +0,0 @@
-/*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "../ann_common.h"
-#include "knn_brute_force_faiss.cuh"
-
-#include "common_faiss.h"
-#include "processing.hpp"
-
-#include "processing.hpp"
-#include <raft/cuda_utils.cuh>
-#include <raft/cudart_utils.h>
-
-#include <label/classlabels.cuh>
-#include <raft/distance/distance.cuh>
-#include <raft/spatial/knn/faiss_mr.hpp>
-
-#include <faiss/gpu/GpuDistance.h>
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/GpuIndexIVFFlat.h>
-#include <faiss/gpu/GpuIndexIVFPQ.h>
-#include <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/utils/Limits.cuh>
-#include <faiss/gpu/utils/Select.cuh>
-#include <faiss/gpu/utils/Tensor.cuh>
-#include <faiss/utils/Heap.h>
-
-#include <thrust/iterator/transform_iterator.h>
-
-#include <raft/distance/distance_type.hpp>
-
-#include <iostream>
-#include <set>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-namespace detail {
-
-inline faiss::ScalarQuantizer::QuantizerType build_faiss_qtype(QuantizerType qtype)
-{
-  switch (qtype) {
-    case QuantizerType::QT_8bit: return faiss::ScalarQuantizer::QuantizerType::QT_8bit;
-    case QuantizerType::QT_8bit_uniform:
-      return faiss::ScalarQuantizer::QuantizerType::QT_8bit_uniform;
-    case QuantizerType::QT_4bit_uniform:
-      return faiss::ScalarQuantizer::QuantizerType::QT_4bit_uniform;
-    case QuantizerType::QT_fp16: return faiss::ScalarQuantizer::QuantizerType::QT_fp16;
-    case QuantizerType::QT_8bit_direct:
-      return faiss::ScalarQuantizer::QuantizerType::QT_8bit_direct;
-    case QuantizerType::QT_6bit: return faiss::ScalarQuantizer::QuantizerType::QT_6bit;
-    default: return (faiss::ScalarQuantizer::QuantizerType)qtype;
-  }
-}
-
-template <typename IntType = int>
-void approx_knn_ivfflat_build_index(
-  knnIndex* index, IVFParam* params, raft::distance::DistanceType metric, IntType n, IntType D)
-{
-  faiss::gpu::GpuIndexIVFFlatConfig config;
-  config.device                  = index->device;
-  faiss::MetricType faiss_metric = build_faiss_metric(metric);
-  faiss::gpu::GpuIndexIVFFlat* faiss_index =
-    new faiss::gpu::GpuIndexIVFFlat(index->gpu_res, D, params->nlist, faiss_metric, config);
-  faiss_index->setNumProbes(params->nprobe);
-  index->index = faiss_index;
-}
-
-template <typename IntType = int>
-void approx_knn_ivfpq_build_index(
-  knnIndex* index, IVFPQParam* params, raft::distance::DistanceType metric, IntType n, IntType D)
-{
-  faiss::gpu::GpuIndexIVFPQConfig config;
-  config.device                          = index->device;
-  config.usePrecomputedTables            = params->usePrecomputedTables;
-  config.interleavedLayout               = params->n_bits != 8;
-  faiss::MetricType faiss_metric         = build_faiss_metric(metric);
-  faiss::gpu::GpuIndexIVFPQ* faiss_index = new faiss::gpu::GpuIndexIVFPQ(
-    index->gpu_res, D, params->nlist, params->M, params->n_bits, faiss_metric, config);
-  faiss_index->setNumProbes(params->nprobe);
-  index->index = faiss_index;
-}
-
-template <typename IntType = int>
-void approx_knn_ivfsq_build_index(
-  knnIndex* index, IVFSQParam* params, raft::distance::DistanceType metric, IntType n, IntType D)
-{
-  faiss::gpu::GpuIndexIVFScalarQuantizerConfig config;
-  config.device                                       = index->device;
-  faiss::MetricType faiss_metric                      = build_faiss_metric(metric);
-  faiss::ScalarQuantizer::QuantizerType faiss_qtype   = build_faiss_qtype(params->qtype);
-  faiss::gpu::GpuIndexIVFScalarQuantizer* faiss_index = new faiss::gpu::GpuIndexIVFScalarQuantizer(
-    index->gpu_res, D, params->nlist, faiss_qtype, faiss_metric, params->encodeResidual);
-  faiss_index->setNumProbes(params->nprobe);
-  index->index = faiss_index;
-}
-
-template <typename IntType = int>
-void approx_knn_build_index(raft::handle_t& handle,
-                            raft::spatial::knn::knnIndex* index,
-                            raft::spatial::knn::knnIndexParam* params,
-                            raft::distance::DistanceType metric,
-                            float metricArg,
-                            float* index_array,
-                            IntType n,
-                            IntType D)
-{
-  int device;
-  RAFT_CUDA_TRY(cudaGetDevice(&device));
-
-  raft::spatial::knn::RmmGpuResources* gpu_res = new raft::spatial::knn::RmmGpuResources();
-  gpu_res->noTempMemory();
-  gpu_res->setDefaultStream(device, handle.get_stream());
-  index->gpu_res   = gpu_res;
-  index->device    = device;
-  index->index     = nullptr;
-  index->metric    = metric;
-  index->metricArg = metricArg;
-
-  // perform preprocessing
-  // k set to 0 (unused during preprocessing / revertion)
-  std::unique_ptr<MetricProcessor<float>> query_metric_processor =
-    create_processor<float>(metric, n, D, 0, false, handle.get_stream());
-
-  query_metric_processor->preprocess(index_array);
-
-  if (dynamic_cast<IVFFlatParam*>(params)) {
-    IVFFlatParam* IVFFlat_param = dynamic_cast<IVFFlatParam*>(params);
-    approx_knn_ivfflat_build_index(index, IVFFlat_param, metric, n, D);
-    std::vector<float> h_index_array(n * D);
-    raft::update_host(h_index_array.data(), index_array, h_index_array.size(), handle.get_stream());
-    query_metric_processor->revert(index_array);
-    index->index->train(n, h_index_array.data());
-    index->index->add(n, h_index_array.data());
-  } else {
-    if (dynamic_cast<IVFPQParam*>(params)) {
-      IVFPQParam* IVFPQ_param = dynamic_cast<IVFPQParam*>(params);
-      approx_knn_ivfpq_build_index(index, IVFPQ_param, metric, n, D);
-    } else if (dynamic_cast<IVFSQParam*>(params)) {
-      IVFSQParam* IVFSQ_param = dynamic_cast<IVFSQParam*>(params);
-      approx_knn_ivfsq_build_index(index, IVFSQ_param, metric, n, D);
-    } else {
-      ASSERT(index->index, "KNN index could not be initialized");
-    }
-
-    index->index->train(n, index_array);
-    index->index->add(n, index_array);
-    query_metric_processor->revert(index_array);
-  }
-}
-
-template <typename IntType = int>
-void approx_knn_search(raft::handle_t& handle,
-                       float* distances,
-                       int64_t* indices,
-                       raft::spatial::knn::knnIndex* index,
-                       IntType k,
-                       float* query_array,
-                       IntType n)
-{
-  // perform preprocessing
-  std::unique_ptr<MetricProcessor<float>> query_metric_processor =
-    create_processor<float>(index->metric, n, index->index->d, k, false, handle.get_stream());
-
-  query_metric_processor->preprocess(query_array);
-  index->index->search(n, query_array, k, distances, indices);
-  query_metric_processor->revert(query_array);
-
-  // Perform necessary post-processing
-  if (index->metric == raft::distance::DistanceType::L2SqrtExpanded ||
-      index->metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
-      index->metric == raft::distance::DistanceType::LpUnexpanded) {
-    /**
-     * post-processing
-     */
-    float p = 0.5;  // standard l2
-    if (index->metric == raft::distance::DistanceType::LpUnexpanded) p = 1.0 / index->metricArg;
-    raft::linalg::unaryOp<float>(
-      distances,
-      distances,
-      n * k,
-      [p] __device__(float input) { return powf(input, p); },
-      handle.get_stream());
-  }
-  query_metric_processor->postprocess(distances);
-}
-
-}  // namespace detail
-}  // namespace knn
-}  // namespace spatial
-}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
new file mode 100644
index 0000000000..e789bafde2
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -0,0 +1,560 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/distance/distance.hpp>
+#include <raft/distance/distance_type.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace raft::spatial::knn::detail::utils {
+
+/** Whether pointers are accessible on the device or on the host. */
+enum class pointer_residency {
+  /** Some of the pointers are on the device, some on the host. */
+  mixed,
+  /** All pointers accessible from both the device and the host. */
+  host_and_device,
+  /** All pointers are host accessible. */
+  host_only,
+  /** All poitners are device accessible. */
+  device_only
+};
+
+template <typename... Types>
+struct pointer_residency_count {
+};
+
+template <>
+struct pointer_residency_count<> {
+  static inline auto run() -> std::tuple<int, int> { return std::make_tuple(0, 0); }
+};
+
+template <typename Type, typename... Types>
+struct pointer_residency_count<Type, Types...> {
+  static inline auto run(const Type* ptr, const Types*... ptrs) -> std::tuple<int, int>
+  {
+    auto [on_device, on_host] = pointer_residency_count<Types...>::run(ptrs...);
+    cudaPointerAttributes attr;
+    RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, ptr));
+    switch (attr.type) {
+      case cudaMemoryTypeUnregistered:
+      case cudaMemoryTypeHost: return std::make_tuple(on_device, on_host + 1);
+      case cudaMemoryTypeDevice: return std::make_tuple(on_device + 1, on_host);
+      case cudaMemoryTypeManaged: return std::make_tuple(on_device + 1, on_host + 1);
+      default: return std::make_tuple(on_device, on_host);
+    }
+  }
+};
+
+/** Check if all argument pointers reside on the host or on the device. */
+template <typename... Types>
+auto check_pointer_residency(const Types*... ptrs) -> pointer_residency
+{
+  auto [on_device, on_host] = pointer_residency_count<Types...>::run(ptrs...);
+  int n_args                = sizeof...(Types);
+  if (on_device == n_args && on_host == n_args) { return pointer_residency::host_and_device; }
+  if (on_device == n_args) { return pointer_residency::device_only; }
+  if (on_host == n_args) { return pointer_residency::host_only; }
+  return pointer_residency::mixed;
+}
+
+template <typename T>
+struct config {
+};
+
+template <>
+struct config<float> {
+  using value_t                    = float;
+  static constexpr double kDivisor = 1.0;
+};
+template <>
+struct config<uint8_t> {
+  using value_t                    = uint32_t;
+  static constexpr double kDivisor = 256.0;
+};
+template <>
+struct config<int8_t> {
+  using value_t                    = int32_t;
+  static constexpr double kDivisor = 128.0;
+};
+
+/**
+ * @brief Converting values between the types taking into account scaling factors
+ * for the integral types.
+ *
+ * @tparam T target type of the mapping.
+ */
+template <typename T>
+struct mapping {
+  /**
+   * @defgroup
+   * @brief Cast and possibly scale a value of the source type `S` to the target type `T`.
+   *
+   * @tparam S source type
+   * @param x source value
+   * @{
+   */
+  template <typename S>
+  HDI auto operator()(const S& x) -> std::enable_if_t<std::is_same_v<S, T>, T>
+  {
+    return x;
+  };
+
+  template <typename S>
+  HDI auto operator()(const S& x) -> std::enable_if_t<!std::is_same_v<S, T>, T>
+  {
+    constexpr double kMult = config<T>::kDivisor / config<S>::kDivisor;
+    if constexpr (std::is_floating_point_v<S>) { return static_cast<T>(x * static_cast<S>(kMult)); }
+    if constexpr (std::is_floating_point_v<T>) { return static_cast<T>(x) * static_cast<T>(kMult); }
+    return static_cast<T>(static_cast<float>(x) * static_cast<float>(kMult));
+  };
+  /** @} */
+};
+
+/**
+ * @brief Sets the first num bytes of the block of memory pointed by ptr to the specified value.
+ *
+ * @param[out] ptr host or device pointer
+ * @param[in] value
+ * @param[in] n_bytes
+ */
+template <typename T>
+inline void memzero(T* ptr, size_t n_elems, rmm::cuda_stream_view stream)
+{
+  switch (check_pointer_residency(ptr)) {
+    case pointer_residency::host_and_device:
+    case pointer_residency::device_only: {
+      RAFT_CUDA_TRY(cudaMemsetAsync(ptr, 0, n_elems * sizeof(T), stream));
+    } break;
+    case pointer_residency::host_only: {
+      stream.synchronize();
+      ::memset(ptr, 0, n_elems * sizeof(T));
+    } break;
+    default: RAFT_FAIL("memset: unreachable code");
+  }
+}
+
+__global__ void argmin_along_rows_kernel(uint32_t n_rows,
+                                         uint32_t n_cols,
+                                         const float* a,
+                                         uint32_t* out)
+{
+  __shared__ uint32_t shm_ids[1024];  // NOLINT
+  __shared__ float shm_vals[1024];    // NOLINT
+  uint32_t i = blockIdx.x;
+  if (i >= n_rows) return;
+  uint32_t min_idx = n_cols;
+  float min_val    = raft::upper_bound<float>();
+  for (uint32_t j = threadIdx.x; j < n_cols; j += blockDim.x) {
+    if (min_val > a[j + n_cols * i]) {
+      min_val = a[j + n_cols * i];
+      min_idx = j;
+    }
+  }
+  shm_vals[threadIdx.x] = min_val;
+  shm_ids[threadIdx.x]  = min_idx;
+  __syncthreads();
+  for (uint32_t offset = blockDim.x / 2; offset > 0; offset >>= 1) {
+    if (threadIdx.x < offset) {
+      if (shm_vals[threadIdx.x] < shm_vals[threadIdx.x + offset]) {
+      } else if (shm_vals[threadIdx.x] > shm_vals[threadIdx.x + offset]) {
+        shm_vals[threadIdx.x] = shm_vals[threadIdx.x + offset];
+        shm_ids[threadIdx.x]  = shm_ids[threadIdx.x + offset];
+      } else if (shm_ids[threadIdx.x] > shm_ids[threadIdx.x + offset]) {
+        shm_ids[threadIdx.x] = shm_ids[threadIdx.x + offset];
+      }
+    }
+    __syncthreads();
+  }
+  if (threadIdx.x == 0) { out[i] = shm_ids[0]; }
+}
+
+/**
+ * @brief Find index of the smallest element in each row.
+ *
+ * NB: device-only function
+ * TODO: specialize select_k for the case of `k == 1` and use that one instead.
+ *
+ * @param n_rows
+ * @param n_cols
+ * @param[in] a device pointer to the row-major matrix [n_rows, n_cols]
+ * @param[out] out device pointer to the vector of selected indices [n_rows]
+ * @param stream
+ */
+inline void argmin_along_rows(
+  uint32_t n_rows, uint32_t n_cols, const float* a, uint32_t* out, rmm::cuda_stream_view stream)
+{
+  uint32_t block_dim = 1024;
+  while (block_dim > n_cols) {
+    block_dim /= 2;
+  }
+  block_dim = max(block_dim, 128);
+  argmin_along_rows_kernel<<<n_rows, block_dim, 0, stream>>>(n_rows, n_cols, a, out);
+}
+
+__global__ void dots_along_rows_kernel(uint32_t n_rows, uint32_t n_cols, const float* a, float* out)
+{
+  uint64_t i = threadIdx.y + (blockDim.y * blockIdx.x);
+  if (i >= n_rows) return;
+
+  float sqsum = 0.0;
+  for (uint64_t j = threadIdx.x; j < n_cols; j += blockDim.x) {
+    float val = a[j + (n_cols * i)];
+    sqsum += val * val;
+  }
+  sqsum += __shfl_xor_sync(0xffffffff, sqsum, 1);
+  sqsum += __shfl_xor_sync(0xffffffff, sqsum, 2);
+  sqsum += __shfl_xor_sync(0xffffffff, sqsum, 4);
+  sqsum += __shfl_xor_sync(0xffffffff, sqsum, 8);
+  sqsum += __shfl_xor_sync(0xffffffff, sqsum, 16);
+  if (threadIdx.x == 0) { out[i] = sqsum; }
+}
+
+/**
+ * @brief Square sum of values in each row (row-major matrix).
+ *
+ * NB: device-only function
+ *
+ * @param n_rows
+ * @param n_cols
+ * @param[in] a device pointer to the row-major matrix [n_rows, n_cols]
+ * @param[out] out device pointer to the vector of dot-products [n_rows]
+ * @param stream
+ */
+inline void dots_along_rows(
+  uint32_t n_rows, uint32_t n_cols, const float* a, float* out, rmm::cuda_stream_view stream)
+{
+  dim3 threads(32, 4, 1);
+  dim3 blocks(ceildiv(n_rows, threads.y), 1, 1);
+  dots_along_rows_kernel<<<blocks, threads, 0, stream>>>(n_rows, n_cols, a, out);
+  /**
+   * TODO: this can be replaced with the rowNorm helper as shown below.
+   * However, the rowNorm helper seems to incur a significant performance penalty
+   * (example case ann-search slowed down from 150ms to 186ms).
+   *
+   * raft::linalg::rowNorm(out, a, n_cols, n_rows, raft::linalg::L2Norm, true, stream);
+   */
+}
+
+template <typename T>
+__global__ void accumulate_into_selected_kernel(uint32_t n_rows,
+                                                uint32_t n_cols,
+                                                float* output,
+                                                uint32_t* selection_counters,
+                                                const T* input,
+                                                const uint32_t* row_ids)
+{
+  uint64_t gid = threadIdx.x + (blockDim.x * blockIdx.x);
+  uint64_t j   = gid % n_cols;
+  uint64_t i   = gid / n_cols;
+  if (i >= n_rows) return;
+  uint64_t l = row_ids[i];
+  if (j == 0) { atomicAdd(&(selection_counters[l]), 1); }
+  atomicAdd(&(output[j + n_cols * l]), mapping<float>{}(input[gid]));
+}
+
+/**
+ * @brief Add all rows of input matrix into a selection of rows in the output matrix
+ * (cast and possibly scale the data input type). Count the number of times every output
+ * row was selected along the way.
+ *
+ * @tparam T
+ *
+ * @param n_cols number of columns in all matrices
+ * @param[out] output output matrix [..., n_cols]
+ * @param[inout] selection_counters number of occurrences of each row id in row_ids [..., n_cols]
+ * @param n_rows number of rows in the input
+ * @param[in] input row-major input matrix [n_rows, n_cols]
+ * @param[in] row_ids row indices in the output matrix [n_rows]
+ */
+template <typename T>
+void accumulate_into_selected(size_t n_rows,
+                              uint32_t n_cols,
+                              float* output,
+                              uint32_t* selection_counters,
+                              const T* input,
+                              const uint32_t* row_ids,
+                              rmm::cuda_stream_view stream)
+{
+  switch (check_pointer_residency(output, input, selection_counters, row_ids)) {
+    case pointer_residency::host_and_device:
+    case pointer_residency::device_only: {
+      uint32_t block_dim = 128;
+      auto grid_dim =
+        static_cast<uint32_t>(ceildiv<size_t>(n_rows * static_cast<size_t>(n_cols), block_dim));
+      accumulate_into_selected_kernel<T><<<grid_dim, block_dim, 0, stream>>>(
+        n_rows, n_cols, output, selection_counters, input, row_ids);
+    } break;
+    case pointer_residency::host_only: {
+      stream.synchronize();
+      for (size_t i = 0; i < n_rows; i++) {
+        uint32_t l = row_ids[i];
+        selection_counters[l]++;
+        for (uint32_t j = 0; j < n_cols; j++) {
+          output[j + n_cols * l] += mapping<float>{}(input[j + n_cols * i]);
+        }
+      }
+      stream.synchronize();
+    } break;
+    default: RAFT_FAIL("All pointers must reside on the same side, host or device.");
+  }
+}
+
+__global__ void normalize_rows_kernel(uint32_t n_rows, uint32_t n_cols, float* a)
+{
+  uint64_t i = threadIdx.y + (blockDim.y * blockIdx.x);
+  if (i >= n_rows) return;
+
+  float sqsum = 0.0;
+  for (uint32_t j = threadIdx.x; j < n_cols; j += blockDim.x) {
+    float val = a[j + (n_cols * i)];
+    sqsum += val * val;
+  }
+  sqsum += __shfl_xor_sync(0xffffffff, sqsum, 1);
+  sqsum += __shfl_xor_sync(0xffffffff, sqsum, 2);
+  sqsum += __shfl_xor_sync(0xffffffff, sqsum, 4);
+  sqsum += __shfl_xor_sync(0xffffffff, sqsum, 8);
+  sqsum += __shfl_xor_sync(0xffffffff, sqsum, 16);
+  if (sqsum <= 1e-8) return;
+  sqsum = rsqrtf(sqsum);  // reciprocal of the square root
+  for (uint32_t j = threadIdx.x; j < n_cols; j += blockDim.x) {
+    a[j + n_cols * i] *= sqsum;
+  }
+}
+
+/**
+ * @brief Divide rows by their L2 norm (square root of sum of squares).
+ *
+ * NB: device-only function
+ *
+ * @param[in] n_rows
+ * @param[in] n_cols
+ * @param[inout] a device pointer to a row-major matrix [n_rows, n_cols]
+ * @param stream
+ */
+inline void normalize_rows(uint32_t n_rows, uint32_t n_cols, float* a, rmm::cuda_stream_view stream)
+{
+  dim3 threads(32, 4, 1);  // DO NOT CHANGE
+  dim3 blocks(ceildiv(n_rows, threads.y), 1, 1);
+  normalize_rows_kernel<<<blocks, threads, 0, stream>>>(n_rows, n_cols, a);
+}
+
+template <typename Lambda>
+__global__ void map_along_rows_kernel(
+  uint32_t n_rows, uint32_t n_cols, float* a, const uint32_t* d, Lambda map)
+{
+  uint64_t gid = threadIdx.x + blockDim.x * blockIdx.x;
+  uint64_t i   = gid / n_cols;
+  if (i >= n_rows) return;
+  float& x = a[gid];
+  x        = map(x, d[i]);
+}
+
+/**
+ * @brief Divide matrix values along rows by an integer value, skipping rows if the corresponding
+ * divisor is zero.
+ *
+ * NB: device-only function
+ *
+ * @tparam Lambda
+ *
+ * @param n_rows
+ * @param n_cols
+ * @param[inout] a device pointer to a row-major matrix [n_rows, n_cols]
+ * @param[in] d device pointer to a vector [n_rows]
+ * @param map the binary operation to apply on every element of matrix rows and of the vector
+ */
+template <typename Lambda>
+inline void map_along_rows(uint32_t n_rows,
+                           uint32_t n_cols,
+                           float* a,
+                           const uint32_t* d,
+                           Lambda map,
+                           rmm::cuda_stream_view stream)
+{
+  dim3 threads(128, 1, 1);
+  dim3 blocks(
+    ceildiv<uint64_t>(static_cast<uint64_t>(n_rows) * static_cast<uint64_t>(n_cols), threads.x),
+    1,
+    1);
+  map_along_rows_kernel<<<blocks, threads, 0, stream>>>(n_rows, n_cols, a, d, map);
+}
+
+template <typename T>
+__global__ void outer_add_kernel(const T* a, uint32_t len_a, const T* b, uint32_t len_b, T* c)
+{
+  uint64_t gid = threadIdx.x + blockDim.x * blockIdx.x;
+  uint64_t i   = gid / len_b;
+  uint64_t j   = gid % len_b;
+  if (i >= len_a) return;
+  c[gid] = (a == nullptr ? T(0) : a[i]) + (b == nullptr ? T(0) : b[j]);
+}
+
+template <typename T, typename IdxT>
+__global__ void block_copy_kernel(const IdxT* in_offsets,
+                                  const IdxT* out_offsets,
+                                  IdxT n_blocks,
+                                  const T* in_data,
+                                  T* out_data,
+                                  IdxT n_mult)
+{
+  IdxT i = IdxT(blockDim.x) * IdxT(blockIdx.x) + threadIdx.x;
+  // find the source offset using the binary search.
+  uint32_t l     = 0;
+  uint32_t r     = n_blocks;
+  IdxT in_offset = 0;
+  if (in_offsets[r] * n_mult <= i) return;
+  while (l + 1 < r) {
+    uint32_t c = (l + r) >> 1;
+    IdxT o     = in_offsets[c] * n_mult;
+    if (o <= i) {
+      l         = c;
+      in_offset = o;
+    } else {
+      r = c;
+    }
+  }
+  // copy the data
+  out_data[out_offsets[l] * n_mult - in_offset + i] = in_data[i];
+}
+
+/**
+ * Copy chunks of data from one array to another at given offsets.
+ *
+ * @tparam T element type
+ * @tparam IdxT index type
+ *
+ * @param[in] in_offsets
+ * @param[in] out_offsets
+ * @param n_blocks size of the offset arrays minus one.
+ * @param[in] in_data
+ * @param[out] out_data
+ * @param n_mult constant multiplier for offset values (such as e.g. `dim`)
+ * @param stream
+ */
+template <typename T, typename IdxT>
+void block_copy(const IdxT* in_offsets,
+                const IdxT* out_offsets,
+                IdxT n_blocks,
+                const T* in_data,
+                T* out_data,
+                IdxT n_mult,
+                rmm::cuda_stream_view stream)
+{
+  IdxT in_size;
+  update_host(&in_size, in_offsets + n_blocks, 1, stream);
+  stream.synchronize();
+  dim3 threads(128, 1, 1);
+  dim3 blocks(ceildiv<IdxT>(in_size * n_mult, threads.x), 1, 1);
+  block_copy_kernel<<<blocks, threads, 0, stream>>>(
+    in_offsets, out_offsets, n_blocks, in_data, out_data, n_mult);
+}
+
+/**
+ * @brief Fill matrix `c` with all combinations of sums of vectors `a` and `b`.
+ *
+ * NB: device-only function
+ *
+ * @tparam T element type
+ *
+ * @param[in] a device pointer to a vector [len_a]
+ * @param len_a number of elements in `a`
+ * @param[in] b device pointer to a vector [len_b]
+ * @param len_b number of elements in `b`
+ * @param[out] c row-major matrix [len_a, len_b]
+ * @param stream
+ */
+template <typename T>
+void outer_add(
+  const T* a, uint32_t len_a, const T* b, uint32_t len_b, T* c, rmm::cuda_stream_view stream)
+{
+  dim3 threads(128, 1, 1);
+  dim3 blocks(
+    ceildiv<uint64_t>(static_cast<uint64_t>(len_a) * static_cast<uint64_t>(len_b), threads.x),
+    1,
+    1);
+  outer_add_kernel<<<blocks, threads, 0, stream>>>(a, len_a, b, len_b, c);
+}
+
+template <typename T, typename S>
+__global__ void copy_selected_kernel(uint32_t n_rows,
+                                     uint32_t n_cols,
+                                     const S* src,
+                                     const uint32_t* row_ids,
+                                     uint32_t ld_src,
+                                     T* dst,
+                                     uint32_t ld_dst)
+{
+  uint64_t gid   = threadIdx.x + blockDim.x * blockIdx.x;
+  uint64_t j     = gid % n_cols;
+  uint64_t i_dst = gid / n_cols;
+  if (i_dst >= n_rows) return;
+  uint64_t i_src          = row_ids[i_dst];
+  dst[ld_dst * i_dst + j] = mapping<T>{}(src[ld_src * i_src + j]);
+}
+
+/**
+ * @brief Copy selected rows of a matrix while mapping the data from the source to the target
+ * type.
+ *
+ * @tparam T target type
+ * @tparam S source type
+ *
+ * @param n_rows
+ * @param n_cols
+ * @param[in] src input matrix [..., ld_src]
+ * @param[in] row_ids selection of rows to be copied [n_rows]
+ * @param ld_src number of cols in the input (ld_src >= n_cols)
+ * @param[out] dst output matrix [n_rows, ld_dst]
+ * @param ld_dst number of cols in the output (ld_dst >= n_cols)
+ * @param stream
+ */
+template <typename T, typename S>
+void copy_selected(uint32_t n_rows,
+                   uint32_t n_cols,
+                   const S* src,
+                   const uint32_t* row_ids,
+                   uint32_t ld_src,
+                   T* dst,
+                   uint32_t ld_dst,
+                   rmm::cuda_stream_view stream)
+{
+  switch (check_pointer_residency(src, dst)) {
+    case pointer_residency::host_and_device:
+    case pointer_residency::device_only: {
+      uint32_t block_dim = 128;
+      uint32_t grid_dim  = ceildiv(n_rows * n_cols, block_dim);
+      copy_selected_kernel<T, S>
+        <<<grid_dim, block_dim, 0, stream>>>(n_rows, n_cols, src, row_ids, ld_src, dst, ld_dst);
+    } break;
+    case pointer_residency::host_only: {
+      stream.synchronize();
+      for (uint64_t i_dst = 0; i_dst < n_rows; i_dst++) {
+        uint64_t i_src = row_ids[i_dst];
+        for (uint64_t j = 0; j < n_cols; j++) {
+          dst[ld_dst * i_dst + j] = mapping<T>{}(src[ld_src * i_src + j]);
+        }
+      }
+      stream.synchronize();
+    } break;
+    default: RAFT_FAIL("All pointers must reside on the same side, host or device.");
+  }
+}
+}  // namespace raft::spatial::knn::detail::utils
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
index f8532e52a0..41f1df85fe 100644
--- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
@@ -19,7 +19,7 @@
 #include <limits>
 #include <raft/linalg/norm.cuh>
 // TODO: Need to hide the PairwiseDistance class impl and expose to public API
-#include "processing.hpp"
+#include "processing.cuh"
 #include <raft/distance/detail/distance.cuh>
 #include <raft/distance/detail/pairwise_distance_base.cuh>
 
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
new file mode 100644
index 0000000000..96af5c9522
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
@@ -0,0 +1,308 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "../ivf_flat_types.hpp"
+#include "ann_kmeans_balanced.cuh"
+#include "ann_utils.cuh"
+
+#include <raft/core/handle.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/core/mdarray.hpp>
+#include <raft/core/nvtx.hpp>
+#include <raft/pow2_utils.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace raft::spatial::knn::ivf_flat::detail {
+
+using namespace raft::spatial::knn::detail;  // NOLINT
+
+/**
+ * @brief Record the dataset into the index, one source row at a time.
+ *
+ * The index consists of the dataset rows, grouped by their labels (into clusters/lists).
+ * Within each cluster (list), the data is grouped into blocks of `WarpSize` interleaved
+ * vectors. Note, the total index length is slightly larger than the dataset length, because
+ * each cluster is padded by `WarpSize` elements
+ *
+ * CUDA launch grid:
+ *   X dimension must cover the dataset (n_rows), YZ are not used;
+ *   there are no dependencies between threads, hence no constraints on the block size.
+ *
+ * @tparam T the element type.
+ * @tparam IdxT type of the indices in the source source_vecs
+ *
+ * @param[in] labels device pointer to the cluster ids for each row [n_rows]
+ * @param[in] list_offsets device pointer to the cluster offsets in the output (index) [n_lists]
+ * @param[in] source_vecs device poitner to the input data [n_rows, dim]
+ * @param[in] source_ixs device poitner to the input indices [n_rows]
+ * @param[out] list_data device pointer to the output [index_size, dim]
+ * @param[out] list_index device pointer to the source ids corr. to the output [index_size]
+ * @param[out] list_sizes_ptr device pointer to the cluster sizes [n_lists];
+ *                          it's used as an atomic counter, and must be initialized with zeros.
+ * @param n_rows source length
+ * @param dim the dimensionality of the data
+ * @param veclen size of vectorized loads/stores; must satisfy `dim % veclen == 0`.
+ *
+ */
+template <typename T, typename IdxT>
+__global__ void build_index_kernel(const uint32_t* labels,
+                                   const IdxT* list_offsets,
+                                   const T* source_vecs,
+                                   const IdxT* source_ixs,
+                                   T* list_data,
+                                   IdxT* list_index,
+                                   uint32_t* list_sizes_ptr,
+                                   IdxT n_rows,
+                                   uint32_t dim,
+                                   uint32_t veclen)
+{
+  const IdxT i = IdxT(blockDim.x) * IdxT(blockIdx.x) + threadIdx.x;
+  if (i >= n_rows) { return; }
+
+  auto list_id     = labels[i];
+  auto inlist_id   = atomicAdd(list_sizes_ptr + list_id, 1);
+  auto list_offset = list_offsets[list_id];
+
+  // Record the source vector id in the index
+  list_index[list_offset + inlist_id] = source_ixs == nullptr ? i : source_ixs[i];
+
+  // The data is written in interleaved groups of `index::kGroupSize` vectors
+  using interleaved_group = Pow2<kIndexGroupSize>;
+  auto group_offset       = interleaved_group::roundDown(inlist_id);
+  auto ingroup_id         = interleaved_group::mod(inlist_id) * veclen;
+
+  // Point to the location of the interleaved group of vectors
+  list_data += (list_offset + group_offset) * dim;
+
+  // Point to the source vector
+  source_vecs += i * dim;
+
+  // Interleave dimensions of the source vector while recording it.
+  // NB: such `veclen` is selected, that `dim % veclen == 0`
+  for (uint32_t l = 0; l < dim; l += veclen) {
+    for (uint32_t j = 0; j < veclen; j++) {
+      list_data[l * kIndexGroupSize + ingroup_id + j] = source_vecs[l + j];
+    }
+  }
+}
+
+/** See raft::spatial::knn::ivf_flat::extend docs */
+template <typename T, typename IdxT>
+inline auto extend(const handle_t& handle,
+                   const index<T, IdxT>& orig_index,
+                   const T* new_vectors,
+                   const IdxT* new_indices,
+                   IdxT n_rows,
+                   rmm::cuda_stream_view stream) -> index<T, IdxT>
+{
+  auto n_lists = orig_index.n_lists;
+  auto dim     = orig_index.dim;
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "ivf_flat::extend(%zu, %u)", size_t(n_rows), dim);
+
+  RAFT_EXPECTS(new_indices != nullptr || orig_index.size == 0,
+               "You must pass data indices when the index is non-empty.");
+
+  rmm::device_uvector<uint32_t> new_labels(n_rows, stream);
+  kmeans::predict(handle,
+                  orig_index.centers.data(),
+                  n_lists,
+                  dim,
+                  new_vectors,
+                  n_rows,
+                  new_labels.data(),
+                  orig_index.metric,
+                  stream);
+
+  auto&& list_sizes     = rmm::device_uvector<uint32_t>(n_lists, stream);
+  auto&& list_offsets   = rmm::device_uvector<IdxT>(n_lists + 1, stream);
+  auto list_sizes_ptr   = list_sizes.data();
+  auto list_offsets_ptr = list_offsets.data();
+
+  auto&& centers   = rmm::device_uvector<float>(size_t(n_lists) * size_t(dim), stream);
+  auto centers_ptr = centers.data();
+
+  // Calculate the centers and sizes on the new data, starting from the original values
+  raft::copy(centers_ptr, orig_index.centers.data(), centers.size(), stream);
+  raft::copy(list_sizes_ptr, orig_index.list_sizes.data(), list_sizes.size(), stream);
+
+  kmeans::calc_centers_and_sizes(centers_ptr,
+                                 list_sizes_ptr,
+                                 n_lists,
+                                 dim,
+                                 new_vectors,
+                                 n_rows,
+                                 new_labels.data(),
+                                 false,
+                                 stream);
+
+  // Calculate new offsets
+  IdxT index_size = 0;
+  update_device(list_offsets_ptr, &index_size, 1, stream);
+  thrust::inclusive_scan(
+    rmm::exec_policy(stream),
+    list_sizes_ptr,
+    list_sizes_ptr + n_lists,
+    list_offsets_ptr + 1,
+    [] __device__(IdxT s, uint32_t l) { return s + Pow2<WarpSize>::roundUp(l); });
+  update_host(&index_size, list_offsets_ptr + n_lists, 1, stream);
+  handle.sync_stream(stream);
+
+  auto&& data    = rmm::device_uvector<T>(index_size * IdxT(dim), stream);
+  auto&& indices = rmm::device_uvector<IdxT>(index_size, stream);
+
+  // Populate index with the old data
+  if (orig_index.size > 0) {
+    utils::block_copy(orig_index.list_offsets.data(),
+                      list_offsets_ptr,
+                      IdxT(n_lists),
+                      orig_index.data.data(),
+                      data.data(),
+                      IdxT(dim),
+                      stream);
+
+    utils::block_copy(orig_index.list_offsets.data(),
+                      list_offsets_ptr,
+                      IdxT(n_lists),
+                      orig_index.indices.data(),
+                      indices.data(),
+                      IdxT(1),
+                      stream);
+  }
+
+  // Copy the old sizes, so we can start from the current state of the index;
+  // we'll rebuild the `list_sizes_ptr` in the following kernel, using it as an atomic counter.
+  raft::copy(list_sizes_ptr, orig_index.list_sizes.data(), list_sizes.size(), stream);
+
+  const dim3 block_dim(256);
+  const dim3 grid_dim(raft::ceildiv<IdxT>(n_rows, block_dim.x));
+  build_index_kernel<<<grid_dim, block_dim, 0, stream>>>(new_labels.data(),
+                                                         list_offsets_ptr,
+                                                         new_vectors,
+                                                         new_indices,
+                                                         data.data(),
+                                                         indices.data(),
+                                                         list_sizes_ptr,
+                                                         n_rows,
+                                                         dim,
+                                                         orig_index.veclen);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+  // Precompute the centers vector norms for L2Expanded distance
+  auto compute_norms = [&]() {
+    auto&& r = rmm::device_uvector<float>(n_lists, stream);
+    utils::dots_along_rows(n_lists, dim, centers.data(), r.data(), stream);
+    RAFT_LOG_TRACE_VEC(r.data(), 20);
+    return std::move(r);
+  };
+  auto&& center_norms = orig_index.metric == raft::distance::DistanceType::L2Expanded
+                          ? std::optional(compute_norms())
+                          : std::nullopt;
+
+  // assemble the index
+  index<T, IdxT> new_index{{},
+                           orig_index.veclen,
+                           orig_index.metric,
+                           index_size,
+                           orig_index.dim,
+                           orig_index.n_lists,
+                           std::move(data),
+                           std::move(indices),
+                           std::move(list_sizes),
+                           std::move(list_offsets),
+                           std::move(centers),
+                           std::move(center_norms)};
+
+  // check index invariants
+  new_index.check_consistency();
+
+  return new_index;
+}
+
+/** See raft::spatial::knn::ivf_flat::build docs */
+template <typename T, typename IdxT>
+inline auto build(const handle_t& handle,
+                  const index_params& params,
+                  const T* dataset,
+                  IdxT n_rows,
+                  uint32_t dim,
+                  rmm::cuda_stream_view stream) -> index<T, IdxT>
+{
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "ivf_flat::build(%zu, %u)", size_t(n_rows), dim);
+  static_assert(std::is_same_v<T, float> || std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
+                "unsupported data type");
+  RAFT_EXPECTS(n_rows > 0 && dim > 0, "empty dataset");
+
+  // TODO: consider padding the dimensions and fixing veclen to its maximum possible value as a
+  // template parameter (https://github.com/rapidsai/raft/issues/711)
+  uint32_t veclen = 16 / sizeof(T);
+  while (dim % veclen != 0) {
+    veclen = veclen >> 1;
+  }
+  auto n_lists = static_cast<uint32_t>(params.n_lists);
+
+  // kmeans cluster ids for the dataset
+  auto&& centers = rmm::device_uvector<float>(size_t(n_lists) * size_t(dim), stream);
+
+  // Predict labels of the whole dataset
+  kmeans::build_optimized_kmeans(handle,
+                                 params.kmeans_n_iters,
+                                 dim,
+                                 dataset,
+                                 n_rows,
+                                 centers.data(),
+                                 n_lists,
+                                 params.kmeans_trainset_fraction,
+                                 params.metric,
+                                 stream);
+
+  auto&& data         = rmm::device_uvector<T>(0, stream);
+  auto&& indices      = rmm::device_uvector<IdxT>(0, stream);
+  auto&& list_sizes   = rmm::device_uvector<uint32_t>(n_lists, stream);
+  auto&& list_offsets = rmm::device_uvector<IdxT>(n_lists + 1, stream);
+  utils::memzero(list_sizes.data(), list_sizes.size(), stream);
+  utils::memzero(list_offsets.data(), list_offsets.size(), stream);
+
+  // assemble the index
+  index<T, IdxT> index{{},
+                       veclen,
+                       params.metric,
+                       IdxT(0),
+                       dim,
+                       n_lists,
+                       std::move(data),
+                       std::move(indices),
+                       std::move(list_sizes),
+                       std::move(list_offsets),
+                       std::move(centers),
+                       std::nullopt};
+
+  // check index invariants
+  index.check_consistency();
+
+  // add the data if necessary
+  if (params.add_data_on_build) {
+    return extend<T, IdxT>(handle, index, dataset, nullptr, n_rows, stream);
+  } else {
+    return index;
+  }
+}
+
+}  // namespace raft::spatial::knn::ivf_flat::detail
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
new file mode 100644
index 0000000000..a52fbc69de
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
@@ -0,0 +1,1281 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "../ivf_flat_types.hpp"
+#include "ann_utils.cuh"
+#include "topk/radix_topk.cuh"
+#include "topk/warpsort_topk.cuh"
+
+#include <raft/common/device_loads_stores.cuh>
+#include <raft/core/cudart_utils.hpp>
+#include <raft/core/handle.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/core/mdarray.hpp>
+#include <raft/cuda_utils.cuh>
+#include <raft/distance/distance.cuh>
+#include <raft/distance/distance_type.hpp>
+#include <raft/pow2_utils.cuh>
+#include <raft/vectorized.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <optional>
+
+namespace raft::spatial::knn::ivf_flat::detail {
+
+using namespace raft::spatial::knn::detail;  // NOLINT
+
+constexpr int kThreadsPerBlock = 128;
+
+/**
+ * @brief Copy `n` elements per block from one place to another.
+ *
+ * @param[out] out target pointer (unique per block)
+ * @param[in] in source pointer
+ * @param n number of elements to copy
+ */
+template <int VecBytes = 16, typename T>
+__device__ inline void copy_vectorized(T* out, const T* in, uint32_t n)
+{
+  constexpr int VecElems = VecBytes / sizeof(T);  // NOLINT
+  using align_bytes      = Pow2<(size_t)VecBytes>;
+  if constexpr (VecElems > 1) {
+    using align_elems = Pow2<VecElems>;
+    if (!align_bytes::areSameAlignOffsets(out, in)) {
+      return copy_vectorized<(VecBytes >> 1), T>(out, in, n);
+    }
+    {  // process unaligned head
+      uint32_t head = align_bytes::roundUp(in) - in;
+      if (head > 0) {
+        copy_vectorized<sizeof(T), T>(out, in, head);
+        n -= head;
+        in += head;
+        out += head;
+      }
+    }
+    {  // process main part vectorized
+      using vec_t = typename IOType<T, VecElems>::Type;
+      copy_vectorized<sizeof(vec_t), vec_t>(
+        reinterpret_cast<vec_t*>(out), reinterpret_cast<const vec_t*>(in), align_elems::div(n));
+    }
+    {  // process unaligned tail
+      uint32_t tail = align_elems::mod(n);
+      if (tail > 0) {
+        n -= tail;
+        copy_vectorized<sizeof(T), T>(out + n, in + n, tail);
+      }
+    }
+  }
+  if constexpr (VecElems <= 1) {
+    for (int i = threadIdx.x; i < n; i += blockDim.x) {
+      out[i] = in[i];
+    }
+  }
+}
+
+/**
+ * @brief Load a part of a vector from the index and from query, compute the (part of the) distance
+ * between them, and aggregate it using the provided Lambda; one structure per thread, per query,
+ * and per index item.
+ *
+ * @tparam kUnroll elements per loop (normally, kUnroll = WarpSize / Veclen)
+ * @tparam Lambda computing the part of the distance for one dimension and aggregating it:
+ *                void (AccT& acc, AccT x, AccT y)
+ * @tparam Veclen size of the vectorized load
+ * @tparam T type of the data in the query and the index
+ * @tparam AccT type of the accumulated value (an optimization for 8bit values to be loaded as 32bit
+ * values)
+ */
+template <int kUnroll, typename Lambda, int Veclen, typename T, typename AccT>
+struct loadAndComputeDist {
+  Lambda compute_dist;
+  AccT& dist;
+
+  __device__ __forceinline__ loadAndComputeDist(AccT& dist, Lambda op)
+    : dist(dist), compute_dist(op)
+  {
+  }
+
+  /**
+   * Load parts of vectors from the index and query and accumulates the partial distance.
+   * This version assumes the query is stored in shared memory.
+   * Every thread here processes exactly kUnroll * Veclen elements independently of others.
+   */
+  template <typename IdxT>
+  __device__ __forceinline__ void runLoadShmemCompute(const T* const& data,
+                                                      const T* query_shared,
+                                                      IdxT loadIndex,
+                                                      IdxT shmemIndex)
+  {
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      T encV[Veclen];
+      ldg(encV, data + (loadIndex + j * kIndexGroupSize) * Veclen);
+      T queryRegs[Veclen];
+      lds(queryRegs, &query_shared[shmemIndex + j * Veclen]);
+#pragma unroll
+      for (int k = 0; k < Veclen; ++k) {
+        compute_dist(dist, queryRegs[k], encV[k]);
+      }
+    }
+  }
+
+  /**
+   * Load parts of vectors from the index and query and accumulates the partial distance.
+   * This version assumes the query is stored in the global memory and is different for every
+   * thread. One warp loads exactly WarpSize query elements at once and then reshuffles them into
+   * corresponding threads (`WarpSize / (kUnroll * Veclen)` elements per thread at once).
+   */
+  template <typename IdxT>
+  __device__ __forceinline__ void runLoadShflAndCompute(const T*& data,
+                                                        const T* query,
+                                                        IdxT baseLoadIndex,
+                                                        const int lane_id)
+  {
+    T queryReg               = query[baseLoadIndex + lane_id];
+    constexpr int stride     = kUnroll * Veclen;
+    constexpr int totalIter  = WarpSize / stride;
+    constexpr int gmemStride = stride * kIndexGroupSize;
+#pragma unroll
+    for (int i = 0; i < totalIter; ++i, data += gmemStride) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        T encV[Veclen];
+        ldg(encV, data + (lane_id + j * kIndexGroupSize) * Veclen);
+        const int d = (i * kUnroll + j) * Veclen;
+#pragma unroll
+        for (int k = 0; k < Veclen; ++k) {
+          compute_dist(dist, shfl(queryReg, d + k, WarpSize), encV[k]);
+        }
+      }
+    }
+  }
+
+  /**
+   * Load parts of vectors from the index and query and accumulates the partial distance.
+   * This version augments `runLoadShflAndCompute` when `dim` is not a multiple of `WarpSize`.
+   */
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(
+    const T*& data, const T* query, const int lane_id, const int dim, const int dimBlocks)
+  {
+    const int loadDim     = dimBlocks + lane_id;
+    T queryReg            = loadDim < dim ? query[loadDim] : 0;
+    const int loadDataIdx = lane_id * Veclen;
+    for (int d = 0; d < dim - dimBlocks; d += Veclen, data += kIndexGroupSize * Veclen) {
+      T enc[Veclen];
+      ldg(enc, data + loadDataIdx);
+#pragma unroll
+      for (int k = 0; k < Veclen; k++) {
+        compute_dist(dist, shfl(queryReg, d + k, WarpSize), enc[k]);
+      }
+    }
+  }
+};
+
+// This handles uint8_t 8, 16 Veclens
+template <int kUnroll, typename Lambda, int uint8_veclen>
+struct loadAndComputeDist<kUnroll, Lambda, uint8_veclen, uint8_t, uint32_t> {
+  Lambda compute_dist;
+  uint32_t& dist;
+
+  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, Lambda op)
+    : dist(dist), compute_dist(op)
+  {
+  }
+
+  __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
+                                                      const uint8_t* query_shared,
+                                                      int loadIndex,
+                                                      int shmemIndex)
+  {
+    constexpr int veclen_int = uint8_veclen / 4;  // converting uint8_t veclens to int
+    loadIndex                = loadIndex * veclen_int;
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      uint32_t encV[veclen_int];
+      ldg(encV,
+          reinterpret_cast<unsigned const*>(data) + loadIndex + j * kIndexGroupSize * veclen_int);
+      uint32_t queryRegs[veclen_int];
+      lds(queryRegs, reinterpret_cast<unsigned const*>(query_shared + shmemIndex) + j * veclen_int);
+#pragma unroll
+      for (int k = 0; k < veclen_int; k++) {
+        compute_dist(dist, queryRegs[k], encV[k]);
+      }
+    }
+  }
+  __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
+                                                        const uint8_t* query,
+                                                        int baseLoadIndex,
+                                                        const int lane_id)
+  {
+    constexpr int veclen_int = uint8_veclen / 4;  // converting uint8_t veclens to int
+    uint32_t queryReg =
+      (lane_id < 8) ? reinterpret_cast<unsigned const*>(query + baseLoadIndex)[lane_id] : 0;
+    constexpr int stride = kUnroll * uint8_veclen;
+
+#pragma unroll
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        uint32_t encV[veclen_int];
+        ldg(encV,
+            reinterpret_cast<unsigned const*>(data) + (lane_id + j * kIndexGroupSize) * veclen_int);
+        const int d = (i * kUnroll + j) * veclen_int;
+#pragma unroll
+        for (int k = 0; k < veclen_int; ++k) {
+          compute_dist(dist, shfl(queryReg, d + k, WarpSize), encV[k]);
+        }
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
+                                                                 const uint8_t* query,
+                                                                 const int lane_id,
+                                                                 const int dim,
+                                                                 const int dimBlocks)
+  {
+    constexpr int veclen_int = uint8_veclen / 4;
+    const int loadDim        = dimBlocks + lane_id * 4;  // Here 4 is for 1 - int
+    uint32_t queryReg = loadDim < dim ? reinterpret_cast<uint32_t const*>(query + loadDim)[0] : 0;
+    for (int d = 0; d < dim - dimBlocks;
+         d += uint8_veclen, data += kIndexGroupSize * uint8_veclen) {
+      uint32_t enc[veclen_int];
+      ldg(enc, reinterpret_cast<uint32_t const*>(data) + lane_id * veclen_int);
+#pragma unroll
+      for (int k = 0; k < veclen_int; k++) {
+        uint32_t q = shfl(queryReg, (d / 4) + k, WarpSize);
+        compute_dist(dist, q, enc[k]);
+      }
+    }
+  }
+};
+
+// Keep this specialized uint8 Veclen = 4, because compiler is generating suboptimal code while
+// using above common template of int2/int4
+template <int kUnroll, typename Lambda>
+struct loadAndComputeDist<kUnroll, Lambda, 4, uint8_t, uint32_t> {
+  Lambda compute_dist;
+  uint32_t& dist;
+
+  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, Lambda op)
+    : dist(dist), compute_dist(op)
+  {
+  }
+
+  __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
+                                                      const uint8_t* query_shared,
+                                                      int loadIndex,
+                                                      int shmemIndex)
+  {
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      uint32_t encV      = reinterpret_cast<unsigned const*>(data)[loadIndex + j * kIndexGroupSize];
+      uint32_t queryRegs = reinterpret_cast<unsigned const*>(query_shared + shmemIndex)[j];
+      compute_dist(dist, queryRegs, encV);
+    }
+  }
+  __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
+                                                        const uint8_t* query,
+                                                        int baseLoadIndex,
+                                                        const int lane_id)
+  {
+    uint32_t queryReg =
+      (lane_id < 8) ? reinterpret_cast<unsigned const*>(query + baseLoadIndex)[lane_id] : 0;
+    constexpr int veclen = 4;
+    constexpr int stride = kUnroll * veclen;
+
+#pragma unroll
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        uint32_t encV = reinterpret_cast<unsigned const*>(data)[lane_id + j * kIndexGroupSize];
+        uint32_t q    = shfl(queryReg, i * kUnroll + j, WarpSize);
+        compute_dist(dist, q, encV);
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
+                                                                 const uint8_t* query,
+                                                                 const int lane_id,
+                                                                 const int dim,
+                                                                 const int dimBlocks)
+  {
+    constexpr int veclen = 4;
+    const int loadDim    = dimBlocks + lane_id;
+    uint32_t queryReg    = loadDim < dim ? reinterpret_cast<unsigned const*>(query)[loadDim] : 0;
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
+      uint32_t enc = reinterpret_cast<unsigned const*>(data)[lane_id];
+      uint32_t q   = shfl(queryReg, d / veclen, WarpSize);
+      compute_dist(dist, q, enc);
+    }
+  }
+};
+
+template <int kUnroll, typename Lambda>
+struct loadAndComputeDist<kUnroll, Lambda, 2, uint8_t, uint32_t> {
+  Lambda compute_dist;
+  uint32_t& dist;
+
+  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, Lambda op)
+    : dist(dist), compute_dist(op)
+  {
+  }
+
+  __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
+                                                      const uint8_t* query_shared,
+                                                      int loadIndex,
+                                                      int shmemIndex)
+  {
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      uint32_t encV      = reinterpret_cast<uint16_t const*>(data)[loadIndex + j * kIndexGroupSize];
+      uint32_t queryRegs = reinterpret_cast<uint16_t const*>(query_shared + shmemIndex)[j];
+      compute_dist(dist, queryRegs, encV);
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
+                                                        const uint8_t* query,
+                                                        int baseLoadIndex,
+                                                        const int lane_id)
+  {
+    uint32_t queryReg =
+      (lane_id < 16) ? reinterpret_cast<uint16_t const*>(query + baseLoadIndex)[lane_id] : 0;
+    constexpr int veclen = 2;
+    constexpr int stride = kUnroll * veclen;
+
+#pragma unroll
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        uint32_t encV = reinterpret_cast<uint16_t const*>(data)[lane_id + j * kIndexGroupSize];
+        uint32_t q    = shfl(queryReg, i * kUnroll + j, WarpSize);
+        compute_dist(dist, q, encV);
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
+                                                                 const uint8_t* query,
+                                                                 const int lane_id,
+                                                                 const int dim,
+                                                                 const int dimBlocks)
+  {
+    constexpr int veclen = 2;
+    int loadDim          = dimBlocks + lane_id * veclen;
+    uint32_t queryReg = loadDim < dim ? reinterpret_cast<uint16_t const*>(query + loadDim)[0] : 0;
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
+      uint32_t enc = reinterpret_cast<uint16_t const*>(data)[lane_id];
+      uint32_t q   = shfl(queryReg, d / veclen, WarpSize);
+      compute_dist(dist, q, enc);
+    }
+  }
+};
+
+template <int kUnroll, typename Lambda>
+struct loadAndComputeDist<kUnroll, Lambda, 1, uint8_t, uint32_t> {
+  Lambda compute_dist;
+  uint32_t& dist;
+
+  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, Lambda op)
+    : dist(dist), compute_dist(op)
+  {
+  }
+
+  __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
+                                                      const uint8_t* query_shared,
+                                                      int loadIndex,
+                                                      int shmemIndex)
+  {
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      uint32_t encV      = data[loadIndex + j * kIndexGroupSize];
+      uint32_t queryRegs = query_shared[shmemIndex + j];
+      compute_dist(dist, queryRegs, encV);
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
+                                                        const uint8_t* query,
+                                                        int baseLoadIndex,
+                                                        const int lane_id)
+  {
+    uint32_t queryReg    = query[baseLoadIndex + lane_id];
+    constexpr int veclen = 1;
+    constexpr int stride = kUnroll * veclen;
+
+#pragma unroll
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        uint32_t encV = data[lane_id + j * kIndexGroupSize];
+        uint32_t q    = shfl(queryReg, i * kUnroll + j, WarpSize);
+        compute_dist(dist, q, encV);
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
+                                                                 const uint8_t* query,
+                                                                 const int lane_id,
+                                                                 const int dim,
+                                                                 const int dimBlocks)
+  {
+    constexpr int veclen = 1;
+    int loadDim          = dimBlocks + lane_id;
+    uint32_t queryReg    = loadDim < dim ? query[loadDim] : 0;
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
+      uint32_t enc = data[lane_id];
+      uint32_t q   = shfl(queryReg, d, WarpSize);
+      compute_dist(dist, q, enc);
+    }
+  }
+};
+
+// This device function is for int8 veclens 4, 8 and 16
+template <int kUnroll, typename Lambda, int int8_veclen>
+struct loadAndComputeDist<kUnroll, Lambda, int8_veclen, int8_t, int32_t> {
+  Lambda compute_dist;
+  int32_t& dist;
+
+  __device__ __forceinline__ loadAndComputeDist(int32_t& dist, Lambda op)
+    : dist(dist), compute_dist(op)
+  {
+  }
+
+  __device__ __forceinline__ void runLoadShmemCompute(const int8_t* const& data,
+                                                      const int8_t* query_shared,
+                                                      int loadIndex,
+                                                      int shmemIndex)
+  {
+    constexpr int veclen_int = int8_veclen / 4;  // converting int8_t veclens to int
+
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      int32_t encV[veclen_int];
+      ldg(encV,
+          reinterpret_cast<int32_t const*>(data) + (loadIndex + j * kIndexGroupSize) * veclen_int);
+      int32_t queryRegs[veclen_int];
+      lds(queryRegs, reinterpret_cast<int32_t const*>(query_shared + shmemIndex) + j * veclen_int);
+#pragma unroll
+      for (int k = 0; k < veclen_int; k++) {
+        compute_dist(dist, queryRegs[k], encV[k]);
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndCompute(const int8_t*& data,
+                                                        const int8_t* query,
+                                                        int baseLoadIndex,
+                                                        const int lane_id)
+  {
+    constexpr int veclen_int = int8_veclen / 4;  // converting int8_t veclens to int
+
+    int32_t queryReg =
+      (lane_id < 8) ? reinterpret_cast<int32_t const*>(query + baseLoadIndex)[lane_id] : 0;
+    constexpr int stride = kUnroll * int8_veclen;
+
+#pragma unroll
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        int32_t encV[veclen_int];
+        ldg(encV,
+            reinterpret_cast<int32_t const*>(data) + (lane_id + j * kIndexGroupSize) * veclen_int);
+        const int d = (i * kUnroll + j) * veclen_int;
+#pragma unroll
+        for (int k = 0; k < veclen_int; ++k) {
+          int32_t q = shfl(queryReg, d + k, WarpSize);
+          compute_dist(dist, q, encV[k]);
+        }
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(
+    const int8_t*& data, const int8_t* query, const int lane_id, const int dim, const int dimBlocks)
+  {
+    constexpr int veclen_int = int8_veclen / 4;
+    const int loadDim        = dimBlocks + lane_id * 4;  // Here 4 is for 1 - int;
+    int32_t queryReg = loadDim < dim ? reinterpret_cast<int32_t const*>(query + loadDim)[0] : 0;
+    for (int d = 0; d < dim - dimBlocks; d += int8_veclen, data += kIndexGroupSize * int8_veclen) {
+      int32_t enc[veclen_int];
+      ldg(enc, reinterpret_cast<int32_t const*>(data) + lane_id * veclen_int);
+#pragma unroll
+      for (int k = 0; k < veclen_int; k++) {
+        int32_t q = shfl(queryReg, (d / 4) + k, WarpSize);  // Here 4 is for 1 - int;
+        compute_dist(dist, q, enc[k]);
+      }
+    }
+  }
+};
+
+template <int kUnroll, typename Lambda>
+struct loadAndComputeDist<kUnroll, Lambda, 2, int8_t, int32_t> {
+  Lambda compute_dist;
+  int32_t& dist;
+  __device__ __forceinline__ loadAndComputeDist(int32_t& dist, Lambda op)
+    : dist(dist), compute_dist(op)
+  {
+  }
+  __device__ __forceinline__ void runLoadShmemCompute(const int8_t* const& data,
+                                                      const int8_t* query_shared,
+                                                      int loadIndex,
+                                                      int shmemIndex)
+  {
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      int32_t encV      = reinterpret_cast<uint16_t const*>(data)[loadIndex + j * kIndexGroupSize];
+      int32_t queryRegs = reinterpret_cast<uint16_t const*>(query_shared + shmemIndex)[j];
+      compute_dist(dist, queryRegs, encV);
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndCompute(const int8_t*& data,
+                                                        const int8_t* query,
+                                                        int baseLoadIndex,
+                                                        const int lane_id)
+  {
+    int32_t queryReg =
+      (lane_id < 16) ? reinterpret_cast<uint16_t const*>(query + baseLoadIndex)[lane_id] : 0;
+    constexpr int veclen = 2;
+    constexpr int stride = kUnroll * veclen;
+
+#pragma unroll
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        int32_t encV = reinterpret_cast<uint16_t const*>(data)[lane_id + j * kIndexGroupSize];
+        int32_t q    = shfl(queryReg, i * kUnroll + j, WarpSize);
+        compute_dist(dist, q, encV);
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(
+    const int8_t*& data, const int8_t* query, const int lane_id, const int dim, const int dimBlocks)
+  {
+    constexpr int veclen = 2;
+    int loadDim          = dimBlocks + lane_id * veclen;
+    int32_t queryReg = loadDim < dim ? reinterpret_cast<uint16_t const*>(query + loadDim)[0] : 0;
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
+      int32_t enc = reinterpret_cast<uint16_t const*>(data + lane_id * veclen)[0];
+      int32_t q   = shfl(queryReg, d / veclen, WarpSize);
+      compute_dist(dist, q, enc);
+    }
+  }
+};
+
+template <int kUnroll, typename Lambda>
+struct loadAndComputeDist<kUnroll, Lambda, 1, int8_t, int32_t> {
+  Lambda compute_dist;
+  int32_t& dist;
+  __device__ __forceinline__ loadAndComputeDist(int32_t& dist, Lambda op)
+    : dist(dist), compute_dist(op)
+  {
+  }
+
+  __device__ __forceinline__ void runLoadShmemCompute(const int8_t* const& data,
+                                                      const int8_t* query_shared,
+                                                      int loadIndex,
+                                                      int shmemIndex)
+  {
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      compute_dist(dist, query_shared[shmemIndex + j], data[loadIndex + j * kIndexGroupSize]);
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndCompute(const int8_t*& data,
+                                                        const int8_t* query,
+                                                        int baseLoadIndex,
+                                                        const int lane_id)
+  {
+    constexpr int veclen = 1;
+    constexpr int stride = kUnroll * veclen;
+    int32_t queryReg     = query[baseLoadIndex + lane_id];
+
+#pragma unroll
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        compute_dist(
+          dist, shfl(queryReg, i * kUnroll + j, WarpSize), data[lane_id + j * kIndexGroupSize]);
+      }
+    }
+  }
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(
+    const int8_t*& data, const int8_t* query, const int lane_id, const int dim, const int dimBlocks)
+  {
+    constexpr int veclen = 1;
+    const int loadDim    = dimBlocks + lane_id;
+    int32_t queryReg     = loadDim < dim ? query[loadDim] : 0;
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
+      compute_dist(dist, shfl(queryReg, d, WarpSize), data[lane_id]);
+    }
+  }
+};
+
+/**
+ * Scan clusters for nearest neighbors of the query vectors.
+ * See `ivfflat_interleaved_scan` for more information.
+ *
+ * The clusters are stored in the interleaved index format described in ivf_flat_types.hpp.
+ * For each query vector, a set of clusters is probed: the distance to each vector in the cluster is
+ * calculated, and the top-k nearest neighbors are selected.
+ *
+ * @param compute_dist distance function
+ * @param query_smem_elems number of dimensions of the query vector to fit in a shared memory of a
+ * block; this number must be a multiple of `WarpSize * Veclen`.
+ * @param[in] query a pointer to all queries in a row-major contiguous format [gridDim.y, dim]
+ * @param[in] coarse_index a pointer to the cluster indices to search through [n_probes]
+ * @param[in] list_indices index<T, IdxT>.indices
+ * @param[in] list_data index<T, IdxT>.data
+ * @param[in] list_sizes index<T, IdxT>.list_sizes
+ * @param[in] list_offsets index<T, IdxT>.list_offsets
+ * @param n_probes
+ * @param k
+ * @param dim
+ * @param[out] neighbors
+ * @param[out] distances
+ */
+template <int Capacity,
+          int Veclen,
+          bool Ascending,
+          typename T,
+          typename AccT,
+          typename IdxT,
+          typename Lambda>
+__global__ void __launch_bounds__(kThreadsPerBlock)
+  interleaved_scan_kernel(Lambda compute_dist,
+                          const uint32_t query_smem_elems,
+                          const T* query,
+                          const uint32_t* coarse_index,
+                          const IdxT* list_indices,
+                          const T* list_data,
+                          const uint32_t* list_sizes,
+                          const IdxT* list_offsets,
+                          const uint32_t n_probes,
+                          const uint32_t k,
+                          const uint32_t dim,
+                          IdxT* neighbors,
+                          float* distances)
+{
+  extern __shared__ __align__(256) uint8_t interleaved_scan_kernel_smem[];
+  // Using shared memory for the (part of the) query;
+  // This allows to save on global memory bandwidth when reading index and query
+  // data at the same time.
+  // Its size is `query_smem_elems`.
+  T* query_shared = reinterpret_cast<T*>(interleaved_scan_kernel_smem);
+  // Make the query input and output point to this block's shared query
+  {
+    const int query_id = blockIdx.y;
+    query += query_id * dim;
+    neighbors += query_id * k * gridDim.x + blockIdx.x * k;
+    distances += query_id * k * gridDim.x + blockIdx.x * k;
+    coarse_index += query_id * n_probes;
+  }
+
+  // Copy a part of the query into shared memory for faster processing
+  copy_vectorized(query_shared, query, std::min(dim, query_smem_elems));
+  __syncthreads();
+
+  topk::block_sort<topk::warp_sort_filtered, Capacity, Ascending, float, IdxT> queue(
+    k, interleaved_scan_kernel_smem + query_smem_elems * sizeof(T));
+
+  {
+    using align_warp  = Pow2<WarpSize>;
+    const int lane_id = align_warp::mod(threadIdx.x);
+
+    // How many full warps needed to compute the distance (without remainder)
+    const uint32_t full_warps_along_dim = align_warp::roundDown(dim);
+
+    const uint32_t shm_assisted_dim =
+      (dim > query_smem_elems) ? query_smem_elems : full_warps_along_dim;
+
+    // Every CUDA block scans one cluster at a time.
+    for (int probe_id = blockIdx.x; probe_id < n_probes; probe_id += gridDim.x) {
+      const uint32_t list_id   = coarse_index[probe_id];  // The id of cluster(list)
+      const size_t list_offset = list_offsets[list_id];
+
+      // The number of vectors in each cluster(list); [nlist]
+      const uint32_t list_length = list_sizes[list_id];
+
+      // The number of interleaved groups to be processed
+      const uint32_t num_groups =
+        align_warp::div(list_length + align_warp::Mask);  // ceildiv by power of 2
+
+      constexpr int kUnroll        = WarpSize / Veclen;
+      constexpr uint32_t kNumWarps = kThreadsPerBlock / WarpSize;
+      // Every warp reads WarpSize vectors and computes the distances to them.
+      // Then, the distances and corresponding ids are distributed among the threads,
+      // and each thread adds one (id, dist) pair to the filtering queue.
+      for (uint32_t group_id = align_warp::div(threadIdx.x); group_id < num_groups;
+           group_id += kNumWarps) {
+        AccT dist = 0;
+        // This is where this warp begins reading data (start position of an interleaved group)
+        const T* data = list_data + (list_offset + group_id * kIndexGroupSize) * dim;
+
+        // This is the vector a given lane/thread handles
+        const uint32_t vec_id = group_id * WarpSize + lane_id;
+        const bool valid      = vec_id < list_length;
+
+        // Process first shm_assisted_dim dimensions (always using shared memory)
+        if (valid) {
+          loadAndComputeDist<kUnroll, decltype(compute_dist), Veclen, T, AccT> lc(dist,
+                                                                                  compute_dist);
+          for (int pos = 0; pos < shm_assisted_dim;
+               pos += WarpSize, data += kIndexGroupSize * WarpSize) {
+            lc.runLoadShmemCompute(data, query_shared, lane_id, pos);
+          }
+        }
+
+        if (dim > query_smem_elems) {
+          // The default path - using shfl ops - for dimensions beyond query_smem_elems
+          loadAndComputeDist<kUnroll, decltype(compute_dist), Veclen, T, AccT> lc(dist,
+                                                                                  compute_dist);
+          for (int pos = shm_assisted_dim; pos < full_warps_along_dim; pos += WarpSize) {
+            lc.runLoadShflAndCompute(data, query, pos, lane_id);
+          }
+          lc.runLoadShflAndComputeRemainder(data, query, lane_id, dim, full_warps_along_dim);
+        } else {
+          // when  shm_assisted_dim == full_warps_along_dim < dim
+          if (valid) {
+            loadAndComputeDist<1, decltype(compute_dist), Veclen, T, AccT> lc(dist, compute_dist);
+            for (int pos = full_warps_along_dim; pos < dim;
+                 pos += Veclen, data += kIndexGroupSize * Veclen) {
+              lc.runLoadShmemCompute(data, query_shared, lane_id, pos);
+            }
+          }
+        }
+
+        // Enqueue one element per thread
+        constexpr float kDummy = Ascending ? upper_bound<float>() : lower_bound<float>();
+        const float val        = valid ? static_cast<float>(dist) : kDummy;
+        const size_t idx = valid ? static_cast<size_t>(list_indices[list_offset + vec_id]) : 0;
+        queue.add(val, idx);
+      }
+    }
+  }
+
+  // finalize and store selected neighbours
+  queue.done();
+  queue.store(distances, neighbors);
+}
+
+/**
+ *  Configure the gridDim.x to maximize GPU occupancy, but reduce the output size
+ */
+template <typename T>
+uint32_t configure_launch_x(uint32_t numQueries, uint32_t n_probes, int32_t sMemSize, T func)
+{
+  int dev_id;
+  RAFT_CUDA_TRY(cudaGetDevice(&dev_id));
+  int num_sms;
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
+  int num_blocks_per_sm = 0;
+  RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    &num_blocks_per_sm, func, kThreadsPerBlock, sMemSize));
+
+  size_t min_grid_size = num_sms * num_blocks_per_sm;
+  size_t min_grid_x    = ceildiv<size_t>(min_grid_size, numQueries);
+  return min_grid_x > n_probes ? n_probes : static_cast<uint32_t>(min_grid_x);
+}
+
+template <int Capacity,
+          int Veclen,
+          bool Ascending,
+          typename T,
+          typename AccT,
+          typename IdxT,
+          typename Lambda>
+void launch_kernel(Lambda lambda,
+                   const ivf_flat::index<T, IdxT>& index,
+                   const T* queries,
+                   const uint32_t* coarse_index,
+                   const uint32_t num_queries,
+                   const uint32_t n_probes,
+                   const uint32_t k,
+                   IdxT* neighbors,
+                   float* distances,
+                   uint32_t& grid_dim_x,
+                   rmm::cuda_stream_view stream)
+{
+  RAFT_EXPECTS(Veclen == index.veclen,
+               "Configured Veclen does not match the index interleaving pattern.");
+  constexpr auto kKernel =
+    interleaved_scan_kernel<Capacity, Veclen, Ascending, T, AccT, IdxT, Lambda>;
+  const int max_query_smem = 16384;
+  int query_smem_elems =
+    std::min<int>(max_query_smem / sizeof(T), Pow2<Veclen * WarpSize>::roundUp(index.dim));
+  int smem_size              = query_smem_elems * sizeof(T);
+  constexpr int kSubwarpSize = std::min<int>(Capacity, WarpSize);
+  smem_size += raft::spatial::knn::detail::topk::calc_smem_size_for_block_wide<AccT, size_t>(
+    kThreadsPerBlock / kSubwarpSize, k);
+
+  // power-of-two less than cuda limit (for better addr alignment)
+  constexpr uint32_t kMaxGridY = 32768;
+
+  if (grid_dim_x == 0) {
+    grid_dim_x = configure_launch_x(std::min(kMaxGridY, num_queries), n_probes, smem_size, kKernel);
+    return;
+  }
+
+  for (uint32_t query_offset = 0; query_offset < num_queries; query_offset += kMaxGridY) {
+    uint32_t grid_dim_y = std::min<uint32_t>(kMaxGridY, num_queries - query_offset);
+    dim3 grid_dim(grid_dim_x, grid_dim_y, 1);
+    dim3 block_dim(kThreadsPerBlock);
+    RAFT_LOG_TRACE(
+      "Launching the ivf-flat interleaved_scan_kernel (%d, %d, 1) x (%d, 1, 1), n_probes = %d, "
+      "smem_size = %d",
+      grid_dim.x,
+      grid_dim.y,
+      block_dim.x,
+      n_probes,
+      smem_size);
+    kKernel<<<grid_dim, block_dim, smem_size, stream>>>(lambda,
+                                                        query_smem_elems,
+                                                        queries,
+                                                        coarse_index,
+                                                        index.indices.data(),
+                                                        index.data.data(),
+                                                        index.list_sizes.data(),
+                                                        index.list_offsets.data(),
+                                                        n_probes,
+                                                        k,
+                                                        index.dim,
+                                                        neighbors,
+                                                        distances);
+    queries += grid_dim_y * index.dim;
+    neighbors += grid_dim_y * grid_dim_x * k;
+    distances += grid_dim_y * grid_dim_x * k;
+  }
+}
+
+template <int Veclen, typename T, typename AccT>
+struct euclidean_dist {
+  __device__ __forceinline__ void operator()(AccT& acc, AccT x, AccT y)
+  {
+    const auto diff = x - y;
+    acc += diff * diff;
+  }
+};
+
+template <int Veclen>
+struct euclidean_dist<Veclen, uint8_t, uint32_t> {
+  __device__ __forceinline__ void operator()(uint32_t& acc, uint32_t x, uint32_t y)
+  {
+    if constexpr (Veclen > 1) {
+      const auto diff = __vabsdiffu4(x, y);
+      acc             = dp4a(diff, diff, acc);
+    } else {
+      const auto diff = x - y;
+      acc += diff * diff;
+    }
+  }
+};
+
+template <int Veclen>
+struct euclidean_dist<Veclen, int8_t, int32_t> {
+  __device__ __forceinline__ void operator()(int32_t& acc, int32_t x, int32_t y)
+  {
+    if constexpr (Veclen > 1) {
+      const auto diff = static_cast<int32_t>(__vabsdiffs4(x, y));
+      acc             = dp4a(diff, diff, acc);
+    } else {
+      const auto diff = x - y;
+      acc += diff * diff;
+    }
+  }
+};
+
+template <int Veclen, typename T, typename AccT>
+struct inner_prod_dist {
+  __device__ __forceinline__ void operator()(AccT& acc, AccT x, AccT y)
+  {
+    if constexpr (Veclen > 1 && (std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>)) {
+      acc = dp4a(x, y, acc);
+    } else {
+      acc += x * y;
+    }
+  }
+};
+
+/** Select the distance computation function and forward the rest of the arguments. */
+template <int Capacity,
+          int Veclen,
+          bool Ascending,
+          typename T,
+          typename AccT,
+          typename IdxT,
+          typename... Args>
+void launch_with_fixed_consts(raft::distance::DistanceType metric, Args&&... args)
+{
+  switch (metric) {
+    case raft::distance::DistanceType::L2Expanded:
+    case raft::distance::DistanceType::L2Unexpanded:
+      return launch_kernel<Capacity,
+                           Veclen,
+                           Ascending,
+                           T,
+                           AccT,
+                           IdxT,
+                           euclidean_dist<Veclen, T, AccT>>({}, std::forward<Args>(args)...);
+    case raft::distance::DistanceType::InnerProduct:
+      return launch_kernel<Capacity,
+                           Veclen,
+                           Ascending,
+                           T,
+                           AccT,
+                           IdxT,
+                           inner_prod_dist<Veclen, T, AccT>>({}, std::forward<Args>(args)...);
+    // NB: update the description of `knn::ivf_flat::build` when adding here a new metric.
+    default: RAFT_FAIL("The chosen distance metric is not supported (%d)", int(metric));
+  }
+}
+
+/**
+ * Lift the `capacity` and `veclen` parameters to the template level,
+ * forward the rest of the arguments unmodified to `launch_interleaved_scan_kernel`.
+ */
+template <typename T,
+          typename AccT,
+          typename IdxT,
+          int Capacity = topk::kMaxCapacity,
+          int Veclen   = std::max<int>(1, 16 / sizeof(T))>
+struct select_interleaved_scan_kernel {
+  /**
+   * Recursively reduce the `Capacity` and `Veclen` parameters until they match the
+   * corresponding runtime arguments.
+   * By default, this recursive process starts with maximum possible values of the
+   * two parameters and ends with both values equal to 1.
+   */
+  template <typename... Args>
+  static inline void run(int capacity, int veclen, bool select_min, Args&&... args)
+  {
+    if constexpr (Capacity > 1) {
+      if (capacity * 2 <= Capacity) {
+        return select_interleaved_scan_kernel<T, AccT, IdxT, Capacity / 2, Veclen>::run(
+          capacity, veclen, select_min, std::forward<Args>(args)...);
+      }
+    }
+    if constexpr (Veclen > 1) {
+      if (veclen * 2 <= Veclen) {
+        return select_interleaved_scan_kernel<T, AccT, IdxT, Capacity, Veclen / 2>::run(
+          capacity, veclen, select_min, std::forward<Args>(args)...);
+      }
+    }
+    // NB: this is the limitation of the topk::block_topk stuctures that use a huge number of
+    //     registers (used in the main kernel here).
+    RAFT_EXPECTS(capacity == Capacity,
+                 "Capacity must be power-of-two not bigger than the maximum allowed size "
+                 "topk::kMaxCapacity (%d).",
+                 topk::kMaxCapacity);
+    RAFT_EXPECTS(
+      veclen == Veclen,
+      "Veclen must be power-of-two not bigger than the maximum allowed size for this data type.");
+    if (select_min) {
+      launch_with_fixed_consts<Capacity, Veclen, true, T, AccT, IdxT>(std::forward<Args>(args)...);
+    } else {
+      launch_with_fixed_consts<Capacity, Veclen, false, T, AccT, IdxT>(std::forward<Args>(args)...);
+    }
+  }
+};
+
+/**
+ * @brief Configure and launch an appropriate template instance of the interleaved scan kernel.
+ *
+ * @tparam T value type
+ * @tparam AccT accumulated type
+ * @tparam IdxT type of the indices
+ *
+ * @param index previously built ivf-flat index
+ * @param[in] queries device pointer to the query vectors [batch_size, dim]
+ * @param[in] coarse_query_results device pointer to the cluster (list) ids [batch_size, n_probes]
+ * @param n_queries batch size
+ * @param metric type of the measured distance
+ * @param n_probes number of nearest clusters to query
+ * @param k number of nearest neighbors.
+ *            NB: the maximum value of `k` is limited statically by `topk::kMaxCapacity`.
+ * @param select_min whether to select nearest (true) or furthest (false) points w.r.t. the given
+ * metric.
+ * @param[out] neighbors device pointer to the result indices for each query and cluster
+ * [batch_size, grid_dim_x, k]
+ * @param[out] distances device pointer to the result distances for each query and cluster
+ * [batch_size, grid_dim_x, k]
+ * @param[inout] grid_dim_x number of blocks launched across all n_probes clusters;
+ *               (one block processes one or more probes, hence: 1 <= grid_dim_x <= n_probes)
+ * @param stream
+ */
+template <typename T, typename AccT, typename IdxT>
+void ivfflat_interleaved_scan(const ivf_flat::index<T, IdxT>& index,
+                              const T* queries,
+                              const uint32_t* coarse_query_results,
+                              const uint32_t n_queries,
+                              const raft::distance::DistanceType metric,
+                              const uint32_t n_probes,
+                              const uint32_t k,
+                              const bool select_min,
+                              IdxT* neighbors,
+                              float* distances,
+                              uint32_t& grid_dim_x,
+                              rmm::cuda_stream_view stream)
+{
+  const int capacity = raft::spatial::knn::detail::topk::calc_capacity(k);
+  select_interleaved_scan_kernel<T, AccT, IdxT>::run(capacity,
+                                                     index.veclen,
+                                                     select_min,
+                                                     metric,
+                                                     index,
+                                                     queries,
+                                                     coarse_query_results,
+                                                     n_queries,
+                                                     n_probes,
+                                                     k,
+                                                     neighbors,
+                                                     distances,
+                                                     grid_dim_x,
+                                                     stream);
+}
+
+template <typename T, typename AccT, typename IdxT>
+void search_impl(const handle_t& handle,
+                 const index<T, IdxT>& index,
+                 const T* queries,
+                 uint32_t n_queries,
+                 uint32_t k,
+                 uint32_t n_probes,
+                 bool select_min,
+                 IdxT* neighbors,
+                 AccT* distances,
+                 rmm::cuda_stream_view stream,
+                 rmm::mr::device_memory_resource* search_mr)
+{
+  // The norm of query
+  rmm::device_uvector<float> query_norm_dev(n_queries, stream, search_mr);
+  // The distance value of cluster(list) and queries
+  rmm::device_uvector<float> distance_buffer_dev(n_queries * index.n_lists, stream, search_mr);
+  // The topk distance value of cluster(list) and queries
+  rmm::device_uvector<float> coarse_distances_dev(n_queries * n_probes, stream, search_mr);
+  // The topk  index of cluster(list) and queries
+  rmm::device_uvector<uint32_t> coarse_indices_dev(n_queries * n_probes, stream, search_mr);
+  // The topk distance value of candicate vectors from each cluster(list)
+  rmm::device_uvector<AccT> refined_distances_dev(n_queries * n_probes * k, stream, search_mr);
+  // The topk index of candicate vectors from each cluster(list)
+  rmm::device_uvector<IdxT> refined_indices_dev(n_queries * n_probes * k, stream, search_mr);
+
+  size_t float_query_size;
+  if constexpr (std::is_integral_v<T>) {
+    float_query_size = n_queries * index.dim;
+  } else {
+    float_query_size = 0;
+  }
+  rmm::device_uvector<float> converted_queries_dev(float_query_size, stream, search_mr);
+  float* converted_queries_ptr = converted_queries_dev.data();
+
+  if constexpr (std::is_same_v<T, float>) {
+    converted_queries_ptr = const_cast<float*>(queries);
+  } else {
+    linalg::unaryOp(
+      converted_queries_ptr, queries, n_queries * index.dim, utils::mapping<float>{}, stream);
+  }
+
+  float alpha = 1.0f;
+  float beta  = 0.0f;
+
+  if (index.metric == raft::distance::DistanceType::L2Expanded) {
+    alpha = -2.0f;
+    beta  = 1.0f;
+    utils::dots_along_rows(
+      n_queries, index.dim, converted_queries_ptr, query_norm_dev.data(), stream);
+    utils::outer_add(query_norm_dev.data(),
+                     n_queries,
+                     index.center_norms->data(),
+                     index.n_lists,
+                     distance_buffer_dev.data(),
+                     stream);
+    RAFT_LOG_TRACE_VEC(index.center_norms->data(), 20);
+    RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), 20);
+  } else {
+    alpha = 1.0f;
+    beta  = 0.0f;
+  }
+
+  linalg::gemm(handle,
+               true,
+               false,
+               index.n_lists,
+               n_queries,
+               index.dim,
+               &alpha,
+               index.centers.data(),
+               index.dim,
+               converted_queries_ptr,
+               index.dim,
+               &beta,
+               distance_buffer_dev.data(),
+               index.n_lists,
+               stream);
+
+  RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), 20);
+  if (n_probes <= raft::spatial::knn::detail::topk::kMaxCapacity) {
+    topk::warp_sort_topk<AccT, uint32_t>(distance_buffer_dev.data(),
+                                         nullptr,
+                                         n_queries,
+                                         index.n_lists,
+                                         n_probes,
+                                         coarse_distances_dev.data(),
+                                         coarse_indices_dev.data(),
+                                         select_min,
+                                         stream,
+                                         search_mr);
+  } else {
+    topk::radix_topk<AccT, uint32_t, 11, 512>(distance_buffer_dev.data(),
+                                              nullptr,
+                                              n_queries,
+                                              index.n_lists,
+                                              n_probes,
+                                              coarse_distances_dev.data(),
+                                              coarse_indices_dev.data(),
+                                              select_min,
+                                              stream,
+                                              search_mr);
+  }
+  RAFT_LOG_TRACE_VEC(coarse_indices_dev.data(), 1 * n_probes);
+  RAFT_LOG_TRACE_VEC(coarse_distances_dev.data(), 1 * n_probes);
+
+  auto distances_dev_ptr = refined_distances_dev.data();
+  auto indices_dev_ptr   = refined_indices_dev.data();
+
+  uint32_t grid_dim_x = 0;
+  if (n_probes > 1) {
+    // query the gridDimX size to store probes topK output
+    ivfflat_interleaved_scan<T, typename utils::config<T>::value_t, IdxT>(index,
+                                                                          nullptr,
+                                                                          nullptr,
+                                                                          n_queries,
+                                                                          index.metric,
+                                                                          n_probes,
+                                                                          k,
+                                                                          select_min,
+                                                                          nullptr,
+                                                                          nullptr,
+                                                                          grid_dim_x,
+                                                                          stream);
+  } else {
+    grid_dim_x = 1;
+  }
+
+  if (grid_dim_x == 1) {
+    distances_dev_ptr = distances;
+    indices_dev_ptr   = neighbors;
+  }
+
+  ivfflat_interleaved_scan<T, typename utils::config<T>::value_t, IdxT>(index,
+                                                                        queries,
+                                                                        coarse_indices_dev.data(),
+                                                                        n_queries,
+                                                                        index.metric,
+                                                                        n_probes,
+                                                                        k,
+                                                                        select_min,
+                                                                        indices_dev_ptr,
+                                                                        distances_dev_ptr,
+                                                                        grid_dim_x,
+                                                                        stream);
+
+  RAFT_LOG_TRACE_VEC(distances_dev_ptr, 2 * k);
+  RAFT_LOG_TRACE_VEC(indices_dev_ptr, 2 * k);
+
+  // Merge topk values from different blocks
+  if (grid_dim_x > 1) {
+    if (k <= raft::spatial::knn::detail::topk::kMaxCapacity) {
+      topk::warp_sort_topk<AccT, IdxT>(refined_distances_dev.data(),
+                                       refined_indices_dev.data(),
+                                       n_queries,
+                                       k * grid_dim_x,
+                                       k,
+                                       distances,
+                                       neighbors,
+                                       select_min,
+                                       stream,
+                                       search_mr);
+    } else {
+      // NB: this branch can only be triggered once `ivfflat_interleaved_scan` above supports larger
+      // `k` values (kMaxCapacity limit as a dependency of topk::block_sort)
+      topk::radix_topk<AccT, IdxT, 11, 512>(refined_distances_dev.data(),
+                                            refined_indices_dev.data(),
+                                            n_queries,
+                                            k * grid_dim_x,
+                                            k,
+                                            distances,
+                                            neighbors,
+                                            select_min,
+                                            stream,
+                                            search_mr);
+    }
+  }
+}
+
+/** See raft::spatial::knn::ivf_flat::search docs */
+template <typename T, typename IdxT>
+inline void search(const handle_t& handle,
+                   const search_params& params,
+                   const index<T, IdxT>& index,
+                   const T* queries,
+                   uint32_t n_queries,
+                   uint32_t k,
+                   IdxT* neighbors,
+                   float* distances,
+                   rmm::cuda_stream_view stream,
+                   rmm::mr::device_memory_resource* mr = nullptr)
+{
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "ivf_flat::search(k = %u, n_queries = %u, dim = %zu)", k, n_queries, index.dim);
+
+  RAFT_EXPECTS(params.n_probes > 0,
+               "n_probes (number of clusters to probe in the search) must be positive.");
+  auto n_probes = std::min<uint32_t>(params.n_probes, index.n_lists);
+
+  bool select_min;
+  switch (index.metric) {
+    case raft::distance::DistanceType::InnerProduct:
+    case raft::distance::DistanceType::CosineExpanded:
+    case raft::distance::DistanceType::CorrelationExpanded:
+      // Similarity metrics have the opposite meaning, i.e. nearest neigbours are those with larger
+      // similarity (See the same logic at cpp/include/raft/sparse/selection/detail/knn.cuh:362
+      // {perform_k_selection})
+      select_min = false;
+      break;
+    default: select_min = true;
+  }
+
+  auto pool_guard = raft::get_pool_memory_resource(mr, n_queries * n_probes * k * 16);
+  if (pool_guard) {
+    RAFT_LOG_DEBUG("ivf_flat::search: using pool memory resource with initial size %zu bytes",
+                   pool_guard->pool_size());
+  }
+
+  return search_impl<T, float, IdxT>(
+    handle, index, queries, n_queries, k, n_probes, select_min, neighbors, distances, stream, mr);
+}
+
+}  // namespace raft::spatial::knn::ivf_flat::detail
diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index f78ffa84e1..7cefeffea2 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -38,7 +38,7 @@
 
 #include "fused_l2_knn.cuh"
 #include "haversine_distance.cuh"
-#include "processing.hpp"
+#include "processing.cuh"
 
 #include "common_faiss.h"
 
diff --git a/cpp/include/raft/spatial/knn/detail/processing.cuh b/cpp/include/raft/spatial/knn/detail/processing.cuh
new file mode 100644
index 0000000000..79c437b020
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/processing.cuh
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "processing.hpp"
+
+#include <raft/distance/distance_type.hpp>
+#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/linalg/norm.cuh>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/stats/mean.cuh>
+#include <raft/stats/mean_center.cuh>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+
+template <typename math_t>
+class CosineMetricProcessor : public MetricProcessor<math_t> {
+ protected:
+  int k_;
+  bool row_major_;
+  size_t n_rows_;
+  size_t n_cols_;
+  cudaStream_t stream_;
+  rmm::device_uvector<math_t> colsums_;
+
+ public:
+  CosineMetricProcessor(size_t n_rows, size_t n_cols, int k, bool row_major, cudaStream_t stream)
+    : stream_(stream),
+      colsums_(n_rows, stream),
+      n_cols_(n_cols),
+      n_rows_(n_rows),
+      row_major_(row_major),
+      k_(k)
+  {
+  }
+
+  void preprocess(math_t* data)
+  {
+    raft::linalg::rowNorm(colsums_.data(),
+                          data,
+                          n_cols_,
+                          n_rows_,
+                          raft::linalg::NormType::L2Norm,
+                          row_major_,
+                          stream_,
+                          [] __device__(math_t in) { return sqrtf(in); });
+
+    raft::linalg::matrixVectorOp(
+      data,
+      data,
+      colsums_.data(),
+      n_cols_,
+      n_rows_,
+      row_major_,
+      false,
+      [] __device__(math_t mat_in, math_t vec_in) { return mat_in / vec_in; },
+      stream_);
+  }
+
+  void revert(math_t* data)
+  {
+    raft::linalg::matrixVectorOp(
+      data,
+      data,
+      colsums_.data(),
+      n_cols_,
+      n_rows_,
+      row_major_,
+      false,
+      [] __device__(math_t mat_in, math_t vec_in) { return mat_in * vec_in; },
+      stream_);
+  }
+
+  void postprocess(math_t* data)
+  {
+    raft::linalg::unaryOp(
+      data, data, k_ * n_rows_, [] __device__(math_t in) { return 1 - in; }, stream_);
+  }
+
+  virtual void set_num_queries(int k) { k_ = k; }
+
+  ~CosineMetricProcessor() = default;
+};
+
+template <typename math_t>
+class CorrelationMetricProcessor : public CosineMetricProcessor<math_t> {
+  using cosine = CosineMetricProcessor<math_t>;
+
+ public:
+  CorrelationMetricProcessor(
+    size_t n_rows, size_t n_cols, int k, bool row_major, cudaStream_t stream)
+    : CosineMetricProcessor<math_t>(n_rows, n_cols, k, row_major, stream), means_(n_rows, stream)
+  {
+  }
+
+  void preprocess(math_t* data)
+  {
+    math_t normalizer_const = 1.0 / (math_t)cosine::n_cols_;
+
+    raft::linalg::reduce(means_.data(),
+                         data,
+                         cosine::n_cols_,
+                         cosine::n_rows_,
+                         (math_t)0.0,
+                         cosine::row_major_,
+                         true,
+                         cosine::stream_);
+
+    raft::linalg::unaryOp(
+      means_.data(),
+      means_.data(),
+      cosine::n_rows_,
+      [=] __device__(math_t in) { return in * normalizer_const; },
+      cosine::stream_);
+
+    raft::stats::meanCenter(data,
+                            data,
+                            means_.data(),
+                            cosine::n_cols_,
+                            cosine::n_rows_,
+                            cosine::row_major_,
+                            false,
+                            cosine::stream_);
+
+    CosineMetricProcessor<math_t>::preprocess(data);
+  }
+
+  void revert(math_t* data)
+  {
+    CosineMetricProcessor<math_t>::revert(data);
+
+    raft::stats::meanAdd(data,
+                         data,
+                         means_.data(),
+                         cosine::n_cols_,
+                         cosine::n_rows_,
+                         cosine::row_major_,
+                         false,
+                         cosine::stream_);
+  }
+
+  void postprocess(math_t* data) { CosineMetricProcessor<math_t>::postprocess(data); }
+
+  ~CorrelationMetricProcessor() = default;
+
+  rmm::device_uvector<math_t> means_;
+};
+
+template <typename math_t>
+class DefaultMetricProcessor : public MetricProcessor<math_t> {
+ public:
+  void preprocess(math_t* data) {}
+
+  void revert(math_t* data) {}
+
+  void postprocess(math_t* data) {}
+
+  ~DefaultMetricProcessor() = default;
+};
+
+template <typename math_t>
+inline std::unique_ptr<MetricProcessor<math_t>> create_processor(
+  distance::DistanceType metric, int n, int D, int k, bool rowMajorQuery, cudaStream_t userStream)
+{
+  MetricProcessor<math_t>* mp = nullptr;
+
+  switch (metric) {
+    case distance::DistanceType::CosineExpanded:
+      mp = new CosineMetricProcessor<math_t>(n, D, k, rowMajorQuery, userStream);
+      break;
+
+    case distance::DistanceType::CorrelationExpanded:
+      mp = new CorrelationMetricProcessor<math_t>(n, D, k, rowMajorQuery, userStream);
+      break;
+    default: mp = new DefaultMetricProcessor<math_t>();
+  }
+
+  return std::unique_ptr<MetricProcessor<math_t>>(mp);
+}
+
+// Currently only being used by floats
+template class MetricProcessor<float>;
+template class CosineMetricProcessor<float>;
+template class CorrelationMetricProcessor<float>;
+template class DefaultMetricProcessor<float>;
+
+}  // namespace knn
+}  // namespace spatial
+}  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp
index 001f57a4aa..41003c2030 100644
--- a/cpp/include/raft/spatial/knn/detail/processing.hpp
+++ b/cpp/include/raft/spatial/knn/detail/processing.hpp
@@ -15,14 +15,6 @@
  */
 #pragma once
 
-#include <raft/distance/distance_type.hpp>
-#include <raft/linalg/matrix_vector_op.cuh>
-#include <raft/linalg/norm.cuh>
-#include <raft/linalg/unary_op.cuh>
-#include <raft/stats/mean.cuh>
-#include <raft/stats/mean_center.cuh>
-#include <rmm/device_uvector.hpp>
-
 namespace raft {
 namespace spatial {
 namespace knn {
@@ -43,178 +35,11 @@ class MetricProcessor {
 
   virtual void postprocess(math_t* data) {}
 
-  virtual ~MetricProcessor() = default;
-};
-
-template <typename math_t>
-class CosineMetricProcessor : public MetricProcessor<math_t> {
- protected:
-  int k_;
-  bool row_major_;
-  size_t n_rows_;
-  size_t n_cols_;
-  cudaStream_t stream_;
-  rmm::device_uvector<math_t> colsums_;
-
- public:
-  CosineMetricProcessor(size_t n_rows, size_t n_cols, int k, bool row_major, cudaStream_t stream)
-    : stream_(stream),
-      colsums_(n_rows, stream),
-      n_cols_(n_cols),
-      n_rows_(n_rows),
-      row_major_(row_major),
-      k_(k)
-  {
-  }
-
-  void preprocess(math_t* data)
-  {
-    raft::linalg::rowNorm(colsums_.data(),
-                          data,
-                          n_cols_,
-                          n_rows_,
-                          raft::linalg::NormType::L2Norm,
-                          row_major_,
-                          stream_,
-                          [] __device__(math_t in) { return sqrtf(in); });
-
-    raft::linalg::matrixVectorOp(
-      data,
-      data,
-      colsums_.data(),
-      n_cols_,
-      n_rows_,
-      row_major_,
-      false,
-      [] __device__(math_t mat_in, math_t vec_in) { return mat_in / vec_in; },
-      stream_);
-  }
-
-  void revert(math_t* data)
-  {
-    raft::linalg::matrixVectorOp(
-      data,
-      data,
-      colsums_.data(),
-      n_cols_,
-      n_rows_,
-      row_major_,
-      false,
-      [] __device__(math_t mat_in, math_t vec_in) { return mat_in * vec_in; },
-      stream_);
-  }
-
-  void postprocess(math_t* data)
-  {
-    raft::linalg::unaryOp(
-      data, data, k_ * n_rows_, [] __device__(math_t in) { return 1 - in; }, stream_);
-  }
-
-  ~CosineMetricProcessor() = default;
-};
-
-template <typename math_t>
-class CorrelationMetricProcessor : public CosineMetricProcessor<math_t> {
-  using cosine = CosineMetricProcessor<math_t>;
-
- public:
-  CorrelationMetricProcessor(
-    size_t n_rows, size_t n_cols, int k, bool row_major, cudaStream_t stream)
-    : CosineMetricProcessor<math_t>(n_rows, n_cols, k, row_major, stream), means_(n_rows, stream)
-  {
-  }
-
-  void preprocess(math_t* data)
-  {
-    math_t normalizer_const = 1.0 / (math_t)cosine::n_cols_;
-
-    raft::linalg::reduce(means_.data(),
-                         data,
-                         cosine::n_cols_,
-                         cosine::n_rows_,
-                         (math_t)0.0,
-                         cosine::row_major_,
-                         true,
-                         cosine::stream_);
-
-    raft::linalg::unaryOp(
-      means_.data(),
-      means_.data(),
-      cosine::n_rows_,
-      [=] __device__(math_t in) { return in * normalizer_const; },
-      cosine::stream_);
+  virtual void set_num_queries(int k) {}
 
-    raft::stats::meanCenter(data,
-                            data,
-                            means_.data(),
-                            cosine::n_cols_,
-                            cosine::n_rows_,
-                            cosine::row_major_,
-                            false,
-                            cosine::stream_);
-
-    CosineMetricProcessor<math_t>::preprocess(data);
-  }
-
-  void revert(math_t* data)
-  {
-    CosineMetricProcessor<math_t>::revert(data);
-
-    raft::stats::meanAdd(data,
-                         data,
-                         means_.data(),
-                         cosine::n_cols_,
-                         cosine::n_rows_,
-                         cosine::row_major_,
-                         false,
-                         cosine::stream_);
-  }
-
-  void postprocess(math_t* data) { CosineMetricProcessor<math_t>::postprocess(data); }
-
-  ~CorrelationMetricProcessor() = default;
-
-  rmm::device_uvector<math_t> means_;
-};
-
-template <typename math_t>
-class DefaultMetricProcessor : public MetricProcessor<math_t> {
- public:
-  void preprocess(math_t* data) {}
-
-  void revert(math_t* data) {}
-
-  void postprocess(math_t* data) {}
-
-  ~DefaultMetricProcessor() = default;
+  virtual ~MetricProcessor() = default;
 };
 
-template <typename math_t>
-inline std::unique_ptr<MetricProcessor<math_t>> create_processor(
-  distance::DistanceType metric, int n, int D, int k, bool rowMajorQuery, cudaStream_t userStream)
-{
-  MetricProcessor<math_t>* mp = nullptr;
-
-  switch (metric) {
-    case distance::DistanceType::CosineExpanded:
-      mp = new CosineMetricProcessor<math_t>(n, D, k, rowMajorQuery, userStream);
-      break;
-
-    case distance::DistanceType::CorrelationExpanded:
-      mp = new CorrelationMetricProcessor<math_t>(n, D, k, rowMajorQuery, userStream);
-      break;
-    default: mp = new DefaultMetricProcessor<math_t>();
-  }
-
-  return std::unique_ptr<MetricProcessor<math_t>>(mp);
-}
-
-// Currently only being used by floats
-template class MetricProcessor<float>;
-template class CosineMetricProcessor<float>;
-template class CorrelationMetricProcessor<float>;
-template class DefaultMetricProcessor<float>;
-
 }  // namespace knn
 }  // namespace spatial
 }  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh b/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
index 6feebfd472..53d88ff366 100644
--- a/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
@@ -16,14 +16,18 @@
 
 #pragma once
 
+#include <raft/core/cudart_utils.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/device_atomics.cuh>
+#include <raft/vectorized.cuh>
+
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_scan.cuh>
 #include <cub/block/block_store.cuh>
 #include <cub/block/radix_rank_sort_operations.cuh>
 
-#include <raft/cudart_utils.h>
-#include <raft/device_atomics.cuh>
-#include <raft/vectorized.cuh>
+#include <rmm/device_vector.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
 
 namespace raft::spatial::knn::detail::topk {
 
@@ -522,12 +526,12 @@ inline dim3 get_optimal_grid_size(size_t req_batch_size, size_t len)
  * @param[in] in_idx
  *   contiguous device array of inputs of size (len * batch_size);
  *   typically, these are indices of the corresponding in_keys.
- * @param[in] batch_size
+ * @param batch_size
  *   number of input rows, i.e. the batch size.
- * @param[in] len
+ * @param len
  *   length of a single input array (row); also sometimes referred as n_cols.
  *   Invariant: len >= k.
- * @param[in] k
+ * @param k
  *   the number of outputs to select in each input row.
  * @param[out] out
  *   contiguous device array of outputs of size (k * batch_size);
@@ -535,9 +539,11 @@ inline dim3 get_optimal_grid_size(size_t req_batch_size, size_t len)
  * @param[out] out_idx
  *   contiguous device array of outputs of size (k * batch_size);
  *   the payload selected together with `out`.
- * @param[in] select_min
+ * @param select_min
  *   whether to select k smallest (true) or largest (false) keys.
- * @param[in] stream
+ * @param stream
+ * @param mr an optional memory resource to use across the calls (you can provide a large enough
+ *           memory pool here to avoid memory allocations within the call).
  */
 template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
 void radix_topk(const T* in,
@@ -548,7 +554,8 @@ void radix_topk(const T* in,
                 T* out,
                 IdxT* out_idx,
                 bool select_min,
-                rmm::cuda_stream_view stream)
+                rmm::cuda_stream_view stream,
+                rmm::mr::device_memory_resource* mr = nullptr)
 {
   // reduce the block size if the input length is too small.
   if constexpr (BlockSize > calc_min_block_size<BitsPerPass>()) {
@@ -565,12 +572,23 @@ void radix_topk(const T* in,
   dim3 blocks           = get_optimal_grid_size<T, IdxT, BitsPerPass, BlockSize>(batch_size, len);
   size_t max_chunk_size = blocks.y;
 
-  rmm::device_uvector<Counter<T, IdxT>> counters(max_chunk_size, stream);
-  rmm::device_uvector<IdxT> histograms(num_buckets * max_chunk_size, stream);
-  rmm::device_uvector<T> buf1(len * max_chunk_size, stream);
-  rmm::device_uvector<IdxT> idx_buf1(len * max_chunk_size, stream);
-  rmm::device_uvector<T> buf2(len * max_chunk_size, stream);
-  rmm::device_uvector<IdxT> idx_buf2(len * max_chunk_size, stream);
+  auto pool_guard = raft::get_pool_memory_resource(
+    mr,
+    max_chunk_size * (sizeof(Counter<T, IdxT>)            // counters
+                      + sizeof(IdxT) * (num_buckets + 2)  // histograms and IdxT bufs
+                      + sizeof(T) * 2                     // T bufs
+                      ));
+  if (pool_guard) {
+    RAFT_LOG_DEBUG("radix_topk: using pool memory resource with initial size %zu bytes",
+                   pool_guard->pool_size());
+  }
+
+  rmm::device_uvector<Counter<T, IdxT>> counters(max_chunk_size, stream, mr);
+  rmm::device_uvector<IdxT> histograms(num_buckets * max_chunk_size, stream, mr);
+  rmm::device_uvector<T> buf1(len * max_chunk_size, stream, mr);
+  rmm::device_uvector<IdxT> idx_buf1(len * max_chunk_size, stream, mr);
+  rmm::device_uvector<T> buf2(len * max_chunk_size, stream, mr);
+  rmm::device_uvector<IdxT> idx_buf2(len * max_chunk_size, stream, mr);
 
   for (size_t offset = 0; offset < batch_size; offset += max_chunk_size) {
     blocks.y = std::min(max_chunk_size, batch_size - offset);
diff --git a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
index a599e8367e..017678afbb 100644
--- a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
@@ -18,6 +18,7 @@
 
 #include "bitonic_sort.cuh"
 
+#include <raft/core/logger.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/pow2_utils.cuh>
 
@@ -25,6 +26,9 @@
 #include <functional>
 #include <type_traits>
 
+#include <rmm/device_vector.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
 /*
   Three APIs of different scopes are provided:
     1. host function: warp_sort_topk()
@@ -571,7 +575,7 @@ struct launch_setup {
                      const IdxT* in_idx,
                      T* out_key,
                      IdxT* out_idx,
-                     cudaStream_t stream)
+                     rmm::cuda_stream_view stream)
   {
     const int capacity = calc_capacity(k);
     if constexpr (Capacity > 1) {
@@ -719,10 +723,18 @@ void warp_sort_topk_(int num_of_block,
                      T* out,
                      IdxT* out_idx,
                      bool select_min,
-                     cudaStream_t stream = 0)
+                     rmm::cuda_stream_view stream,
+                     rmm::mr::device_memory_resource* mr = nullptr)
 {
-  rmm::device_uvector<T> tmp_val(num_of_block * k * batch_size, stream);
-  rmm::device_uvector<IdxT> tmp_idx(num_of_block * k * batch_size, stream);
+  auto pool_guard = raft::get_pool_memory_resource(
+    mr, num_of_block * k * batch_size * 2 * std::max(sizeof(T), sizeof(IdxT)));
+  if (pool_guard) {
+    RAFT_LOG_DEBUG("warp_sort_topk: using pool memory resource with initial size %zu bytes",
+                   pool_guard->pool_size());
+  }
+
+  rmm::device_uvector<T> tmp_val(num_of_block * k * batch_size, stream, mr);
+  rmm::device_uvector<IdxT> tmp_idx(num_of_block * k * batch_size, stream, mr);
 
   int capacity   = calc_capacity(k);
   int warp_width = std::min(capacity, WarpSize);
@@ -780,12 +792,12 @@ void warp_sort_topk_(int num_of_block,
  * @param[in] in_idx
  *   contiguous device array of inputs of size (len * batch_size);
  *   typically, these are indices of the corresponding in_keys.
- * @param[in] batch_size
+ * @param batch_size
  *   number of input rows, i.e. the batch size.
- * @param[in] len
+ * @param len
  *   length of a single input array (row); also sometimes referred as n_cols.
  *   Invariant: len >= k.
- * @param[in] k
+ * @param k
  *   the number of outputs to select in each input row.
  * @param[out] out
  *   contiguous device array of outputs of size (k * batch_size);
@@ -793,9 +805,11 @@ void warp_sort_topk_(int num_of_block,
  * @param[out] out_idx
  *   contiguous device array of outputs of size (k * batch_size);
  *   the payload selected together with `out`.
- * @param[in] select_min
+ * @param select_min
  *   whether to select k smallest (true) or largest (false) keys.
- * @param[in] stream
+ * @param stream
+ * @param mr an optional memory resource to use across the calls (you can provide a large enough
+ *           memory pool here to avoid memory allocations within the call).
  */
 template <typename T, typename IdxT>
 void warp_sort_topk(const T* in,
@@ -806,7 +820,8 @@ void warp_sort_topk(const T* in,
                     T* out,
                     IdxT* out_idx,
                     bool select_min,
-                    rmm::cuda_stream_view stream = 0)
+                    rmm::cuda_stream_view stream,
+                    rmm::mr::device_memory_resource* mr = nullptr)
 {
   ASSERT(k <= kMaxCapacity, "Current max k is %d (requested %d)", kMaxCapacity, k);
   ASSERT(len <= size_t(std::numeric_limits<IdxT>::max()),
@@ -821,13 +836,33 @@ void warp_sort_topk(const T* in,
   int len_per_thread = len / (num_of_block * num_of_warp * std::min(capacity, WarpSize));
 
   if (len_per_thread <= LaunchThreshold<warp_sort_immediate>::len_factor_for_choosing) {
-    warp_sort_topk_<warp_sort_immediate, T, IdxT>(
-      num_of_block, num_of_warp, in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+    warp_sort_topk_<warp_sort_immediate, T, IdxT>(num_of_block,
+                                                  num_of_warp,
+                                                  in,
+                                                  in_idx,
+                                                  batch_size,
+                                                  len,
+                                                  k,
+                                                  out,
+                                                  out_idx,
+                                                  select_min,
+                                                  stream,
+                                                  mr);
   } else {
     calc_launch_parameter<warp_sort_filtered, T, IdxT>(
       batch_size, len, k, &num_of_block, &num_of_warp);
-    warp_sort_topk_<warp_sort_filtered, T, IdxT>(
-      num_of_block, num_of_warp, in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+    warp_sort_topk_<warp_sort_filtered, T, IdxT>(num_of_block,
+                                                 num_of_warp,
+                                                 in,
+                                                 in_idx,
+                                                 batch_size,
+                                                 len,
+                                                 k,
+                                                 out,
+                                                 out_idx,
+                                                 select_min,
+                                                 stream,
+                                                 mr);
   }
 }
 
diff --git a/cpp/include/raft/spatial/knn/ivf_flat.cuh b/cpp/include/raft/spatial/knn/ivf_flat.cuh
new file mode 100644
index 0000000000..98cccd64df
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/ivf_flat.cuh
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/ivf_flat_build.cuh"
+#include "detail/ivf_flat_search.cuh"
+#include "ivf_flat_types.hpp"
+
+#include <raft/core/handle.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+namespace raft::spatial::knn::ivf_flat {
+
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * NB: Currently, the following distance metrics are supported:
+ * - L2Expanded
+ * - L2Unexpanded
+ * - InnerProduct
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::spatial::knn;
+ *   // use default index parameters
+ *   ivf_flat::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = ivf_flat::build(handle, index_params, dataset, N, D);
+ *   // use default search parameters
+ *   ivf_flat::search_params search_params;
+ *   // search K nearest neighbours for each of the N queries
+ *   ivf_flat::search(handle, search_params, index, queries, N, K, out_inds, out_dists);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param handle
+ * @param params configure the index building
+ * @param[in] dataset a device pointer to a row-major matrix [n_rows, dim]
+ * @param n_rows the number of samples
+ * @param dim the dimensionality of the data
+ *
+ * @return the constructed ivf-flat index
+ */
+template <typename T, typename IdxT = uint32_t>
+inline auto build(
+  const handle_t& handle, const index_params& params, const T* dataset, IdxT n_rows, uint32_t dim)
+  -> index<T, IdxT>
+{
+  return raft::spatial::knn::ivf_flat::detail::build(
+    handle, params, dataset, n_rows, dim, handle.get_stream());
+}
+
+/**
+ * @brief Build a new index containing the data of the original plus new extra vectors.
+ *
+ * Implementation note:
+ *    The new data is clustered according to existing kmeans clusters, then the cluster
+ *    centers are adjusted to match the newly labeled data.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::spatial::knn;
+ *   ivf_flat::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_flat::build(handle, index_params, dataset, N, D);
+ *   // fill the index with the data
+ *   auto index = ivf_flat::extend(handle, index_empty, dataset, nullptr, N);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param handle
+ * @param orig_index original index
+ * @param[in] new_vectors a device pointer to a row-major matrix [n_rows, index.dim()]
+ * @param[in] new_indices a device pointer to a vector of indices [n_rows].
+ *    If the original index is empty (`orig_index.size() == 0`), you can pass `nullptr`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param n_rows the number of samples
+ *
+ * @return the constructed extended ivf-flat index
+ */
+template <typename T, typename IdxT>
+inline auto extend(const handle_t& handle,
+                   const index<T, IdxT>& orig_index,
+                   const T* new_vectors,
+                   const IdxT* new_indices,
+                   IdxT n_rows) -> index<T, IdxT>
+{
+  return raft::spatial::knn::ivf_flat::detail::extend(
+    handle, orig_index, new_vectors, new_indices, n_rows, handle.get_stream());
+}
+
+/**
+ * @brief Search ANN using the constructed index.
+ *
+ * See the [ivf_flat::build](#ivf_flat::build) documentation for a usage example.
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices
+ *
+ * @param handle
+ * @param params configure the search
+ * @param index ivf-flat constructed index
+ * @param[in] queries a device pointer to a row-major matrix [n_queries, index->dim()]
+ * @param n_queries the batch size
+ * @param k the number of neighbors to find for each query.
+ * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
+ * [n_queries, k]
+ * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
+ * @param mr an optional memory resource to use across the searches (you can provide a large enough
+ *           memory pool here to avoid memory allocations within search).
+ */
+template <typename T, typename IdxT>
+inline void search(const handle_t& handle,
+                   const search_params& params,
+                   const index<T, IdxT>& index,
+                   const T* queries,
+                   uint32_t n_queries,
+                   uint32_t k,
+                   IdxT* neighbors,
+                   float* distances,
+                   rmm::mr::device_memory_resource* mr = nullptr)
+{
+  return raft::spatial::knn::ivf_flat::detail::search(
+    handle, params, index, queries, n_queries, k, neighbors, distances, handle.get_stream(), mr);
+}
+
+}  // namespace raft::spatial::knn::ivf_flat
diff --git a/cpp/include/raft/spatial/knn/ivf_flat_types.hpp b/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
new file mode 100644
index 0000000000..6c46a288c1
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "common.hpp"
+
+#include <raft/core/error.hpp>
+#include <raft/distance/distance_type.hpp>
+#include <raft/integer_utils.h>
+
+#include <rmm/device_uvector.hpp>
+
+#include <optional>
+
+namespace raft::spatial::knn::ivf_flat {
+
+/** Size of the interleaved group (see `index::data` description). */
+constexpr static uint32_t kIndexGroupSize = 32;
+
+/**
+ * @brief IVF-flat index.
+ *
+ * This structure is supposed to be immutable: it's only constructed using `ivf_flat::build`,
+ * and should never be modified.
+ * At the same time, we expose all its members and allow the aggregate construction, so that
+ * third-party users can implement custom serialization/deserialization routines or modify
+ * the index building process.
+ *
+ * It would seem logical to make all the type's members constant. However, we can't do that
+ * because it would imply copying data when the index is moved. And we also cannot return
+ * `const index` in our factory functions, such as `ivf_flat::build`, because then the result
+ * couldn't be moved.
+ * Therefore, we return `index` mutable as-is, with a warning to the users that there are no
+ * protection mechanisms against manipulating the data.
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ */
+template <typename T, typename IdxT>
+struct index : knn::index {
+  static_assert(!raft::is_narrowing_v<uint32_t, IdxT>,
+                "IdxT must be able to represent all values of uint32_t");
+  /**
+   * Vectorized load/store size in elements, determines the size of interleaved data chunks.
+   *
+   * TODO: in theory, we can lift this to the template parameter and keep it at hardware maximum
+   * possible value by padding the `dim` of the data https://github.com/rapidsai/raft/issues/711
+   */
+  const uint32_t veclen;
+  /** Distance metric used for clustering. */
+  const raft::distance::DistanceType metric;
+  /** Total length of the index. */
+  const IdxT size;
+  /** Dimensionality of the data. */
+  const uint32_t dim;
+  /** Number of clusters/inverted lists. */
+  const uint32_t n_lists;
+
+  /**
+   * Inverted list data [size, dim].
+   *
+   * The data consists of the dataset rows, grouped by their labels (into clusters/lists).
+   * Within each list (cluster), the data is grouped into blocks of `kGroupSize` interleaved
+   * vectors. Note, the total index length is slightly larger than the source dataset length,
+   * because each cluster is padded by `kGroupSize` elements.
+   *
+   * Interleaving pattern:
+   * within groups of `kGroupSize` rows, the data is interleaved with the block size equal to
+   * `veclen * sizeof(T)`. That is, a chunk of `veclen` consecutive components of one row is
+   * followed by a chunk of the same size of the next row, and so on.
+   *
+   * __Example__: veclen = 2, dim = 6, kGroupSize = 32, list_size = 31
+   *
+   *     x[ 0, 0], x[ 0, 1], x[ 1, 0], x[ 1, 1], ... x[14, 0], x[14, 1], x[15, 0], x[15, 1],
+   *     x[16, 0], x[16, 1], x[17, 0], x[17, 1], ... x[30, 0], x[30, 1],    -    ,    -    ,
+   *     x[ 0, 2], x[ 0, 3], x[ 1, 2], x[ 1, 3], ... x[14, 2], x[14, 3], x[15, 2], x[15, 3],
+   *     x[16, 2], x[16, 3], x[17, 2], x[17, 3], ... x[30, 2], x[30, 3],    -    ,    -    ,
+   *     x[ 0, 4], x[ 0, 5], x[ 1, 4], x[ 1, 5], ... x[14, 4], x[14, 5], x[15, 4], x[15, 5],
+   *     x[16, 4], x[16, 5], x[17, 4], x[17, 5], ... x[30, 4], x[30, 5],    -    ,    -    ,
+   *
+   */
+  rmm::device_uvector<T> data;
+  /** Inverted list indices: ids of items in the source data [size] */
+  rmm::device_uvector<IdxT> indices;
+  /** Sizes of the lists (clusters) [n_lists] */
+  rmm::device_uvector<uint32_t> list_sizes;
+  /**
+   * Offsets into the lists [n_lists + 1].
+   * The last value contains the total length of the index.
+   */
+  rmm::device_uvector<IdxT> list_offsets;
+  /** k-means cluster centers corresponding to the lists [n_lists, dim] */
+  rmm::device_uvector<float> centers;
+  /** (Optional) Precomputed norms of the `centers` w.r.t. the chosen distance metric [n_lists]  */
+  std::optional<rmm::device_uvector<float>> center_norms;
+
+  // Don't allow copying the index for performance reasons (try avoiding copying data)
+  index(const index&) = delete;
+  index(index&&)      = default;
+  auto operator=(const index&) -> index& = delete;
+  auto operator=(index&&) -> index& = default;
+  ~index()                          = default;
+
+  /** Throw an error if the index content is inconsistent. */
+  inline void check_consistency() const
+  {
+    RAFT_EXPECTS(dim % veclen == 0, "dimensionality is not a multiple of the veclen");
+    RAFT_EXPECTS(list_offsets.size() == list_sizes.size() + 1,
+                 "inconsistent number of lists (clusters)");
+    RAFT_EXPECTS(reinterpret_cast<size_t>(data.data()) % (veclen * sizeof(T)) == 0,
+                 "The data storage pointer is not aligned to the vector length");
+  }
+};
+
+struct index_params : knn::index_params {
+  /** The number of inverted lists (clusters) */
+  uint32_t n_lists = 1024;
+  /** The number of iterations searching for kmeans centers (index building). */
+  uint32_t kmeans_n_iters = 20;
+  /** The fraction of data to use during iterative kmeans building. */
+  double kmeans_trainset_fraction = 0.5;
+};
+
+struct search_params : knn::search_params {
+  /** The number of clusters to search. */
+  uint32_t n_probes = 20;
+};
+
+static_assert(std::is_standard_layout_v<index<float, uint32_t>>);
+static_assert(std::is_aggregate_v<index<float, uint32_t>>);
+static_assert(std::is_aggregate_v<index_params>);
+static_assert(std::is_aggregate_v<search_params>);
+
+}  // namespace raft::spatial::knn::ivf_flat
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 3326a0691c..e0f11a114d 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -108,6 +108,7 @@ add_executable(test_raft
     test/sparse/row_op.cu
     test/sparse/sort.cu
     test/sparse/symmetrize.cu
+    test/spatial/ann_ivf_flat.cu
     test/spatial/knn.cu
     test/spatial/fused_l2_knn.cu
     test/spatial/haversine.cu
diff --git a/cpp/test/integer_utils.cpp b/cpp/test/integer_utils.cpp
index 71567deb45..117e7f5f7e 100644
--- a/cpp/test/integer_utils.cpp
+++ b/cpp/test/integer_utils.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,14 @@
 
 namespace raft {
 
+static_assert(!is_narrowing_v<uint32_t, uint64_t>);
+static_assert(!is_narrowing_v<uint32_t, int64_t>);
+static_assert(!is_narrowing_v<uint32_t, uint32_t>);
+static_assert(is_narrowing_v<uint32_t, int32_t>);
+static_assert(is_narrowing_v<uint32_t, int>);
+static_assert(!is_narrowing_v<float, double>);
+static_assert(is_narrowing_v<double, float>);
+
 TEST(Raft, rounding_up)
 {
   ASSERT_EQ(raft::div_rounding_up_safe(5, 3), 2);
diff --git a/cpp/test/spatial/ann_base_kernel.cuh b/cpp/test/spatial/ann_base_kernel.cuh
new file mode 100644
index 0000000000..4462875de2
--- /dev/null
+++ b/cpp/test/spatial/ann_base_kernel.cuh
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/cuda_utils.cuh>
+#include <raft/distance/distance_type.hpp>
+#include <raft/spatial/knn/knn.cuh>
+
+#include <rmm/device_uvector.hpp>
+
+namespace raft::spatial::knn {
+template <typename DataType, typename AccT>
+__global__ void naiveDistanceKernel(float* dist,
+                                    const DataType* x,
+                                    const DataType* y,
+                                    size_t m,
+                                    size_t n,
+                                    size_t k,
+                                    raft::distance::DistanceType type)
+{
+  size_t midx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (midx >= m) return;
+  for (size_t nidx = threadIdx.y + blockIdx.y * blockDim.y; nidx < n;
+       nidx += blockDim.y * gridDim.y) {
+    AccT acc = AccT(0);
+    for (size_t i = 0; i < k; ++i) {
+      size_t xidx = i + midx * k;
+      size_t yidx = i + nidx * k;
+      if (type == raft::distance::DistanceType::InnerProduct) {
+        acc += x[xidx] * y[yidx];
+      } else {
+        AccT diff = x[xidx] - y[yidx];
+        acc += diff * diff;
+      }
+    }
+    float dist_val = (float)acc;
+    if (type == raft::distance::DistanceType::L2SqrtExpanded ||
+        type == raft::distance::DistanceType::L2SqrtUnexpanded)
+      dist_val = raft::mySqrt(dist_val);
+    dist[midx * n + nidx] = dist_val;
+  }
+}
+
+/**
+ * TODO: either replace this with brute_force_knn or with distance+select_k
+ *       when either distance or brute_force_knn support 8-bit int inputs.
+ */
+template <typename DataType, typename AccT>
+void naiveBfKnn(float* dist_topk,
+                int64_t* indices_topk,
+                const DataType* x,
+                const DataType* y,
+                size_t n_inputs,
+                size_t input_len,
+                size_t dim,
+                uint32_t k,
+                raft::distance::DistanceType type,
+                DataType metric_arg = 2.0f,
+                cudaStream_t stream = 0)
+{
+  dim3 block_dim(16, 32, 1);
+  // maximum reasonable grid size in `y` direction
+  uint16_t grid_y =
+    static_cast<uint16_t>(std::min<size_t>(raft::ceildiv<size_t>(input_len, block_dim.y), 32768));
+
+  // bound the memory used by this function
+  size_t max_batch_size =
+    std::min(n_inputs, raft::ceildiv<size_t>(size_t(1) << size_t(27), input_len));
+  rmm::device_uvector<float> dist(max_batch_size * input_len, stream);
+
+  for (size_t offset = 0; offset < n_inputs; offset += max_batch_size) {
+    size_t batch_size = std::min(max_batch_size, n_inputs - offset);
+    dim3 grid_dim(raft::ceildiv<size_t>(batch_size, block_dim.x), grid_y, 1);
+
+    naiveDistanceKernel<DataType, AccT><<<grid_dim, block_dim, 0, stream>>>(
+      dist.data(), x + offset * dim, y, batch_size, input_len, dim, type);
+
+    select_k<int64_t, float>(dist.data(),
+                             nullptr,
+                             batch_size,
+                             input_len,
+                             dist_topk + offset * k,
+                             indices_topk + offset * k,
+                             type != raft::distance::DistanceType::InnerProduct,
+                             static_cast<int>(k),
+                             stream,
+                             SelectKAlgo::WARP_SORT);
+  }
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+}
+
+}  // namespace raft::spatial::knn
diff --git a/cpp/test/spatial/ann_ivf_flat.cu b/cpp/test/spatial/ann_ivf_flat.cu
new file mode 100644
index 0000000000..7468fd75b7
--- /dev/null
+++ b/cpp/test/spatial/ann_ivf_flat.cu
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include "./ann_base_kernel.cuh"
+
+#include <raft/core/logger.hpp>
+#include <raft/distance/distance_type.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/sparse/detail/utils.h>
+#include <raft/spatial/knn/ann.cuh>
+#include <raft/spatial/knn/ivf_flat.cuh>
+#include <raft/spatial/knn/knn.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+
+#include <gtest/gtest.h>
+
+#include <cstddef>
+#include <iostream>
+#include <vector>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+struct AnnIvfFlatInputs {
+  int num_queries;
+  int num_db_vecs;
+  int dim;
+  int k;
+  int nprobe;
+  int nlist;
+  raft::distance::DistanceType metric;
+};
+
+template <typename IdxT, typename DistT, typename compareDist>
+struct idx_dist_pair {
+  IdxT idx;
+  DistT dist;
+  compareDist eq_compare;
+  bool operator==(const idx_dist_pair<IdxT, DistT, compareDist>& a) const
+  {
+    if (idx == a.idx) return true;
+    if (eq_compare(dist, a.dist)) return true;
+    return false;
+  }
+  idx_dist_pair(IdxT x, DistT y, compareDist op) : idx(x), dist(y), eq_compare(op) {}
+};
+
+template <typename T, typename DistT>
+auto eval_knn(const std::vector<T>& expected_idx,
+              const std::vector<T>& actual_idx,
+              const std::vector<DistT>& expected_dist,
+              const std::vector<DistT>& actual_dist,
+              size_t rows,
+              size_t cols,
+              const DistT eps,
+              double min_recall) -> testing::AssertionResult
+{
+  size_t match_count = 0;
+  size_t total_count = static_cast<size_t>(rows) * static_cast<size_t>(cols);
+  for (size_t i = 0; i < rows; ++i) {
+    for (size_t k = 0; k < cols; ++k) {
+      size_t idx_k  = i * cols + k;  // row major assumption!
+      auto act_idx  = actual_idx[idx_k];
+      auto act_dist = actual_dist[idx_k];
+      for (size_t j = 0; j < cols; ++j) {
+        size_t idx    = i * cols + j;  // row major assumption!
+        auto exp_idx  = expected_idx[idx];
+        auto exp_dist = expected_dist[idx];
+        idx_dist_pair exp_kvp(exp_idx, exp_dist, raft::CompareApprox<DistT>(eps));
+        idx_dist_pair act_kvp(act_idx, act_dist, raft::CompareApprox<DistT>(eps));
+        if (exp_kvp == act_kvp) {
+          match_count++;
+          break;
+        }
+      }
+    }
+  }
+  RAFT_LOG_INFO("Recall = %zu/%zu", match_count, total_count);
+  double actual_recall = static_cast<double>(match_count) / static_cast<double>(total_count);
+  if (actual_recall < min_recall - eps) {
+    RAFT_LOG_WARN("Recall is suspiciously too low (%f < %f)", actual_recall, min_recall);
+    if (match_count == 0 || actual_recall < min_recall * min_recall - eps) {
+      return testing::AssertionFailure()
+             << "actual recall (" << actual_recall
+             << ") is much smaller than the minimum expected recall (" << min_recall << ").";
+    }
+  }
+  return testing::AssertionSuccess();
+}
+
+template <typename T, typename DataT>
+class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs> {
+ public:
+  AnnIVFFlatTest()
+    : stream_(handle_.get_stream()),
+      ps(::testing::TestWithParam<AnnIvfFlatInputs>::GetParam()),
+      database(0, stream_),
+      search_queries(0, stream_)
+  {
+  }
+
+ protected:
+  void testIVFFlat()
+  {
+    size_t queries_size = ps.num_queries * ps.k;
+    std::vector<int64_t> indices_ivfflat(queries_size);
+    std::vector<int64_t> indices_naive(queries_size);
+    std::vector<T> distances_ivfflat(queries_size);
+    std::vector<T> distances_naive(queries_size);
+
+    {
+      rmm::device_uvector<T> distances_naive_dev(queries_size, stream_);
+      rmm::device_uvector<int64_t> indices_naive_dev(queries_size, stream_);
+      using acc_t = typename detail::utils::config<DataT>::value_t;
+      naiveBfKnn<DataT, acc_t>(distances_naive_dev.data(),
+                               indices_naive_dev.data(),
+                               search_queries.data(),
+                               database.data(),
+                               ps.num_queries,
+                               ps.num_db_vecs,
+                               ps.dim,
+                               ps.k,
+                               ps.metric,
+                               2.0f,
+                               stream_);
+      update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_);
+      update_host(indices_naive.data(), indices_naive_dev.data(), queries_size, stream_);
+      handle_.sync_stream(stream_);
+    }
+
+    {
+      // unless something is really wrong with clustering, this could serve as a lower bound on
+      // recall
+      double min_recall = static_cast<double>(ps.nprobe) / static_cast<double>(ps.nlist);
+
+      rmm::device_uvector<T> distances_ivfflat_dev(queries_size, stream_);
+      rmm::device_uvector<int64_t> indices_ivfflat_dev(queries_size, stream_);
+
+      {
+        // legacy interface
+        raft::spatial::knn::IVFFlatParam ivfParams;
+        ivfParams.nprobe = ps.nprobe;
+        ivfParams.nlist  = ps.nlist;
+        raft::spatial::knn::knnIndex index;
+        index.index   = nullptr;
+        index.gpu_res = nullptr;
+
+        approx_knn_build_index(handle_,
+                               &index,
+                               dynamic_cast<raft::spatial::knn::knnIndexParam*>(&ivfParams),
+                               ps.metric,
+                               0,
+                               database.data(),
+                               ps.num_db_vecs,
+                               ps.dim);
+        handle_.sync_stream(stream_);
+        approx_knn_search(handle_,
+                          distances_ivfflat_dev.data(),
+                          indices_ivfflat_dev.data(),
+                          &index,
+                          ps.k,
+                          search_queries.data(),
+                          ps.num_queries);
+
+        update_host(distances_ivfflat.data(), distances_ivfflat_dev.data(), queries_size, stream_);
+        update_host(indices_ivfflat.data(), indices_ivfflat_dev.data(), queries_size, stream_);
+        handle_.sync_stream(stream_);
+      }
+
+      ASSERT_TRUE(eval_knn(indices_naive,
+                           indices_ivfflat,
+                           distances_naive,
+                           distances_ivfflat,
+                           ps.num_queries,
+                           ps.k,
+                           float(0.001),
+                           min_recall));
+      {
+        // new interface
+        raft::spatial::knn::ivf_flat::index_params index_params;
+        raft::spatial::knn::ivf_flat::search_params search_params;
+        index_params.n_lists   = ps.nlist;
+        index_params.metric    = ps.metric;
+        search_params.n_probes = ps.nprobe;
+
+        index_params.add_data_on_build        = false;
+        index_params.kmeans_trainset_fraction = 0.5;
+        auto index =
+          ivf_flat::build(handle_, index_params, database.data(), int64_t(ps.num_db_vecs), ps.dim);
+
+        rmm::device_uvector<int64_t> vector_indices(ps.num_db_vecs, stream_);
+        sparse::iota_fill(vector_indices.data(), int64_t(ps.num_db_vecs), int64_t(1), stream_);
+        handle_.sync_stream(stream_);
+
+        int64_t half_of_data = ps.num_db_vecs / 2;
+
+        auto index_2 =
+          ivf_flat::extend<DataT, int64_t>(handle_, index, database.data(), nullptr, half_of_data);
+
+        auto index_3 = ivf_flat::extend<DataT, int64_t>(handle_,
+                                                        index_2,
+                                                        database.data() + half_of_data * ps.dim,
+                                                        vector_indices.data() + half_of_data,
+                                                        int64_t(ps.num_db_vecs) - half_of_data);
+
+        ivf_flat::search(handle_,
+                         search_params,
+                         index_3,
+                         search_queries.data(),
+                         ps.num_queries,
+                         ps.k,
+                         indices_ivfflat_dev.data(),
+                         distances_ivfflat_dev.data());
+
+        update_host(distances_ivfflat.data(), distances_ivfflat_dev.data(), queries_size, stream_);
+        update_host(indices_ivfflat.data(), indices_ivfflat_dev.data(), queries_size, stream_);
+        handle_.sync_stream(stream_);
+      }
+      ASSERT_TRUE(eval_knn(indices_naive,
+                           indices_ivfflat,
+                           distances_naive,
+                           distances_ivfflat,
+                           ps.num_queries,
+                           ps.k,
+                           float(0.001),
+                           min_recall));
+    }
+  }
+
+  void SetUp() override
+  {
+    database.resize(ps.num_db_vecs * ps.dim, stream_);
+    search_queries.resize(ps.num_queries * ps.dim, stream_);
+
+    raft::random::Rng r(1234ULL);
+    if constexpr (std::is_same<DataT, float>{}) {
+      r.uniform(database.data(), ps.num_db_vecs * ps.dim, DataT(0.1), DataT(2.0), stream_);
+      r.uniform(search_queries.data(), ps.num_queries * ps.dim, DataT(0.1), DataT(2.0), stream_);
+    } else {
+      r.uniformInt(database.data(), ps.num_db_vecs * ps.dim, DataT(1), DataT(20), stream_);
+      r.uniformInt(search_queries.data(), ps.num_queries * ps.dim, DataT(1), DataT(20), stream_);
+    }
+    handle_.sync_stream(stream_);
+  }
+
+  void TearDown() override
+  {
+    handle_.sync_stream(stream_);
+    database.resize(0, stream_);
+    search_queries.resize(0, stream_);
+  }
+
+ private:
+  raft::handle_t handle_;
+  rmm::cuda_stream_view stream_;
+  AnnIvfFlatInputs ps;
+  rmm::device_uvector<DataT> database;
+  rmm::device_uvector<DataT> search_queries;
+};
+
+const std::vector<AnnIvfFlatInputs> inputs = {
+  // test various dims (aligned and not aligned to vector sizes)
+  {1000, 10000, 1, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
+  {1000, 10000, 2, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
+  {1000, 10000, 3, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
+  {1000, 10000, 4, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
+  {1000, 10000, 5, 16, 40, 1024, raft::distance::DistanceType::InnerProduct},
+  {1000, 10000, 8, 16, 40, 1024, raft::distance::DistanceType::InnerProduct},
+
+  // test dims that do not fit into kernel shared memory limits
+  {1000, 10000, 2048, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
+  {1000, 10000, 2049, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
+  {1000, 10000, 2050, 16, 40, 1024, raft::distance::DistanceType::InnerProduct},
+  {1000, 10000, 2051, 16, 40, 1024, raft::distance::DistanceType::InnerProduct},
+  {1000, 10000, 2052, 16, 40, 1024, raft::distance::DistanceType::InnerProduct},
+  {1000, 10000, 2053, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
+  {1000, 10000, 2056, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
+
+  // various random combinations
+  {1000, 10000, 16, 10, 40, 1024, raft::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 10, 50, 1024, raft::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 10, 70, 1024, raft::distance::DistanceType::L2Expanded},
+  {100, 10000, 16, 10, 20, 512, raft::distance::DistanceType::L2Expanded},
+  {20, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::L2Expanded},
+  {1000, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::L2Expanded},
+  {10000, 131072, 8, 10, 20, 1024, raft::distance::DistanceType::L2Expanded},
+
+  {1000, 10000, 16, 10, 40, 1024, raft::distance::DistanceType::InnerProduct},
+  {1000, 10000, 16, 10, 50, 1024, raft::distance::DistanceType::InnerProduct},
+  {1000, 10000, 16, 10, 70, 1024, raft::distance::DistanceType::InnerProduct},
+  {100, 10000, 16, 10, 20, 512, raft::distance::DistanceType::InnerProduct},
+  {20, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::InnerProduct},
+  {1000, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::InnerProduct},
+  {10000, 131072, 8, 10, 50, 1024, raft::distance::DistanceType::InnerProduct},
+
+  {1000, 10000, 4096, 20, 50, 1024, raft::distance::DistanceType::InnerProduct},
+
+  // test splitting the big query batches  (> max gridDim.y) into smaller batches
+  {100000, 1024, 32, 10, 64, 64, raft::distance::DistanceType::InnerProduct},
+  {98306, 1024, 32, 10, 64, 64, raft::distance::DistanceType::InnerProduct},
+
+  // test radix_sort for getting the cluster selection
+  {1000,
+   10000,
+   16,
+   10,
+   raft::spatial::knn::detail::topk::kMaxCapacity * 2,
+   raft::spatial::knn::detail::topk::kMaxCapacity * 4,
+   raft::distance::DistanceType::L2Expanded},
+  {1000,
+   10000,
+   16,
+   10,
+   raft::spatial::knn::detail::topk::kMaxCapacity * 4,
+   raft::spatial::knn::detail::topk::kMaxCapacity * 4,
+   raft::distance::DistanceType::InnerProduct}};
+
+typedef AnnIVFFlatTest<float, float> AnnIVFFlatTestF;
+TEST_P(AnnIVFFlatTestF, AnnIVFFlat) { this->testIVFFlat(); }
+
+INSTANTIATE_TEST_CASE_P(AnnIVFFlatTest, AnnIVFFlatTestF, ::testing::ValuesIn(inputs));
+
+typedef AnnIVFFlatTest<float, uint8_t> AnnIVFFlatTestF_uint8;
+TEST_P(AnnIVFFlatTestF_uint8, AnnIVFFlat) { this->testIVFFlat(); }
+
+INSTANTIATE_TEST_CASE_P(AnnIVFFlatTest, AnnIVFFlatTestF_uint8, ::testing::ValuesIn(inputs));
+
+typedef AnnIVFFlatTest<float, int8_t> AnnIVFFlatTestF_int8;
+TEST_P(AnnIVFFlatTestF_int8, AnnIVFFlat) { this->testIVFFlat(); }
+
+INSTANTIATE_TEST_CASE_P(AnnIVFFlatTest, AnnIVFFlatTestF_int8, ::testing::ValuesIn(inputs));
+
+}  // namespace knn
+}  // namespace spatial
+}  // namespace raft
diff --git a/docs/source/cpp_api/spatial.rst b/docs/source/cpp_api/spatial.rst
index 5065fa5af0..243bf19bf7 100644
--- a/docs/source/cpp_api/spatial.rst
+++ b/docs/source/cpp_api/spatial.rst
@@ -16,3 +16,12 @@ Nearest Neighbors
 .. doxygennamespace:: raft::spatial::knn
     :project: RAFT
     :members:
+
+
+
+IVF-Flat
+--------
+
+.. doxygennamespace:: raft::spatial::knn::ivf_flat
+    :project: RAFT
+    :members: