From d9a7290b60d1037a7fbc00b4b6e5c371b8b86ca8 Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Wed, 13 Dec 2023 17:30:36 +0100
Subject: [PATCH 01/11] Fix ann-bench multithreading (#2021)

In the current state, ann-benchmarks running in the `--throughput` mode (multi-threaded) share ANN wrappers among CPU threads. This is not thread-safe and may result in incorrectly measured time (e.g. sharing cuda events among CPU threads) or various exceptions and segfaults (e.g. doing state-changing cublas calls from multiple CPU threads).

This PR makes the search benchmarks copy ANN wrappers in each thread. The copies of the wrappers then selectively:
  - share thread-safe resources (e.g. rmm memory pool) and large objects that are not expected to change during search (e.g. index data);
  - duplicate the resources that are not thread-safe or carry the thread-specific state (e.g. cublas handles, CUDA events and streams).

Alongside, the PR adds a few small changes, including:
 - enables ann-bench NVTX annotations for the non-common-executable mode (shows benchmark labels and iterations in nsys timeline);
 - fixes compile errors for the common-executable mode.

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)
  - William Hicks (https://github.com/wphicks)

Approvers:
  - William Hicks (https://github.com/wphicks)
  - Mark Harris (https://github.com/harrism)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2021
---
 cpp/bench/ann/CMakeLists.txt                  | 45 ++++++---
 cpp/bench/ann/src/common/ann_types.hpp        | 15 ++-
 cpp/bench/ann/src/common/benchmark.hpp        | 16 ++-
 cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h   | 50 +++++++---
 cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h   | 50 +++++++---
 cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh       | 22 +++--
 cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h   | 27 ++---
 cpp/bench/ann/src/raft/raft_ann_bench_utils.h | 99 +++++++++++++++++++
 cpp/bench/ann/src/raft/raft_benchmark.cu      | 12 +--
 .../ann/src/raft/raft_cagra_hnswlib_wrapper.h | 55 +++++------
 cpp/bench/ann/src/raft/raft_cagra_wrapper.h   | 98 ++++++++++--------
 .../ann/src/raft/raft_ivf_flat_wrapper.h      | 36 ++++---
 cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h  | 45 ++++-----
 13 files changed, 364 insertions(+), 206 deletions(-)

diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index 5919de07e7..c144d1399e 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -116,6 +116,21 @@ if(RAFT_ANN_BENCH_USE_FAISS)
   include(cmake/thirdparty/get_faiss.cmake)
 endif()
 
+# ##################################################################################################
+# * Enable NVTX if available
+
+# Note: ANN_BENCH wrappers have extra NVTX code not related to raft::nvtx.They track gbench
+# benchmark cases and iterations. This is to make limited NVTX available to all algos, not just
+# raft.
+if(TARGET CUDA::nvtx3)
+  set(_CMAKE_REQUIRED_INCLUDES_ORIG ${CMAKE_REQUIRED_INCLUDES})
+  get_target_property(CMAKE_REQUIRED_INCLUDES CUDA::nvtx3 INTERFACE_INCLUDE_DIRECTORIES)
+  unset(NVTX3_HEADERS_FOUND CACHE)
+  # Check the headers explicitly to make sure the cpu-only build succeeds
+  CHECK_INCLUDE_FILE_CXX(nvtx3/nvToolsExt.h NVTX3_HEADERS_FOUND)
+  set(CMAKE_REQUIRED_INCLUDES ${_CMAKE_REQUIRED_INCLUDES_ORIG})
+endif()
+
 # ##################################################################################################
 # * Configure tests function-------------------------------------------------------------
 
@@ -141,8 +156,13 @@ function(ConfigureAnnBench)
     add_dependencies(${BENCH_NAME} ANN_BENCH)
   else()
     add_executable(${BENCH_NAME} ${ConfigureAnnBench_PATH})
-    target_compile_definitions(${BENCH_NAME} PRIVATE ANN_BENCH_BUILD_MAIN)
-    target_link_libraries(${BENCH_NAME} PRIVATE benchmark::benchmark)
+    target_compile_definitions(
+      ${BENCH_NAME} PRIVATE ANN_BENCH_BUILD_MAIN
+                            $<$<BOOL:${NVTX3_HEADERS_FOUND}>:ANN_BENCH_NVTX3_HEADERS_FOUND>
+    )
+    target_link_libraries(
+      ${BENCH_NAME} PRIVATE benchmark::benchmark $<$<BOOL:${NVTX3_HEADERS_FOUND}>:CUDA::nvtx3>
+    )
   endif()
 
   target_link_libraries(
@@ -340,8 +360,16 @@ if(RAFT_ANN_BENCH_SINGLE_EXE)
   target_include_directories(ANN_BENCH PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
 
   target_link_libraries(
-    ANN_BENCH PRIVATE nlohmann_json::nlohmann_json benchmark_static dl -static-libgcc
-                      -static-libstdc++ CUDA::nvtx3
+    ANN_BENCH
+    PRIVATE raft::raft
+            nlohmann_json::nlohmann_json
+            benchmark_static
+            dl
+            -static-libgcc
+            fmt::fmt-header-only
+            spdlog::spdlog_header_only
+            -static-libstdc++
+            $<$<BOOL:${NVTX3_HEADERS_FOUND}>:CUDA::nvtx3>
   )
   set_target_properties(
     ANN_BENCH
@@ -355,17 +383,10 @@ if(RAFT_ANN_BENCH_SINGLE_EXE)
                BUILD_RPATH "\$ORIGIN"
                INSTALL_RPATH "\$ORIGIN"
   )
-
-  # Disable NVTX when the nvtx3 headers are missing
-  set(_CMAKE_REQUIRED_INCLUDES_ORIG ${CMAKE_REQUIRED_INCLUDES})
-  get_target_property(CMAKE_REQUIRED_INCLUDES ANN_BENCH INCLUDE_DIRECTORIES)
-  CHECK_INCLUDE_FILE_CXX(nvtx3/nvToolsExt.h NVTX3_HEADERS_FOUND)
-  set(CMAKE_REQUIRED_INCLUDES ${_CMAKE_REQUIRED_INCLUDES_ORIG})
   target_compile_definitions(
     ANN_BENCH
     PRIVATE
-      $<$<BOOL:${CUDAToolkit_FOUND}>:ANN_BENCH_LINK_CUDART="libcudart.so.${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}.${CUDAToolkit_VERSION_PATCH}
-  ">
+      $<$<BOOL:${CUDAToolkit_FOUND}>:ANN_BENCH_LINK_CUDART="libcudart.so.${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}.${CUDAToolkit_VERSION_PATCH}">
       $<$<BOOL:${NVTX3_HEADERS_FOUND}>:ANN_BENCH_NVTX3_HEADERS_FOUND>
   )
 
diff --git a/cpp/bench/ann/src/common/ann_types.hpp b/cpp/bench/ann/src/common/ann_types.hpp
index e964a81efa..9b77c9df91 100644
--- a/cpp/bench/ann/src/common/ann_types.hpp
+++ b/cpp/bench/ann/src/common/ann_types.hpp
@@ -18,6 +18,7 @@
 
 #include "cuda_stub.hpp"  // cudaStream_t
 
+#include <memory>
 #include <stdexcept>
 #include <string>
 #include <vector>
@@ -64,17 +65,10 @@ inline auto parse_memory_type(const std::string& memory_type) -> MemoryType
   }
 }
 
-class AlgoProperty {
- public:
-  inline AlgoProperty() {}
-  inline AlgoProperty(MemoryType dataset_memory_type_, MemoryType query_memory_type_)
-    : dataset_memory_type(dataset_memory_type_), query_memory_type(query_memory_type_)
-  {
-  }
+struct AlgoProperty {
   MemoryType dataset_memory_type;
   // neighbors/distances should have same memory type as queries
   MemoryType query_memory_type;
-  virtual ~AlgoProperty() = default;
 };
 
 class AnnBase {
@@ -125,6 +119,11 @@ class ANN : public AnnBase {
   // The client code should call set_search_dataset() before searching,
   // and should not release dataset before searching is finished.
   virtual void set_search_dataset(const T* /*dataset*/, size_t /*nrow*/){};
+
+  /**
+   * Make a shallow copy of the ANN wrapper that shares the resources and ensures thread-safe access
+   * to them. */
+  virtual auto copy() -> std::unique_ptr<ANN<T>> = 0;
 };
 
 }  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index a2e77323c1..e61de6745e 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -45,7 +45,7 @@ std::condition_variable cond_var;
 std::atomic_int processed_threads{0};
 
 static inline std::unique_ptr<AnnBase> current_algo{nullptr};
-static inline std::shared_ptr<AlgoProperty> current_algo_props{nullptr};
+static inline std::unique_ptr<AlgoProperty> current_algo_props{nullptr};
 
 using kv_series = std::vector<std::tuple<std::string, std::vector<nlohmann::json>>>;
 
@@ -241,9 +241,8 @@ void bench_search(::benchmark::State& state,
       return;
     }
 
-    auto algo_property = parse_algo_property(algo->get_preference(), sp_json);
-    current_algo_props = std::make_shared<AlgoProperty>(algo_property.dataset_memory_type,
-                                                        algo_property.query_memory_type);
+    current_algo_props = std::make_unique<AlgoProperty>(
+      std::move(parse_algo_property(algo->get_preference(), sp_json)));
 
     if (search_param->needs_dataset()) {
       try {
@@ -277,23 +276,22 @@ void bench_search(::benchmark::State& state,
     // We are accessing shared variables (like current_algo, current_algo_probs) before the
     // benchmark loop, therefore the synchronization here is necessary.
   }
-  const auto algo_property = *current_algo_props;
-  query_set                = dataset->query_set(algo_property.query_memory_type);
+  query_set = dataset->query_set(current_algo_props->query_memory_type);
 
   /**
    * Each thread will manage its own outputs
    */
   std::shared_ptr<buf<float>> distances =
-    std::make_shared<buf<float>>(algo_property.query_memory_type, k * query_set_size);
+    std::make_shared<buf<float>>(current_algo_props->query_memory_type, k * query_set_size);
   std::shared_ptr<buf<std::size_t>> neighbors =
-    std::make_shared<buf<std::size_t>>(algo_property.query_memory_type, k * query_set_size);
+    std::make_shared<buf<std::size_t>>(current_algo_props->query_memory_type, k * query_set_size);
 
   cuda_timer gpu_timer;
   auto start = std::chrono::high_resolution_clock::now();
   {
     nvtx_case nvtx{state.name()};
 
-    ANN<T>* algo = dynamic_cast<ANN<T>*>(current_algo.get());
+    auto algo = dynamic_cast<ANN<T>*>(current_algo.get())->copy();
     for (auto _ : state) {
       [[maybe_unused]] auto ntx_lap = nvtx.lap();
       [[maybe_unused]] auto gpu_lap = gpu_timer.lap();
diff --git a/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h b/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h
index 755fe9f197..3cc4e10b49 100644
--- a/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h
+++ b/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h
@@ -73,8 +73,6 @@ class FaissCpu : public ANN<T> {
     static_assert(std::is_same_v<T, float>, "faiss support only float type");
   }
 
-  virtual ~FaissCpu() noexcept {}
-
   void build(const T* dataset, size_t nrow, cudaStream_t stream = 0) final;
 
   void set_search_param(const AnnSearchParam& param) override;
@@ -82,9 +80,9 @@ class FaissCpu : public ANN<T> {
   void init_quantizer(int dim)
   {
     if (this->metric_type_ == faiss::MetricType::METRIC_L2) {
-      this->quantizer_ = std::make_unique<faiss::IndexFlatL2>(dim);
+      this->quantizer_ = std::make_shared<faiss::IndexFlatL2>(dim);
     } else if (this->metric_type_ == faiss::MetricType::METRIC_INNER_PRODUCT) {
-      this->quantizer_ = std::make_unique<faiss::IndexFlatIP>(dim);
+      this->quantizer_ = std::make_shared<faiss::IndexFlatIP>(dim);
     }
   }
 
@@ -113,15 +111,15 @@ class FaissCpu : public ANN<T> {
   template <typename Index>
   void load_(const std::string& file);
 
-  std::unique_ptr<faiss::Index> index_;
-  std::unique_ptr<faiss::Index> quantizer_;
-  std::unique_ptr<faiss::IndexRefineFlat> index_refine_;
+  std::shared_ptr<faiss::Index> index_;
+  std::shared_ptr<faiss::Index> quantizer_;
+  std::shared_ptr<faiss::IndexRefineFlat> index_refine_;
   faiss::MetricType metric_type_;
   int nlist_;
   double training_sample_fraction_;
 
   int num_threads_;
-  std::unique_ptr<FixedThreadPool> thread_pool_;
+  std::shared_ptr<FixedThreadPool> thread_pool_;
 };
 
 template <typename T>
@@ -152,7 +150,7 @@ void FaissCpu<T>::build(const T* dataset, size_t nrow, cudaStream_t stream)
   index_->train(nrow, dataset);  // faiss::IndexFlat::train() will do nothing
   assert(index_->is_trained);
   index_->add(nrow, dataset);
-  index_refine_ = std::make_unique<faiss::IndexRefineFlat>(this->index_.get(), dataset);
+  index_refine_ = std::make_shared<faiss::IndexRefineFlat>(this->index_.get(), dataset);
 }
 
 template <typename T>
@@ -169,7 +167,7 @@ void FaissCpu<T>::set_search_param(const AnnSearchParam& param)
 
   if (!thread_pool_ || num_threads_ != search_param.num_threads) {
     num_threads_ = search_param.num_threads;
-    thread_pool_ = std::make_unique<FixedThreadPool>(num_threads_);
+    thread_pool_ = std::make_shared<FixedThreadPool>(num_threads_);
   }
 }
 
@@ -203,7 +201,7 @@ template <typename T>
 template <typename Index>
 void FaissCpu<T>::load_(const std::string& file)
 {
-  index_ = std::unique_ptr<Index>(dynamic_cast<Index*>(faiss::read_index(file.c_str())));
+  index_ = std::shared_ptr<Index>(dynamic_cast<Index*>(faiss::read_index(file.c_str())));
 }
 
 template <typename T>
@@ -214,7 +212,7 @@ class FaissCpuIVFFlat : public FaissCpu<T> {
   FaissCpuIVFFlat(Metric metric, int dim, const BuildParam& param) : FaissCpu<T>(metric, dim, param)
   {
     this->init_quantizer(dim);
-    this->index_ = std::make_unique<faiss::IndexIVFFlat>(
+    this->index_ = std::make_shared<faiss::IndexIVFFlat>(
       this->quantizer_.get(), dim, param.nlist, this->metric_type_);
   }
 
@@ -223,6 +221,11 @@ class FaissCpuIVFFlat : public FaissCpu<T> {
     this->template save_<faiss::IndexIVFFlat>(file);
   }
   void load(const std::string& file) override { this->template load_<faiss::IndexIVFFlat>(file); }
+
+  std::unique_ptr<ANN<T>> copy()
+  {
+    return std::make_unique<FaissCpuIVFFlat<T>>(*this);  // use copy constructor
+  }
 };
 
 template <typename T>
@@ -237,7 +240,7 @@ class FaissCpuIVFPQ : public FaissCpu<T> {
   FaissCpuIVFPQ(Metric metric, int dim, const BuildParam& param) : FaissCpu<T>(metric, dim, param)
   {
     this->init_quantizer(dim);
-    this->index_ = std::make_unique<faiss::IndexIVFPQ>(
+    this->index_ = std::make_shared<faiss::IndexIVFPQ>(
       this->quantizer_.get(), dim, param.nlist, param.M, param.bitsPerCode, this->metric_type_);
   }
 
@@ -246,6 +249,11 @@ class FaissCpuIVFPQ : public FaissCpu<T> {
     this->template save_<faiss::IndexIVFPQ>(file);
   }
   void load(const std::string& file) override { this->template load_<faiss::IndexIVFPQ>(file); }
+
+  std::unique_ptr<ANN<T>> copy()
+  {
+    return std::make_unique<FaissCpuIVFPQ<T>>(*this);  // use copy constructor
+  }
 };
 
 // TODO: Enable this in cmake
@@ -270,7 +278,7 @@ class FaissCpuIVFSQ : public FaissCpu<T> {
     }
 
     this->init_quantizer(dim);
-    this->index_ = std::make_unique<faiss::IndexIVFScalarQuantizer>(
+    this->index_ = std::make_shared<faiss::IndexIVFScalarQuantizer>(
       this->quantizer_.get(), dim, param.nlist, qtype, this->metric_type_, true);
   }
 
@@ -282,6 +290,11 @@ class FaissCpuIVFSQ : public FaissCpu<T> {
   {
     this->template load_<faiss::IndexIVFScalarQuantizer>(file);
   }
+
+  std::unique_ptr<ANN<T>> copy()
+  {
+    return std::make_unique<FaissCpuIVFSQ<T>>(*this);  // use copy constructor
+  }
 };
 
 template <typename T>
@@ -290,7 +303,7 @@ class FaissCpuFlat : public FaissCpu<T> {
   FaissCpuFlat(Metric metric, int dim)
     : FaissCpu<T>(metric, dim, typename FaissCpu<T>::BuildParam{})
   {
-    this->index_ = std::make_unique<faiss::IndexFlat>(dim, this->metric_type_);
+    this->index_ = std::make_shared<faiss::IndexFlat>(dim, this->metric_type_);
   }
 
   // class FaissCpu is more like a IVF class, so need special treating here
@@ -299,7 +312,7 @@ class FaissCpuFlat : public FaissCpu<T> {
     auto search_param = dynamic_cast<const typename FaissCpu<T>::SearchParam&>(param);
     if (!this->thread_pool_ || this->num_threads_ != search_param.num_threads) {
       this->num_threads_ = search_param.num_threads;
-      this->thread_pool_ = std::make_unique<FixedThreadPool>(this->num_threads_);
+      this->thread_pool_ = std::make_shared<FixedThreadPool>(this->num_threads_);
     }
   };
 
@@ -308,6 +321,11 @@ class FaissCpuFlat : public FaissCpu<T> {
     this->template save_<faiss::IndexFlat>(file);
   }
   void load(const std::string& file) override { this->template load_<faiss::IndexFlat>(file); }
+
+  std::unique_ptr<ANN<T>> copy()
+  {
+    return std::make_unique<FaissCpuFlat<T>>(*this);  // use copy constructor
+  }
 };
 
 }  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h b/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h
index 4f13ff8a49..ad51dd4e68 100644
--- a/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h
+++ b/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h
@@ -105,7 +105,6 @@ class FaissGpu : public ANN<T> {
     RAFT_CUDA_TRY(cudaGetDevice(&device_));
     RAFT_CUDA_TRY(cudaEventCreate(&sync_, cudaEventDisableTiming));
     faiss_default_stream_ = gpu_resource_.getDefaultStream(device_);
-    raft::resource::set_cuda_stream(handle_, faiss_default_stream_);
   }
 
   virtual ~FaissGpu() noexcept { RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(sync_)); }
@@ -147,18 +146,33 @@ class FaissGpu : public ANN<T> {
     RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, sync_));
   }
 
+  /** [NOTE Multithreading]
+   *
+   * `gpu_resource_` is a shared resource:
+   *   1. It uses a shared_ptr under the hood, so the copies of it refer to the same
+   *      resource implementation instance
+   *   2. GpuIndex is probably keeping a reference to it, as it's passed to the constructor
+   *
+   * To avoid copying the index (database) in each thread, we make both the index and
+   * the gpu_resource shared.
+   * This means faiss GPU streams are possibly shared among the CPU threads;
+   * the throughput search mode may be inaccurate.
+   *
+   * WARNING: we haven't investigated whether faiss::gpu::GpuIndex or
+   * faiss::gpu::StandardGpuResources are thread-safe.
+   *
+   */
   mutable faiss::gpu::StandardGpuResources gpu_resource_;
-  std::unique_ptr<faiss::gpu::GpuIndex> index_;
-  std::unique_ptr<faiss::IndexRefineFlat> index_refine_{nullptr};
+  std::shared_ptr<faiss::gpu::GpuIndex> index_;
+  std::shared_ptr<faiss::IndexRefineFlat> index_refine_{nullptr};
   faiss::MetricType metric_type_;
   int nlist_;
   int device_;
   cudaEvent_t sync_{nullptr};
   cudaStream_t faiss_default_stream_{nullptr};
   double training_sample_fraction_;
-  std::unique_ptr<faiss::SearchParameters> search_params_;
+  std::shared_ptr<faiss::SearchParameters> search_params_;
   const T* dataset_;
-  raft::device_resources handle_;
   float refine_ratio_ = 1.0;
 };
 
@@ -263,7 +277,7 @@ class FaissGpuIVFFlat : public FaissGpu<T> {
   {
     faiss::gpu::GpuIndexIVFFlatConfig config;
     config.device = this->device_;
-    this->index_  = std::make_unique<faiss::gpu::GpuIndexIVFFlat>(
+    this->index_  = std::make_shared<faiss::gpu::GpuIndexIVFFlat>(
       &(this->gpu_resource_), dim, param.nlist, this->metric_type_, config);
   }
 
@@ -275,7 +289,7 @@ class FaissGpuIVFFlat : public FaissGpu<T> {
 
     faiss::IVFSearchParameters faiss_search_params;
     faiss_search_params.nprobe = nprobe;
-    this->search_params_       = std::make_unique<faiss::IVFSearchParameters>(faiss_search_params);
+    this->search_params_       = std::make_shared<faiss::IVFSearchParameters>(faiss_search_params);
     this->refine_ratio_        = search_param.refine_ratio;
   }
 
@@ -287,6 +301,7 @@ class FaissGpuIVFFlat : public FaissGpu<T> {
   {
     this->template load_<faiss::gpu::GpuIndexIVFFlat, faiss::IndexIVFFlat>(file);
   }
+  std::unique_ptr<ANN<T>> copy() override { return std::make_unique<FaissGpuIVFFlat<T>>(*this); };
 };
 
 template <typename T>
@@ -306,7 +321,7 @@ class FaissGpuIVFPQ : public FaissGpu<T> {
     config.device                 = this->device_;
 
     this->index_ =
-      std::make_unique<faiss::gpu::GpuIndexIVFPQ>(&(this->gpu_resource_),
+      std::make_shared<faiss::gpu::GpuIndexIVFPQ>(&(this->gpu_resource_),
                                                   dim,
                                                   param.nlist,
                                                   param.M,
@@ -324,11 +339,11 @@ class FaissGpuIVFPQ : public FaissGpu<T> {
     faiss::IVFPQSearchParameters faiss_search_params;
     faiss_search_params.nprobe = nprobe;
 
-    this->search_params_ = std::make_unique<faiss::IVFPQSearchParameters>(faiss_search_params);
+    this->search_params_ = std::make_shared<faiss::IVFPQSearchParameters>(faiss_search_params);
 
     if (search_param.refine_ratio > 1.0) {
       this->index_refine_ =
-        std::make_unique<faiss::IndexRefineFlat>(this->index_.get(), this->dataset_);
+        std::make_shared<faiss::IndexRefineFlat>(this->index_.get(), this->dataset_);
       this->index_refine_.get()->k_factor = search_param.refine_ratio;
     }
   }
@@ -341,6 +356,7 @@ class FaissGpuIVFPQ : public FaissGpu<T> {
   {
     this->template load_<faiss::gpu::GpuIndexIVFPQ, faiss::IndexIVFPQ>(file);
   }
+  std::unique_ptr<ANN<T>> copy() override { return std::make_unique<FaissGpuIVFPQ<T>>(*this); };
 };
 
 // TODO: Enable this in cmake
@@ -366,7 +382,7 @@ class FaissGpuIVFSQ : public FaissGpu<T> {
 
     faiss::gpu::GpuIndexIVFScalarQuantizerConfig config;
     config.device = this->device_;
-    this->index_  = std::make_unique<faiss::gpu::GpuIndexIVFScalarQuantizer>(
+    this->index_  = std::make_shared<faiss::gpu::GpuIndexIVFScalarQuantizer>(
       &(this->gpu_resource_), dim, param.nlist, qtype, this->metric_type_, true, config);
   }
 
@@ -379,11 +395,11 @@ class FaissGpuIVFSQ : public FaissGpu<T> {
     faiss::IVFSearchParameters faiss_search_params;
     faiss_search_params.nprobe = nprobe;
 
-    this->search_params_ = std::make_unique<faiss::IVFSearchParameters>(faiss_search_params);
+    this->search_params_ = std::make_shared<faiss::IVFSearchParameters>(faiss_search_params);
     this->refine_ratio_  = search_param.refine_ratio;
     if (search_param.refine_ratio > 1.0) {
       this->index_refine_ =
-        std::make_unique<faiss::IndexRefineFlat>(this->index_.get(), this->dataset_);
+        std::make_shared<faiss::IndexRefineFlat>(this->index_.get(), this->dataset_);
       this->index_refine_.get()->k_factor = search_param.refine_ratio;
     }
   }
@@ -398,6 +414,7 @@ class FaissGpuIVFSQ : public FaissGpu<T> {
     this->template load_<faiss::gpu::GpuIndexIVFScalarQuantizer, faiss::IndexIVFScalarQuantizer>(
       file);
   }
+  std::unique_ptr<ANN<T>> copy() override { return std::make_unique<FaissGpuIVFSQ<T>>(*this); };
 };
 
 template <typename T>
@@ -408,7 +425,7 @@ class FaissGpuFlat : public FaissGpu<T> {
   {
     faiss::gpu::GpuIndexFlatConfig config;
     config.device = this->device_;
-    this->index_  = std::make_unique<faiss::gpu::GpuIndexFlat>(
+    this->index_  = std::make_shared<faiss::gpu::GpuIndexFlat>(
       &(this->gpu_resource_), dim, this->metric_type_, config);
   }
   void set_search_param(const typename FaissGpu<T>::AnnSearchParam& param) override
@@ -417,7 +434,7 @@ class FaissGpuFlat : public FaissGpu<T> {
     int nprobe        = search_param.nprobe;
     assert(nprobe <= nlist_);
 
-    this->search_params_ = std::make_unique<faiss::SearchParameters>();
+    this->search_params_ = std::make_shared<faiss::SearchParameters>();
   }
 
   void save(const std::string& file) const override
@@ -428,8 +445,9 @@ class FaissGpuFlat : public FaissGpu<T> {
   {
     this->template load_<faiss::gpu::GpuIndexFlat, faiss::IndexFlat>(file);
   }
+  std::unique_ptr<ANN<T>> copy() override { return std::make_unique<FaissGpuFlat<T>>(*this); };
 };
 
 }  // namespace raft::bench::ann
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh b/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
index 664ec511dd..20c50a5119 100644
--- a/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
+++ b/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
@@ -52,7 +52,6 @@ class Ggnn : public ANN<T> {
   };
 
   Ggnn(Metric metric, int dim, const BuildParam& param);
-  ~Ggnn() { delete impl_; }
 
   void build(const T* dataset, size_t nrow, cudaStream_t stream = 0) override
   {
@@ -72,6 +71,7 @@ class Ggnn : public ANN<T> {
 
   void save(const std::string& file) const override { impl_->save(file); }
   void load(const std::string& file) override { impl_->load(file); }
+  std::unique_ptr<ANN<T>> copy() override { return std::make_unique<Ggnn<T>>(*this); };
 
   AlgoProperty get_preference() const override { return impl_->get_preference(); }
 
@@ -81,7 +81,7 @@ class Ggnn : public ANN<T> {
   };
 
  private:
-  ANN<T>* impl_;
+  std::shared_ptr<ANN<T>> impl_;
 };
 
 template <typename T>
@@ -90,23 +90,23 @@ Ggnn<T>::Ggnn(Metric metric, int dim, const BuildParam& param) : ANN<T>(metric,
   // ggnn/src/sift1m.cu
   if (metric == Metric::kEuclidean && dim == 128 && param.k_build == 24 && param.k == 10 &&
       param.segment_size == 32) {
-    impl_ = new GgnnImpl<T, Euclidean, 128, 24, 10, 32>(metric, dim, param);
+    impl_ = std::make_shared<GgnnImpl<T, Euclidean, 128, 24, 10, 32>>(metric, dim, param);
   }
   // ggnn/src/deep1b_multi_gpu.cu, and adapt it deep1B
   else if (metric == Metric::kEuclidean && dim == 96 && param.k_build == 24 && param.k == 10 &&
            param.segment_size == 32) {
-    impl_ = new GgnnImpl<T, Euclidean, 96, 24, 10, 32>(metric, dim, param);
+    impl_ = std::make_shared<GgnnImpl<T, Euclidean, 96, 24, 10, 32>>(metric, dim, param);
   } else if (metric == Metric::kInnerProduct && dim == 96 && param.k_build == 24 && param.k == 10 &&
              param.segment_size == 32) {
-    impl_ = new GgnnImpl<T, Cosine, 96, 24, 10, 32>(metric, dim, param);
+    impl_ = std::make_shared<GgnnImpl<T, Cosine, 96, 24, 10, 32>>(metric, dim, param);
   } else if (metric == Metric::kInnerProduct && dim == 96 && param.k_build == 96 && param.k == 10 &&
              param.segment_size == 64) {
-    impl_ = new GgnnImpl<T, Cosine, 96, 96, 10, 64>(metric, dim, param);
+    impl_ = std::make_shared<GgnnImpl<T, Cosine, 96, 96, 10, 64>>(metric, dim, param);
   }
   // ggnn/src/glove200.cu, adapt it to glove100
   else if (metric == Metric::kInnerProduct && dim == 100 && param.k_build == 96 && param.k == 10 &&
            param.segment_size == 64) {
-    impl_ = new GgnnImpl<T, Cosine, 100, 96, 10, 64>(metric, dim, param);
+    impl_ = std::make_shared<GgnnImpl<T, Cosine, 100, 96, 10, 64>>(metric, dim, param);
   } else {
     throw std::runtime_error(
       "ggnn: not supported combination of metric, dim and build param; "
@@ -133,6 +133,10 @@ class GgnnImpl : public ANN<T> {
 
   void save(const std::string& file) const override;
   void load(const std::string& file) override;
+  std::unique_ptr<ANN<T>> copy() override
+  {
+    return std::make_unique<GgnnImpl<T, measure, D, KBuild, KQuery, S>>(*this);
+  };
 
   AlgoProperty get_preference() const override
   {
@@ -159,7 +163,7 @@ class GgnnImpl : public ANN<T> {
                                           KBuild / 2 /* KF */,
                                           KQuery,
                                           S>;
-  std::unique_ptr<GGNNGPUInstance> ggnn_;
+  std::shared_ptr<GGNNGPUInstance> ggnn_;
   typename Ggnn<T>::BuildParam build_param_;
   typename Ggnn<T>::SearchParam search_param_;
 };
@@ -189,7 +193,7 @@ void GgnnImpl<T, measure, D, KBuild, KQuery, S>::build(const T* dataset,
 {
   int device;
   RAFT_CUDA_TRY(cudaGetDevice(&device));
-  ggnn_ = std::make_unique<GGNNGPUInstance>(
+  ggnn_ = std::make_shared<GGNNGPUInstance>(
     device, nrow, build_param_.num_layers, true, build_param_.tau);
 
   ggnn_->set_base_data(dataset);
diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
index 921d72decc..2a5177d295 100644
--- a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
+++ b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
@@ -82,6 +82,7 @@ class HnswLib : public ANN<T> {
 
   void save(const std::string& path_to_index) const override;
   void load(const std::string& path_to_index) override;
+  std::unique_ptr<ANN<T>> copy() override { return std::make_unique<HnswLib<T>>(*this); };
 
   AlgoProperty get_preference() const override
   {
@@ -96,15 +97,15 @@ class HnswLib : public ANN<T> {
  private:
   void get_search_knn_results_(const T* query, int k, size_t* indices, float* distances) const;
 
-  std::unique_ptr<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>> appr_alg_;
-  std::unique_ptr<hnswlib::SpaceInterface<typename hnsw_dist_t<T>::type>> space_;
+  std::shared_ptr<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>> appr_alg_;
+  std::shared_ptr<hnswlib::SpaceInterface<typename hnsw_dist_t<T>::type>> space_;
 
   using ANN<T>::metric_;
   using ANN<T>::dim_;
   int ef_construction_;
   int m_;
   int num_threads_;
-  std::unique_ptr<FixedThreadPool> thread_pool_;
+  std::shared_ptr<FixedThreadPool> thread_pool_;
   Objective metric_objective_;
 };
 
@@ -129,18 +130,18 @@ void HnswLib<T>::build(const T* dataset, size_t nrow, cudaStream_t)
 {
   if constexpr (std::is_same_v<T, float>) {
     if (metric_ == Metric::kInnerProduct) {
-      space_ = std::make_unique<hnswlib::InnerProductSpace>(dim_);
+      space_ = std::make_shared<hnswlib::InnerProductSpace>(dim_);
     } else {
-      space_ = std::make_unique<hnswlib::L2Space>(dim_);
+      space_ = std::make_shared<hnswlib::L2Space>(dim_);
     }
   } else if constexpr (std::is_same_v<T, uint8_t>) {
-    space_ = std::make_unique<hnswlib::L2SpaceI>(dim_);
+    space_ = std::make_shared<hnswlib::L2SpaceI>(dim_);
   }
 
-  appr_alg_ = std::make_unique<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
+  appr_alg_ = std::make_shared<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
     space_.get(), nrow, m_, ef_construction_);
 
-  thread_pool_                  = std::make_unique<FixedThreadPool>(num_threads_);
+  thread_pool_                  = std::make_shared<FixedThreadPool>(num_threads_);
   const size_t items_per_thread = nrow / (num_threads_ + 1);
 
   thread_pool_->submit(
@@ -168,7 +169,7 @@ void HnswLib<T>::set_search_param(const AnnSearchParam& param_)
 
   // Create a pool if multiple query threads have been set and the pool hasn't been created already
   bool create_pool = (metric_objective_ == Objective::LATENCY && num_threads_ > 1 && !thread_pool_);
-  if (create_pool) { thread_pool_ = std::make_unique<FixedThreadPool>(num_threads_); }
+  if (create_pool) { thread_pool_ = std::make_shared<FixedThreadPool>(num_threads_); }
 }
 
 template <typename T>
@@ -199,15 +200,15 @@ void HnswLib<T>::load(const std::string& path_to_index)
 {
   if constexpr (std::is_same_v<T, float>) {
     if (metric_ == Metric::kInnerProduct) {
-      space_ = std::make_unique<hnswlib::InnerProductSpace>(dim_);
+      space_ = std::make_shared<hnswlib::InnerProductSpace>(dim_);
     } else {
-      space_ = std::make_unique<hnswlib::L2Space>(dim_);
+      space_ = std::make_shared<hnswlib::L2Space>(dim_);
     }
   } else if constexpr (std::is_same_v<T, uint8_t>) {
-    space_ = std::make_unique<hnswlib::L2SpaceI>(dim_);
+    space_ = std::make_shared<hnswlib::L2SpaceI>(dim_);
   }
 
-  appr_alg_ = std::make_unique<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
+  appr_alg_ = std::make_shared<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
     space_.get(), path_to_index);
 }
 
diff --git a/cpp/bench/ann/src/raft/raft_ann_bench_utils.h b/cpp/bench/ann/src/raft/raft_ann_bench_utils.h
index cb30c2693f..2b91c2588c 100644
--- a/cpp/bench/ann/src/raft/raft_ann_bench_utils.h
+++ b/cpp/bench/ann/src/raft/raft_ann_bench_utils.h
@@ -41,4 +41,103 @@ inline raft::distance::DistanceType parse_metric_type(raft::bench::ann::Metric m
     throw std::runtime_error("raft supports only metric type of inner product and L2");
   }
 }
+
+/**
+ * This struct is used by multiple raft benchmark wrappers. It serves as a thread-safe keeper of
+ * shared and private GPU resources (see below).
+ *
+ * - Accessing the same `configured_raft_resources` from concurrent threads is not safe.
+ * - Accessing the copies of `configured_raft_resources` from concurrent threads is safe.
+ * - There must be at most one "original" `configured_raft_resources` at any time, but as many
+ *   copies of it as needed (modifies the program static state).
+ */
+class configured_raft_resources {
+ public:
+  using device_mr_t = rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>;
+  /**
+   * This constructor has the shared state passed unmodified but creates the local state anew.
+   * It's used by the copy constructor.
+   */
+  explicit configured_raft_resources(const std::shared_ptr<device_mr_t>& mr)
+    : mr_{mr},
+      sync_{[]() {
+              auto* ev = new cudaEvent_t;
+              RAFT_CUDA_TRY(cudaEventCreate(ev, cudaEventDisableTiming));
+              return ev;
+            }(),
+            [](cudaEvent_t* ev) {
+              RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(*ev));
+              delete ev;
+            }},
+      res_{cudaStreamPerThread}
+  {
+  }
+
+  /** Default constructor creates all resources anew. */
+  configured_raft_resources()
+    : configured_raft_resources{
+        {[]() {
+           auto* mr =
+             new device_mr_t{rmm::mr::get_current_device_resource(), 1024 * 1024 * 1024ull};
+           rmm::mr::set_current_device_resource(mr);
+           return mr;
+         }(),
+         [](device_mr_t* mr) {
+           if (mr == nullptr) { return; }
+           auto* cur_mr = dynamic_cast<device_mr_t*>(rmm::mr::get_current_device_resource());
+           if (cur_mr != nullptr && (*cur_mr) == (*mr)) {
+             // Normally, we'd always want to set the rmm resource back to the upstream of the pool
+             // here. However, we expect some implementations may be buggy and mess up the rmm
+             // resource, especially during development. This extra check here adds a little bit of
+             // resilience: let the program crash/fail somewhere else rather than in the destructor
+             // of the shared pointer.
+             rmm::mr::set_current_device_resource(mr->get_upstream());
+           }
+           delete mr;
+         }}}
+  {
+  }
+
+  configured_raft_resources(configured_raft_resources&&)            = default;
+  configured_raft_resources& operator=(configured_raft_resources&&) = default;
+  ~configured_raft_resources()                                      = default;
+  configured_raft_resources(const configured_raft_resources& res)
+    : configured_raft_resources{res.mr_}
+  {
+  }
+  configured_raft_resources& operator=(const configured_raft_resources& other)
+  {
+    this->mr_ = other.mr_;
+    return *this;
+  }
+
+  operator raft::resources&() noexcept { return res_; }
+  operator const raft::resources&() const noexcept { return res_; }
+
+  /** Make the given stream wait on all work submitted to the resource. */
+  void stream_wait(cudaStream_t stream) const
+  {
+    RAFT_CUDA_TRY(cudaEventRecord(*sync_, resource::get_cuda_stream(res_)));
+    RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, *sync_));
+  }
+
+  /** Get the internal sync event (which otherwise used only in `stream_wait`). */
+  cudaEvent_t get_sync_event() const { return *sync_; }
+
+ private:
+  /**
+   * This pool is set as the RMM current device, hence its shared among all users of RMM resources.
+   * Its lifetime must be longer than that of any other cuda resources. It's not exposed and not
+   * used by anyone directly.
+   */
+  std::shared_ptr<device_mr_t> mr_;
+  /** Each benchmark wrapper must have its own copy of the synchronization event. */
+  std::unique_ptr<cudaEvent_t, std::function<void(cudaEvent_t*)>> sync_;
+  /**
+   * Until we make the use of copies of raft::resources thread-safe, each benchmark wrapper must
+   * have its own copy of it.
+   */
+  raft::device_resources res_;
+};
+
 }  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_benchmark.cu b/cpp/bench/ann/src/raft/raft_benchmark.cu
index f8c65a2d6e..b776a9fafb 100644
--- a/cpp/bench/ann/src/raft/raft_benchmark.cu
+++ b/cpp/bench/ann/src/raft/raft_benchmark.cu
@@ -126,15 +126,5 @@ REGISTER_ALGO_INSTANCE(std::uint8_t);
 
 #ifdef ANN_BENCH_BUILD_MAIN
 #include "../common/benchmark.hpp"
-int main(int argc, char** argv)
-{
-  rmm::mr::cuda_memory_resource cuda_mr;
-  // Construct a resource that uses a coalescing best-fit pool allocator
-  rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> pool_mr{&cuda_mr};
-  rmm::mr::set_current_device_resource(
-    &pool_mr);  // Updates the current device resource pointer to `pool_mr`
-  rmm::mr::device_memory_resource* mr =
-    rmm::mr::get_current_device_resource();  // Points to `pool_mr`
-  return raft::bench::ann::run_main(argc, argv);
-}
+int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
 #endif
diff --git a/cpp/bench/ann/src/raft/raft_cagra_hnswlib_wrapper.h b/cpp/bench/ann/src/raft/raft_cagra_hnswlib_wrapper.h
index 432caecfcc..3fd0a374b7 100644
--- a/cpp/bench/ann/src/raft/raft_cagra_hnswlib_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_cagra_hnswlib_wrapper.h
@@ -30,15 +30,12 @@ class RaftCagraHnswlib : public ANN<T> {
 
   RaftCagraHnswlib(Metric metric, int dim, const BuildParam& param, int concurrent_searches = 1)
     : ANN<T>(metric, dim),
-      metric_(metric),
-      index_params_(param),
-      dimension_(dim),
-      handle_(cudaStreamPerThread)
+      cagra_build_{metric, dim, param, concurrent_searches},
+      // HnswLib param values don't matter since we don't build with HnswLib
+      hnswlib_search_{metric, dim, typename HnswLib<T>::BuildParam{50, 100}}
   {
   }
 
-  ~RaftCagraHnswlib() noexcept {}
-
   void build(const T* dataset, size_t nrow, cudaStream_t stream) final;
 
   void set_search_param(const AnnSearchParam& param) override;
@@ -60,61 +57,53 @@ class RaftCagraHnswlib : public ANN<T> {
     property.query_memory_type   = MemoryType::Host;
     return property;
   }
+
   void save(const std::string& file) const override;
   void load(const std::string&) override;
+  std::unique_ptr<ANN<T>> copy() override
+  {
+    return std::make_unique<RaftCagraHnswlib<T, IdxT>>(*this);
+  }
 
  private:
-  raft::device_resources handle_;
-  Metric metric_;
-  BuildParam index_params_;
-  int dimension_;
-
-  std::unique_ptr<RaftCagra<T, IdxT>> cagra_build_;
-  std::unique_ptr<HnswLib<T>> hnswlib_search_;
-
-  Objective metric_objective_;
+  RaftCagra<T, IdxT> cagra_build_;
+  HnswLib<T> hnswlib_search_;
 };
 
 template <typename T, typename IdxT>
 void RaftCagraHnswlib<T, IdxT>::build(const T* dataset, size_t nrow, cudaStream_t stream)
 {
-  if (not cagra_build_) {
-    cagra_build_ = std::make_unique<RaftCagra<T, IdxT>>(metric_, dimension_, index_params_);
-  }
-  cagra_build_->build(dataset, nrow, stream);
+  cagra_build_.build(dataset, nrow, stream);
 }
 
 template <typename T, typename IdxT>
 void RaftCagraHnswlib<T, IdxT>::set_search_param(const AnnSearchParam& param_)
 {
-  hnswlib_search_->set_search_param(param_);
+  hnswlib_search_.set_search_param(param_);
 }
 
 template <typename T, typename IdxT>
 void RaftCagraHnswlib<T, IdxT>::save(const std::string& file) const
 {
-  cagra_build_->save_to_hnswlib(file);
+  cagra_build_.save_to_hnswlib(file);
 }
 
 template <typename T, typename IdxT>
 void RaftCagraHnswlib<T, IdxT>::load(const std::string& file)
 {
-  typename HnswLib<T>::BuildParam param;
-  // these values don't matter since we don't build with HnswLib
-  param.M               = 50;
-  param.ef_construction = 100;
-  if (not hnswlib_search_) {
-    hnswlib_search_ = std::make_unique<HnswLib<T>>(metric_, dimension_, param);
-  }
-  hnswlib_search_->load(file);
-  hnswlib_search_->set_base_layer_only();
+  hnswlib_search_.load(file);
+  hnswlib_search_.set_base_layer_only();
 }
 
 template <typename T, typename IdxT>
-void RaftCagraHnswlib<T, IdxT>::search(
-  const T* queries, int batch_size, int k, size_t* neighbors, float* distances, cudaStream_t) const
+void RaftCagraHnswlib<T, IdxT>::search(const T* queries,
+                                       int batch_size,
+                                       int k,
+                                       size_t* neighbors,
+                                       float* distances,
+                                       cudaStream_t stream) const
 {
-  hnswlib_search_->search(queries, batch_size, k, neighbors, distances);
+  hnswlib_search_.search(queries, batch_size, k, neighbors, distances, stream);
 }
 
 }  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
index a3e481ec5a..ec71de9cff 100644
--- a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
@@ -76,21 +76,20 @@ class RaftCagra : public ANN<T> {
     : ANN<T>(metric, dim),
       index_params_(param),
       dimension_(dim),
-      handle_(cudaStreamPerThread),
       need_dataset_update_(true),
-      dataset_(make_device_matrix<T, int64_t>(handle_, 0, 0)),
-      graph_(make_device_matrix<IdxT, int64_t>(handle_, 0, 0)),
-      input_dataset_v_(nullptr, 0, 0),
+      dataset_(std::make_shared<raft::device_matrix<T, int64_t, row_major>>(
+        std::move(make_device_matrix<T, int64_t>(handle_, 0, 0)))),
+      graph_(std::make_shared<raft::device_matrix<IdxT, int64_t, row_major>>(
+        std::move(make_device_matrix<IdxT, int64_t>(handle_, 0, 0)))),
+      input_dataset_v_(
+        std::make_shared<raft::device_matrix_view<const T, int64_t, row_major>>(nullptr, 0, 0)),
       graph_mem_(AllocatorType::Device),
       dataset_mem_(AllocatorType::Device)
   {
     index_params_.cagra_params.metric         = parse_metric_type(metric);
     index_params_.ivf_pq_build_params->metric = parse_metric_type(metric);
-    RAFT_CUDA_TRY(cudaGetDevice(&device_));
   }
 
-  ~RaftCagra() noexcept {}
-
   void build(const T* dataset, size_t nrow, cudaStream_t stream) final;
 
   void set_search_param(const AnnSearchParam& param) override;
@@ -117,8 +116,24 @@ class RaftCagra : public ANN<T> {
   void save(const std::string& file) const override;
   void load(const std::string&) override;
   void save_to_hnswlib(const std::string& file) const;
+  std::unique_ptr<ANN<T>> copy() override;
 
  private:
+  // handle_ must go first to make sure it dies last and all memory allocated in pool
+  configured_raft_resources handle_{};
+  raft::mr::cuda_pinned_resource mr_pinned_;
+  raft::mr::cuda_huge_page_resource mr_huge_page_;
+  AllocatorType graph_mem_;
+  AllocatorType dataset_mem_;
+  BuildParam index_params_;
+  bool need_dataset_update_;
+  raft::neighbors::cagra::search_params search_params_;
+  std::shared_ptr<raft::neighbors::cagra::index<T, IdxT>> index_;
+  int dimension_;
+  std::shared_ptr<raft::device_matrix<IdxT, int64_t, row_major>> graph_;
+  std::shared_ptr<raft::device_matrix<T, int64_t, row_major>> dataset_;
+  std::shared_ptr<raft::device_matrix_view<const T, int64_t, row_major>> input_dataset_v_;
+
   inline rmm::mr::device_memory_resource* get_mr(AllocatorType mem_type)
   {
     switch (mem_type) {
@@ -127,38 +142,26 @@ class RaftCagra : public ANN<T> {
       default: return rmm::mr::get_current_device_resource();
     }
   }
-  raft ::mr::cuda_pinned_resource mr_pinned_;
-  raft ::mr::cuda_huge_page_resource mr_huge_page_;
-  raft::device_resources handle_;
-  AllocatorType graph_mem_;
-  AllocatorType dataset_mem_;
-  BuildParam index_params_;
-  bool need_dataset_update_;
-  raft::neighbors::cagra::search_params search_params_;
-  std::optional<raft::neighbors::cagra::index<T, IdxT>> index_;
-  int device_;
-  int dimension_;
-  raft::device_matrix<IdxT, int64_t, row_major> graph_;
-  raft::device_matrix<T, int64_t, row_major> dataset_;
-  raft::device_matrix_view<const T, int64_t, row_major> input_dataset_v_;
 };
 
 template <typename T, typename IdxT>
-void RaftCagra<T, IdxT>::build(const T* dataset, size_t nrow, cudaStream_t)
+void RaftCagra<T, IdxT>::build(const T* dataset, size_t nrow, cudaStream_t stream)
 {
   auto dataset_view =
     raft::make_host_matrix_view<const T, int64_t>(dataset, IdxT(nrow), dimension_);
 
   auto& params = index_params_.cagra_params;
 
-  index_.emplace(raft::neighbors::cagra::detail::build(handle_,
-                                                       params,
-                                                       dataset_view,
-                                                       index_params_.nn_descent_params,
-                                                       index_params_.ivf_pq_refine_rate,
-                                                       index_params_.ivf_pq_build_params,
-                                                       index_params_.ivf_pq_search_params));
-  return;
+  index_ = std::make_shared<raft::neighbors::cagra::index<T, IdxT>>(
+    std::move(raft::neighbors::cagra::detail::build(handle_,
+                                                    params,
+                                                    dataset_view,
+                                                    index_params_.nn_descent_params,
+                                                    index_params_.ivf_pq_refine_rate,
+                                                    index_params_.ivf_pq_build_params,
+                                                    index_params_.ivf_pq_search_params)));
+
+  handle_.stream_wait(stream);  // RAFT stream -> bench stream
 }
 
 inline std::string allocator_to_string(AllocatorType mem_type)
@@ -194,24 +197,24 @@ void RaftCagra<T, IdxT>::set_search_param(const AnnSearchParam& param)
 
     index_->update_graph(handle_, make_const_mdspan(new_graph.view()));
     // update_graph() only stores a view in the index. We need to keep the graph object alive.
-    graph_ = std::move(new_graph);
+    *graph_ = std::move(new_graph);
   }
 
   if (search_param.dataset_mem != dataset_mem_ || need_dataset_update_) {
     dataset_mem_ = search_param.dataset_mem;
 
     // First free up existing memory
-    dataset_ = make_device_matrix<T, int64_t>(handle_, 0, 0);
-    index_->update_dataset(handle_, make_const_mdspan(dataset_.view()));
+    *dataset_ = make_device_matrix<T, int64_t>(handle_, 0, 0);
+    index_->update_dataset(handle_, make_const_mdspan(dataset_->view()));
 
     // Allocate space using the correct memory resource.
     RAFT_LOG_INFO("moving dataset to new memory space: %s",
                   allocator_to_string(dataset_mem_).c_str());
 
     auto mr = get_mr(dataset_mem_);
-    raft::neighbors::cagra::detail::copy_with_padding(handle_, dataset_, input_dataset_v_, mr);
+    raft::neighbors::cagra::detail::copy_with_padding(handle_, *dataset_, *input_dataset_v_, mr);
 
-    index_->update_dataset(handle_, make_const_mdspan(dataset_.view()));
+    index_->update_dataset(handle_, make_const_mdspan(dataset_->view()));
 
     // Ideally, instead of dataset_.view(), we should pass a strided matrix view to update.
     // See Issue https://github.com/rapidsai/raft/issues/1972 for details.
@@ -227,9 +230,9 @@ void RaftCagra<T, IdxT>::set_search_dataset(const T* dataset, size_t nrow)
 {
   // It can happen that we are re-using a previous algo object which already has
   // the dataset set. Check if we need update.
-  if (static_cast<size_t>(input_dataset_v_.extent(0)) != nrow ||
-      input_dataset_v_.data_handle() != dataset) {
-    input_dataset_v_     = make_device_matrix_view<const T, int64_t>(dataset, nrow, this->dim_);
+  if (static_cast<size_t>(input_dataset_v_->extent(0)) != nrow ||
+      input_dataset_v_->data_handle() != dataset) {
+    *input_dataset_v_    = make_device_matrix_view<const T, int64_t>(dataset, nrow, this->dim_);
     need_dataset_update_ = true;
   }
 }
@@ -249,12 +252,23 @@ void RaftCagra<T, IdxT>::save_to_hnswlib(const std::string& file) const
 template <typename T, typename IdxT>
 void RaftCagra<T, IdxT>::load(const std::string& file)
 {
-  index_ = raft::neighbors::cagra::deserialize<T, IdxT>(handle_, file);
+  index_ = std::make_shared<raft::neighbors::cagra::index<T, IdxT>>(
+    std::move(raft::neighbors::cagra::deserialize<T, IdxT>(handle_, file)));
+}
+
+template <typename T, typename IdxT>
+std::unique_ptr<ANN<T>> RaftCagra<T, IdxT>::copy()
+{
+  return std::make_unique<RaftCagra<T, IdxT>>(*this);  // use copy constructor
 }
 
 template <typename T, typename IdxT>
-void RaftCagra<T, IdxT>::search(
-  const T* queries, int batch_size, int k, size_t* neighbors, float* distances, cudaStream_t) const
+void RaftCagra<T, IdxT>::search(const T* queries,
+                                int batch_size,
+                                int k,
+                                size_t* neighbors,
+                                float* distances,
+                                cudaStream_t stream) const
 {
   IdxT* neighbors_IdxT;
   rmm::device_uvector<IdxT> neighbors_storage(0, resource::get_cuda_stream(handle_));
@@ -281,6 +295,6 @@ void RaftCagra<T, IdxT>::search(
                           raft::resource::get_cuda_stream(handle_));
   }
 
-  handle_.sync_stream();
+  handle_.stream_wait(stream);  // RAFT stream -> bench stream
 }
 }  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
index 13ea20d483..51b8b67f37 100644
--- a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
@@ -59,8 +59,6 @@ class RaftIvfFlatGpu : public ANN<T> {
     RAFT_CUDA_TRY(cudaGetDevice(&device_));
   }
 
-  ~RaftIvfFlatGpu() noexcept {}
-
   void build(const T* dataset, size_t nrow, cudaStream_t stream) final;
 
   void set_search_param(const AnnSearchParam& param) override;
@@ -84,22 +82,24 @@ class RaftIvfFlatGpu : public ANN<T> {
   }
   void save(const std::string& file) const override;
   void load(const std::string&) override;
+  std::unique_ptr<ANN<T>> copy() override;
 
  private:
-  raft::device_resources handle_;
+  // handle_ must go first to make sure it dies last and all memory allocated in pool
+  configured_raft_resources handle_{};
   BuildParam index_params_;
   raft::neighbors::ivf_flat::search_params search_params_;
-  std::optional<raft::neighbors::ivf_flat::index<T, IdxT>> index_;
+  std::shared_ptr<raft::neighbors::ivf_flat::index<T, IdxT>> index_;
   int device_;
   int dimension_;
 };
 
 template <typename T, typename IdxT>
-void RaftIvfFlatGpu<T, IdxT>::build(const T* dataset, size_t nrow, cudaStream_t)
+void RaftIvfFlatGpu<T, IdxT>::build(const T* dataset, size_t nrow, cudaStream_t stream)
 {
-  index_.emplace(
-    raft::neighbors::ivf_flat::build(handle_, index_params_, dataset, IdxT(nrow), dimension_));
-  return;
+  index_ = std::make_shared<raft::neighbors::ivf_flat::index<T, IdxT>>(std::move(
+    raft::neighbors::ivf_flat::build(handle_, index_params_, dataset, IdxT(nrow), dimension_)));
+  handle_.stream_wait(stream);  // RAFT stream -> bench stream
 }
 
 template <typename T, typename IdxT>
@@ -120,18 +120,28 @@ void RaftIvfFlatGpu<T, IdxT>::save(const std::string& file) const
 template <typename T, typename IdxT>
 void RaftIvfFlatGpu<T, IdxT>::load(const std::string& file)
 {
-  index_ = raft::neighbors::ivf_flat::deserialize<T, IdxT>(handle_, file);
+  index_ = std::make_shared<raft::neighbors::ivf_flat::index<T, IdxT>>(
+    std::move(raft::neighbors::ivf_flat::deserialize<T, IdxT>(handle_, file)));
   return;
 }
 
 template <typename T, typename IdxT>
-void RaftIvfFlatGpu<T, IdxT>::search(
-  const T* queries, int batch_size, int k, size_t* neighbors, float* distances, cudaStream_t) const
+std::unique_ptr<ANN<T>> RaftIvfFlatGpu<T, IdxT>::copy()
+{
+  return std::make_unique<RaftIvfFlatGpu<T, IdxT>>(*this);  // use copy constructor
+}
+
+template <typename T, typename IdxT>
+void RaftIvfFlatGpu<T, IdxT>::search(const T* queries,
+                                     int batch_size,
+                                     int k,
+                                     size_t* neighbors,
+                                     float* distances,
+                                     cudaStream_t stream) const
 {
   static_assert(sizeof(size_t) == sizeof(IdxT), "IdxT is incompatible with size_t");
   raft::neighbors::ivf_flat::search(
     handle_, search_params_, *index_, queries, batch_size, k, (IdxT*)neighbors, distances);
-  resource::sync_stream(handle_);
-  return;
+  handle_.stream_wait(stream);  // RAFT stream -> bench stream
 }
 }  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
index e4004b0007..9a373787ac 100644
--- a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
@@ -57,12 +57,8 @@ class RaftIvfPQ : public ANN<T> {
     : ANN<T>(metric, dim), index_params_(param), dimension_(dim)
   {
     index_params_.metric = parse_metric_type(metric);
-    RAFT_CUDA_TRY(cudaGetDevice(&device_));
-    RAFT_CUDA_TRY(cudaEventCreate(&sync_, cudaEventDisableTiming));
   }
 
-  ~RaftIvfPQ() noexcept { RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(sync_)); }
-
   void build(const T* dataset, size_t nrow, cudaStream_t stream) final;
 
   void set_search_param(const AnnSearchParam& param) override;
@@ -87,23 +83,17 @@ class RaftIvfPQ : public ANN<T> {
   }
   void save(const std::string& file) const override;
   void load(const std::string&) override;
+  std::unique_ptr<ANN<T>> copy() override;
 
  private:
-  raft::device_resources handle_;
-  cudaEvent_t sync_{nullptr};
+  // handle_ must go first to make sure it dies last and all memory allocated in pool
+  configured_raft_resources handle_{};
   BuildParam index_params_;
   raft::neighbors::ivf_pq::search_params search_params_;
-  std::optional<raft::neighbors::ivf_pq::index<IdxT>> index_;
-  int device_;
+  std::shared_ptr<raft::neighbors::ivf_pq::index<IdxT>> index_;
   int dimension_;
   float refine_ratio_ = 1.0;
   raft::device_matrix_view<const T, IdxT> dataset_;
-
-  void stream_wait(cudaStream_t stream) const
-  {
-    RAFT_CUDA_TRY(cudaEventRecord(sync_, resource::get_cuda_stream(handle_)));
-    RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, sync_));
-  }
 };
 
 template <typename T, typename IdxT>
@@ -115,9 +105,9 @@ void RaftIvfPQ<T, IdxT>::save(const std::string& file) const
 template <typename T, typename IdxT>
 void RaftIvfPQ<T, IdxT>::load(const std::string& file)
 {
-  auto index_tmp = raft::neighbors::ivf_pq::index<IdxT>(handle_, index_params_, dimension_);
-  raft::runtime::neighbors::ivf_pq::deserialize(handle_, file, &index_tmp);
-  index_.emplace(std::move(index_tmp));
+  std::make_shared<raft::neighbors::ivf_pq::index<IdxT>>(handle_, index_params_, dimension_)
+    .swap(index_);
+  raft::runtime::neighbors::ivf_pq::deserialize(handle_, file, index_.get());
   return;
 }
 
@@ -125,9 +115,16 @@ template <typename T, typename IdxT>
 void RaftIvfPQ<T, IdxT>::build(const T* dataset, size_t nrow, cudaStream_t stream)
 {
   auto dataset_v = raft::make_device_matrix_view<const T, IdxT>(dataset, IdxT(nrow), dim_);
+  std::make_shared<raft::neighbors::ivf_pq::index<IdxT>>(
+    std::move(raft::runtime::neighbors::ivf_pq::build(handle_, index_params_, dataset_v)))
+    .swap(index_);
+  handle_.stream_wait(stream);  // RAFT stream -> bench stream
+}
 
-  index_.emplace(raft::runtime::neighbors::ivf_pq::build(handle_, index_params_, dataset_v));
-  stream_wait(stream);
+template <typename T, typename IdxT>
+std::unique_ptr<ANN<T>> RaftIvfPQ<T, IdxT>::copy()
+{
+  return std::make_unique<RaftIvfPQ<T, IdxT>>(*this);  // use copy constructor
 }
 
 template <typename T, typename IdxT>
@@ -176,7 +173,7 @@ void RaftIvfPQ<T, IdxT>::search(const T* queries,
                                        neighbors_v,
                                        distances_v,
                                        index_->metric());
-      stream_wait(stream);  // RAFT stream -> bench stream
+      handle_.stream_wait(stream);  // RAFT stream -> bench stream
     } else {
       auto queries_host    = raft::make_host_matrix<T, IdxT>(batch_size, index_->dim());
       auto candidates_host = raft::make_host_matrix<IdxT, IdxT>(batch_size, k0);
@@ -193,9 +190,9 @@ void RaftIvfPQ<T, IdxT>::search(const T* queries,
         dataset_.data_handle(), dataset_.extent(0), dataset_.extent(1));
 
       // wait for the queries to copy to host in 'stream` and for IVF-PQ::search to finish
-      RAFT_CUDA_TRY(cudaEventRecord(sync_, resource::get_cuda_stream(handle_)));
-      RAFT_CUDA_TRY(cudaEventRecord(sync_, stream));
-      RAFT_CUDA_TRY(cudaEventSynchronize(sync_));
+      RAFT_CUDA_TRY(cudaEventRecord(handle_.get_sync_event(), resource::get_cuda_stream(handle_)));
+      RAFT_CUDA_TRY(cudaEventRecord(handle_.get_sync_event(), stream));
+      RAFT_CUDA_TRY(cudaEventSynchronize(handle_.get_sync_event()));
       raft::runtime::neighbors::refine(handle_,
                                        dataset_v,
                                        queries_host.view(),
@@ -215,7 +212,7 @@ void RaftIvfPQ<T, IdxT>::search(const T* queries,
 
     raft::runtime::neighbors::ivf_pq::search(
       handle_, search_params_, *index_, queries_v, neighbors_v, distances_v);
-    stream_wait(stream);  // RAFT stream -> bench stream
+    handle_.stream_wait(stream);  // RAFT stream -> bench stream
   }
 }
 }  // namespace raft::bench::ann

From 6c95f9c0cc1b160f4934dce2dd7a875f592edb8c Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Wed, 13 Dec 2023 21:39:10 +0100
Subject: [PATCH 02/11] Fix a crash in FAISS benchmark wrapper introduced in
 #2021 (#2062)

With the changes introduced by #2021, the copied FAISS benchmark wrapper contains a cuda event that is used for synchronizing between streams during search. The lifetime of the event is the same as of the wrapper, but the event handle itself is copied between the wrappers; this leads to illegal memory accesses and crashes.
This PR fixes the bug by creating a new cuda event on each wrapper copy, so that the wrappers do not share their synchronization events.

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2062
---
 cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h | 33 +++++++++++++--------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h b/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h
index ad51dd4e68..7879530753 100644
--- a/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h
+++ b/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h
@@ -80,6 +80,19 @@ class OmpSingleThreadScope {
 
 namespace raft::bench::ann {
 
+struct copyable_event {
+  copyable_event() { RAFT_CUDA_TRY(cudaEventCreate(&value_, cudaEventDisableTiming)); }
+  ~copyable_event() { RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(value_)); }
+  copyable_event(copyable_event&&)            = default;
+  copyable_event& operator=(copyable_event&&) = default;
+  copyable_event(const copyable_event& res) : copyable_event{} {}
+  copyable_event& operator=(const copyable_event& other) = delete;
+  operator cudaEvent_t() const noexcept { return value_; }
+
+ private:
+  cudaEvent_t value_{nullptr};
+};
+
 template <typename T>
 class FaissGpu : public ANN<T> {
  public:
@@ -97,18 +110,15 @@ class FaissGpu : public ANN<T> {
 
   FaissGpu(Metric metric, int dim, const BuildParam& param)
     : ANN<T>(metric, dim),
+      gpu_resource_{std::make_shared<faiss::gpu::StandardGpuResources>()},
       metric_type_(parse_metric_type(metric)),
       nlist_{param.nlist},
       training_sample_fraction_{1.0 / double(param.ratio)}
   {
     static_assert(std::is_same_v<T, float>, "faiss support only float type");
     RAFT_CUDA_TRY(cudaGetDevice(&device_));
-    RAFT_CUDA_TRY(cudaEventCreate(&sync_, cudaEventDisableTiming));
-    faiss_default_stream_ = gpu_resource_.getDefaultStream(device_);
   }
 
-  virtual ~FaissGpu() noexcept { RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(sync_)); }
-
   void build(const T* dataset, size_t nrow, cudaStream_t stream = 0) final;
 
   virtual void set_search_param(const FaissGpu<T>::AnnSearchParam& param) {}
@@ -142,7 +152,7 @@ class FaissGpu : public ANN<T> {
 
   void stream_wait(cudaStream_t stream) const
   {
-    RAFT_CUDA_TRY(cudaEventRecord(sync_, faiss_default_stream_));
+    RAFT_CUDA_TRY(cudaEventRecord(sync_, gpu_resource_->getDefaultStream(device_)));
     RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, sync_));
   }
 
@@ -162,14 +172,13 @@ class FaissGpu : public ANN<T> {
    * faiss::gpu::StandardGpuResources are thread-safe.
    *
    */
-  mutable faiss::gpu::StandardGpuResources gpu_resource_;
+  mutable std::shared_ptr<faiss::gpu::StandardGpuResources> gpu_resource_;
   std::shared_ptr<faiss::gpu::GpuIndex> index_;
   std::shared_ptr<faiss::IndexRefineFlat> index_refine_{nullptr};
   faiss::MetricType metric_type_;
   int nlist_;
   int device_;
-  cudaEvent_t sync_{nullptr};
-  cudaStream_t faiss_default_stream_{nullptr};
+  copyable_event sync_{};
   double training_sample_fraction_;
   std::shared_ptr<faiss::SearchParameters> search_params_;
   const T* dataset_;
@@ -278,7 +287,7 @@ class FaissGpuIVFFlat : public FaissGpu<T> {
     faiss::gpu::GpuIndexIVFFlatConfig config;
     config.device = this->device_;
     this->index_  = std::make_shared<faiss::gpu::GpuIndexIVFFlat>(
-      &(this->gpu_resource_), dim, param.nlist, this->metric_type_, config);
+      this->gpu_resource_.get(), dim, param.nlist, this->metric_type_, config);
   }
 
   void set_search_param(const typename FaissGpu<T>::AnnSearchParam& param) override
@@ -321,7 +330,7 @@ class FaissGpuIVFPQ : public FaissGpu<T> {
     config.device                 = this->device_;
 
     this->index_ =
-      std::make_shared<faiss::gpu::GpuIndexIVFPQ>(&(this->gpu_resource_),
+      std::make_shared<faiss::gpu::GpuIndexIVFPQ>(this->gpu_resource_.get(),
                                                   dim,
                                                   param.nlist,
                                                   param.M,
@@ -383,7 +392,7 @@ class FaissGpuIVFSQ : public FaissGpu<T> {
     faiss::gpu::GpuIndexIVFScalarQuantizerConfig config;
     config.device = this->device_;
     this->index_  = std::make_shared<faiss::gpu::GpuIndexIVFScalarQuantizer>(
-      &(this->gpu_resource_), dim, param.nlist, qtype, this->metric_type_, true, config);
+      this->gpu_resource_.get(), dim, param.nlist, qtype, this->metric_type_, true, config);
   }
 
   void set_search_param(const typename FaissGpu<T>::AnnSearchParam& param) override
@@ -426,7 +435,7 @@ class FaissGpuFlat : public FaissGpu<T> {
     faiss::gpu::GpuIndexFlatConfig config;
     config.device = this->device_;
     this->index_  = std::make_shared<faiss::gpu::GpuIndexFlat>(
-      &(this->gpu_resource_), dim, this->metric_type_, config);
+      this->gpu_resource_.get(), dim, this->metric_type_, config);
   }
   void set_search_param(const typename FaissGpu<T>::AnnSearchParam& param) override
   {

From 80a48ca33c5d57d9b0597aacfe72fd0f87d40be6 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 13 Dec 2023 15:57:30 -0800
Subject: [PATCH 03/11] Switch to scikit-build-core (#2051)

Contributes to rapidsai/build-planning#2

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2051
---
 build.sh                                      |  8 ++--
 ci/build_wheel_pylibraft.sh                   |  2 +-
 ci/build_wheel_raft_dask.sh                   |  2 +-
 .../all_cuda-118_arch-aarch64.yaml            |  2 +-
 .../all_cuda-118_arch-x86_64.yaml             |  2 +-
 .../all_cuda-120_arch-aarch64.yaml            |  2 +-
 .../all_cuda-120_arch-x86_64.yaml             |  2 +-
 .../bench_ann_cuda-118_arch-aarch64.yaml      |  2 +-
 .../bench_ann_cuda-118_arch-x86_64.yaml       |  2 +-
 .../bench_ann_cuda-120_arch-aarch64.yaml      |  2 +-
 .../bench_ann_cuda-120_arch-x86_64.yaml       |  2 +-
 conda/recipes/pylibraft/meta.yaml             |  2 +-
 conda/recipes/raft-dask/meta.yaml             |  2 +-
 dependencies.yaml                             |  7 ++--
 python/pylibraft/CMakeLists.txt               |  8 +---
 python/pylibraft/README.md                    |  1 +
 python/pylibraft/pyproject.toml               | 25 ++++++++-----
 python/pylibraft/setup.py                     | 37 -------------------
 python/raft-dask/CMakeLists.txt               |  8 +---
 python/raft-dask/README.md                    |  1 +
 python/raft-dask/pyproject.toml               | 24 +++++++-----
 python/raft-dask/setup.py                     | 36 ------------------
 22 files changed, 56 insertions(+), 123 deletions(-)
 create mode 120000 python/pylibraft/README.md
 delete mode 100644 python/pylibraft/setup.py
 create mode 120000 python/raft-dask/README.md
 delete mode 100644 python/raft-dask/setup.py

diff --git a/build.sh b/build.sh
index 200d6710e0..e5df0af826 100755
--- a/build.sh
+++ b/build.sh
@@ -386,6 +386,8 @@ SKBUILD_EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS}"
 if [[ "${EXTRA_CMAKE_ARGS}" != *"DFIND_RAFT_CPP"* ]]; then
     SKBUILD_EXTRA_CMAKE_ARGS="${SKBUILD_EXTRA_CMAKE_ARGS} -DFIND_RAFT_CPP=ON"
 fi
+# Replace spaces with semicolons in SKBUILD_EXTRA_CMAKE_ARGS
+SKBUILD_EXTRA_CMAKE_ARGS=$(echo ${SKBUILD_EXTRA_CMAKE_ARGS} | sed 's/ /;/g')
 
 # If clean given, run it prior to any other steps
 if (( ${CLEAN} == 1 )); then
@@ -493,15 +495,13 @@ fi
 
 # Build and (optionally) install the pylibraft Python package
 if (( ${NUMARGS} == 0 )) || hasArg pylibraft; then
-    SKBUILD_CONFIGURE_OPTIONS="${SKBUILD_EXTRA_CMAKE_ARGS}" \
-        SKBUILD_BUILD_OPTIONS="-j${PARALLEL_LEVEL}" \
+    SKBUILD_CMAKE_ARGS="${SKBUILD_EXTRA_CMAKE_ARGS}" \
         python -m pip install --no-build-isolation --no-deps ${REPODIR}/python/pylibraft
 fi
 
 # Build and (optionally) install the raft-dask Python package
 if (( ${NUMARGS} == 0 )) || hasArg raft-dask; then
-    SKBUILD_CONFIGURE_OPTIONS="${SKBUILD_EXTRA_CMAKE_ARGS}" \
-        SKBUILD_BUILD_OPTIONS="-j${PARALLEL_LEVEL}" \
+    SKBUILD_CMAKE_ARGS="${SKBUILD_EXTRA_CMAKE_ARGS}" \
         python -m pip install --no-build-isolation --no-deps ${REPODIR}/python/raft-dask
 fi
 
diff --git a/ci/build_wheel_pylibraft.sh b/ci/build_wheel_pylibraft.sh
index 48e20f597e..ec30a28b92 100755
--- a/ci/build_wheel_pylibraft.sh
+++ b/ci/build_wheel_pylibraft.sh
@@ -4,6 +4,6 @@
 set -euo pipefail
 
 # Set up skbuild options. Enable sccache in skbuild config options
-export SKBUILD_CONFIGURE_OPTIONS="-DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF"
+export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_RAFT_CPP=OFF"
 
 ci/build_wheel.sh pylibraft python/pylibraft
diff --git a/ci/build_wheel_raft_dask.sh b/ci/build_wheel_raft_dask.sh
index cad09602dc..5ae12303d0 100755
--- a/ci/build_wheel_raft_dask.sh
+++ b/ci/build_wheel_raft_dask.sh
@@ -4,6 +4,6 @@
 set -euo pipefail
 
 # Set up skbuild options. Enable sccache in skbuild config options
-export SKBUILD_CONFIGURE_OPTIONS="-DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF"
+export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_RAFT_CPP=OFF"
 
 ci/build_wheel.sh raft-dask python/raft-dask
diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index ce1fed94f7..ac076f5505 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -49,7 +49,7 @@ dependencies:
 - rapids-dask-dependency==24.2.*
 - recommonmark
 - rmm==24.2.*
-- scikit-build>=0.13.1
+- scikit-build-core>=0.7.0
 - scikit-learn
 - scipy
 - sphinx-copybutton
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 16504fb4ca..b3ded51bb5 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -49,7 +49,7 @@ dependencies:
 - rapids-dask-dependency==24.2.*
 - recommonmark
 - rmm==24.2.*
-- scikit-build>=0.13.1
+- scikit-build-core>=0.7.0
 - scikit-learn
 - scipy
 - sphinx-copybutton
diff --git a/conda/environments/all_cuda-120_arch-aarch64.yaml b/conda/environments/all_cuda-120_arch-aarch64.yaml
index b2d4a50df8..c0eede1389 100644
--- a/conda/environments/all_cuda-120_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-120_arch-aarch64.yaml
@@ -45,7 +45,7 @@ dependencies:
 - rapids-dask-dependency==24.2.*
 - recommonmark
 - rmm==24.2.*
-- scikit-build>=0.13.1
+- scikit-build-core>=0.7.0
 - scikit-learn
 - scipy
 - sphinx-copybutton
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index adb1fd74b7..cebaf96493 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -45,7 +45,7 @@ dependencies:
 - rapids-dask-dependency==24.2.*
 - recommonmark
 - rmm==24.2.*
-- scikit-build>=0.13.1
+- scikit-build-core>=0.7.0
 - scikit-learn
 - scipy
 - sphinx-copybutton
diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
index 39c6d24889..87e3942e6a 100644
--- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -39,6 +39,6 @@ dependencies:
 - pandas
 - pyyaml
 - rmm==24.2.*
-- scikit-build>=0.13.1
+- scikit-build-core>=0.7.0
 - sysroot_linux-aarch64==2.17
 name: bench_ann_cuda-118_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index e0f46085d3..a4ac253a85 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -39,6 +39,6 @@ dependencies:
 - pandas
 - pyyaml
 - rmm==24.2.*
-- scikit-build>=0.13.1
+- scikit-build-core>=0.7.0
 - sysroot_linux-64==2.17
 name: bench_ann_cuda-118_arch-x86_64
diff --git a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
index 2c69bc5325..9ef9799363 100644
--- a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
@@ -35,6 +35,6 @@ dependencies:
 - pandas
 - pyyaml
 - rmm==24.2.*
-- scikit-build>=0.13.1
+- scikit-build-core>=0.7.0
 - sysroot_linux-aarch64==2.17
 name: bench_ann_cuda-120_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
index 4f4b0d5019..5fa09096ba 100644
--- a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
@@ -35,6 +35,6 @@ dependencies:
 - pandas
 - pyyaml
 - rmm==24.2.*
-- scikit-build>=0.13.1
+- scikit-build-core>=0.7.0
 - sysroot_linux-64==2.17
 name: bench_ann_cuda-120_arch-x86_64
diff --git a/conda/recipes/pylibraft/meta.yaml b/conda/recipes/pylibraft/meta.yaml
index b8a088d0f3..c736ffb877 100644
--- a/conda/recipes/pylibraft/meta.yaml
+++ b/conda/recipes/pylibraft/meta.yaml
@@ -50,7 +50,7 @@ requirements:
     - libraft-headers {{ version }}
     - python x.x
     - rmm ={{ minor_version }}
-    - scikit-build >=0.13.1
+    - scikit-build-core >=0.7.0
     - setuptools
   run:
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
diff --git a/conda/recipes/raft-dask/meta.yaml b/conda/recipes/raft-dask/meta.yaml
index eae5a6affe..944b500428 100644
--- a/conda/recipes/raft-dask/meta.yaml
+++ b/conda/recipes/raft-dask/meta.yaml
@@ -50,7 +50,7 @@ requirements:
     - pylibraft {{ version }}
     - python x.x
     - rmm ={{ minor_version }}
-    - scikit-build >=0.13.1
+    - scikit-build-core >=0.7.0
     - setuptools
     - ucx {{ ucx_version }}
     - ucx-proc=*=gpu
diff --git a/dependencies.yaml b/dependencies.yaml
index 64ff648c48..f049c75511 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -64,7 +64,6 @@ files:
     includes:
       - build
       - build_pylibraft
-      - build_wheels
   py_run_pylibraft:
     output: pyproject
     pyproject_dir: python/pylibraft
@@ -89,7 +88,6 @@ files:
       table: build-system
     includes:
       - build
-      - build_wheels
   py_run_raft_dask:
     output: pyproject
     pyproject_dir: python/raft-dask
@@ -133,12 +131,15 @@ dependencies:
           - &cmake_ver cmake>=3.26.4
           - cython>=3.0.0
           - ninja
-          - scikit-build>=0.13.1
       - output_types: [conda]
         packages:
           - c-compiler
           - cxx-compiler
           - nccl>=2.9.9
+          - scikit-build-core>=0.7.0
+      - output_types: [requirements, pyproject]
+        packages:
+          - scikit-build-core[pyproject]>=0.7.0
     specific:
       - output_types: conda
         matrices:
diff --git a/python/pylibraft/CMakeLists.txt b/python/pylibraft/CMakeLists.txt
index 3b0417b850..f7e114ae66 100644
--- a/python/pylibraft/CMakeLists.txt
+++ b/python/pylibraft/CMakeLists.txt
@@ -26,11 +26,7 @@ rapids_cuda_init_architectures(pylibraft)
 project(
   pylibraft
   VERSION ${pylibraft_version}
-  LANGUAGES # TODO: Building Python extension modules via the python_extension_module requires the C
-            # language to be enabled here. The test project that is built in scikit-build to verify
-            # various linking options for the python library is hardcoded to build with C, so until
-            # that is fixed we need to keep C.
-            C CXX CUDA
+  LANGUAGES CXX CUDA
 )
 
 option(FIND_RAFT_CPP "Search for existing RAFT C++ installations before defaulting to local files"
@@ -51,7 +47,7 @@ else()
   set(raft_FOUND OFF)
 endif()
 
-include(rapids-cython)
+include(rapids-cython-core)
 
 if(NOT raft_FOUND)
   set(BUILD_TESTS OFF)
diff --git a/python/pylibraft/README.md b/python/pylibraft/README.md
new file mode 120000
index 0000000000..fe84005413
--- /dev/null
+++ b/python/pylibraft/README.md
@@ -0,0 +1 @@
+../../README.md
\ No newline at end of file
diff --git a/python/pylibraft/pyproject.toml b/python/pylibraft/pyproject.toml
index f811afa55f..5070d6cf6f 100644
--- a/python/pylibraft/pyproject.toml
+++ b/python/pylibraft/pyproject.toml
@@ -20,11 +20,9 @@ requires = [
     "cython>=3.0.0",
     "ninja",
     "rmm==24.2.*",
-    "scikit-build>=0.13.1",
-    "setuptools",
-    "wheel",
+    "scikit-build-core[pyproject]>=0.7.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
-build-backend = "setuptools.build_meta"
+build-backend = "scikit_build_core.build"
 
 [project]
 name = "pylibraft"
@@ -61,12 +59,6 @@ test = [
 Homepage = "https://github.com/rapidsai/raft"
 Documentation = "https://docs.rapids.ai/api/raft/stable/"
 
-[tool.setuptools]
-license-files = ["LICENSE"]
-
-[tool.setuptools.dynamic]
-version = {file = "pylibraft/VERSION"}
-
 [tool.isort]
 line_length = 79
 multi_line_output = 3
@@ -109,3 +101,16 @@ skip = [
     "dist",
     "__init__.py",
 ]
+
+[tool.scikit-build]
+build-dir = "build/{wheel_tag}"
+cmake.build-type = "Release"
+cmake.minimum-version = "3.26.4"
+ninja.make-fallback = true
+sdist.reproducible = true
+wheel.packages = ["pylibraft"]
+
+[tool.scikit-build.metadata.version]
+provider = "scikit_build_core.metadata.regex"
+input = "pylibraft/VERSION"
+regex = "(?P<value>.*)"
diff --git a/python/pylibraft/setup.py b/python/pylibraft/setup.py
deleted file mode 100644
index 738bc2cf5b..0000000000
--- a/python/pylibraft/setup.py
+++ /dev/null
@@ -1,37 +0,0 @@
-#
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from setuptools import find_packages
-from skbuild import setup
-
-
-def exclude_libcxx_symlink(cmake_manifest):
-    return list(
-        filter(
-            lambda name: not ("include/rapids/libcxx/include" in name),
-            cmake_manifest,
-        )
-    )
-
-
-packages = find_packages(include=["pylibraft*"])
-setup(
-    # Don't want libcxx getting pulled into wheel builds.
-    cmake_process_manifest_hook=exclude_libcxx_symlink,
-    packages=packages,
-    package_data={key: ["VERSION", "*.pxd"] for key in packages},
-    zip_safe=False,
-)
diff --git a/python/raft-dask/CMakeLists.txt b/python/raft-dask/CMakeLists.txt
index 0deed549a7..1d27e49583 100644
--- a/python/raft-dask/CMakeLists.txt
+++ b/python/raft-dask/CMakeLists.txt
@@ -23,11 +23,7 @@ rapids_cuda_init_architectures(raft-dask-python)
 project(
   raft-dask-python
   VERSION ${raft_dask_version}
-  LANGUAGES # TODO: Building Python extension modules via the python_extension_module requires the C
-            # language to be enabled here. The test project that is built in scikit-build to verify
-            # various linking options for the python library is hardcoded to build with C, so until
-            # that is fixed we need to keep C.
-            C CXX CUDA
+  LANGUAGES CXX CUDA
 )
 
 option(FIND_RAFT_CPP "Search for existing RAFT C++ installations before defaulting to local files"
@@ -59,7 +55,7 @@ if(NOT raft_FOUND)
   find_package(NCCL REQUIRED)
 endif()
 
-include(rapids-cython)
+include(rapids-cython-core)
 rapids_cython_init()
 
 add_subdirectory(raft_dask/common)
diff --git a/python/raft-dask/README.md b/python/raft-dask/README.md
new file mode 120000
index 0000000000..fe84005413
--- /dev/null
+++ b/python/raft-dask/README.md
@@ -0,0 +1 @@
+../../README.md
\ No newline at end of file
diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index 807c413160..37d94be7f9 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -14,13 +14,12 @@
 
 [build-system]
 
+build-backend = "scikit_build_core.build"
 requires = [
     "cmake>=3.26.4",
     "cython>=3.0.0",
     "ninja",
-    "scikit-build>=0.13.1",
-    "setuptools",
-    "wheel",
+    "scikit-build-core[pyproject]>=0.7.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project]
@@ -59,12 +58,6 @@ test = [
 Homepage = "https://github.com/rapidsai/raft"
 Documentation = "https://docs.rapids.ai/api/raft/stable/"
 
-[tool.setuptools]
-license-files = ["LICENSE"]
-
-[tool.setuptools.dynamic]
-version = {file = "raft_dask/VERSION"}
-
 [tool.isort]
 line_length = 79
 multi_line_output = 3
@@ -108,3 +101,16 @@ skip = [
     "dist",
     "__init__.py",
 ]
+
+[tool.scikit-build]
+build-dir = "build/{wheel_tag}"
+cmake.build-type = "Release"
+cmake.minimum-version = "3.26.4"
+ninja.make-fallback = true
+sdist.reproducible = true
+wheel.packages = ["raft_dask"]
+
+[tool.scikit-build.metadata.version]
+provider = "scikit_build_core.metadata.regex"
+input = "raft_dask/VERSION"
+regex = "(?P<value>.*)"
diff --git a/python/raft-dask/setup.py b/python/raft-dask/setup.py
deleted file mode 100644
index dda58c3341..0000000000
--- a/python/raft-dask/setup.py
+++ /dev/null
@@ -1,36 +0,0 @@
-#
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from setuptools import find_packages
-from skbuild import setup
-
-
-def exclude_libcxx_symlink(cmake_manifest):
-    return list(
-        filter(
-            lambda name: not ("include/rapids/libcxx/include" in name),
-            cmake_manifest,
-        )
-    )
-
-
-packages = find_packages(include=["raft_dask*"])
-setup(
-    cmake_process_manifest_hook=exclude_libcxx_symlink,
-    packages=packages,
-    package_data={key: ["VERSION", "*.pxd"] for key in packages},
-    zip_safe=False,
-)

From 1beb556ab622da5586d89f7f1fd37ceade33f0bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Malte=20F=C3=B6rster?=
 <97973773+mfoerste4@users.noreply.github.com>
Date: Fri, 15 Dec 2023 04:03:31 +0100
Subject: [PATCH 04/11] [BUG] fix empty initialization of device_ndarray in
 pylibraft (#2061)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The `device_ndarray.empty()` function can be used to allocate device memory without initialization. Previously, the memory has been allocated (uninitialized) on the host and then been copied to the device.

This PR fixes the behavior for 'empty()' by allowing the `device_ndarray` to be initialized by an `array_interface` instead of an `numpy.ndarray` instance, which conditionally allows to skip the initialization of the `DeviceBuffer`.

CC @tfeher, @cjnolet

Authors:
  - Malte Förster (https://github.com/mfoerste4)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2061
---
 .../pylibraft/common/device_ndarray.py        | 49 +++++++++++++++----
 1 file changed, 39 insertions(+), 10 deletions(-)

diff --git a/python/pylibraft/pylibraft/common/device_ndarray.py b/python/pylibraft/pylibraft/common/device_ndarray.py
index f267e0c644..ae7bb2cabf 100644
--- a/python/pylibraft/pylibraft/common/device_ndarray.py
+++ b/python/pylibraft/pylibraft/common/device_ndarray.py
@@ -31,7 +31,9 @@ def __init__(self, np_ndarray):
 
         Parameters
         ----------
-        ndarray : A numpy.ndarray which will be copied and moved to the device
+        ndarray : Can be numpy.ndarray, array like or even directly
+            an __array_interface__. Only case it is a numpy.ndarray its
+            contents will be copied to the device.
 
         Examples
         --------
@@ -58,11 +60,38 @@ def __init__(self, np_ndarray):
             raft_array = device_ndarray.empty((100, 50))
             torch_tensor = torch.as_tensor(raft_array, device='cuda')
         """
-        self.ndarray_ = np_ndarray
+
+        if type(np_ndarray) is np.ndarray:
+            # np_ndarray IS an actual numpy.ndarray
+            self.__array_interface__ = np_ndarray.__array_interface__.copy()
+            self.ndarray_ = np_ndarray
+            copy = True
+        elif hasattr(np_ndarray, "__array_interface__"):
+            # np_ndarray HAS an __array_interface__
+            self.__array_interface__ = np_ndarray.__array_interface__.copy()
+            self.ndarray_ = np_ndarray
+            copy = False
+        elif all(
+            name in np_ndarray for name in {"typestr", "shape", "version"}
+        ):
+            # np_ndarray IS an __array_interface__
+            self.__array_interface__ = np_ndarray.copy()
+            self.ndarray_ = None
+            copy = False
+        else:
+            raise ValueError(
+                "np_ndarray should be or contain __array_interface__"
+            )
+
         order = "C" if self.c_contiguous else "F"
-        self.device_buffer_ = rmm.DeviceBuffer.to_device(
-            self.ndarray_.tobytes(order=order)
-        )
+        if copy:
+            self.device_buffer_ = rmm.DeviceBuffer.to_device(
+                self.ndarray_.tobytes(order=order)
+            )
+        else:
+            self.device_buffer_ = rmm.DeviceBuffer(
+                size=np.prod(self.shape) * self.dtype.itemsize
+            )
 
     @classmethod
     def empty(cls, shape, dtype=np.float32, order="C"):
@@ -82,7 +111,7 @@ def empty(cls, shape, dtype=np.float32, order="C"):
                 or column-major (Fortran-style) order in memory
         """
         arr = np.empty(shape, dtype=dtype, order=order)
-        return cls(arr)
+        return cls(arr.__array_interface__.copy())
 
     @property
     def c_contiguous(self):
@@ -104,7 +133,7 @@ def dtype(self):
         """
         Datatype of the current device_ndarray instance
         """
-        array_interface = self.ndarray_.__array_interface__
+        array_interface = self.__array_interface__
         return np.dtype(array_interface["typestr"])
 
     @property
@@ -112,7 +141,7 @@ def shape(self):
         """
         Shape of the current device_ndarray instance
         """
-        array_interface = self.ndarray_.__array_interface__
+        array_interface = self.__array_interface__
         return array_interface["shape"]
 
     @property
@@ -120,7 +149,7 @@ def strides(self):
         """
         Strides of the current device_ndarray instance
         """
-        array_interface = self.ndarray_.__array_interface__
+        array_interface = self.__array_interface__
         return array_interface.get("strides")
 
     @property
@@ -131,7 +160,7 @@ def __cuda_array_interface__(self):
         zero-copy semantics.
         """
         device_cai = self.device_buffer_.__cuda_array_interface__
-        host_cai = self.ndarray_.__array_interface__.copy()
+        host_cai = self.__array_interface__.copy()
         host_cai["data"] = (device_cai["data"][0], device_cai["data"][1])
 
         return host_cai

From 2962169ebf1fdd2f104c734058c06c5b5442fdfb Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 19 Dec 2023 14:48:31 -0600
Subject: [PATCH 05/11] Update to CCCL 2.2.0. (#2049)

This PR updates RAFT to CCCL 2.2.0. Do not merge until all of RAPIDS is ready to update.

Depends on #2048.

Replaces #1464.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/raft/pull/2049
---
 cpp/CMakeLists.txt                               | 16 +++-------------
 .../{get_thrust.cmake => get_cccl.cmake}         | 13 +++++--------
 2 files changed, 8 insertions(+), 21 deletions(-)
 rename cpp/cmake/thirdparty/{get_thrust.cmake => get_cccl.cmake} (70%)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index bccbc8c471..dbce46abfe 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -177,8 +177,8 @@ include(cmake/modules/ConfigureCUDA.cmake)
 rapids_cpm_init()
 
 if(NOT BUILD_CPU_ONLY)
-  # thrust before rmm/cuco so we get the right version of thrust/cub
-  include(cmake/thirdparty/get_thrust.cmake)
+  # CCCL before rmm/cuco so we get the right version of CCCL
+  include(cmake/thirdparty/get_cccl.cmake)
   include(cmake/thirdparty/get_rmm.cmake)
   include(cmake/thirdparty/get_cutlass.cmake)
 
@@ -206,7 +206,7 @@ target_include_directories(
 
 if(NOT BUILD_CPU_ONLY)
   # Keep RAFT as lightweight as possible. Only CUDA libs and rmm should be used in global target.
-  target_link_libraries(raft INTERFACE rmm::rmm cuco::cuco nvidia::cutlass::cutlass raft::Thrust)
+  target_link_libraries(raft INTERFACE rmm::rmm cuco::cuco nvidia::cutlass::cutlass CCCL::CCCL)
 endif()
 
 target_compile_features(raft INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
@@ -628,16 +628,6 @@ Imported Targets:
 
 set(code_string ${nvtx_export_string})
 
-string(
-  APPEND
-  code_string
-  [=[
-if(NOT TARGET raft::Thrust)
-  thrust_create_target(raft::Thrust FROM_OPTIONS)
-endif()
-]=]
-)
-
 string(
   APPEND
   code_string
diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_cccl.cmake
similarity index 70%
rename from cpp/cmake/thirdparty/get_thrust.cmake
rename to cpp/cmake/thirdparty/get_cccl.cmake
index 6e37aab40d..c608ee4630 100644
--- a/cpp/cmake/thirdparty/get_thrust.cmake
+++ b/cpp/cmake/thirdparty/get_cccl.cmake
@@ -12,13 +12,10 @@
 # the License.
 # =============================================================================
 
-# Use CPM to find or clone thrust
-function(find_and_configure_thrust)
-        include(${rapids-cmake-dir}/cpm/thrust.cmake)
-
-        rapids_cpm_thrust( NAMESPACE raft
-                           BUILD_EXPORT_SET raft-exports
-                           INSTALL_EXPORT_SET raft-exports)
+# Use CPM to find or clone CCCL
+function(find_and_configure_cccl)
+        include(${rapids-cmake-dir}/cpm/cccl.cmake)
+        rapids_cpm_cccl(BUILD_EXPORT_SET raft-exports INSTALL_EXPORT_SET raft-exports)
 endfunction()
 
-find_and_configure_thrust()
+find_and_configure_cccl()

From 7e098b22669d8f3b0152b4cb20ad854090182d47 Mon Sep 17 00:00:00 2001
From: Ben Frederickson <ben@benfrederickson.com>
Date: Tue, 19 Dec 2023 16:04:34 -0800
Subject: [PATCH 06/11] Benchmark brute force knn (#2063)

Add our bfknn code to the raft-ann-bench project

Authors:
  - Ben Frederickson (https://github.com/benfred)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2063
---
 cpp/bench/ann/CMakeLists.txt                  |  5 +-
 .../ann/src/faiss/faiss_cpu_benchmark.cpp     |  2 +-
 .../ann/src/faiss/faiss_gpu_benchmark.cu      |  2 +-
 .../src/raft/raft_ann_bench_param_parser.h    |  2 +-
 cpp/bench/ann/src/raft/raft_benchmark.cu      |  9 +-
 cpp/bench/ann/src/raft/raft_wrapper.h         | 84 ++++++++++---------
 .../src/raft-ann-bench/run/__main__.py        |  4 +-
 .../run/conf/algos/faiss_cpu_flat.yaml        |  5 ++
 .../run/conf/algos/faiss_gpu_flat.yaml        |  5 ++
 .../run/conf/algos/raft_brute_force.yaml      |  5 ++
 10 files changed, 74 insertions(+), 49 deletions(-)
 create mode 100644 python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/faiss_cpu_flat.yaml
 create mode 100644 python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/faiss_gpu_flat.yaml
 create mode 100644 python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_brute_force.yaml

diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index c144d1399e..16b0f7e1ac 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -18,9 +18,6 @@
 option(RAFT_ANN_BENCH_USE_FAISS_GPU_FLAT "Include faiss' brute-force knn algorithm in benchmark" ON)
 option(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT "Include faiss' ivf flat algorithm in benchmark" ON)
 option(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_PQ "Include faiss' ivf pq algorithm in benchmark" ON)
-option(RAFT_ANN_BENCH_USE_FAISS_CPU_FLAT
-       "Include faiss' cpu brute-force knn algorithm in benchmark" ON
-)
 option(RAFT_ANN_BENCH_USE_FAISS_CPU_FLAT "Include faiss' cpu brute-force algorithm in benchmark" ON)
 
 option(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT "Include faiss' cpu ivf flat algorithm in benchmark"
@@ -30,6 +27,7 @@ option(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_PQ "Include faiss' cpu ivf pq algorithm
 option(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT "Include raft's ivf flat algorithm in benchmark" ON)
 option(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ "Include raft's ivf pq algorithm in benchmark" ON)
 option(RAFT_ANN_BENCH_USE_RAFT_CAGRA "Include raft's CAGRA in benchmark" ON)
+option(RAFT_ANN_BENCH_USE_RAFT_BRUTE_FORCE "Include raft's brute force knn in benchmark" ON)
 option(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB "Include raft's CAGRA in benchmark" ON)
 option(RAFT_ANN_BENCH_USE_HNSWLIB "Include hnsw algorithm in benchmark" ON)
 option(RAFT_ANN_BENCH_USE_GGNN "Include ggnn algorithm in benchmark" ON)
@@ -55,6 +53,7 @@ if(BUILD_CPU_ONLY)
   set(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT OFF)
   set(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ OFF)
   set(RAFT_ANN_BENCH_USE_RAFT_CAGRA OFF)
+  set(RAFT_ANN_BENCH_USE_RAFT_BRUTE_FORCE OFF)
   set(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB OFF)
   set(RAFT_ANN_BENCH_USE_GGNN OFF)
 else()
diff --git a/cpp/bench/ann/src/faiss/faiss_cpu_benchmark.cpp b/cpp/bench/ann/src/faiss/faiss_cpu_benchmark.cpp
index 97d1bbf307..e3e25a99a2 100644
--- a/cpp/bench/ann/src/faiss/faiss_cpu_benchmark.cpp
+++ b/cpp/bench/ann/src/faiss/faiss_cpu_benchmark.cpp
@@ -143,7 +143,7 @@ std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search
     parse_search_param<T>(conf, *param);
     return param;
   } else if (algo == "faiss_cpu_flat") {
-    auto param = std::make_unique<typename raft::bench::ann::ANN<T>::AnnSearchParam>();
+    auto param = std::make_unique<typename raft::bench::ann::FaissCpu<T>::SearchParam>();
     return param;
   }
   // else
diff --git a/cpp/bench/ann/src/faiss/faiss_gpu_benchmark.cu b/cpp/bench/ann/src/faiss/faiss_gpu_benchmark.cu
index 8b04ba1980..a9388531cc 100644
--- a/cpp/bench/ann/src/faiss/faiss_gpu_benchmark.cu
+++ b/cpp/bench/ann/src/faiss/faiss_gpu_benchmark.cu
@@ -143,7 +143,7 @@ std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search
     parse_search_param<T>(conf, *param);
     return param;
   } else if (algo == "faiss_gpu_flat") {
-    auto param = std::make_unique<typename raft::bench::ann::ANN<T>::AnnSearchParam>();
+    auto param = std::make_unique<typename raft::bench::ann::FaissGpu<T>::SearchParam>();
     return param;
   }
   // else
diff --git a/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h b/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
index 1eb0e53cc5..2a021a8a12 100644
--- a/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
+++ b/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
@@ -20,7 +20,7 @@
 #include <nlohmann/json.hpp>
 
 #undef WARP_SIZE
-#ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN
+#ifdef RAFT_ANN_BENCH_USE_RAFT_BRUTE_FORCE
 #include "raft_wrapper.h"
 #endif
 #ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
diff --git a/cpp/bench/ann/src/raft/raft_benchmark.cu b/cpp/bench/ann/src/raft/raft_benchmark.cu
index b776a9fafb..cfc30bef7d 100644
--- a/cpp/bench/ann/src/raft/raft_benchmark.cu
+++ b/cpp/bench/ann/src/raft/raft_benchmark.cu
@@ -22,6 +22,7 @@
 #include <cmath>
 #include <memory>
 #include <raft/core/logger.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 #include <stdexcept>
 #include <string>
@@ -47,8 +48,10 @@ std::unique_ptr<raft::bench::ann::ANN<T>> create_algo(const std::string& algo,
   std::unique_ptr<raft::bench::ann::ANN<T>> ann;
 
   if constexpr (std::is_same_v<T, float>) {
-#ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN
-    if (algo == "raft_bfknn") { ann = std::make_unique<raft::bench::ann::RaftGpu<T>>(metric, dim); }
+#ifdef RAFT_ANN_BENCH_USE_RAFT_BRUTE_FORCE
+    if (algo == "raft_brute_force") {
+      ann = std::make_unique<raft::bench::ann::RaftGpu<T>>(metric, dim);
+    }
 #endif
   }
 
@@ -85,7 +88,7 @@ template <typename T>
 std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search_param(
   const std::string& algo, const nlohmann::json& conf)
 {
-#ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN
+#ifdef RAFT_ANN_BENCH_USE_RAFT_BRUTE_FORCE
   if (algo == "raft_brute_force") {
     auto param = std::make_unique<typename raft::bench::ann::ANN<T>::AnnSearchParam>();
     return param;
diff --git a/cpp/bench/ann/src/raft/raft_wrapper.h b/cpp/bench/ann/src/raft/raft_wrapper.h
index 499bdf29a1..eae615cba1 100644
--- a/cpp/bench/ann/src/raft/raft_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_wrapper.h
@@ -17,33 +17,33 @@
 
 #include <cassert>
 #include <memory>
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/detail/distance.cuh>
 #include <raft/distance/distance_types.hpp>
-#include <raft/spatial/knn/detail/fused_l2_knn.cuh>
+#include <raft/neighbors/brute_force.cuh>
+#include <raft/neighbors/brute_force_serialize.cuh>
 #include <stdexcept>
 #include <string>
 #include <type_traits>
 
 #include "../common/ann_types.hpp"
+#include "raft_ann_bench_utils.h"
 
 namespace raft_temp {
 
 inline raft::distance::DistanceType parse_metric_type(raft::bench::ann::Metric metric)
 {
-  if (metric == raft::bench::ann::Metric::kInnerProduct) {
-    return raft::distance::DistanceType::InnerProduct;
-  } else if (metric == raft::bench::ann::Metric::kEuclidean) {
-    return raft::distance::DistanceType::L2Expanded;
-  } else {
-    throw std::runtime_error("raft supports only metric type of inner product and L2");
+  switch (metric) {
+    case raft::bench::ann::Metric::kInnerProduct: return raft::distance::DistanceType::InnerProduct;
+    case raft::bench::ann::Metric::kEuclidean: return raft::distance::DistanceType::L2Expanded;
+    default: throw std::runtime_error("raft supports only metric type of inner product and L2");
   }
 }
-
 }  // namespace raft_temp
 
 namespace raft::bench::ann {
 
-// brute force fused L2 KNN - RAFT
+// brute force KNN - RAFT
 template <typename T>
 class RaftGpu : public ANN<T> {
  public:
@@ -74,9 +74,13 @@ class RaftGpu : public ANN<T> {
   }
   void set_search_dataset(const T* dataset, size_t nrow) override;
   void save(const std::string& file) const override;
-  void load(const std::string&) override { return; };
+  void load(const std::string&) override;
+  std::unique_ptr<ANN<T>> copy() override;
 
  protected:
+  // handle_ must go first to make sure it dies last and all memory allocated in pool
+  configured_raft_resources handle_{};
+  std::shared_ptr<raft::neighbors::brute_force::index<T>> index_;
   raft::distance::DistanceType metric_type_;
   int device_;
   const T* dataset_;
@@ -87,16 +91,19 @@ template <typename T>
 RaftGpu<T>::RaftGpu(Metric metric, int dim)
   : ANN<T>(metric, dim), metric_type_(raft_temp::parse_metric_type(metric))
 {
-  static_assert(std::is_same_v<T, float>, "raft support only float type");
-  assert(metric_type_ == raft::distance::DistanceType::L2Expanded);
+  static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
+                "raft bfknn only supports float/double");
   RAFT_CUDA_TRY(cudaGetDevice(&device_));
 }
 
 template <typename T>
-void RaftGpu<T>::build(const T*, size_t, cudaStream_t)
+void RaftGpu<T>::build(const T* dataset, size_t nrow, cudaStream_t stream)
 {
-  // as this is brute force algo so no index building required
-  return;
+  auto dataset_view = raft::make_host_matrix_view<const T, int64_t>(dataset, nrow, this->dim_);
+  index_            = std::make_shared<raft::neighbors::brute_force::index<T>>(
+    std::move(raft::neighbors::brute_force::build(handle_, dataset_view)));
+
+  handle_.stream_wait(stream);
 }
 
 template <typename T>
@@ -115,15 +122,14 @@ void RaftGpu<T>::set_search_dataset(const T* dataset, size_t nrow)
 template <typename T>
 void RaftGpu<T>::save(const std::string& file) const
 {
-  // create a empty index file as no index to store.
-  std::fstream fp;
-  fp.open(file.c_str(), std::ios::out);
-  if (!fp) {
-    printf("Error in creating file!!!\n");
-    ;
-    return;
-  }
-  fp.close();
+  raft::neighbors::brute_force::serialize<T>(handle_, file, *index_);
+}
+
+template <typename T>
+void RaftGpu<T>::load(const std::string& file)
+{
+  index_ = std::make_shared<raft::neighbors::brute_force::index<T>>(
+    std::move(raft::neighbors::brute_force::deserialize<T>(handle_, file)));
 }
 
 template <typename T>
@@ -134,20 +140,22 @@ void RaftGpu<T>::search(const T* queries,
                         float* distances,
                         cudaStream_t stream) const
 {
-  // TODO: Integrate new `raft::brute_force::index` (from
-  // https://github.com/rapidsai/raft/pull/1817)
-  raft::spatial::knn::detail::fusedL2Knn(this->dim_,
-                                         reinterpret_cast<int64_t*>(neighbors),
-                                         distances,
-                                         dataset_,
-                                         queries,
-                                         nrow_,
-                                         static_cast<size_t>(batch_size),
-                                         k,
-                                         true,
-                                         true,
-                                         stream,
-                                         metric_type_);
+  auto queries_view =
+    raft::make_device_matrix_view<const T, int64_t>(queries, batch_size, this->dim_);
+
+  auto neighbors_view = raft::make_device_matrix_view<size_t, int64_t>(neighbors, batch_size, k);
+  auto distances_view = raft::make_device_matrix_view<float, int64_t>(distances, batch_size, k);
+
+  raft::neighbors::brute_force::search<T, size_t>(
+    handle_, *index_, queries_view, neighbors_view, distances_view);
+
+  handle_.stream_wait(stream);
+}
+
+template <typename T>
+std::unique_ptr<ANN<T>> RaftGpu<T>::copy()
+{
+  return std::make_unique<RaftGpu<T>>(*this);  // use copy constructor
 }
 
 }  // namespace raft::bench::ann
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
index 9841b47b98..a1f97d67d5 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
@@ -498,8 +498,8 @@ def add_algo_group(group_list):
             )
             if executable not in executables_to_run:
                 executables_to_run[executable] = {"index": []}
-            build_params = algos_conf[algo]["groups"][group]["build"]
-            search_params = algos_conf[algo]["groups"][group]["search"]
+            build_params = algos_conf[algo]["groups"][group]["build"] or {}
+            search_params = algos_conf[algo]["groups"][group]["search"] or {}
 
             param_names = []
             param_lists = []
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/faiss_cpu_flat.yaml b/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/faiss_cpu_flat.yaml
new file mode 100644
index 0000000000..25eaf03d40
--- /dev/null
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/faiss_cpu_flat.yaml
@@ -0,0 +1,5 @@
+name: faiss_cpu_flat
+groups:
+  base:
+    build:
+    search:
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/faiss_gpu_flat.yaml b/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/faiss_gpu_flat.yaml
new file mode 100644
index 0000000000..a722e1b91c
--- /dev/null
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/faiss_gpu_flat.yaml
@@ -0,0 +1,5 @@
+name: faiss_gpu_flat
+groups:
+  base:
+    build:
+    search:
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_brute_force.yaml b/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_brute_force.yaml
new file mode 100644
index 0000000000..da99841f9b
--- /dev/null
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_brute_force.yaml
@@ -0,0 +1,5 @@
+name: raft_brute_force
+groups:
+  base:
+    build:
+    search:

From bae049bf59ac209a58881605426fb14b112a67fc Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Thu, 21 Dec 2023 10:22:51 -0500
Subject: [PATCH 07/11] Update `raft-ann-bench` output filenames and add
 features to plotting (#2043)

This PR:
1. Adds more clarity to filenames by using `,` as separator instead of `_`
2. Adds 80% and 99% recall bars to build plots
3. Does not plot a recall level in build plot if no data is present
4. Adds a `x-start` argument which allows controlling the minimum recall level used on the x-axis of the search plot
5. Fixes sometimes occurring multi-line issue in search plots
6. Build time plots now plot average build times for an index corresponding a search query in each recall range

Authors:
  - Divye Gala (https://github.com/divyegala)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2043
---
 docs/source/raft_ann_benchmarks.md            |  13 +-
 .../raft-ann-bench/data_export/__main__.py    |  34 +++--
 .../src/raft-ann-bench/plot/__main__.py       | 131 ++++++++++++------
 .../src/raft-ann-bench/run/__main__.py        |  33 +++--
 4 files changed, 136 insertions(+), 75 deletions(-)

diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md
index dcdfc2cec9..a2fe820317 100644
--- a/docs/source/raft_ann_benchmarks.md
+++ b/docs/source/raft_ann_benchmarks.md
@@ -171,8 +171,8 @@ options:
 
 `algo-groups`: this parameter is helpful to append any specific algorithm+group combination to run the benchmark for in addition to all the arguments from `algorithms` and `groups`. It is of the format `<algorithm>.<group>`, or for example, `raft_cagra.large`
 
-For every algorithm run by this script, it outputs an index build statistics JSON file in `<dataset-path/<dataset>/result/build/<algo_{group}-{k}-{batch_size}.json>`
-and an index search statistics JSON file in `<dataset-path/<dataset>/result/search/<algo_{group}-{k}-{batch_size}.json>`. NOTE: The filenams will not have "_{group}" if `group = "base"`.
+For every algorithm run by this script, it outputs an index build statistics JSON file in `<dataset-path/<dataset>/result/build/<{algo},{group}.json>`
+and an index search statistics JSON file in `<dataset-path/<dataset>/result/search/<{algo},{group},k{k},bs{batch_size}.json>`. NOTE: The filenames will not have ",{group}" if `group = "base"`.
 
 `dataset-path` :
 1. data is read from `<dataset-path>/<dataset>`
@@ -198,8 +198,8 @@ options:
   --dataset-path DATASET_PATH
                         path to dataset folder (default: ${RAPIDS_DATASET_ROOT_DIR})
 ```
-Build statistics CSV file is stored in `<dataset-path/<dataset>/result/build/<algo_group.csv>`
-and index search statistics CSV file in `<dataset-path/<dataset>/result/search/<algo_group-k{k}-batch_size{batch_size}_{suffix}.csv>`, where suffix has three values:
+Build statistics CSV file is stored in `<dataset-path/<dataset>/result/build/<{algo},{group}.csv>`
+and index search statistics CSV file in `<dataset-path/<dataset>/result/search/<{algo},{group},k{k},bs{batch_size},{suffix}.csv>`, where suffix has three values:
 1. `raw`: All search results are exported
 2. `throughput`: Pareto frontier of throughput results is exported
 3. `latency`: Pareto frontier of latency results is exported
@@ -212,8 +212,8 @@ CSV files `<dataset-path/<dataset>/result/search/*.csv`.
 The usage of this script is:
 ```bash
 usage:  [-h] [--dataset DATASET] [--dataset-path DATASET_PATH] [--output-filepath OUTPUT_FILEPATH] [--algorithms ALGORITHMS] [--groups GROUPS] [--algo-groups ALGO_GROUPS]
-        [-k COUNT] [-bs BATCH_SIZE] [--build] [--search] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--mode {throughput,latency}] [--time-unit {s,ms,us}]
-        [--raw]
+        [-k COUNT] [-bs BATCH_SIZE] [--build] [--search] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--x-start X_START] [--mode {throughput,latency}]
+        [--time-unit {s,ms,us}] [--raw]
 
 options:
   -h, --help            show this help message and exit
@@ -237,6 +237,7 @@ options:
   --x-scale X_SCALE     Scale to use when drawing the X-axis. Typically linear, logit or a2 (default: linear)
   --y-scale {linear,log,symlog,logit}
                         Scale to use when drawing the Y-axis (default: linear)
+  --x-start X_START     Recall values to start the x-axis from (default: 0.8)
   --mode {throughput,latency}
                         search mode whose Pareto frontier is used on the y-axis (default: throughput)
   --time-unit {s,ms,us}
diff --git a/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py
index 5cb06c573f..c8a6375577 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py
@@ -74,7 +74,9 @@ def read_file(dataset, dataset_path, method):
                 try:
                     data = json.load(f)
                     df = pd.DataFrame(data["benchmarks"])
-                    yield os.path.join(dir, file), file.split("-")[0], df
+                    filename_split = file.split(",")
+                    algo_name = (filename_split[0], filename_split[1])
+                    yield os.path.join(dir, file), algo_name, df
                 except Exception as e:
                     print(
                         "An error occurred processing file %s (%s). "
@@ -85,7 +87,10 @@ def read_file(dataset, dataset_path, method):
 def convert_json_to_csv_build(dataset, dataset_path):
     for file, algo_name, df in read_file(dataset, dataset_path, "build"):
         try:
-            algo_name = algo_name.replace("_base", "")
+            if "base" in algo_name[1]:
+                algo_name = algo_name[0]
+            else:
+                algo_name = "_".join(algo_name)
             df["name"] = df["name"].str.split("/").str[0]
             write = pd.DataFrame(
                 {
@@ -97,12 +102,7 @@ def convert_json_to_csv_build(dataset, dataset_path):
             for name in df:
                 if name not in skip_build_cols:
                     write[name] = df[name]
-            filepath = os.path.normpath(file).split(os.sep)
-            filename = filepath[-1].split("-")[0] + ".csv"
-            write.to_csv(
-                os.path.join(f"{os.sep}".join(filepath[:-1]), filename),
-                index=False,
-            )
+            write.to_csv(file.replace(".json", ".csv"), index=False)
         except Exception as e:
             print(
                 "An error occurred processing file %s (%s). Skipping..."
@@ -140,9 +140,17 @@ def convert_json_to_csv_search(dataset, dataset_path):
     for file, algo_name, df in read_file(dataset, dataset_path, "search"):
         try:
             build_file = os.path.join(
-                dataset_path, dataset, "result", "build", f"{algo_name}.csv"
+                dataset_path,
+                dataset,
+                "result",
+                "build",
+                f"{','.join(algo_name)}.csv",
             )
-            algo_name = algo_name.replace("_base", "")
+            print(build_file)
+            if "base" in algo_name[1]:
+                algo_name = algo_name[0]
+            else:
+                algo_name = "_".join(algo_name)
             df["name"] = df["name"].str.split("/").str[0]
             try:
                 write = pd.DataFrame(
@@ -201,13 +209,13 @@ def convert_json_to_csv_search(dataset, dataset_path):
                     "appended in the Search CSV"
                 )
 
-            write.to_csv(file.replace(".json", "_raw.csv"), index=False)
+            write.to_csv(file.replace(".json", ",raw.csv"), index=False)
             throughput = get_frontier(write, "throughput")
             throughput.to_csv(
-                file.replace(".json", "_throughput.csv"), index=False
+                file.replace(".json", ",throughput.csv"), index=False
             )
             latency = get_frontier(write, "latency")
-            latency.to_csv(file.replace(".json", "_latency.csv"), index=False)
+            latency.to_csv(file.replace(".json", ",latency.csv"), index=False)
         except Exception as e:
             print(
                 "An error occurred processing file %s (%s). Skipping..."
diff --git a/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py
index 8bd54170c9..86fd527f5f 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py
@@ -62,6 +62,19 @@ def positive_int(input_str: str) -> int:
     return i
 
 
+def positive_float(input_str: str) -> float:
+    try:
+        i = float(input_str)
+        if i < 0.0:
+            raise ValueError
+    except ValueError:
+        raise argparse.ArgumentTypeError(
+            f"{input_str} is not a positive float"
+        )
+
+    return i
+
+
 def generate_n_colors(n):
     vs = np.linspace(0.3, 0.9, 7)
     colors = [(0.9, 0.4, 0.4, 1.0)]
@@ -113,9 +126,11 @@ def create_plot_search(
     batch_size,
     mode,
     time_unit,
+    x_start,
 ):
     xn = "k-nn"
     xm, ym = (metrics[xn], metrics[mode])
+    xm["lim"][0] = x_start
     # Now generate each plot
     handles = []
     labels = []
@@ -211,20 +226,15 @@ def inv_fun(x):
 
 
 def create_plot_build(
-    build_results, search_results, linestyles, fn_out, dataset
+    build_results, search_results, linestyles, fn_out, dataset, k, batch_size
 ):
+    bt_80 = [0] * len(linestyles)
 
-    qps_85 = [-1] * len(linestyles)
-    bt_85 = [0] * len(linestyles)
-    i_85 = [-1] * len(linestyles)
-
-    qps_90 = [-1] * len(linestyles)
     bt_90 = [0] * len(linestyles)
-    i_90 = [-1] * len(linestyles)
 
-    qps_95 = [-1] * len(linestyles)
     bt_95 = [0] * len(linestyles)
-    i_95 = [-1] * len(linestyles)
+
+    bt_99 = [0] * len(linestyles)
 
     data = OrderedDict()
     colors = OrderedDict()
@@ -237,35 +247,59 @@ def mean_y(algo):
 
     for pos, algo in enumerate(sorted(search_results.keys(), key=mean_y)):
         points = np.array(search_results[algo], dtype=object)
+        # x is recall, ls is algo_name, idxs is index_name
         xs = points[:, 2]
-        ys = points[:, 3]
         ls = points[:, 0]
         idxs = points[:, 1]
-        # x is recall, y is qps, ls is algo_name, idxs is index_name
+
+        len_80, len_90, len_95, len_99 = 0, 0, 0, 0
         for i in range(len(xs)):
-            if xs[i] >= 0.85 and xs[i] < 0.9 and ys[i] > qps_85[pos]:
-                qps_85[pos] = ys[i]
-                bt_85[pos] = build_results[(ls[i], idxs[i])][0][2]
-                i_85[pos] = idxs[i]
-            elif xs[i] >= 0.9 and xs[i] < 0.95 and ys[i] > qps_90[pos]:
-                qps_90[pos] = ys[i]
-                bt_90[pos] = build_results[(ls[i], idxs[i])][0][2]
-                i_90[pos] = idxs[i]
-            elif xs[i] >= 0.95 and ys[i] > qps_95[pos]:
-                qps_95[pos] = ys[i]
-                bt_95[pos] = build_results[(ls[i], idxs[i])][0][2]
-                i_95[pos] = idxs[i]
-        data[algo] = [bt_85[pos], bt_90[pos], bt_95[pos]]
+            if xs[i] >= 0.80 and xs[i] < 0.90:
+                bt_80[pos] = bt_80[pos] + build_results[(ls[i], idxs[i])][0][2]
+                len_80 = len_80 + 1
+            elif xs[i] >= 0.9 and xs[i] < 0.95:
+                bt_90[pos] = bt_90[pos] + build_results[(ls[i], idxs[i])][0][2]
+                len_90 = len_90 + 1
+            elif xs[i] >= 0.95 and xs[i] < 0.99:
+                bt_95[pos] = bt_95[pos] + build_results[(ls[i], idxs[i])][0][2]
+                len_95 = len_95 + 1
+            elif xs[i] >= 0.99:
+                bt_99[pos] = bt_99[pos] + build_results[(ls[i], idxs[i])][0][2]
+                len_99 = len_99 + 1
+        if len_80 > 0:
+            bt_80[pos] = bt_80[pos] / len_80
+        if len_90 > 0:
+            bt_90[pos] = bt_90[pos] / len_90
+        if len_95 > 0:
+            bt_95[pos] = bt_95[pos] / len_95
+        if len_99 > 0:
+            bt_99[pos] = bt_99[pos] / len_99
+        data[algo] = [
+            bt_80[pos],
+            bt_90[pos],
+            bt_95[pos],
+            bt_99[pos],
+        ]
         colors[algo] = linestyles[algo][0]
 
-    index = ["@85% Recall", "@90% Recall", "@95% Recall"]
+    index = [
+        "@80% Recall",
+        "@90% Recall",
+        "@95% Recall",
+        "@99% Recall",
+    ]
 
     df = pd.DataFrame(data, index=index)
+    df.replace(0.0, np.nan, inplace=True)
+    df = df.dropna(how="all")
     plt.figure(figsize=(12, 9))
     ax = df.plot.bar(rot=0, color=colors)
     fig = ax.get_figure()
     print(f"writing build output to {fn_out}")
-    plt.title("Build Time for Highest QPS")
+    plt.title(
+        "Average Build Time within Recall Range "
+        f"for k={k} batch_size={batch_size}"
+    )
     plt.suptitle(f"{dataset}")
     plt.ylabel("Build Time (s)")
     fig.savefig(fn_out)
@@ -344,9 +378,9 @@ def load_all_results(
         ]
     elif method == "search":
         if raw:
-            suffix = "_raw"
+            suffix = ",raw"
         else:
-            suffix = f"_{mode}"
+            suffix = f",{mode}"
         result_files = [
             result_file
             for result_file in result_files
@@ -356,22 +390,20 @@ def load_all_results(
         raise FileNotFoundError(f"No CSV result files found in {results_path}")
 
     if method == "search":
-        result_files = [
-            result_filename
-            for result_filename in result_files
-            if f"{k}-{batch_size}" in result_filename
-        ]
-        algo_group_files = [
-            result_filename.split("-")[0] for result_filename in result_files
-        ]
-    else:
-        algo_group_files = [
-            result_filename for result_filename in result_files
-        ]
-
-    for i in range(len(algo_group_files)):
-        algo_group = algo_group_files[i].replace(".csv", "").split("_")
-        algo_group_files[i] = ("_".join(algo_group[:-1]), algo_group[-1])
+        filter_k_bs = []
+        for result_filename in result_files:
+            filename_split = result_filename.split(",")
+            if (
+                int(filename_split[-3][1:]) == k
+                and int(filename_split[-2][2:]) == batch_size
+            ):
+                filter_k_bs.append(result_filename)
+        result_files = filter_k_bs
+
+    algo_group_files = [
+        result_filename.replace(".csv", "").split(",")[:2]
+        for result_filename in result_files
+    ]
     algo_group_files = list(zip(*algo_group_files))
 
     if len(algorithms) > 0:
@@ -478,6 +510,12 @@ def main():
         choices=["linear", "log", "symlog", "logit"],
         default="linear",
     )
+    parser.add_argument(
+        "--x-start",
+        help="Recall values to start the x-axis from",
+        default=0.8,
+        type=positive_float,
+    )
     parser.add_argument(
         "--mode",
         help="search mode whose Pareto frontier is used on the y-axis",
@@ -525,7 +563,7 @@ def main():
     )
     build_output_filepath = os.path.join(
         args.output_filepath,
-        f"build-{args.dataset}.png",
+        f"build-{args.dataset}-k{k}-batch_size{batch_size}.png",
     )
 
     search_results = load_all_results(
@@ -554,6 +592,7 @@ def main():
             batch_size,
             args.mode,
             args.time_unit,
+            args.x_start,
         )
     if build:
         build_results = load_all_results(
@@ -575,6 +614,8 @@ def main():
             linestyles,
             build_output_filepath,
             args.dataset,
+            k,
+            batch_size,
         )
 
 
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
index a1f97d67d5..52d536c2e8 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
@@ -115,14 +115,16 @@ def validate_algorithm(algos_conf, algo, gpu_present):
 def find_executable(algos_conf, algo, group, k, batch_size):
     executable = algos_conf[algo]["executable"]
 
-    return_str = f"{algo}_{group}-{k}-{batch_size}"
+    file_name = (f"{algo},{group}", f"{algo},{group},k{k},bs{batch_size}")
 
     build_path = os.getenv("RAFT_HOME")
     if build_path is not None:
-        build_path = os.path.join(build_path, "cpp", "build", executable)
+        build_path = os.path.join(
+            build_path, "cpp", "build", "release", executable
+        )
         if os.path.exists(build_path):
             print(f"-- Using RAFT bench from repository in {build_path}. ")
-            return (executable, build_path, return_str)
+            return (executable, build_path, file_name)
 
     # if there is no build folder present, we look in the conda environment
     conda_path = os.getenv("CONDA_PREFIX")
@@ -130,7 +132,7 @@ def find_executable(algos_conf, algo, group, k, batch_size):
         conda_path = os.path.join(conda_path, "bin", "ann", executable)
         if os.path.exists(conda_path):
             print("-- Using RAFT bench found in conda environment. ")
-            return (executable, conda_path, return_str)
+            return (executable, conda_path, file_name)
 
     else:
         raise FileNotFoundError(executable)
@@ -152,15 +154,21 @@ def run_build_and_search(
     mode="throughput",
     raft_log_level="info",
 ):
-    for executable, ann_executable_path, algo in executables_to_run.keys():
+    for (
+        executable,
+        ann_executable_path,
+        output_filename,
+    ) in executables_to_run.keys():
         # Need to write temporary configuration
-        temp_conf_filename = f"{conf_filename}_{algo}_{uuid.uuid1()}.json"
+        temp_conf_filename = (
+            f"{conf_filename}_{output_filename[1]}_{uuid.uuid1()}.json"
+        )
         with open(temp_conf_filename, "w") as f:
             temp_conf = dict()
             temp_conf["dataset"] = conf_file["dataset"]
             temp_conf["search_basic_param"] = conf_file["search_basic_param"]
             temp_conf["index"] = executables_to_run[
-                (executable, ann_executable_path, algo)
+                (executable, ann_executable_path, output_filename)
             ]["index"]
             json_str = json.dumps(temp_conf, indent=2)
             f.write(json_str)
@@ -172,7 +180,7 @@ def run_build_and_search(
         if build:
             build_folder = os.path.join(legacy_result_folder, "build")
             os.makedirs(build_folder, exist_ok=True)
-            build_file = f"{algo}.json"
+            build_file = f"{output_filename[0]}.json"
             temp_build_file = f"{build_file}.lock"
             cmd = [
                 ann_executable_path,
@@ -190,7 +198,8 @@ def run_build_and_search(
 
             if dry_run:
                 print(
-                    "Benchmark command for %s:\n%s\n" % (algo, " ".join(cmd))
+                    "Benchmark command for %s:\n%s\n"
+                    % (output_filename[0], " ".join(cmd))
                 )
             else:
                 try:
@@ -208,6 +217,7 @@ def run_build_and_search(
         if search:
             search_folder = os.path.join(legacy_result_folder, "search")
             os.makedirs(search_folder, exist_ok=True)
+            search_file = f"{output_filename[1]}.json"
             cmd = [
                 ann_executable_path,
                 "--search",
@@ -219,7 +229,7 @@ def run_build_and_search(
                 "--benchmark_out_format=json",
                 "--mode=%s" % mode,
                 "--benchmark_out="
-                + f"{os.path.join(search_folder, f'{algo}.json')}",
+                + f"{os.path.join(search_folder, search_file)}",
                 "--raft_log_level=" + f"{parse_log_level(raft_log_level)}",
             ]
             if force:
@@ -231,7 +241,8 @@ def run_build_and_search(
             cmd = cmd + [temp_conf_filename]
             if dry_run:
                 print(
-                    "Benchmark command for %s:\n%s\n" % (algo, " ".join(cmd))
+                    "Benchmark command for %s:\n%s\n"
+                    % (output_filename[1], " ".join(cmd))
                 )
             else:
                 try:

From dc442d8f0e74f94b3bceb880f18f043e48802226 Mon Sep 17 00:00:00 2001
From: Vivek Narang <123010842+narangvivek10@users.noreply.github.com>
Date: Tue, 2 Jan 2024 13:57:39 -0500
Subject: [PATCH 08/11] Fix a typo (#2070)

A code change to fix a typo that I found while going through the codebase.

Authors:
  - Vivek Narang (https://github.com/narangvivek10)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2070
---
 cpp/include/raft/neighbors/detail/refine_device.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/raft/neighbors/detail/refine_device.cuh b/cpp/include/raft/neighbors/detail/refine_device.cuh
index 6ee96957fa..337318f791 100644
--- a/cpp/include/raft/neighbors/detail/refine_device.cuh
+++ b/cpp/include/raft/neighbors/detail/refine_device.cuh
@@ -51,7 +51,7 @@ void refine_device(raft::resources const& handle,
   uint32_t k              = static_cast<uint32_t>(indices.extent(1));
 
   RAFT_EXPECTS(k <= raft::matrix::detail::select::warpsort::kMaxCapacity,
-               "k must be lest than topk::kMaxCapacity (%d).",
+               "k must be less than topk::kMaxCapacity (%d).",
                raft::matrix::detail::select::warpsort::kMaxCapacity);
 
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(

From 6d3572b098c8650af4742159b75c73c7a802f63f Mon Sep 17 00:00:00 2001
From: William Hicks <wphicks@users.noreply.github.com>
Date: Wed, 3 Jan 2024 16:59:38 -0500
Subject: [PATCH 09/11] Fix errors with ingroup exposed by doxygen 1.10 (#2079)

Allow docs to be built correctly with doxygen 1.10

Authors:
  - William Hicks (https://github.com/wphicks)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2079
---
 cpp/include/raft/core/cublas_macros.hpp         |  8 ++++----
 cpp/include/raft/linalg/add.cuh                 | 12 +++++++++---
 cpp/include/raft/neighbors/nn_descent_types.hpp |  4 ++--
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/cpp/include/raft/core/cublas_macros.hpp b/cpp/include/raft/core/cublas_macros.hpp
index 5c56240ccf..c782c8ded1 100644
--- a/cpp/include/raft/core/cublas_macros.hpp
+++ b/cpp/include/raft/core/cublas_macros.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@
 namespace raft {
 
 /**
- * @ingroup error_handling
+ * @addtogroup error_handling
  * @{
  */
 
@@ -76,7 +76,7 @@ inline const char* cublas_error_to_string(cublasStatus_t err)
 #undef _CUBLAS_ERR_TO_STR
 
 /**
- * @ingroup assertion
+ * @addtogroup assertion
  * @{
  */
 
@@ -135,4 +135,4 @@ inline const char* cublas_error_to_string(cublasStatus_t err)
 #define CUBLAS_CHECK_NO_THROW(call) RAFT_CUBLAS_TRY_NO_THROW(call)
 #endif
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/add.cuh b/cpp/include/raft/linalg/add.cuh
index b2cd736c57..26b012956a 100644
--- a/cpp/include/raft/linalg/add.cuh
+++ b/cpp/include/raft/linalg/add.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,7 +29,11 @@ namespace raft {
 namespace linalg {
 
 /**
- * @ingroup arithmetic
+ * @defgroup arithmetic Arithmetic functions
+ * @{
+ */
+
+/**
  * @brief Elementwise scalar add operation on the input buffer
  *
  * @tparam InT     input data-type. Also the data-type upon which the math ops
@@ -87,6 +91,8 @@ void addDevScalar(
   detail::addDevScalar(outDev, inDev, singleScalarDev, len, stream);
 }
 
+/** @} */  // end of group add
+
 /**
  * @defgroup add_dense Addition Arithmetic
  * @{
@@ -220,4 +226,4 @@ void add_scalar(raft::resources const& handle,
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/neighbors/nn_descent_types.hpp b/cpp/include/raft/neighbors/nn_descent_types.hpp
index 64e464c618..7d4f3d615b 100644
--- a/cpp/include/raft/neighbors/nn_descent_types.hpp
+++ b/cpp/include/raft/neighbors/nn_descent_types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@
 
 namespace raft::neighbors::experimental::nn_descent {
 /**
- * @ingroup nn_descent
+ * @ingroup nn-descent
  * @{
  */
 

From 0d6595462cf0d35b1440295056a72fb24c4ad6da Mon Sep 17 00:00:00 2001
From: William Hicks <wphicks@users.noreply.github.com>
Date: Wed, 3 Jan 2024 21:16:46 -0500
Subject: [PATCH 10/11] Implement maybe-owning multi-dimensional container
 (mdbuffer) (#1999)

### What is mdbuffer?

This PR introduces a maybe-owning multi-dimensional abstraction called `mdbuffer` to help simplify code that _may_ require an `mdarray` but only if the data are not already in a desired form or location.

As a concrete example, consider a function `foo_device` which operates on memory accessible from the device. If we wish to pass it data originating on the host, a separate code path must be created in which a `device_mdarray` is created and the data are explicitly copied from host to device. This leads to a proliferation of branches as `foo_device` interacts with other functions with similar requirements.

As an initial simplification, `mdbuffer` allows us to write a single template that accepts an `mdspan` pointing to memory on either host _or_ device and routes it through the same code:
```c++
template <typename mdspan_type>
void foo_device(raft::resources const& res, mdspan_type data) {
  auto buf = raft::mdbuffer{res, raft::mdbuffer{data}, raft::memory_type::device};
  // Data in buf is now guaranteed to be accessible from device.
  // If it was already accessible from device, no copy was performed. If it
  // was not, a copy was performed.

  some_kernel<<<...>>>(buf.view<raft::memory_type::device>());

  // It is sometimes useful to know whether or not a copy was performed to
  // e.g. determine whether the transformed data should be copied back to its original
  // location. This can be checked via the `is_owning()` method.
  if (buf.is_owning()) {
    raft::copy(res, data, buf.view<raft::memory_type::device>());
  }
}

foo_device(res, some_host_mdspan);  // Still works; memory is allocated and copy is performed
foo_device(res, some_device_mdspan);  // Still works and no allocation or copy is required
foo_device(res, some_managed_mdspan);  // Still works and no allocation or copy is required
```

While this is a useful simplification, it still leads to a proliferation of template instantiations. If this is undesirable, `mdbuffer` permits a further consolidation through implicit conversion of an mdspan to an mdbuffer:

```c++
void foo_device(raft::resources const& res, raft::mdbuffer<float, raft::matrix_extent<int>>&& data)
{ auto buf = raft::mdbuffer{res, data, raft::memory_type::device};
  some_kernel<<<...>>>(buf.view<raft::memory_type::device>());
  if (buf.is_owning()) {
    raft::copy(res, data, buf.view<raft::memory_type::device>());
  }
}

// All of the following work exactly as before but no longer require separate template instantiations
foo_device(res, some_host_mdspan);
foo_device(res, some_device_mdspan);
foo_device(res, some_managed_mdspan);
```

`mdbuffer` also offers a simple way to perform runtime dispatching based on the memory type passed to it using standard C++ patterns. While mdbuffer's `.view()` method takes an optional template parameter indicating the mdspan type to retrieve as a view, that parameter can be omitted to retrieve a `std::variant` of all mdspan types which may provide a view on the `mdbuffer`'s data (depending on its memory type). We can then use `std::visit` to perform runtime dispatching based on where the data are stored:

```c++
void foo(raft::resources const& res, raft::mdbuffer<float, raft::matrix_extent<int>>&& data) {
  std::visit([](auto view) {
    if constexpr (typename decltype(view)::accessor_type::is_device_accessible) {
      // Do something with these data on device
    } else {
      // Do something with these data on host
    }
  }, data.view());
}
```

In addition to moving data among various memory types (host, device, managed, and pinned currently), `mdbuffer` can be used to coerce data to a desired in-memory layout or to a compatible data type (e.g. floats to doubles). As with changes in the memory type, a copy will be performed if and only if it is necessary.

```c++
template <typename mdspan_type>
void foo_device(raft::resources const& res, mdspan_type data) {
  auto buf = raft::mdbuffer<float, raft::matrix_extent<int>, raft::row_major>{res,
raft::mdbuffer{data}, raft::memory_type::device};
  // Data in buf is now guaranteed to be accessible from device, and
  // represented by floats in row-major order.

  some_kernel<<<...>>>(buf.view<raft::memory_type::device>());

  // The same check can be used to determine whether or not a copy was
  // required, regardless of the cause. I.e. if the data were already on
  // device but in column-major order, the is_owning() method would still
  // return true because new storage needed to be allocated.
  if (buf.is_owning()) {
    raft::copy(res, data, buf.view<raft::memory_type::device>());
  }
}
```

### What mdbuffer is **not**
`mdbuffer` is **not** a replacement for either `mdspan` or `mdarray`. `mdspan` remains the standard object for passing data views throughout the RAFT codebase, and `mdarray` remains the standard object for allocating new multi-dimensional data. This is reflected in the fact that `mdbuffer` can _only_ be constructed from an existing `mdspan` or `mdarray` or another `mdbuffer`. `mdbuffer` is intended to be used solely to simplify code where data _may_ need to be copied to a different location.

### Follow-ups

-  I have omitted the mdbuffer-based replacement for and generalization of `temporary_device_buffer` since this PR is already enormous. I have this partially written however, and I'll post a link to its current state to help motivate the changes here.
- For all necessary copies, `mdbuffer` uses `raft::copy`. For _some_ transformations that require a change in data type or layout, `raft::copy` is not fully optimized. See #1842 for more information. Optimizing this will be an important change to ensure that `mdbuffer` can be used with absolutely minimal overhead in all cases. These non-optimized cases represent a small fraction of the real-world use cases we can expect for `mdbuffer`, however, so there should be little concern about beginning to use it as is.
- `std::visit`'s performance for a small number of variants is sometimes non-optimal. As a followup, it would be good to benchmark `mdbuffer`'s current performance and compare to internal use of a `visit` implementation that uses a `switch` on the available memory types.

Resolve #1602

Authors:
  - William Hicks (https://github.com/wphicks)
  - Tarang Jain (https://github.com/tarang-jain)

Approvers:
  - Divye Gala (https://github.com/divyegala)
  - Corey J. Nolet (https://github.com/cjnolet)
  - Artem M. Chirkin (https://github.com/achirkin)
  - Tamas Bela Feher (https://github.com/tfeher)
  - Ben Frederickson (https://github.com/benfred)

URL: https://github.com/rapidsai/raft/pull/1999
---
 .../core/detail/fail_container_policy.hpp     |  146 +++
 .../raft/core/device_container_policy.hpp     |   22 +-
 cpp/include/raft/core/device_mdspan.hpp       |   88 +-
 .../raft/core/host_container_policy.hpp       |    3 +-
 .../raft/core/host_device_accessor.hpp        |   12 +-
 cpp/include/raft/core/host_mdspan.hpp         |   10 +-
 .../raft/core/managed_container_policy.hpp    |   86 ++
 cpp/include/raft/core/managed_mdarray.hpp     |  152 +++
 cpp/include/raft/core/managed_mdspan.hpp      |  273 +++++
 cpp/include/raft/core/mdbuffer.cuh            | 1020 +++++++++++++++++
 cpp/include/raft/core/mdbuffer.hpp            |   26 +
 cpp/include/raft/core/memory_type.hpp         |   56 +-
 .../raft/core/pinned_container_policy.hpp     |  142 +++
 cpp/include/raft/core/pinned_mdarray.hpp      |  152 +++
 cpp/include/raft/core/pinned_mdspan.hpp       |  270 +++++
 cpp/include/raft/core/serialize.hpp           |    3 +-
 cpp/include/raft/core/stream_view.hpp         |    3 +-
 .../raft/util/memory_type_dispatcher.cuh      |  209 ++++
 cpp/include/raft/util/variant_utils.hpp       |   64 ++
 cpp/test/CMakeLists.txt                       |    4 +-
 cpp/test/core/mdarray.cu                      |    3 +-
 cpp/test/core/mdbuffer.cu                     |  330 ++++++
 cpp/test/core/memory_type.cpp                 |   34 +-
 cpp/test/core/numpy_serializer.cu             |    3 +-
 cpp/test/util/memory_type_dispatcher.cu       |  421 +++++++
 docs/source/cpp_api/mdspan.rst                |    2 +
 docs/source/cpp_api/mdspan_mdarray.rst        |   66 +-
 docs/source/cpp_api/mdspan_mdbuffer.rst       |   13 +
 docs/source/cpp_api/mdspan_mdspan.rst         |   39 +-
 .../source/cpp_api/memory_type_dispatcher.rst |   13 +
 30 files changed, 3563 insertions(+), 102 deletions(-)
 create mode 100644 cpp/include/raft/core/detail/fail_container_policy.hpp
 create mode 100644 cpp/include/raft/core/managed_container_policy.hpp
 create mode 100644 cpp/include/raft/core/managed_mdarray.hpp
 create mode 100644 cpp/include/raft/core/managed_mdspan.hpp
 create mode 100644 cpp/include/raft/core/mdbuffer.cuh
 create mode 100644 cpp/include/raft/core/mdbuffer.hpp
 create mode 100644 cpp/include/raft/core/pinned_container_policy.hpp
 create mode 100644 cpp/include/raft/core/pinned_mdarray.hpp
 create mode 100644 cpp/include/raft/core/pinned_mdspan.hpp
 create mode 100644 cpp/include/raft/util/memory_type_dispatcher.cuh
 create mode 100644 cpp/include/raft/util/variant_utils.hpp
 create mode 100644 cpp/test/core/mdbuffer.cu
 create mode 100644 cpp/test/util/memory_type_dispatcher.cu
 create mode 100644 docs/source/cpp_api/mdspan_mdbuffer.rst
 create mode 100644 docs/source/cpp_api/memory_type_dispatcher.rst

diff --git a/cpp/include/raft/core/detail/fail_container_policy.hpp b/cpp/include/raft/core/detail/fail_container_policy.hpp
new file mode 100644
index 0000000000..ff36659f04
--- /dev/null
+++ b/cpp/include/raft/core/detail/fail_container_policy.hpp
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <raft/core/error.hpp>
+#include <raft/core/logger-macros.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/thirdparty/mdspan/include/experimental/mdspan>
+#include <stddef.h>
+
+namespace raft {
+namespace detail {
+
+template <typename T>
+struct fail_reference {
+  using value_type    = typename std::remove_cv_t<T>;
+  using pointer       = T*;
+  using const_pointer = T const*;
+
+  fail_reference() = default;
+  template <typename StreamViewType>
+  fail_reference(T* ptr, StreamViewType stream)
+  {
+    throw non_cuda_build_error{"Attempted to construct reference to device data in non-CUDA build"};
+  }
+
+  operator value_type() const  // NOLINT
+  {
+    throw non_cuda_build_error{"Attempted to dereference device data in non-CUDA build"};
+    return value_type{};
+  }
+  auto operator=(T const& other) -> fail_reference&
+  {
+    throw non_cuda_build_error{"Attempted to assign to device data in non-CUDA build"};
+    return *this;
+  }
+};
+
+/** A placeholder container which throws an exception on use
+ *
+ * This placeholder is used in non-CUDA builds for container types that would
+ * otherwise be provided with CUDA code. Attempting to construct a non-empty
+ * container of this type throws an exception indicating that there was an
+ * attempt to use the device from a non-CUDA build. An example of when this
+ * might happen is if a downstream application attempts to allocate a device
+ * mdarray using a library built with non-CUDA RAFT.
+ */
+template <typename T>
+struct fail_container {
+  using value_type = T;
+  using size_type  = std::size_t;
+
+  using reference       = fail_reference<T>;
+  using const_reference = fail_reference<T const>;
+
+  using pointer       = value_type*;
+  using const_pointer = value_type const*;
+
+  using iterator       = pointer;
+  using const_iterator = const_pointer;
+
+  explicit fail_container(size_t n = size_t{})
+  {
+    if (n != size_t{}) {
+      throw non_cuda_build_error{"Attempted to allocate device container in non-CUDA build"};
+    }
+  }
+
+  template <typename Index>
+  auto operator[](Index i) noexcept -> reference
+  {
+    RAFT_LOG_ERROR("Attempted to access device data in non-CUDA build");
+    return reference{};
+  }
+
+  template <typename Index>
+  auto operator[](Index i) const noexcept -> const_reference
+  {
+    RAFT_LOG_ERROR("Attempted to access device data in non-CUDA build");
+    return const_reference{};
+  }
+  void resize(size_t n)
+  {
+    if (n != size_t{}) {
+      throw non_cuda_build_error{"Attempted to allocate device container in non-CUDA build"};
+    }
+  }
+
+  [[nodiscard]] auto data() noexcept -> pointer { return nullptr; }
+  [[nodiscard]] auto data() const noexcept -> const_pointer { return nullptr; }
+};
+
+/** A placeholder container policy which throws an exception on use
+ *
+ * This placeholder is used in non-CUDA builds for container types that would
+ * otherwise be provided with CUDA code. Attempting to construct a non-empty
+ * container of this type throws an exception indicating that there was an
+ * attempt to use the device from a non-CUDA build. An example of when this
+ * might happen is if a downstream application attempts to allocate a device
+ * mdarray using a library built with non-CUDA RAFT.
+ */
+template <typename ElementType>
+struct fail_container_policy {
+  using element_type    = ElementType;
+  using container_type  = fail_container<element_type>;
+  using pointer         = typename container_type::pointer;
+  using const_pointer   = typename container_type::const_pointer;
+  using reference       = typename container_type::reference;
+  using const_reference = typename container_type::const_reference;
+
+  using accessor_policy       = std::experimental::default_accessor<element_type>;
+  using const_accessor_policy = std::experimental::default_accessor<element_type const>;
+
+  auto create(raft::resources const& res, size_t n) -> container_type { return container_type(n); }
+
+  fail_container_policy() = default;
+
+  [[nodiscard]] constexpr auto access(container_type& c, size_t n) const noexcept -> reference
+  {
+    return c[n];
+  }
+  [[nodiscard]] constexpr auto access(container_type const& c, size_t n) const noexcept
+    -> const_reference
+  {
+    return c[n];
+  }
+
+  [[nodiscard]] auto make_accessor_policy() noexcept { return accessor_policy{}; }
+  [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; }
+};
+
+}  // namespace detail
+}  // namespace raft
diff --git a/cpp/include/raft/core/device_container_policy.hpp b/cpp/include/raft/core/device_container_policy.hpp
index 011de307db..e8717d4c5e 100644
--- a/cpp/include/raft/core/device_container_policy.hpp
+++ b/cpp/include/raft/core/device_container_policy.hpp
@@ -6,7 +6,7 @@
  */
 
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
  * limitations under the License.
  */
 #pragma once
+#ifndef RAFT_DISABLE_CUDA
 #include <raft/core/device_mdspan.hpp>
 #include <raft/util/cudart_utils.hpp>
 
@@ -196,3 +197,22 @@ class device_uvector_policy {
 };
 
 }  // namespace raft
+#else
+#include <raft/core/detail/fail_container_policy.hpp>
+namespace raft {
+
+// Provide placeholders that will allow CPU-GPU interoperable codebases to
+// compile in non-CUDA mode but which will throw exceptions at runtime on any
+// attempt to touch device data
+
+template <typename T>
+using device_reference = detail::fail_reference<T>;
+
+template <typename T>
+using device_uvector = detail::fail_container<T>;
+
+template <typename ElementType>
+using device_uvector_policy = detail::fail_container_policy<ElementType>;
+
+}  // namespace raft
+#endif
diff --git a/cpp/include/raft/core/device_mdspan.hpp b/cpp/include/raft/core/device_mdspan.hpp
index c1898a3f09..3b6165b86a 100644
--- a/cpp/include/raft/core/device_mdspan.hpp
+++ b/cpp/include/raft/core/device_mdspan.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,9 +26,6 @@ namespace raft {
 template <typename AccessorPolicy>
 using device_accessor = host_device_accessor<AccessorPolicy, memory_type::device>;
 
-template <typename AccessorPolicy>
-using managed_accessor = host_device_accessor<AccessorPolicy, memory_type::managed>;
-
 /**
  * @brief std::experimental::mdspan with device tag to avoid accessing incorrect memory location.
  */
@@ -38,12 +35,6 @@ template <typename ElementType,
           typename AccessorPolicy = std::experimental::default_accessor<ElementType>>
 using device_mdspan = mdspan<ElementType, Extents, LayoutPolicy, device_accessor<AccessorPolicy>>;
 
-template <typename ElementType,
-          typename Extents,
-          typename LayoutPolicy   = layout_c_contiguous,
-          typename AccessorPolicy = std::experimental::default_accessor<ElementType>>
-using managed_mdspan = mdspan<ElementType, Extents, LayoutPolicy, managed_accessor<AccessorPolicy>>;
-
 template <typename T, bool B>
 struct is_device_mdspan : std::false_type {};
 template <typename T>
@@ -61,23 +52,6 @@ using is_input_device_mdspan_t = is_device_mdspan<T, is_input_mdspan_v<T>>;
 template <typename T>
 using is_output_device_mdspan_t = is_device_mdspan<T, is_output_mdspan_v<T>>;
 
-template <typename T, bool B>
-struct is_managed_mdspan : std::false_type {};
-template <typename T>
-struct is_managed_mdspan<T, true> : std::bool_constant<T::accessor_type::is_managed_accessible> {};
-
-/**
- * @\brief Boolean to determine if template type T is either raft::managed_mdspan or a derived type
- */
-template <typename T>
-using is_managed_mdspan_t = is_managed_mdspan<T, is_mdspan_v<T>>;
-
-template <typename T>
-using is_input_managed_mdspan_t = is_managed_mdspan<T, is_input_mdspan_v<T>>;
-
-template <typename T>
-using is_output_managed_mdspan_t = is_managed_mdspan<T, is_output_mdspan_v<T>>;
-
 /**
  * @\brief Boolean to determine if variadic template types Tn are either raft::device_mdspan or a
  * derived type
@@ -102,30 +76,6 @@ using enable_if_input_device_mdspan = std::enable_if_t<is_input_device_mdspan_v<
 template <typename... Tn>
 using enable_if_output_device_mdspan = std::enable_if_t<is_output_device_mdspan_v<Tn...>>;
 
-/**
- * @\brief Boolean to determine if variadic template types Tn are either raft::managed_mdspan or a
- * derived type
- */
-template <typename... Tn>
-inline constexpr bool is_managed_mdspan_v = std::conjunction_v<is_managed_mdspan_t<Tn>...>;
-
-template <typename... Tn>
-inline constexpr bool is_input_managed_mdspan_v =
-  std::conjunction_v<is_input_managed_mdspan_t<Tn>...>;
-
-template <typename... Tn>
-inline constexpr bool is_output_managed_mdspan_v =
-  std::conjunction_v<is_output_managed_mdspan_t<Tn>...>;
-
-template <typename... Tn>
-using enable_if_managed_mdspan = std::enable_if_t<is_managed_mdspan_v<Tn...>>;
-
-template <typename... Tn>
-using enable_if_input_managed_mdspan = std::enable_if_t<is_input_managed_mdspan_v<Tn...>>;
-
-template <typename... Tn>
-using enable_if_output_managed_mdspan = std::enable_if_t<is_output_managed_mdspan_v<Tn...>>;
-
 /**
  * @brief Shorthand for 0-dim host mdspan (scalar).
  * @tparam ElementType the data type of the scalar element
@@ -186,7 +136,7 @@ using device_aligned_matrix_view =
 template <typename ElementType,
           typename IndexType    = std::uint32_t,
           typename LayoutPolicy = layout_right_padded<ElementType>>
-auto make_device_aligned_matrix_view(ElementType* ptr, IndexType n_rows, IndexType n_cols)
+auto constexpr make_device_aligned_matrix_view(ElementType* ptr, IndexType n_rows, IndexType n_cols)
 {
   using data_handle_type =
     typename std::experimental::aligned_accessor<ElementType,
@@ -203,24 +153,6 @@ auto make_device_aligned_matrix_view(ElementType* ptr, IndexType n_rows, IndexTy
   return device_aligned_matrix_view<ElementType, IndexType, LayoutPolicy>{aligned_pointer, extents};
 }
 
-/**
- * @brief Create a raft::managed_mdspan
- * @tparam ElementType the data type of the matrix elements
- * @tparam IndexType the index type of the extents
- * @tparam LayoutPolicy policy for strides and layout ordering
- * @param ptr Pointer to the data
- * @param exts dimensionality of the array (series of integers)
- * @return raft::managed_mdspan
- */
-template <typename ElementType,
-          typename IndexType    = std::uint32_t,
-          typename LayoutPolicy = layout_c_contiguous,
-          size_t... Extents>
-auto make_managed_mdspan(ElementType* ptr, extents<IndexType, Extents...> exts)
-{
-  return make_mdspan<ElementType, IndexType, LayoutPolicy, true, true>(ptr, exts);
-}
-
 /**
  * @brief Create a 0-dim (scalar) mdspan instance for device value.
  *
@@ -229,7 +161,7 @@ auto make_managed_mdspan(ElementType* ptr, extents<IndexType, Extents...> exts)
  * @param[in] ptr on device to wrap
  */
 template <typename ElementType, typename IndexType = std::uint32_t>
-auto make_device_scalar_view(ElementType* ptr)
+auto constexpr make_device_scalar_view(ElementType* ptr)
 {
   scalar_extent<IndexType> extents;
   return device_scalar_view<ElementType, IndexType>{ptr, extents};
@@ -249,7 +181,7 @@ auto make_device_scalar_view(ElementType* ptr)
 template <typename ElementType,
           typename IndexType    = std::uint32_t,
           typename LayoutPolicy = layout_c_contiguous>
-auto make_device_matrix_view(ElementType* ptr, IndexType n_rows, IndexType n_cols)
+auto constexpr make_device_matrix_view(ElementType* ptr, IndexType n_rows, IndexType n_cols)
 {
   matrix_extent<IndexType> extents{n_rows, n_cols};
   return device_matrix_view<ElementType, IndexType, LayoutPolicy>{ptr, extents};
@@ -269,10 +201,10 @@ auto make_device_matrix_view(ElementType* ptr, IndexType n_rows, IndexType n_col
  * @param[in] stride leading dimension / stride of data
  */
 template <typename ElementType, typename IndexType, typename LayoutPolicy = layout_c_contiguous>
-auto make_device_strided_matrix_view(ElementType* ptr,
-                                     IndexType n_rows,
-                                     IndexType n_cols,
-                                     IndexType stride)
+auto constexpr make_device_strided_matrix_view(ElementType* ptr,
+                                               IndexType n_rows,
+                                               IndexType n_cols,
+                                               IndexType stride)
 {
   constexpr auto is_row_major = std::is_same_v<LayoutPolicy, layout_c_contiguous>;
   IndexType stride0           = is_row_major ? (stride > 0 ? stride : n_cols) : 1;
@@ -295,7 +227,7 @@ auto make_device_strided_matrix_view(ElementType* ptr,
  * @return raft::device_vector_view
  */
 template <typename ElementType, typename IndexType, typename LayoutPolicy = layout_c_contiguous>
-auto make_device_vector_view(ElementType* ptr, IndexType n)
+auto constexpr make_device_vector_view(ElementType* ptr, IndexType n)
 {
   return device_vector_view<ElementType, IndexType, LayoutPolicy>{ptr, n};
 }
@@ -310,7 +242,7 @@ auto make_device_vector_view(ElementType* ptr, IndexType n)
  * @return raft::device_vector_view
  */
 template <typename ElementType, typename IndexType, typename LayoutPolicy = layout_c_contiguous>
-auto make_device_vector_view(
+auto constexpr make_device_vector_view(
   ElementType* ptr,
   const typename LayoutPolicy::template mapping<vector_extent<IndexType>>& mapping)
 {
diff --git a/cpp/include/raft/core/host_container_policy.hpp b/cpp/include/raft/core/host_container_policy.hpp
index 3b3538ea20..0192436934 100644
--- a/cpp/include/raft/core/host_container_policy.hpp
+++ b/cpp/include/raft/core/host_container_policy.hpp
@@ -6,7 +6,7 @@
  */
 
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -62,4 +62,5 @@ class host_vector_policy {
   [[nodiscard]] auto make_accessor_policy() noexcept { return accessor_policy{}; }
   [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; }
 };
+
 }  // namespace raft
diff --git a/cpp/include/raft/core/host_device_accessor.hpp b/cpp/include/raft/core/host_device_accessor.hpp
index e9ebdb6c9f..7cb2aaf487 100644
--- a/cpp/include/raft/core/host_device_accessor.hpp
+++ b/cpp/include/raft/core/host_device_accessor.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,6 +42,16 @@ struct host_device_accessor : public AccessorPolicy {
   using AccessorPolicy::AccessorPolicy;
   using offset_policy = host_device_accessor;
   host_device_accessor(AccessorPolicy const& that) : AccessorPolicy{that} {}  // NOLINT
+
+  // Prevent implicit conversion from incompatible host_device_accessor types
+  template <memory_type OtherMemType>
+  host_device_accessor(host_device_accessor<AccessorPolicy, OtherMemType> const& that) = delete;
+
+  template <memory_type OtherMemType, typename = std::enable_if_t<mem_type == OtherMemType>>
+  host_device_accessor(host_device_accessor<AccessorPolicy, OtherMemType> const& that)
+    : AccessorPolicy{that}
+  {
+  }
 };
 
 }  // namespace raft
diff --git a/cpp/include/raft/core/host_mdspan.hpp b/cpp/include/raft/core/host_mdspan.hpp
index 9a675680ac..d5f431f4a2 100644
--- a/cpp/include/raft/core/host_mdspan.hpp
+++ b/cpp/include/raft/core/host_mdspan.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -134,7 +134,7 @@ using host_aligned_matrix_view =
 template <typename ElementType,
           typename IndexType    = std::uint32_t,
           typename LayoutPolicy = layout_right_padded<ElementType>>
-auto make_host_aligned_matrix_view(ElementType* ptr, IndexType n_rows, IndexType n_cols)
+auto constexpr make_host_aligned_matrix_view(ElementType* ptr, IndexType n_rows, IndexType n_cols)
 {
   using data_handle_type =
     typename std::experimental::aligned_accessor<ElementType,
@@ -159,7 +159,7 @@ auto make_host_aligned_matrix_view(ElementType* ptr, IndexType n_rows, IndexType
  * @param[in] ptr on device to wrap
  */
 template <typename ElementType, typename IndexType = std::uint32_t>
-auto make_host_scalar_view(ElementType* ptr)
+auto constexpr make_host_scalar_view(ElementType* ptr)
 {
   scalar_extent<IndexType> extents;
   return host_scalar_view<ElementType, IndexType>{ptr, extents};
@@ -179,7 +179,7 @@ auto make_host_scalar_view(ElementType* ptr)
 template <typename ElementType,
           typename IndexType    = std::uint32_t,
           typename LayoutPolicy = layout_c_contiguous>
-auto make_host_matrix_view(ElementType* ptr, IndexType n_rows, IndexType n_cols)
+auto constexpr make_host_matrix_view(ElementType* ptr, IndexType n_rows, IndexType n_cols)
 {
   matrix_extent<IndexType> extents{n_rows, n_cols};
   return host_matrix_view<ElementType, IndexType, LayoutPolicy>{ptr, extents};
@@ -196,7 +196,7 @@ auto make_host_matrix_view(ElementType* ptr, IndexType n_rows, IndexType n_cols)
 template <typename ElementType,
           typename IndexType    = std::uint32_t,
           typename LayoutPolicy = layout_c_contiguous>
-auto make_host_vector_view(ElementType* ptr, IndexType n)
+auto constexpr make_host_vector_view(ElementType* ptr, IndexType n)
 {
   return host_vector_view<ElementType, IndexType, LayoutPolicy>{ptr, n};
 }
diff --git a/cpp/include/raft/core/managed_container_policy.hpp b/cpp/include/raft/core/managed_container_policy.hpp
new file mode 100644
index 0000000000..f4e26c6ef1
--- /dev/null
+++ b/cpp/include/raft/core/managed_container_policy.hpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#ifndef RAFT_DISABLE_CUDA
+#include <raft/core/device_container_policy.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/util/cudart_utils.hpp>
+
+#include <raft/core/detail/span.hpp>  // dynamic_extent
+#include <raft/core/host_device_accessor.hpp>
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
+#include <rmm/mr/device/managed_memory_resource.hpp>
+
+namespace raft {
+/**
+ * @brief A container policy for managed mdarray.
+ */
+template <typename ElementType>
+class managed_uvector_policy {
+ public:
+  using element_type    = ElementType;
+  using container_type  = device_uvector<element_type>;
+  using pointer         = typename container_type::pointer;
+  using const_pointer   = typename container_type::const_pointer;
+  using reference       = device_reference<element_type>;
+  using const_reference = device_reference<element_type const>;
+
+  using accessor_policy       = std::experimental::default_accessor<element_type>;
+  using const_accessor_policy = std::experimental::default_accessor<element_type const>;
+
+  auto create(raft::resources const& res, size_t n) -> container_type
+  {
+    return container_type(n, resource::get_cuda_stream(res), mr_);
+  }
+
+  [[nodiscard]] constexpr auto access(container_type& c, size_t n) const noexcept -> reference
+  {
+    return c[n];
+  }
+  [[nodiscard]] constexpr auto access(container_type const& c, size_t n) const noexcept
+    -> const_reference
+  {
+    return c[n];
+  }
+
+  [[nodiscard]] auto make_accessor_policy() noexcept { return accessor_policy{}; }
+  [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; }
+
+ private:
+  static auto* get_default_memory_resource()
+  {
+    auto static result = rmm::mr::managed_memory_resource{};
+    return &result;
+  }
+  rmm::mr::managed_memory_resource* mr_{get_default_memory_resource()};
+};
+
+}  // namespace raft
+#else
+#include <raft/core/detail/fail_container_policy.hpp>
+namespace raft {
+
+// Provide placeholders that will allow CPU-GPU interoperable codebases to
+// compile in non-CUDA mode but which will throw exceptions at runtime on any
+// attempt to touch device data
+
+template <typename ElementType>
+using managed_uvector_policy = detail::fail_container_policy<ElementType>;
+
+}  // namespace raft
+#endif
diff --git a/cpp/include/raft/core/managed_mdarray.hpp b/cpp/include/raft/core/managed_mdarray.hpp
new file mode 100644
index 0000000000..c1438d941d
--- /dev/null
+++ b/cpp/include/raft/core/managed_mdarray.hpp
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <raft/core/managed_container_policy.hpp>
+#include <raft/core/managed_mdspan.hpp>
+#include <raft/core/mdarray.hpp>
+#include <raft/core/resources.hpp>
+
+namespace raft {
+
+/**
+ * @brief mdarray with managed container policy
+ * @tparam ElementType the data type of the elements
+ * @tparam Extents defines the shape
+ * @tparam LayoutPolicy policy for indexing strides and layout ordering
+ * @tparam ContainerPolicy storage and accessor policy
+ */
+template <typename ElementType,
+          typename Extents,
+          typename LayoutPolicy    = layout_c_contiguous,
+          typename ContainerPolicy = managed_uvector_policy<ElementType>>
+using managed_mdarray =
+  mdarray<ElementType, Extents, LayoutPolicy, managed_accessor<ContainerPolicy>>;
+
+/**
+ * @brief Shorthand for 0-dim host mdarray (scalar).
+ * @tparam ElementType the data type of the scalar element
+ * @tparam IndexType the index type of the extents
+ */
+template <typename ElementType, typename IndexType = std::uint32_t>
+using managed_scalar = managed_mdarray<ElementType, scalar_extent<IndexType>>;
+
+/**
+ * @brief Shorthand for 1-dim managed mdarray.
+ * @tparam ElementType the data type of the vector elements
+ * @tparam IndexType the index type of the extents
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ */
+template <typename ElementType,
+          typename IndexType    = std::uint32_t,
+          typename LayoutPolicy = layout_c_contiguous>
+using managed_vector = managed_mdarray<ElementType, vector_extent<IndexType>, LayoutPolicy>;
+
+/**
+ * @brief Shorthand for c-contiguous managed matrix.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam IndexType the index type of the extents
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ */
+template <typename ElementType,
+          typename IndexType    = std::uint32_t,
+          typename LayoutPolicy = layout_c_contiguous>
+using managed_matrix = managed_mdarray<ElementType, matrix_extent<IndexType>, LayoutPolicy>;
+
+/**
+ * @brief Create a managed mdarray.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam IndexType the index type of the extents
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @param handle raft::resources
+ * @param exts dimensionality of the array (series of integers)
+ * @return raft::managed_mdarray
+ */
+template <typename ElementType,
+          typename IndexType    = std::uint32_t,
+          typename LayoutPolicy = layout_c_contiguous,
+          size_t... Extents>
+auto make_managed_mdarray(raft::resources const& handle, extents<IndexType, Extents...> exts)
+{
+  using mdarray_t = managed_mdarray<ElementType, decltype(exts), LayoutPolicy>;
+
+  typename mdarray_t::mapping_type layout{exts};
+  typename mdarray_t::container_policy_type policy{};
+
+  return mdarray_t{handle, layout, policy};
+}
+
+/**
+ * @brief Create a 2-dim c-contiguous managed mdarray.
+ *
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam IndexType the index type of the extents
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @param[in] handle raft handle for managing expensive resources
+ * @param[in] n_rows number or rows in matrix
+ * @param[in] n_cols number of columns in matrix
+ * @return raft::managed_matrix
+ */
+template <typename ElementType,
+          typename IndexType    = std::uint32_t,
+          typename LayoutPolicy = layout_c_contiguous>
+auto make_managed_matrix(raft::resources const& handle, IndexType n_rows, IndexType n_cols)
+{
+  return make_managed_mdarray<ElementType, IndexType, LayoutPolicy>(
+    handle, make_extents<IndexType>(n_rows, n_cols));
+}
+
+/**
+ * @brief Create a managed scalar from v.
+ *
+ * @tparam ElementType the data type of the scalar element
+ * @tparam IndexType the index type of the extents
+ * @param[in] handle raft handle for managing expensive cuda resources
+ * @param[in] v scalar to wrap on managed
+ * @return raft::managed_scalar
+ */
+template <typename ElementType, typename IndexType = std::uint32_t>
+auto make_managed_scalar(raft::resources const& handle, ElementType const& v)
+{
+  scalar_extent<IndexType> extents;
+  using policy_t = typename managed_scalar<ElementType>::container_policy_type;
+  policy_t policy{};
+  auto scalar = managed_scalar<ElementType>{handle, extents, policy};
+  scalar(0)   = v;
+  return scalar;
+}
+
+/**
+ * @brief Create a 1-dim managed mdarray.
+ * @tparam ElementType the data type of the vector elements
+ * @tparam IndexType the index type of the extents
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @param[in] handle raft handle for managing expensive cuda resources
+ * @param[in] n number of elements in vector
+ * @return raft::managed_vector
+ */
+template <typename ElementType,
+          typename IndexType    = std::uint32_t,
+          typename LayoutPolicy = layout_c_contiguous>
+auto make_managed_vector(raft::resources const& handle, IndexType n)
+{
+  return make_managed_mdarray<ElementType, IndexType, LayoutPolicy>(handle,
+                                                                    make_extents<IndexType>(n));
+}
+
+}  // end namespace raft
diff --git a/cpp/include/raft/core/managed_mdspan.hpp b/cpp/include/raft/core/managed_mdspan.hpp
new file mode 100644
index 0000000000..9c2976ec6b
--- /dev/null
+++ b/cpp/include/raft/core/managed_mdspan.hpp
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <raft/core/host_device_accessor.hpp>
+#include <raft/core/mdspan.hpp>
+#include <raft/core/memory_type.hpp>
+
+namespace raft {
+
+template <typename AccessorPolicy>
+using managed_accessor = host_device_accessor<AccessorPolicy, memory_type::managed>;
+
+/**
+ * @brief std::experimental::mdspan with managed tag to indicate host/device accessibility
+ */
+template <typename ElementType,
+          typename Extents,
+          typename LayoutPolicy   = layout_c_contiguous,
+          typename AccessorPolicy = std::experimental::default_accessor<ElementType>>
+using managed_mdspan = mdspan<ElementType, Extents, LayoutPolicy, managed_accessor<AccessorPolicy>>;
+
+template <typename T, bool B>
+struct is_managed_mdspan : std::false_type {};
+template <typename T>
+struct is_managed_mdspan<T, true>
+  : std::bool_constant<T::accessor_type::mem_type == memory_type::managed> {};
+
+/**
+ * @\brief Boolean to determine if template type T is either raft::managed_mdspan or a derived type
+ */
+template <typename T>
+using is_managed_mdspan_t = is_managed_mdspan<T, is_mdspan_v<T>>;
+
+template <typename T>
+using is_input_managed_mdspan_t = is_managed_mdspan<T, is_input_mdspan_v<T>>;
+
+template <typename T>
+using is_output_managed_mdspan_t = is_managed_mdspan<T, is_output_mdspan_v<T>>;
+
+/**
+ * @\brief Boolean to determine if variadic template types Tn are either raft::managed_mdspan or a
+ * derived type
+ */
+template <typename... Tn>
+inline constexpr bool is_managed_mdspan_v = std::conjunction_v<is_managed_mdspan_t<Tn>...>;
+
+template <typename... Tn>
+inline constexpr bool is_input_managed_mdspan_v =
+  std::conjunction_v<is_input_managed_mdspan_t<Tn>...>;
+
+template <typename... Tn>
+inline constexpr bool is_output_managed_mdspan_v =
+  std::conjunction_v<is_output_managed_mdspan_t<Tn>...>;
+
+template <typename... Tn>
+using enable_if_managed_mdspan = std::enable_if_t<is_managed_mdspan_v<Tn...>>;
+
+template <typename... Tn>
+using enable_if_input_managed_mdspan = std::enable_if_t<is_input_managed_mdspan_v<Tn...>>;
+
+template <typename... Tn>
+using enable_if_output_managed_mdspan = std::enable_if_t<is_output_managed_mdspan_v<Tn...>>;
+
+/**
+ * @brief Shorthand for 0-dim managed mdspan (scalar).
+ * @tparam ElementType the data type of the scalar element
+ * @tparam IndexType the index type of the extents
+ */
+template <typename ElementType, typename IndexType = std::uint32_t>
+using managed_scalar_view = managed_mdspan<ElementType, scalar_extent<IndexType>>;
+
+/**
+ * @brief Shorthand for 1-dim managed mdspan.
+ * @tparam ElementType the data type of the vector elements
+ * @tparam IndexType the index type of the extents
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ */
+template <typename ElementType,
+          typename IndexType    = std::uint32_t,
+          typename LayoutPolicy = layout_c_contiguous>
+using managed_vector_view = managed_mdspan<ElementType, vector_extent<IndexType>, LayoutPolicy>;
+
+/**
+ * @brief Shorthand for c-contiguous managed matrix view.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam IndexType the index type of the extents
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ */
+template <typename ElementType,
+          typename IndexType    = std::uint32_t,
+          typename LayoutPolicy = layout_c_contiguous>
+using managed_matrix_view = managed_mdspan<ElementType, matrix_extent<IndexType>, LayoutPolicy>;
+
+/**
+ * @brief Shorthand for 128 byte aligned managed matrix view.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam IndexType the index type of the extents
+ * @tparam LayoutPolicy must be of type layout_{left/right}_padded
+ */
+template <typename ElementType,
+          typename IndexType    = std::uint32_t,
+          typename LayoutPolicy = layout_right_padded<ElementType>,
+          typename              = enable_if_layout_padded<ElementType, LayoutPolicy>>
+using managed_aligned_matrix_view =
+  managed_mdspan<ElementType,
+                 matrix_extent<IndexType>,
+                 LayoutPolicy,
+                 std::experimental::aligned_accessor<ElementType, detail::alignment::value>>;
+
+/**
+ * @brief Create a 2-dim 128 byte aligned mdspan instance for managed pointer. It's
+ *        expected that the given layout policy match the layout of the underlying
+ *        pointer.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy must be of type layout_{left/right}_padded
+ * @tparam IndexType the index type of the extents
+ * @param[in] ptr to managed memory to wrap
+ * @param[in] n_rows number of rows in pointer
+ * @param[in] n_cols number of columns in pointer
+ */
+template <typename ElementType,
+          typename IndexType    = std::uint32_t,
+          typename LayoutPolicy = layout_right_padded<ElementType>>
+auto constexpr make_managed_aligned_matrix_view(ElementType* ptr,
+                                                IndexType n_rows,
+                                                IndexType n_cols)
+{
+  using data_handle_type =
+    typename std::experimental::aligned_accessor<ElementType,
+                                                 detail::alignment::value>::data_handle_type;
+  static_assert(std::is_same<LayoutPolicy, layout_left_padded<ElementType>>::value ||
+                std::is_same<LayoutPolicy, layout_right_padded<ElementType>>::value);
+  assert(reinterpret_cast<std::uintptr_t>(ptr) ==
+         std::experimental::details::alignTo(reinterpret_cast<std::uintptr_t>(ptr),
+                                             detail::alignment::value));
+
+  data_handle_type aligned_pointer = ptr;
+
+  matrix_extent<IndexType> extents{n_rows, n_cols};
+  return managed_aligned_matrix_view<ElementType, IndexType, LayoutPolicy>{aligned_pointer,
+                                                                           extents};
+}
+
+/**
+ * @brief Create a 0-dim (scalar) mdspan instance for managed value.
+ *
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam IndexType the index type of the extents
+ * @param[in] ptr to managed memory to wrap
+ */
+template <typename ElementType, typename IndexType = std::uint32_t>
+auto constexpr make_managed_scalar_view(ElementType* ptr)
+{
+  scalar_extent<IndexType> extents;
+  return managed_scalar_view<ElementType, IndexType>{ptr, extents};
+}
+
+/**
+ * @brief Create a 2-dim c-contiguous mdspan instance for managed pointer. It's
+ *        expected that the given layout policy match the layout of the underlying
+ *        pointer.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @tparam IndexType the index type of the extents
+ * @param[in] ptr to managed memory to wrap
+ * @param[in] n_rows number of rows in pointer
+ * @param[in] n_cols number of columns in pointer
+ */
+template <typename ElementType,
+          typename IndexType    = std::uint32_t,
+          typename LayoutPolicy = layout_c_contiguous>
+auto constexpr make_managed_matrix_view(ElementType* ptr, IndexType n_rows, IndexType n_cols)
+{
+  matrix_extent<IndexType> extents{n_rows, n_cols};
+  return managed_matrix_view<ElementType, IndexType, LayoutPolicy>{ptr, extents};
+}
+
+/**
+ * @brief Create a 2-dim mdspan instance for managed pointer with a strided layout
+ *        that is restricted to stride 1 in the trailing dimension. It's
+ *        expected that the given layout policy match the layout of the underlying
+ *        pointer.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam IndexType the index type of the extents
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @param[in] ptr to managed memory to wrap
+ * @param[in] n_rows number of rows in pointer
+ * @param[in] n_cols number of columns in pointer
+ * @param[in] stride leading dimension / stride of data
+ */
+template <typename ElementType, typename IndexType, typename LayoutPolicy = layout_c_contiguous>
+auto constexpr make_managed_strided_matrix_view(ElementType* ptr,
+                                                IndexType n_rows,
+                                                IndexType n_cols,
+                                                IndexType stride)
+{
+  constexpr auto is_row_major = std::is_same_v<LayoutPolicy, layout_c_contiguous>;
+  IndexType stride0           = is_row_major ? (stride > 0 ? stride : n_cols) : 1;
+  IndexType stride1           = is_row_major ? 1 : (stride > 0 ? stride : n_rows);
+
+  assert(is_row_major ? stride0 >= n_cols : stride1 >= n_rows);
+  matrix_extent<IndexType> extents{n_rows, n_cols};
+
+  auto layout = make_strided_layout(extents, std::array<IndexType, 2>{stride0, stride1});
+  return managed_matrix_view<ElementType, IndexType, layout_stride>{ptr, layout};
+}
+
+/**
+ * @brief Create a 1-dim mdspan instance for managed pointer.
+ * @tparam ElementType the data type of the vector elements
+ * @tparam IndexType the index type of the extents
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @param[in] ptr to managed memory to wrap
+ * @param[in] n number of elements in pointer
+ * @return raft::managed_vector_view
+ */
+template <typename ElementType, typename IndexType, typename LayoutPolicy = layout_c_contiguous>
+auto constexpr make_managed_vector_view(ElementType* ptr, IndexType n)
+{
+  return managed_vector_view<ElementType, IndexType, LayoutPolicy>{ptr, n};
+}
+
+/**
+ * @brief Create a 1-dim mdspan instance for managed pointer.
+ * @tparam ElementType the data type of the vector elements
+ * @tparam IndexType the index type of the extents
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @param[in] ptr to managed memory to wrap
+ * @param[in] mapping The layout mapping to use for this vector
+ * @return raft::managed_vector_view
+ */
+template <typename ElementType, typename IndexType, typename LayoutPolicy = layout_c_contiguous>
+auto constexpr make_managed_vector_view(
+  ElementType* ptr,
+  const typename LayoutPolicy::template mapping<vector_extent<IndexType>>& mapping)
+{
+  return managed_vector_view<ElementType, IndexType, LayoutPolicy>{ptr, mapping};
+}
+
+/**
+ * @brief Create a raft::managed_mdspan
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam IndexType the index type of the extents
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @param ptr Pointer to the data
+ * @param exts dimensionality of the array (series of integers)
+ * @return raft::managed_mdspan
+ */
+template <typename ElementType,
+          typename IndexType    = std::uint32_t,
+          typename LayoutPolicy = layout_c_contiguous,
+          size_t... Extents>
+auto constexpr make_managed_mdspan(ElementType* ptr, extents<IndexType, Extents...> exts)
+{
+  return make_mdspan<ElementType, IndexType, LayoutPolicy, true, true>(ptr, exts);
+}
+}  // end namespace raft
diff --git a/cpp/include/raft/core/mdbuffer.cuh b/cpp/include/raft/core/mdbuffer.cuh
new file mode 100644
index 0000000000..18533ce882
--- /dev/null
+++ b/cpp/include/raft/core/mdbuffer.cuh
@@ -0,0 +1,1020 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <algorithm>
+#include <execution>
+#include <optional>
+#include <raft/core/cuda_support.hpp>
+#include <raft/core/detail/copy.hpp>
+#include <raft/core/device_container_policy.hpp>
+#include <raft/core/error.hpp>
+#include <raft/core/host_container_policy.hpp>
+#include <raft/core/host_device_accessor.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/core/managed_container_policy.hpp>
+#include <raft/core/mdarray.hpp>
+#include <raft/core/mdspan.hpp>
+#include <raft/core/pinned_container_policy.hpp>
+#include <raft/core/stream_view.hpp>
+#include <raft/util/variant_utils.hpp>
+#include <type_traits>
+#include <utility>
+#include <variant>
+#ifndef RAFT_DISABLE_CUDA
+#include <raft/core/copy.cuh>
+#include <raft/util/cudart_utils.hpp>
+#include <thrust/device_ptr.h>
+#else
+#include <raft/core/copy.hpp>
+#endif
+
+namespace raft {
+
+/**
+ * @defgroup mdbuffer_apis multi-dimensional maybe-owning type
+ * @{
+ */
+
+/**
+ * @brief Retrieve a canonical index associated with a given memory type.
+ *
+ * For variants based on memory type, this index can be used to help keep a
+ * consistent ordering of the memory types in the variant.
+ */
+inline auto constexpr variant_index_from_memory_type(raft::memory_type mem_type)
+{
+  return static_cast<std::underlying_type_t<raft::memory_type>>(mem_type);
+}
+
+/**
+ * @brief Retrieve the memory type associated with a canonical index
+ */
+inline auto constexpr memory_type_from_variant_index(
+  std::underlying_type_t<raft::memory_type> index)
+{
+  return static_cast<raft::memory_type>(index);
+}
+
+/**
+ * @brief Retrieve a type from a variant based on a given memory type.
+ */
+template <raft::memory_type MemType, typename Variant>
+using alternate_from_mem_type =
+  std::variant_alternative_t<variant_index_from_memory_type(MemType) % std::variant_size_v<Variant>,
+                             Variant>;
+
+namespace detail {
+template <typename T, raft::memory_type MemType>
+struct memory_type_to_default_policy {};
+template <typename T>
+struct memory_type_to_default_policy<T, raft::memory_type::host> {
+  using type = typename raft::host_vector_policy<T>;
+};
+template <typename T>
+struct memory_type_to_default_policy<T, raft::memory_type::device> {
+  using type = typename raft::device_uvector_policy<T>;
+};
+template <typename T>
+struct memory_type_to_default_policy<T, raft::memory_type::managed> {
+  using type = typename raft::managed_uvector_policy<T>;
+};
+template <typename T>
+struct memory_type_to_default_policy<T, raft::memory_type::pinned> {
+  using type = typename raft::pinned_vector_policy<T>;
+};
+
+template <typename T, raft::memory_type MemType>
+using memory_type_to_default_policy_t = typename memory_type_to_default_policy<T, MemType>::type;
+}  // namespace detail
+
+/**
+ * @brief A variant of container policies for each memory type which can be
+ * used to build the default container policy for a buffer.
+ */
+template <typename T>
+using default_container_policy_variant =
+  std::variant<detail::memory_type_to_default_policy_t<T, memory_type_from_variant_index(0)>,
+               detail::memory_type_to_default_policy_t<T, memory_type_from_variant_index(1)>,
+               detail::memory_type_to_default_policy_t<T, memory_type_from_variant_index(2)>,
+               detail::memory_type_to_default_policy_t<T, memory_type_from_variant_index(3)>>;
+
+/**
+ * @brief A template used to translate a variant of underlying mdarray
+ * container policies into a container policy that can be used by an mdbuffer.
+ */
+template <typename ElementType,
+          typename ContainerPolicyVariant =
+            default_container_policy_variant<std::remove_cv_t<ElementType>>>
+struct default_buffer_container_policy {
+  using element_type = ElementType;
+  using value_type   = std::remove_cv_t<element_type>;
+
+ private:
+  template <std::size_t index>
+  using raw_container_policy_at_index = std::variant_alternative_t<index, ContainerPolicyVariant>;
+
+ public:
+  using container_policy_variant =
+    std::variant<host_device_accessor<std::variant_alternative_t<0, ContainerPolicyVariant>,
+                                      static_cast<memory_type>(0)>,
+                 host_device_accessor<std::variant_alternative_t<1, ContainerPolicyVariant>,
+                                      static_cast<memory_type>(1)>,
+                 host_device_accessor<std::variant_alternative_t<2, ContainerPolicyVariant>,
+                                      static_cast<memory_type>(2)>,
+                 host_device_accessor<std::variant_alternative_t<3, ContainerPolicyVariant>,
+                                      static_cast<memory_type>(3)>>;
+  template <raft::memory_type MemType>
+  using container_policy = alternate_from_mem_type<MemType, container_policy_variant>;
+  using container_type_variant =
+    std::variant<typename raw_container_policy_at_index<0>::container_type,
+                 typename raw_container_policy_at_index<1>::container_type,
+                 typename raw_container_policy_at_index<2>::container_type,
+                 typename raw_container_policy_at_index<3>::container_type>;
+
+  template <raft::memory_type MemType>
+  using container_type = alternate_from_mem_type<MemType, container_type_variant>;
+
+  using accessor_policy_variant =
+    std::variant<host_device_accessor<typename raw_container_policy_at_index<0>::accessor_policy,
+                                      static_cast<memory_type>(0)>,
+                 host_device_accessor<typename raw_container_policy_at_index<1>::accessor_policy,
+                                      static_cast<memory_type>(1)>,
+                 host_device_accessor<typename raw_container_policy_at_index<2>::accessor_policy,
+                                      static_cast<memory_type>(2)>,
+                 host_device_accessor<typename raw_container_policy_at_index<3>::accessor_policy,
+                                      static_cast<memory_type>(3)>>;
+
+  template <raft::memory_type MemType>
+  using accessor_policy = alternate_from_mem_type<MemType, accessor_policy_variant>;
+
+  using const_accessor_policy_variant = std::variant<
+    host_device_accessor<typename raw_container_policy_at_index<0>::const_accessor_policy,
+                         static_cast<memory_type>(0)>,
+    host_device_accessor<typename raw_container_policy_at_index<1>::const_accessor_policy,
+                         static_cast<memory_type>(1)>,
+    host_device_accessor<typename raw_container_policy_at_index<2>::const_accessor_policy,
+                         static_cast<memory_type>(2)>,
+    host_device_accessor<typename raw_container_policy_at_index<3>::const_accessor_policy,
+                         static_cast<memory_type>(3)>>;
+  template <raft::memory_type MemType>
+  using const_accessor_policy = alternate_from_mem_type<MemType, accessor_policy_variant>;
+
+  template <raft::memory_type MemType>
+  auto create(raft::resources const& res, size_t n)
+  {
+    return container_type<MemType>(res, n);
+  }
+
+  auto create(raft::resources const& res, size_t n, raft::memory_type mem_type)
+  {
+    auto result = container_type_variant{};
+    switch (mem_type) {
+      case raft::memory_type::host: result = create<raft::memory_type::host>(res, n); break;
+      case raft::memory_type::device: result = create<raft::memory_type::device>(res, n); break;
+      case raft::memory_type::managed: result = create<raft::memory_type::managed>(res, n); break;
+      case raft::memory_type::pinned: result = create<raft::memory_type::pinned>(res, n); break;
+    }
+    return result;
+  }
+
+ private:
+  template <typename ContainerType>
+  auto static constexpr has_stream() -> decltype(std::declval<ContainerType>().stream(), bool())
+  {
+    return true;
+  };
+  auto static constexpr has_stream(...) -> bool { return false; };
+
+ public:
+  template <memory_type MemType>
+  [[nodiscard]] auto make_accessor_policy() noexcept
+  {
+    return accessor_policy<MemType>{};
+  }
+  template <memory_type MemType>
+  [[nodiscard]] auto make_accessor_policy() const noexcept
+  {
+    return const_accessor_policy<MemType>{};
+  }
+
+  [[nodiscard]] auto make_accessor_policy(memory_type mem_type) noexcept
+  {
+    auto result = accessor_policy_variant{};
+    switch (mem_type) {
+      case memory_type::host: result = make_accessor_policy<memory_type::host>(); break;
+      case memory_type::device: result = make_accessor_policy<memory_type::device>(); break;
+      case memory_type::managed: result = make_accessor_policy<memory_type::managed>(); break;
+      case memory_type::pinned: result = make_accessor_policy<memory_type::pinned>(); break;
+    }
+    return result;
+  }
+  [[nodiscard]] auto make_accessor_policy(memory_type mem_type) const noexcept
+  {
+    auto result = const_accessor_policy_variant{};
+    switch (mem_type) {
+      case memory_type::host: result = make_accessor_policy<memory_type::host>(); break;
+      case memory_type::device: result = make_accessor_policy<memory_type::device>(); break;
+      case memory_type::managed: result = make_accessor_policy<memory_type::managed>(); break;
+      case memory_type::pinned: result = make_accessor_policy<memory_type::pinned>(); break;
+    }
+    return result;
+  }
+};
+
+/**
+ * @brief A type representing multi-dimensional data which may or may not own
+ * its underlying storage. `raft::mdbuffer` is used to conveniently perform
+ * copies of data _only_ when necessary to ensure that the data are accessible
+ * in the desired memory space and format.
+ *
+ * When developing functions that interact with the GPU, it is often necessary
+ * to ensure that the data are in a particular memory space (e.g. device,
+ * host, managed, pinned), but those functions may be called with data that
+ * may or may not already be in the desired memory space. For instance, when
+ * called in one workflow, the data may have been previously transferred to
+ * device, rendering a copy unnecessary. In another, the function may be
+ * directly invoked on host data.
+ *
+ * Even when working strictly with host memory, it is often necessary to
+ * ensure that the data are in a particular layout for efficient access (e.g.
+ * column major vs row major) or that the the data are of a particular type
+ * (e.g. double) even though we wish to call the function with data of
+ * another compatible type (e.g. float).
+ *
+ * `mdbuffer` is a tool for ensuring that the data are represented in exactly
+ * the desired format and location while flexibly supporting data which may
+ * not already be in that format or location. It does so by providing a
+ * non-owning view on data which are already in the required form, but it
+ * allocates (owned) memory and performs a copy if and only if it is
+ * necessary.
+ *
+ * Usage example:
+ * @code{.cpp}
+ * template <typename mdspan_type>
+ * void foo_device(raft::resources const& res, mdspan_type data) {
+ *   auto buf = raft::mdbuffer{res, raft::mdbuffer{data}, raft::memory_type::device};
+ *   // Data in buf is now guaranteed to be accessible from device.
+ *   // If it was already accessible from device, no copy was performed. If it
+ *   // was not, a copy was performed.
+ *
+ *   some_kernel<<<...>>>(buf.view<raft::memory_type::device>());
+ *
+ *   // It is sometimes useful to know whether or not a copy was performed to
+ *   // e.g. determine whether the transformed data should be copied back to its original
+ *   // location. This can be checked via the `is_owning()` method.
+ *   if (buf.is_owning()) {
+ *     raft::copy(res, data, buf.view<raft::memory_type::device>());
+ *   }
+ * }
+ * @endcode
+ *
+ * Note that in this example, the `foo_device` template can be correctly
+ * instantiated for both host and device mdspans. Similarly we can use
+ * `mdbuffer` to coerce data to a particular memory layout and data-type, as in
+ * the following example:
+ * @code{.cpp}
+ * template <typename mdspan_type>
+ * void foo_device(raft::resources const& res, mdspan_type data) {
+ *   auto buf = raft::mdbuffer<float, raft::matrix_extent<int>, raft::row_major>{res,
+ * raft::mdbuffer{data}, raft::memory_type::device};
+ *   // Data in buf is now guaranteed to be accessible from device, and
+ *   // represented by floats in row-major order.
+ *
+ *   some_kernel<<<...>>>(buf.view<raft::memory_type::device>());
+ *
+ *   // The same check can be used to determine whether or not a copy was
+ *   // required, regardless of the cause. I.e. if the data were already on
+ *   // device but in column-major order, the is_owning() method would still
+ *   // return true because new storage needed to be allocated.
+ *   if (buf.is_owning()) {
+ *     raft::copy(res, data, buf.view<raft::memory_type::device>());
+ *   }
+ * }
+ * @endcode
+ *
+ * Note that in this example, the `foo_device` template can accept data of
+ * any float-convertible type in any layout and of any memory type and coerce
+ * it to the desired device-accessible representation.
+ *
+ * Because `mdspan` types can be implicitly converted to `mdbuffer`, it is even
+ * possible to avoid multiple template instantiations by directly accepting an
+ * `mdbuffer` as argument, as in the following example:
+ * @code{.cpp}
+ * void foo_device(raft::resources const& res, raft::mdbuffer<float, raft::matrix_extent<int>>&&
+ * data) { auto buf = raft::mdbuffer{res, data, raft::memory_type::device};
+ *   // Data in buf is now guaranteed to be accessible from device.
+ *
+ *   some_kernel<<<...>>>(buf.view<raft::memory_type::device>());
+ * }
+ * @endcode
+ *
+ * In this example, `foo_device` can now accept any row-major mdspan of floats
+ * regardless of memory type without requiring separate template instantiations
+ * for each type.
+ *
+ * While the view method takes an optional compile-time memory type parameter,
+ * omitting this parameter will return a std::variant of mdspan types. This
+ * allows for straightforward runtime dispatching based on the memory type
+ * using std::visit, as in the following example:
+ *
+ * @code{.cpp}
+ * void foo(raft::resources const& res, raft::mdbuffer<float, raft::matrix_extent<int>>&& data) {
+ *   std::visit([](auto&& view) {
+ *     // Do something with the view, including (possibly) dispatching based on
+ *     // whether it is a host, device, managed, or pinned mdspan
+ *   }, data.view());
+ * }
+ * @endcode
+ *
+ * For convenience, runtime memory-type dispatching can also be performed
+ * without explicit use of `mdbuffer` using `raft::memory_type_dispatcher`, as
+ * described in @ref memory_type_dispatcher. Please see the full documentation
+ * of that function template for more extensive discussion of the many ways it
+ * can be used. To illustrate its connection to `mdbuffer`, however, consider
+ * the following example, which performs a similar task to the above
+ * `std::visit` call:
+ *
+ * @code{.cpp}
+ * void foo_device(raft::resources const& res, raft::device_matrix_view<float> data) {
+ *   // Implement foo solely for device data
+ * };
+ *
+ * // Call foo with data of any memory type:
+ * template <typename mdspan_type>
+ * void foo(raft::resources const& res, mdspan_type data) {
+ *   raft::memory_type_dispatcher(res,
+ *     [&res](raft::device_matrix_view<float> dev_data) {foo_device(res, dev_data);},
+ *     data
+ *   );
+ * }
+ * @endcode
+ *
+ * Here, the `memory_type_dispatcher` implicitly constructs an `mdbuffer` from
+ * the input and performs any necessary conversions before passing the input to
+ * `foo_device`. While `mdbuffer` does not require the use of
+ * `memory_type_dispatcher`, there are many common use cases in which explicit
+ * invocations of `mdbuffer` can be elided with `memory_type_dispatcher`.
+ *
+ * Finally, we should note that `mdbuffer` should almost never be passed as a
+ * const reference. To indicate const-ness of the underlying data, the
+ * `mdbuffer` should be constructed with a const memory type, but the mdbuffer
+ * itself should generally be passed as an rvalue reference in function
+ * arguments. Using an `mdbuffer` that is itself `const` is not strictly
+ * incorrect, but it indicates a likely misuse of the type.
+ *
+ * @tparam ElementType element type stored in the buffer
+ * @tparam Extents specifies the number of dimensions and their sizes
+ * @tparam LayoutPolicy specifies how data should be laid out in memory
+ * @tparam ContainerPolicy specifies how data should be allocated if necessary
+ * and how it should be accessed. This should very rarely need to be
+ * customized. For those cases where it must be customized, it is recommended
+ * to instantiate default_buffer_container_policy with a std::variant of
+ * container policies for each memory type. Note that the accessor policy of
+ * each container policy variant is used as the accessor policy for the mdspan
+ * view of the buffer for the corresponding memory type.
+ */
+template <typename ElementType,
+          typename Extents,
+          typename LayoutPolicy    = layout_c_contiguous,
+          typename ContainerPolicy = default_buffer_container_policy<ElementType>>
+struct mdbuffer {
+  using extents_type = Extents;
+  using layout_type  = LayoutPolicy;
+  using mapping_type = typename layout_type::template mapping<extents_type>;
+  using element_type = ElementType;
+
+  using value_type      = std::remove_cv_t<element_type>;
+  using index_type      = typename extents_type::index_type;
+  using difference_type = std::ptrdiff_t;
+  using rank_type       = typename extents_type::rank_type;
+
+  using container_policy_type   = ContainerPolicy;
+  using accessor_policy_variant = typename ContainerPolicy::accessor_policy_variant;
+
+  template <raft::memory_type MemType>
+  using accessor_policy = alternate_from_mem_type<MemType, accessor_policy_variant>;
+
+  using container_type_variant = typename container_policy_type::container_type_variant;
+
+  template <raft::memory_type MemType>
+  using container_type = typename container_policy_type::template container_type<MemType>;
+
+  template <memory_type MemType>
+  using owning_type = mdarray<value_type,
+                              extents_type,
+                              layout_type,
+                              typename container_policy_type::template container_policy<MemType>>;
+  // We use the static cast here to ensure that the memory types appear in the
+  // order expected for retrieving the correct variant alternative based on
+  // memory type. Even if the memory types are re-arranged in the enum and
+  // assigned different values, the logic should remain correct.
+  using owning_type_variant = std::variant<owning_type<static_cast<memory_type>(0)>,
+                                           owning_type<static_cast<memory_type>(1)>,
+                                           owning_type<static_cast<memory_type>(2)>,
+                                           owning_type<static_cast<memory_type>(3)>>;
+
+  template <memory_type MemType>
+  using view_type = std::conditional_t<std::is_const_v<element_type>,
+                                       typename owning_type<MemType>::const_view_type,
+                                       typename owning_type<MemType>::view_type>;
+
+  using view_type_variant = std::variant<view_type<static_cast<memory_type>(0)>,
+                                         view_type<static_cast<memory_type>(1)>,
+                                         view_type<static_cast<memory_type>(2)>,
+                                         view_type<static_cast<memory_type>(3)>>;
+
+  template <memory_type MemType>
+  using const_view_type         = typename owning_type<MemType>::const_view_type;
+  using const_view_type_variant = std::variant<const_view_type<static_cast<memory_type>(0)>,
+                                               const_view_type<static_cast<memory_type>(1)>,
+                                               const_view_type<static_cast<memory_type>(2)>,
+                                               const_view_type<static_cast<memory_type>(3)>>;
+
+  using storage_type_variant = concatenated_variant_t<view_type_variant, owning_type_variant>;
+
+  // Non-owning types are stored first in the variant Thus, if we want to access the
+  // owning type corresponding to device memory, we would need to skip over the
+  // non-owning types and then go to the index which corresponds to the memory
+  // type: is_owning * num_non_owning_types + index = 1 * 4 + 1 = 5
+  template <memory_type MemType, bool is_owning>
+  using storage_type =
+    std::variant_alternative_t<std::size_t{is_owning} * std::variant_size_v<view_type_variant> +
+                                 std::size_t{variant_index_from_memory_type(MemType)},
+                               storage_type_variant>;
+
+  /**
+   * @brief Construct an empty, uninitialized buffer
+   */
+  constexpr mdbuffer() = default;
+
+ private:
+  container_policy_type cp_{};
+  storage_type_variant data_{};
+
+  // This template is used to determine whether or not is possible to copy from
+  // the mdspan returned by the view method of a FromT type mdbuffer with
+  // memory type indicated by FromIndex to the mdspan returned by this mdbuffer
+  // at ToIndex
+  template <typename FromT, std::size_t FromIndex, std::size_t ToIndex>
+  auto static constexpr is_copyable_combination()
+  {
+    return detail::mdspan_copyable_v<
+      decltype(std::declval<std::variant_alternative_t<ToIndex, owning_type_variant>>().view()),
+      std::variant_alternative_t<FromIndex, decltype(std::declval<FromT>().view())>>;
+  }
+
+  // Using an index_sequence to iterate over the possible memory types of this
+  // mdbuffer, we construct an array of bools to determine whether or not the
+  // mdspan returned by the view method of a FromT type mdbuffer with memory
+  // type indicated by FromIndex can be copied to the mdspan returned by this
+  // mdbuffer's view method at each memory type
+  template <std::size_t FromIndex, typename FromT, std::size_t... ToIs>
+  auto static constexpr get_to_copyable_combinations(std::index_sequence<ToIs...>)
+  {
+    return std::array{is_copyable_combination<FromT, FromIndex, ToIs>()...};
+  }
+
+  // Using an index_sequence to iterate over the possible memory types of the
+  // FromT type mdbuffer, we construct an array of arrays indicating whether it
+  // is possible to copy from any mdspan that can be returned from the FromT
+  // mdbuffer to any mdspan that can be returned from this mdbuffer
+  template <typename FromT, std::size_t... FromIs>
+  auto static constexpr get_from_copyable_combinations(std::index_sequence<FromIs...>)
+  {
+    return std::array{get_to_copyable_combinations<FromIs, FromT>(
+      std::make_index_sequence<std::variant_size_v<owning_type_variant>>())...};
+  }
+
+  // Get an array of arrays indicating whether or not it is possible to copy
+  // from any given memory type of a FromT mdbuffer to any memory type of this
+  // mdbuffer
+  template <typename FromT>
+  auto static constexpr get_copyable_combinations()
+  {
+    return get_from_copyable_combinations<FromT>(
+      std::make_index_sequence<std::variant_size_v<decltype(std::declval<FromT>().view())>>());
+  }
+
+  template <std::size_t FromIndex, typename FromT, std::size_t... Is>
+  auto static constexpr is_copyable_from(std::index_sequence<Is...>)
+  {
+    return (... || get_copyable_combinations<FromT>()[FromIndex][Is]);
+  }
+
+  template <typename FromT, std::size_t... Is>
+  auto static constexpr is_copyable_from(bool, std::index_sequence<Is...>)
+  {
+    return (... || is_copyable_from<Is, FromT>(
+                     std::make_index_sequence<std::variant_size_v<owning_type_variant>>()));
+  }
+
+  template <typename FromT>
+  auto static constexpr is_copyable_from()
+  {
+    return is_copyable_from<FromT>(
+      true,
+      std::make_index_sequence<std::variant_size_v<decltype(std::declval<FromT>().view())>>());
+  }
+
+  template <typename FromT>
+  auto static is_copyable_from(FromT&& other, memory_type mem_type)
+  {
+    auto static copyable_combinations = get_copyable_combinations<FromT>();
+    return copyable_combinations[variant_index_from_memory_type(other.mem_type())]
+                                [variant_index_from_memory_type(mem_type)];
+  }
+
+  template <typename FromT>
+  auto static copy_from(raft::resources const& res, FromT&& other, memory_type mem_type)
+  {
+    auto result = storage_type_variant{};
+    switch (mem_type) {
+      case memory_type::host: {
+        result = std::visit(
+          [&res](auto&& other_view) {
+            auto tmp_result = owning_type<memory_type::host>{
+              res,
+              mapping_type{other_view.extents()},
+              typename container_policy_type::template container_policy<memory_type::host>{}};
+            raft::copy(res, tmp_result.view(), other_view);
+            return tmp_result;
+          },
+          other.view());
+        break;
+      }
+      case memory_type::device: {
+        result = std::visit(
+          [&res](auto&& other_view) {
+            auto tmp_result = owning_type<memory_type::device>{
+              res,
+              mapping_type{other_view.extents()},
+              typename container_policy_type::template container_policy<memory_type::device>{}};
+            raft::copy(res, tmp_result.view(), other_view);
+            return tmp_result;
+          },
+          other.view());
+        break;
+      }
+      case memory_type::managed: {
+        result = std::visit(
+          [&res](auto&& other_view) {
+            auto tmp_result = owning_type<memory_type::managed>{
+              res,
+              mapping_type{other_view.extents()},
+              typename container_policy_type::template container_policy<memory_type::managed>{}};
+            raft::copy(res, tmp_result.view(), other_view);
+            return tmp_result;
+          },
+          other.view());
+        break;
+      }
+      case memory_type::pinned: {
+        result = std::visit(
+          [&res](auto&& other_view) {
+            auto tmp_result = owning_type<memory_type::pinned>{
+              res,
+              mapping_type{other_view.extents()},
+              typename container_policy_type::template container_policy<memory_type::pinned>{}};
+            raft::copy(res, tmp_result.view(), other_view);
+            return tmp_result;
+          },
+          other.view());
+        break;
+      }
+    }
+    return result;
+  }
+
+ public:
+  /**
+   * @brief Construct an mdbuffer wrapping an existing mdspan. The resulting
+   * mdbuffer will be non-owning and match the memory type, layout, and
+   * element type of the mdspan.
+   */
+  template <
+    typename OtherAccessorPolicy,
+    std::enable_if_t<is_type_in_variant_v<OtherAccessorPolicy, accessor_policy_variant>>* = nullptr>
+  mdbuffer(mdspan<ElementType, Extents, LayoutPolicy, OtherAccessorPolicy> other) : data_{other}
+  {
+  }
+
+  /**
+   * @brief Construct an mdbuffer of const elements wrapping an existing mdspan
+   * with non-const elements. The resulting mdbuffer will be non-owning and match the memory type,
+   * layout, and element type of the mdspan.
+   */
+  template <
+    typename OtherElementType,
+    typename OtherAccessorPolicy,
+    std::enable_if_t<!std::is_same_v<OtherElementType, ElementType> &&
+                     std::is_same_v<OtherElementType const, ElementType> &&
+                     is_type_in_variant_v<OtherAccessorPolicy, accessor_policy_variant>>* = nullptr>
+  mdbuffer(mdspan<OtherElementType, Extents, LayoutPolicy, OtherAccessorPolicy> other)
+    : data_{raft::make_const_mdspan(other)}
+  {
+  }
+
+  /**
+   * @brief Construct an mdbuffer to hold an existing mdarray rvalue. The
+   * mdarray will be moved into the mdbuffer, and the mdbuffer will be owning.
+   */
+  template <typename OtherContainerPolicy,
+            std::enable_if_t<is_type_in_variant_v<
+              host_device_accessor<typename OtherContainerPolicy::accessor_type,
+                                   OtherContainerPolicy::mem_type>,
+              typename container_policy_type::container_policy_variant>>* = nullptr>
+  mdbuffer(mdarray<ElementType, Extents, LayoutPolicy, OtherContainerPolicy>&& other)
+    : data_{std::move(other)}
+  {
+  }
+
+  /**
+   * @brief Construct an mdbuffer from an existing mdarray lvalue. An mdspan
+   * view will be taken from the mdarray in order to construct the mdbuffer,
+   * and the mdbuffer will be non-owning
+   */
+  template <typename OtherContainerPolicy,
+            std::enable_if_t<is_type_in_variant_v<
+              host_device_accessor<typename OtherContainerPolicy::accessor_type,
+                                   OtherContainerPolicy::mem_type>,
+              typename container_policy_type::container_policy_variant>>* = nullptr>
+  mdbuffer(mdarray<ElementType, Extents, LayoutPolicy, OtherContainerPolicy>& other)
+    : mdbuffer{other.view()}
+  {
+  }
+
+  /**
+   * @brief Construct one mdbuffer from another mdbuffer rvalue with matching
+   * element type, extents, layout, and container policy.
+   *
+   * If the existing mdbuffer is owning and of the correct memory type,
+   * the new mdbuffer will take ownership of the underlying memory
+   * (preventing a view on memory owned by a moved-from object). The memory
+   * type of the new mdbuffer may be specified explicitly, in which case a copy
+   * will be performed if and only if it is necessary to do so.
+   */
+  mdbuffer(raft::resources const& res,
+           mdbuffer<ElementType, Extents, LayoutPolicy, ContainerPolicy>&& other,
+           std::optional<memory_type> specified_mem_type = std::nullopt)
+    : data_{[&res, &other, specified_mem_type, this]() {
+        auto other_mem_type = other.mem_type();
+        auto mem_type       = specified_mem_type.value_or(other_mem_type);
+        auto result         = storage_type_variant{};
+        if (mem_type == other.mem_type()) {
+          result = std::move(other.data_);
+        } else if (!other.is_owning() && has_compatible_accessibility(other_mem_type, mem_type) &&
+                   !is_host_device_accessible(mem_type)) {
+          switch (mem_type) {
+            case (memory_type::host): {
+              result = std::visit(
+                [&result, this](auto&& other_view) {
+                  return view_type<memory_type::host>{
+                    other_view.data_handle(),
+                    other_view.mapping(),
+                    cp_.template make_accessor_policy<memory_type::host>()};
+                },
+                other.view());
+              break;
+            }
+            case (memory_type::device): {
+              result = std::visit(
+                [&result, this](auto&& other_view) {
+                  return view_type<memory_type::device>{
+                    other_view.data_handle(),
+                    other_view.mapping(),
+                    cp_.template make_accessor_policy<memory_type::device>()};
+                },
+                other.view());
+              break;
+            }
+            case (memory_type::managed): {
+              result = std::visit(
+                [&result, this](auto&& other_view) {
+                  return view_type<memory_type::managed>{
+                    other_view.data_handle(),
+                    other_view.mapping(),
+                    cp_.template make_accessor_policy<memory_type::managed>()};
+                },
+                other.view());
+              break;
+            }
+            case (memory_type::pinned): {
+              result = std::visit(
+                [&result, this](auto&& other_view) {
+                  return view_type<memory_type::pinned>{
+                    other_view.data_handle(),
+                    other_view.mapping(),
+                    cp_.template make_accessor_policy<memory_type::pinned>()};
+                },
+                other.view());
+              break;
+            }
+          }
+        } else {
+          result = copy_from(res, other, mem_type);
+        }
+        return result;
+      }()}
+  {
+  }
+
+  /**
+   * @brief Construct one mdbuffer from another mdbuffer lvalue with matching
+   * element type, extents, layout, and container policy.
+   *
+   * Unlike when constructing from an rvalue, the new mdbuffer will take a
+   * non-owning view whenever possible, since it is assumed that the caller
+   * will manage the lifetime of the lvalue input. Note that the mdbuffer
+   * passed here must itself be non-const in order to allow this constructor to
+   * provide an equivalent view of the underlying data. To indicate const-ness
+   * of the underlying data, mdbuffers should be constructed with a const
+   * ElementType.
+   */
+  mdbuffer(raft::resources const& res,
+           mdbuffer<ElementType, Extents, LayoutPolicy, ContainerPolicy>& other, /* NOLINT */
+           std::optional<memory_type> specified_mem_type = std::nullopt)
+    : data_{[&res, &other, specified_mem_type, this]() {
+        auto mem_type       = specified_mem_type.value_or(other.mem_type());
+        auto result         = storage_type_variant{};
+        auto other_mem_type = other.mem_type();
+        if (mem_type == other_mem_type) {
+          std::visit([&result](auto&& other_view) { result = other_view; }, other.view());
+        } else if (has_compatible_accessibility(other_mem_type, mem_type) &&
+                   !is_host_device_accessible(mem_type)) {
+          switch (mem_type) {
+            case (memory_type::host): {
+              result = std::visit(
+                [&result, this](auto&& other_view) {
+                  return view_type<memory_type::host>{
+                    other_view.data_handle(),
+                    other_view.mapping(),
+                    cp_.template make_accessor_policy<memory_type::host>()};
+                },
+                other.view());
+              break;
+            }
+            case (memory_type::device): {
+              result = std::visit(
+                [&result, this](auto&& other_view) {
+                  return view_type<memory_type::device>{
+                    other_view.data_handle(),
+                    other_view.mapping(),
+                    cp_.template make_accessor_policy<memory_type::device>()};
+                },
+                other.view());
+              break;
+            }
+            case (memory_type::managed): {
+              result = std::visit(
+                [&result, this](auto&& other_view) {
+                  return view_type<memory_type::managed>{
+                    other_view.data_handle(),
+                    other_view.mapping(),
+                    cp_.template make_accessor_policy<memory_type::managed>()};
+                },
+                other.view());
+              break;
+            }
+            case (memory_type::pinned): {
+              result = std::visit(
+                [&result, this](auto&& other_view) {
+                  return view_type<memory_type::pinned>{
+                    other_view.data_handle(),
+                    other_view.mapping(),
+                    cp_.template make_accessor_policy<memory_type::pinned>()};
+                },
+                other.view());
+              break;
+            }
+          }
+        } else {
+          result = copy_from(res, other, mem_type);
+        }
+        return result;
+      }()}
+  {
+  }
+
+  /**
+   * @brief Construct an mdbuffer from an existing mdbuffer with arbitrary but
+   * compatible element type, extents, layout, and container policy. This
+   * constructor is used to coerce data to specific element types, layouts,
+   * or extents as well as specifying a memory type.
+   */
+  template <
+    typename OtherElementType,
+    typename OtherExtents,
+    typename OtherLayoutPolicy,
+    typename OtherContainerPolicy,
+    std::enable_if_t<is_copyable_from<
+      mdbuffer<OtherElementType, OtherExtents, OtherLayoutPolicy, OtherContainerPolicy>>()>* =
+      nullptr>
+  mdbuffer(
+    raft::resources const& res,
+    mdbuffer<OtherElementType, OtherExtents, OtherLayoutPolicy, OtherContainerPolicy> const& other,
+    std::optional<memory_type> specified_mem_type = std::nullopt)
+    : data_{[&res, &other, specified_mem_type]() {
+        auto mem_type = specified_mem_type.value_or(other.mem_type());
+        // Note: We perform this check at runtime because it is possible for two
+        // mdbuffers to have storage types which may be copied to each other for
+        // some memory types but not for others. This is an unusual situation, but
+        // we still need to guard against it.
+        RAFT_EXPECTS(
+          is_copyable_from<decltype(other)>(other, mem_type),
+          "mdbuffer cannot be constructed from other mdbuffer with indicated memory type");
+        return copy_from(res, other, mem_type);
+      }()}
+  {
+  }
+
+  /**
+   * @brief Return the memory type of the underlying data referenced by the
+   * mdbuffer
+   */
+  [[nodiscard]] auto constexpr mem_type() const
+  {
+    return static_cast<memory_type>(data_.index() % std::variant_size_v<owning_type_variant>);
+  };
+
+  /**
+   * @brief Return a boolean indicating whether or not the mdbuffer owns its
+   * storage
+   */
+  [[nodiscard]] auto constexpr is_owning() const
+  {
+    return data_.index() >= std::variant_size_v<view_type_variant>;
+  };
+
+ private:
+  template <typename MemTypeConstant>
+  [[nodiscard]] auto view()
+  {
+    if constexpr (MemTypeConstant::value.has_value()) {
+      if (is_owning()) {
+        if constexpr (std::is_const_v<element_type>) {
+          return std::as_const(std::get<owning_type<MemTypeConstant::value.value()>>(data_)).view();
+        } else {
+          return std::get<owning_type<MemTypeConstant::value.value()>>(data_).view();
+        }
+      } else {
+        return std::get<view_type<MemTypeConstant::value.value()>>(data_);
+      }
+    } else {
+      return std::visit(
+        [](auto&& inner) {
+          if constexpr (is_mdspan_v<std::remove_reference_t<decltype(inner)>>) {
+            return view_type_variant{inner};
+          } else {
+            if constexpr (std::is_const_v<element_type>) {
+              return view_type_variant{std::as_const(inner).view()};
+            } else {
+              return view_type_variant{inner.view()};
+            }
+          }
+        },
+        data_);
+    }
+  }
+
+  template <typename MemTypeConstant>
+  [[nodiscard]] auto view() const
+  {
+    if constexpr (MemTypeConstant::value.has_value()) {
+      if (is_owning()) {
+        return make_const_mdspan(
+          std::get<owning_type<MemTypeConstant::value.value()>>(data_).view());
+      } else {
+        return make_const_mdspan(std::get<view_type<MemTypeConstant::value.value()>>(data_));
+      }
+    } else {
+      return std::visit(
+        [](auto&& inner) {
+          if constexpr (is_mdspan_v<std::remove_reference_t<decltype(inner)>>) {
+            return const_view_type_variant{make_const_mdspan(inner)};
+          } else {
+            return const_view_type_variant{make_const_mdspan(inner.view())};
+          }
+        },
+        data_);
+    }
+  }
+
+ public:
+  /**
+   * @brief Return an mdspan of the indicated memory type representing a view
+   * on the stored data. If the mdbuffer does not contain data of the indicated
+   * memory type, a std::bad_variant_access will be thrown.
+   */
+  template <memory_type mem_type>
+  [[nodiscard]] auto view()
+  {
+    return view<memory_type_constant<mem_type>>();
+  }
+  /**
+   * @brief Return an mdspan containing const elements of the indicated memory type representing a
+   * view on the stored data. If the mdbuffer does not contain data of the indicated memory type, a
+   * std::bad_variant_access will be thrown.
+   */
+  template <memory_type mem_type>
+  [[nodiscard]] auto view() const
+  {
+    return view<memory_type_constant<mem_type>>();
+  }
+  /**
+   * @brief Return a std::variant representing the possible mdspan types that
+   * could be returned as views on the mdbuffer. The variant will contain the mdspan
+   * corresponding to its current memory type.
+   *
+   * This method is useful for writing generic code to handle any memory type
+   * that might be contained in an mdbuffer at a particular point in a
+   * workflow. By performing a `std::visit` on the returned value, the caller
+   * can easily dispatch to the correct code path for the memory type.
+   */
+  [[nodiscard]] auto view() { return view<memory_type_constant<>>(); }
+  /**
+   * @brief Return a std::variant representing the possible mdspan types that
+   * could be returned as const views on the mdbuffer. The variant will contain the mdspan
+   * corresponding to its current memory type.
+   *
+   * This method is useful for writing generic code to handle any memory type
+   * that might be contained in an mdbuffer at a particular point in a
+   * workflow. By performing a `std::visit` on the returned value, the caller
+   * can easily dispatch to the correct code path for the memory type.
+   */
+  [[nodiscard]] auto view() const { return view<memory_type_constant<>>(); }
+};
+
+/**
+ * @\brief Template checks and helpers to determine if type T is an mdbuffer
+ *         or a derived type
+ */
+
+template <typename ElementType, typename Extents, typename LayoutPolicy, typename ContainerPolicy>
+void __takes_an_mdbuffer_ptr(mdbuffer<ElementType, Extents, LayoutPolicy, ContainerPolicy>*);
+
+template <typename T, typename = void>
+struct is_mdbuffer : std::false_type {};
+template <typename T>
+struct is_mdbuffer<T, std::void_t<decltype(__takes_an_mdbuffer_ptr(std::declval<T*>()))>>
+  : std::true_type {};
+
+template <typename T, typename = void>
+struct is_input_mdbuffer : std::false_type {};
+template <typename T>
+struct is_input_mdbuffer<T, std::void_t<decltype(__takes_an_mdbuffer_ptr(std::declval<T*>()))>>
+  : std::bool_constant<std::is_const_v<typename T::element_type>> {};
+
+template <typename T, typename = void>
+struct is_output_mdbuffer : std::false_type {};
+template <typename T>
+struct is_output_mdbuffer<T, std::void_t<decltype(__takes_an_mdbuffer_ptr(std::declval<T*>()))>>
+  : std::bool_constant<not std::is_const_v<typename T::element_type>> {};
+
+template <typename T>
+using is_mdbuffer_t = is_mdbuffer<std::remove_const_t<T>>;
+
+template <typename T>
+using is_input_mdbuffer_t = is_input_mdbuffer<T>;
+
+template <typename T>
+using is_output_mdbuffer_t = is_output_mdbuffer<T>;
+
+/**
+ * @\brief Boolean to determine if variadic template types Tn are
+ *          raft::mdbuffer or derived types
+ */
+template <typename... Tn>
+inline constexpr bool is_mdbuffer_v = std::conjunction_v<is_mdbuffer_t<Tn>...>;
+
+template <typename... Tn>
+using enable_if_mdbuffer = std::enable_if_t<is_mdbuffer_v<Tn...>>;
+
+template <typename... Tn>
+inline constexpr bool is_input_mdbuffer_v = std::conjunction_v<is_input_mdbuffer_t<Tn>...>;
+
+template <typename... Tn>
+using enable_if_input_mdbuffer = std::enable_if_t<is_input_mdbuffer_v<Tn...>>;
+
+template <typename... Tn>
+inline constexpr bool is_output_mdbuffer_v = std::conjunction_v<is_output_mdbuffer_t<Tn>...>;
+
+template <typename... Tn>
+using enable_if_output_mdbuffer = std::enable_if_t<is_output_mdbuffer_v<Tn...>>;
+
+/** @} */
+
+}  // namespace raft
diff --git a/cpp/include/raft/core/mdbuffer.hpp b/cpp/include/raft/core/mdbuffer.hpp
new file mode 100644
index 0000000000..8281b5c6d6
--- /dev/null
+++ b/cpp/include/raft/core/mdbuffer.hpp
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#ifndef RAFT_DISABLE_CUDA
+#pragma message(__FILE__                                               \
+                  " should only be used in CUDA-disabled RAFT builds." \
+                  " Please use equivalent .cuh header instead.")
+#else
+// It is safe to include this cuh file in an hpp header because all CUDA code
+// is ifdef'd out for CUDA-disabled builds.
+#include <raft/core/mdbuffer.cuh>
+#endif
diff --git a/cpp/include/raft/core/memory_type.hpp b/cpp/include/raft/core/memory_type.hpp
index cd37a0ee50..7849cd67ab 100644
--- a/cpp/include/raft/core/memory_type.hpp
+++ b/cpp/include/raft/core/memory_type.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,13 +14,28 @@
  * limitations under the License.
  */
 #pragma once
+#include <cstdint>
+#include <optional>
+#ifndef RAFT_DISABLE_CUDA
+#include <cuda_runtime.h>
+#include <raft/util/cuda_rt_essentials.hpp>
+#include <type_traits>
+#else
+#include <raft/core/logger.hpp>
+#endif
 
 namespace raft {
-enum class memory_type { host, device, managed, pinned };
+enum class memory_type : std::uint8_t {
+  host    = std::uint8_t{0},
+  pinned  = std::uint8_t{1},
+  device  = std::uint8_t{2},
+  managed = std::uint8_t{3}
+};
 
 auto constexpr is_device_accessible(memory_type mem_type)
 {
-  return (mem_type == memory_type::device || mem_type == memory_type::managed);
+  return (mem_type == memory_type::device || mem_type == memory_type::managed ||
+          mem_type == memory_type::pinned);
 }
 auto constexpr is_host_accessible(memory_type mem_type)
 {
@@ -32,6 +47,22 @@ auto constexpr is_host_device_accessible(memory_type mem_type)
   return is_device_accessible(mem_type) && is_host_accessible(mem_type);
 }
 
+auto constexpr has_compatible_accessibility(memory_type old_mem_type, memory_type new_mem_type)
+{
+  return ((!is_device_accessible(new_mem_type) || is_device_accessible(old_mem_type)) &&
+          (!is_host_accessible(new_mem_type) || is_host_accessible(old_mem_type)));
+}
+
+template <memory_type... mem_types>
+struct memory_type_constant {
+  static_assert(sizeof...(mem_types) < 2, "At most one memory type can be specified");
+  auto static constexpr value = []() {
+    auto result = std::optional<memory_type>{};
+    if constexpr (sizeof...(mem_types) == 1) { result = std::make_optional(mem_types...); }
+    return result;
+  }();
+};
+
 namespace detail {
 
 template <bool is_host_accessible, bool is_device_accessible>
@@ -49,4 +80,23 @@ auto constexpr memory_type_from_access()
 }
 
 }  // end namespace detail
+
+template <typename T>
+auto memory_type_from_pointer(T* ptr)
+{
+  auto result = memory_type::host;
+#ifndef RAFT_DISABLE_CUDA
+  auto attrs = cudaPointerAttributes{};
+  RAFT_CUDA_TRY(cudaPointerGetAttributes(&attrs, ptr));
+  switch (attrs.type) {
+    case cudaMemoryTypeDevice: result = memory_type::device; break;
+    case cudaMemoryTypeHost: result = memory_type::host; break;
+    case cudaMemoryTypeManaged: result = memory_type::managed; break;
+    default: result = memory_type::host;
+  }
+#else
+  RAFT_LOG_DEBUG("RAFT compiled without CUDA support, assuming pointer is host pointer");
+#endif
+  return result;
+}
 }  // end namespace raft
diff --git a/cpp/include/raft/core/pinned_container_policy.hpp b/cpp/include/raft/core/pinned_container_policy.hpp
new file mode 100644
index 0000000000..51451deadb
--- /dev/null
+++ b/cpp/include/raft/core/pinned_container_policy.hpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <cstddef>
+#include <raft/core/mdspan_types.hpp>
+#include <raft/core/resources.hpp>
+#ifndef RAFT_DISABLE_CUDA
+#include <thrust/host_vector.h>
+#include <thrust/mr/allocator.h>
+#include <thrust/system/cuda/memory_resource.h>
+#else
+#include <raft/core/detail/fail_container_policy.hpp>
+#endif
+
+namespace raft {
+#ifndef RAFT_DISABLE_CUDA
+
+/**
+ * @brief A thin wrapper over thrust::host_vector for implementing the pinned mdarray container
+ * policy.
+ *
+ */
+template <typename T>
+struct pinned_container {
+  using value_type = T;
+  using allocator_type =
+    thrust::mr::stateless_resource_allocator<value_type,
+                                             thrust::cuda::universal_host_pinned_memory_resource>;
+
+ private:
+  using underlying_container_type = thrust::host_vector<value_type, allocator_type>;
+  underlying_container_type data_;
+
+ public:
+  using size_type = std::size_t;
+
+  using reference       = value_type&;
+  using const_reference = value_type const&;
+
+  using pointer       = value_type*;
+  using const_pointer = value_type const*;
+
+  using iterator       = pointer;
+  using const_iterator = const_pointer;
+
+  ~pinned_container()                           = default;
+  pinned_container(pinned_container&&) noexcept = default;
+  pinned_container(pinned_container const& that) : data_{that.data_} {}
+
+  auto operator=(pinned_container<T> const& that) -> pinned_container<T>&
+  {
+    data_ = underlying_container_type{that.data_};
+    return *this;
+  }
+  auto operator=(pinned_container<T>&& that) noexcept -> pinned_container<T>& = default;
+
+  /**
+   * @brief Ctor that accepts a size.
+   */
+  explicit pinned_container(std::size_t size, allocator_type const& alloc) : data_{size, alloc} {}
+  /**
+   * @brief Index operator that returns a reference to the actual data.
+   */
+  template <typename Index>
+  auto operator[](Index i) noexcept -> reference
+  {
+    return data_[i];
+  }
+  /**
+   * @brief Index operator that returns a reference to the actual data.
+   */
+  template <typename Index>
+  auto operator[](Index i) const noexcept
+  {
+    return data_[i];
+  }
+
+  void resize(size_type size) { data_.resize(size, data_.stream()); }
+
+  [[nodiscard]] auto data() noexcept -> pointer { return data_.data().get(); }
+  [[nodiscard]] auto data() const noexcept -> const_pointer { return data_.data().get(); }
+};
+
+/**
+ * @brief A container policy for pinned mdarray.
+ */
+template <typename ElementType>
+struct pinned_vector_policy {
+  using element_type          = ElementType;
+  using container_type        = pinned_container<element_type>;
+  using allocator_type        = typename container_type::allocator_type;
+  using pointer               = typename container_type::pointer;
+  using const_pointer         = typename container_type::const_pointer;
+  using reference             = typename container_type::reference;
+  using const_reference       = typename container_type::const_reference;
+  using accessor_policy       = std::experimental::default_accessor<element_type>;
+  using const_accessor_policy = std::experimental::default_accessor<element_type const>;
+
+  auto create(raft::resources const&, size_t n) -> container_type
+  {
+    return container_type(n, allocator_);
+  }
+
+  constexpr pinned_vector_policy() noexcept(std::is_nothrow_default_constructible_v<ElementType>)
+    : allocator_{}
+  {
+  }
+
+  [[nodiscard]] constexpr auto access(container_type& c, size_t n) const noexcept -> reference
+  {
+    return c[n];
+  }
+  [[nodiscard]] constexpr auto access(container_type const& c, size_t n) const noexcept
+    -> const_reference
+  {
+    return c[n];
+  }
+
+  [[nodiscard]] auto make_accessor_policy() noexcept { return accessor_policy{}; }
+  [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; }
+
+ private:
+  allocator_type allocator_;
+};
+#else
+template <typename ElementType>
+using pinned_vector_policy = detail::fail_container_policy<ElementType>;
+#endif
+}  // namespace raft
diff --git a/cpp/include/raft/core/pinned_mdarray.hpp b/cpp/include/raft/core/pinned_mdarray.hpp
new file mode 100644
index 0000000000..72b8d52e0d
--- /dev/null
+++ b/cpp/include/raft/core/pinned_mdarray.hpp
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <raft/core/mdarray.hpp>
+#include <raft/core/pinned_container_policy.hpp>
+#include <raft/core/pinned_mdspan.hpp>
+#include <raft/core/resources.hpp>
+
+namespace raft {
+
+/**
+ * @brief mdarray with pinned container policy
+ * @tparam ElementType the data type of the elements
+ * @tparam Extents defines the shape
+ * @tparam LayoutPolicy policy for indexing strides and layout ordering
+ * @tparam ContainerPolicy storage and accessor policy
+ */
+template <typename ElementType,
+          typename Extents,
+          typename LayoutPolicy    = layout_c_contiguous,
+          typename ContainerPolicy = pinned_vector_policy<ElementType>>
+using pinned_mdarray =
+  mdarray<ElementType, Extents, LayoutPolicy, pinned_accessor<ContainerPolicy>>;
+
+/**
+ * @brief Shorthand for 0-dim host mdarray (scalar).
+ * @tparam ElementType the data type of the scalar element
+ * @tparam IndexType the index type of the extents
+ */
+template <typename ElementType, typename IndexType = std::uint32_t>
+using pinned_scalar = pinned_mdarray<ElementType, scalar_extent<IndexType>>;
+
+/**
+ * @brief Shorthand for 1-dim pinned mdarray.
+ * @tparam ElementType the data type of the vector elements
+ * @tparam IndexType the index type of the extents
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ */
+template <typename ElementType,
+          typename IndexType    = std::uint32_t,
+          typename LayoutPolicy = layout_c_contiguous>
+using pinned_vector = pinned_mdarray<ElementType, vector_extent<IndexType>, LayoutPolicy>;
+
+/**
+ * @brief Shorthand for c-contiguous pinned matrix.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam IndexType the index type of the extents
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ */
+template <typename ElementType,
+          typename IndexType    = std::uint32_t,
+          typename LayoutPolicy = layout_c_contiguous>
+using pinned_matrix = pinned_mdarray<ElementType, matrix_extent<IndexType>, LayoutPolicy>;
+
+/**
+ * @brief Create a pinned mdarray.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam IndexType the index type of the extents
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @param handle raft::resources
+ * @param exts dimensionality of the array (series of integers)
+ * @return raft::pinned_mdarray
+ */
+template <typename ElementType,
+          typename IndexType    = std::uint32_t,
+          typename LayoutPolicy = layout_c_contiguous,
+          size_t... Extents>
+auto make_pinned_mdarray(raft::resources const& handle, extents<IndexType, Extents...> exts)
+{
+  using mdarray_t = pinned_mdarray<ElementType, decltype(exts), LayoutPolicy>;
+
+  typename mdarray_t::mapping_type layout{exts};
+  typename mdarray_t::container_policy_type policy{};
+
+  return mdarray_t{handle, layout, policy};
+}
+
+/**
+ * @brief Create a 2-dim c-contiguous pinned mdarray.
+ *
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam IndexType the index type of the extents
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @param[in] handle raft handle for managing expensive resources
+ * @param[in] n_rows number or rows in matrix
+ * @param[in] n_cols number of columns in matrix
+ * @return raft::pinned_matrix
+ */
+template <typename ElementType,
+          typename IndexType    = std::uint32_t,
+          typename LayoutPolicy = layout_c_contiguous>
+auto make_pinned_matrix(raft::resources const& handle, IndexType n_rows, IndexType n_cols)
+{
+  return make_pinned_mdarray<ElementType, IndexType, LayoutPolicy>(
+    handle, make_extents<IndexType>(n_rows, n_cols));
+}
+
+/**
+ * @brief Create a pinned scalar from v.
+ *
+ * @tparam ElementType the data type of the scalar element
+ * @tparam IndexType the index type of the extents
+ * @param[in] handle raft handle for managing expensive cuda resources
+ * @param[in] v scalar to wrap on pinned
+ * @return raft::pinned_scalar
+ */
+template <typename ElementType, typename IndexType = std::uint32_t>
+auto make_pinned_scalar(raft::resources const& handle, ElementType const& v)
+{
+  scalar_extent<IndexType> extents;
+  using policy_t = typename pinned_scalar<ElementType>::container_policy_type;
+  policy_t policy{};
+  auto scalar = pinned_scalar<ElementType>{handle, extents, policy};
+  scalar(0)   = v;
+  return scalar;
+}
+
+/**
+ * @brief Create a 1-dim pinned mdarray.
+ * @tparam ElementType the data type of the vector elements
+ * @tparam IndexType the index type of the extents
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @param[in] handle raft handle for managing expensive cuda resources
+ * @param[in] n number of elements in vector
+ * @return raft::pinned_vector
+ */
+template <typename ElementType,
+          typename IndexType    = std::uint32_t,
+          typename LayoutPolicy = layout_c_contiguous>
+auto make_pinned_vector(raft::resources const& handle, IndexType n)
+{
+  return make_pinned_mdarray<ElementType, IndexType, LayoutPolicy>(handle,
+                                                                   make_extents<IndexType>(n));
+}
+
+}  // end namespace raft
diff --git a/cpp/include/raft/core/pinned_mdspan.hpp b/cpp/include/raft/core/pinned_mdspan.hpp
new file mode 100644
index 0000000000..e764101d1c
--- /dev/null
+++ b/cpp/include/raft/core/pinned_mdspan.hpp
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <raft/core/host_device_accessor.hpp>
+#include <raft/core/mdspan.hpp>
+#include <raft/core/memory_type.hpp>
+
+namespace raft {
+
+template <typename AccessorPolicy>
+using pinned_accessor = host_device_accessor<AccessorPolicy, memory_type::pinned>;
+
+/**
+ * @brief std::experimental::mdspan with pinned tag to indicate host/device accessibility
+ */
+template <typename ElementType,
+          typename Extents,
+          typename LayoutPolicy   = layout_c_contiguous,
+          typename AccessorPolicy = std::experimental::default_accessor<ElementType>>
+using pinned_mdspan = mdspan<ElementType, Extents, LayoutPolicy, pinned_accessor<AccessorPolicy>>;
+
+template <typename T, bool B>
+struct is_pinned_mdspan : std::false_type {};
+template <typename T>
+struct is_pinned_mdspan<T, true>
+  : std::bool_constant<T::accessor_type::mem_type == memory_type::pinned> {};
+
+/**
+ * @\brief Boolean to determine if template type T is either raft::pinned_mdspan or a derived type
+ */
+template <typename T>
+using is_pinned_mdspan_t = is_pinned_mdspan<T, is_mdspan_v<T>>;
+
+template <typename T>
+using is_input_pinned_mdspan_t = is_pinned_mdspan<T, is_input_mdspan_v<T>>;
+
+template <typename T>
+using is_output_pinned_mdspan_t = is_pinned_mdspan<T, is_output_mdspan_v<T>>;
+
+/**
+ * @\brief Boolean to determine if variadic template types Tn are either raft::pinned_mdspan or a
+ * derived type
+ */
+template <typename... Tn>
+inline constexpr bool is_pinned_mdspan_v = std::conjunction_v<is_pinned_mdspan_t<Tn>...>;
+
+template <typename... Tn>
+inline constexpr bool is_input_pinned_mdspan_v =
+  std::conjunction_v<is_input_pinned_mdspan_t<Tn>...>;
+
+template <typename... Tn>
+inline constexpr bool is_output_pinned_mdspan_v =
+  std::conjunction_v<is_output_pinned_mdspan_t<Tn>...>;
+
+template <typename... Tn>
+using enable_if_pinned_mdspan = std::enable_if_t<is_pinned_mdspan_v<Tn...>>;
+
+template <typename... Tn>
+using enable_if_input_pinned_mdspan = std::enable_if_t<is_input_pinned_mdspan_v<Tn...>>;
+
+template <typename... Tn>
+using enable_if_output_pinned_mdspan = std::enable_if_t<is_output_pinned_mdspan_v<Tn...>>;
+
+/**
+ * @brief Shorthand for 0-dim pinned mdspan (scalar).
+ * @tparam ElementType the data type of the scalar element
+ * @tparam IndexType the index type of the extents
+ */
+template <typename ElementType, typename IndexType = std::uint32_t>
+using pinned_scalar_view = pinned_mdspan<ElementType, scalar_extent<IndexType>>;
+
+/**
+ * @brief Shorthand for 1-dim pinned mdspan.
+ * @tparam ElementType the data type of the vector elements
+ * @tparam IndexType the index type of the extents
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ */
+template <typename ElementType,
+          typename IndexType    = std::uint32_t,
+          typename LayoutPolicy = layout_c_contiguous>
+using pinned_vector_view = pinned_mdspan<ElementType, vector_extent<IndexType>, LayoutPolicy>;
+
+/**
+ * @brief Shorthand for c-contiguous pinned matrix view.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam IndexType the index type of the extents
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ */
+template <typename ElementType,
+          typename IndexType    = std::uint32_t,
+          typename LayoutPolicy = layout_c_contiguous>
+using pinned_matrix_view = pinned_mdspan<ElementType, matrix_extent<IndexType>, LayoutPolicy>;
+
+/**
+ * @brief Shorthand for 128 byte aligned pinned matrix view.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam IndexType the index type of the extents
+ * @tparam LayoutPolicy must be of type layout_{left/right}_padded
+ */
+template <typename ElementType,
+          typename IndexType    = std::uint32_t,
+          typename LayoutPolicy = layout_right_padded<ElementType>,
+          typename              = enable_if_layout_padded<ElementType, LayoutPolicy>>
+using pinned_aligned_matrix_view =
+  pinned_mdspan<ElementType,
+                matrix_extent<IndexType>,
+                LayoutPolicy,
+                std::experimental::aligned_accessor<ElementType, detail::alignment::value>>;
+
+/**
+ * @brief Create a 2-dim 128 byte aligned mdspan instance for pinned pointer. It's
+ *        expected that the given layout policy match the layout of the underlying
+ *        pointer.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy must be of type layout_{left/right}_padded
+ * @tparam IndexType the index type of the extents
+ * @param[in] ptr to pinned memory to wrap
+ * @param[in] n_rows number of rows in pointer
+ * @param[in] n_cols number of columns in pointer
+ */
+template <typename ElementType,
+          typename IndexType    = std::uint32_t,
+          typename LayoutPolicy = layout_right_padded<ElementType>>
+auto constexpr make_pinned_aligned_matrix_view(ElementType* ptr, IndexType n_rows, IndexType n_cols)
+{
+  using data_handle_type =
+    typename std::experimental::aligned_accessor<ElementType,
+                                                 detail::alignment::value>::data_handle_type;
+  static_assert(std::is_same<LayoutPolicy, layout_left_padded<ElementType>>::value ||
+                std::is_same<LayoutPolicy, layout_right_padded<ElementType>>::value);
+  assert(reinterpret_cast<std::uintptr_t>(ptr) ==
+         std::experimental::details::alignTo(reinterpret_cast<std::uintptr_t>(ptr),
+                                             detail::alignment::value));
+
+  data_handle_type aligned_pointer = ptr;
+
+  matrix_extent<IndexType> extents{n_rows, n_cols};
+  return pinned_aligned_matrix_view<ElementType, IndexType, LayoutPolicy>{aligned_pointer, extents};
+}
+
+/**
+ * @brief Create a 0-dim (scalar) mdspan instance for pinned value.
+ *
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam IndexType the index type of the extents
+ * @param[in] ptr to pinned memory to wrap
+ */
+template <typename ElementType, typename IndexType = std::uint32_t>
+auto constexpr make_pinned_scalar_view(ElementType* ptr)
+{
+  scalar_extent<IndexType> extents;
+  return pinned_scalar_view<ElementType, IndexType>{ptr, extents};
+}
+
+/**
+ * @brief Create a 2-dim c-contiguous mdspan instance for pinned pointer. It's
+ *        expected that the given layout policy match the layout of the underlying
+ *        pointer.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @tparam IndexType the index type of the extents
+ * @param[in] ptr to pinned memory to wrap
+ * @param[in] n_rows number of rows in pointer
+ * @param[in] n_cols number of columns in pointer
+ */
+template <typename ElementType,
+          typename IndexType    = std::uint32_t,
+          typename LayoutPolicy = layout_c_contiguous>
+auto constexpr make_pinned_matrix_view(ElementType* ptr, IndexType n_rows, IndexType n_cols)
+{
+  matrix_extent<IndexType> extents{n_rows, n_cols};
+  return pinned_matrix_view<ElementType, IndexType, LayoutPolicy>{ptr, extents};
+}
+
+/**
+ * @brief Create a 2-dim mdspan instance for pinned pointer with a strided layout
+ *        that is restricted to stride 1 in the trailing dimension. It's
+ *        expected that the given layout policy match the layout of the underlying
+ *        pointer.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam IndexType the index type of the extents
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @param[in] ptr to pinned memory to wrap
+ * @param[in] n_rows number of rows in pointer
+ * @param[in] n_cols number of columns in pointer
+ * @param[in] stride leading dimension / stride of data
+ */
+template <typename ElementType, typename IndexType, typename LayoutPolicy = layout_c_contiguous>
+auto constexpr make_pinned_strided_matrix_view(ElementType* ptr,
+                                               IndexType n_rows,
+                                               IndexType n_cols,
+                                               IndexType stride)
+{
+  constexpr auto is_row_major = std::is_same_v<LayoutPolicy, layout_c_contiguous>;
+  IndexType stride0           = is_row_major ? (stride > 0 ? stride : n_cols) : 1;
+  IndexType stride1           = is_row_major ? 1 : (stride > 0 ? stride : n_rows);
+
+  assert(is_row_major ? stride0 >= n_cols : stride1 >= n_rows);
+  matrix_extent<IndexType> extents{n_rows, n_cols};
+
+  auto layout = make_strided_layout(extents, std::array<IndexType, 2>{stride0, stride1});
+  return pinned_matrix_view<ElementType, IndexType, layout_stride>{ptr, layout};
+}
+
+/**
+ * @brief Create a 1-dim mdspan instance for pinned pointer.
+ * @tparam ElementType the data type of the vector elements
+ * @tparam IndexType the index type of the extents
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @param[in] ptr to pinned memory to wrap
+ * @param[in] n number of elements in pointer
+ * @return raft::pinned_vector_view
+ */
+template <typename ElementType, typename IndexType, typename LayoutPolicy = layout_c_contiguous>
+auto constexpr make_pinned_vector_view(ElementType* ptr, IndexType n)
+{
+  return pinned_vector_view<ElementType, IndexType, LayoutPolicy>{ptr, n};
+}
+
+/**
+ * @brief Create a 1-dim mdspan instance for pinned pointer.
+ * @tparam ElementType the data type of the vector elements
+ * @tparam IndexType the index type of the extents
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @param[in] ptr to pinned memory to wrap
+ * @param[in] mapping The layout mapping to use for this vector
+ * @return raft::pinned_vector_view
+ */
+template <typename ElementType, typename IndexType, typename LayoutPolicy = layout_c_contiguous>
+auto constexpr make_pinned_vector_view(
+  ElementType* ptr,
+  const typename LayoutPolicy::template mapping<vector_extent<IndexType>>& mapping)
+{
+  return pinned_vector_view<ElementType, IndexType, LayoutPolicy>{ptr, mapping};
+}
+
+/**
+ * @brief Create a raft::pinned_mdspan
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam IndexType the index type of the extents
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @param ptr Pointer to the data
+ * @param exts dimensionality of the array (series of integers)
+ * @return raft::pinned_mdspan
+ */
+template <typename ElementType,
+          typename IndexType    = std::uint32_t,
+          typename LayoutPolicy = layout_c_contiguous,
+          size_t... Extents>
+auto constexpr make_pinned_mdspan(ElementType* ptr, extents<IndexType, Extents...> exts)
+{
+  return make_mdspan<ElementType, IndexType, LayoutPolicy, true, true>(ptr, exts);
+}
+}  // end namespace raft
diff --git a/cpp/include/raft/core/serialize.hpp b/cpp/include/raft/core/serialize.hpp
index b2fef8c6ef..7e3aab8b89 100644
--- a/cpp/include/raft/core/serialize.hpp
+++ b/cpp/include/raft/core/serialize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <raft/core/detail/mdspan_numpy_serializer.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdspan.hpp>
+#include <raft/core/managed_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
 
diff --git a/cpp/include/raft/core/stream_view.hpp b/cpp/include/raft/core/stream_view.hpp
index f7e7934dbf..128050c414 100644
--- a/cpp/include/raft/core/stream_view.hpp
+++ b/cpp/include/raft/core/stream_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#pragma once
 #include <raft/core/cuda_support.hpp>
 #include <raft/core/error.hpp>
 #include <raft/core/logger.hpp>
diff --git a/cpp/include/raft/util/memory_type_dispatcher.cuh b/cpp/include/raft/util/memory_type_dispatcher.cuh
new file mode 100644
index 0000000000..94d838415a
--- /dev/null
+++ b/cpp/include/raft/util/memory_type_dispatcher.cuh
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <raft/core/mdbuffer.cuh>
+#include <raft/core/mdspan.hpp>
+#include <raft/core/memory_type.hpp>
+#include <type_traits>
+#include <utility>
+
+namespace raft {
+
+namespace detail {
+
+template <typename lambda_t, typename arg_t, typename = void>
+struct is_callable : std::false_type {};
+
+template <typename lambda_t, typename arg_t>
+struct is_callable<lambda_t,
+                   arg_t,
+                   std::void_t<decltype(std::declval<lambda_t>()(std::declval<arg_t>()))>>
+  : std::true_type {};
+
+template <typename lambda_t,
+          typename mdbuffer_type,
+          memory_type mem_type,
+          enable_if_mdbuffer<mdbuffer_type>* = nullptr>
+auto static constexpr is_callable_for_memory_type =
+  is_callable<lambda_t, decltype(std::declval<mdbuffer_type>().template view<mem_type>())>::value;
+
+}  // namespace detail
+
+/**
+ * @defgroup memory_type_dispatcher Dispatch functor based on memory type
+ * @{
+ */
+
+/**
+ * @brief Dispatch to various specializations of a functor which accepts an
+ * mdspan based on the mdspan's memory type
+ *
+ * This function template is used to dispatch to one or more implementations
+ * of a function based on memory type. For instance, if a functor has been
+ * implemented with an operator that accepts only a `device_mdspan`, input data
+ * can be passed to that functor with minimal copies or allocations by wrapping
+ * the functor in this template.
+ *
+ * More specifically, host memory data will be copied to device before being
+ * passed to the functor as a `device_mdspan`. Device, managed, and pinned data
+ * will be passed directly to the functor as a `device_mdspan`.
+ *
+ * If the functor's operator were _also_ specialized for `host_mdspan`, then
+ * this wrapper would pass an input `host_mdspan` directly to the corresponding
+ * specialization.
+ *
+ * If a functor explicitly specializes for managed/pinned memory and receives
+ * managed/pinned input, the corresponding specialization will be invoked. If the functor does not
+ * specialize for either, it will preferentially invoke the device
+ * specialization if available and then the host specialization. Managed input
+ * will never be dispatched to an explicit specialization for pinned memory and
+ * vice versa.
+ *
+ * Dispatching is performed by coercing the input mdspan to an mdbuffer of the
+ * correct type. If it is necessary to coerce the input data to a different
+ * data type (e.g. floats to doubles) or to a different memory layout, this can
+ * be done by passing an explicit mdbuffer type to the `memory_type_dispatcher`
+ * template.
+ *
+ * Usage example:
+ * @code{.cpp}
+ * // Functor which accepts only a `device_mdspan` or `managed_mdspan` of
+ * // doubles in C-contiguous layout. We wish to be able to call this
+ * // functor on any compatible data, regardless of data type, memory type,
+ * // or layout.
+ * struct functor {
+ *    auto operator()(device_matrix_view<double> data) {
+ *      // Do something with data on device
+ *    };
+ *    auto operator()(managed_matrix_view<double> data) {
+ *      // Do something with data, taking advantage of knowledge that
+ *      // underlying memory is managed
+ *    };
+ * };
+ *
+ * auto rows = 3;
+ * auto cols = 5;
+ * auto res = raft::device_resources{};
+ *
+ * auto host_data = raft::make_host_matrix<double>(rows, cols);
+ * // functor{}(host_data.view()); // This would fail to compile
+ * auto device_data = raft::make_device_matrix<double>(res, rows, cols);
+ * functor{}(device_data.view()); // Functor accepts device mdspan
+ * auto managed_data = raft::make_managed_matrix<double>(res, rows, cols);
+ * // functor{}(managed_data.view()); // Functor accepts managed mdspan
+ * auto pinned_data = raft::make_managed_matrix<double>(res, rows, cols);
+ * functor{}(pinned_data.view()); // This would fail to compile
+ * auto float_data = raft::make_device_matrix<float>(res, rows, cols);
+ * // functor{}(float_data.view()); // This would fail to compile
+ * auto f_data = raft::make_device_matrix<double, int, raft::layout_f_contiguous>(res, rows, cols);
+ * // functor{}(f_data.view()); // This would fail to compile
+ *
+ * // `memory_type_dispatcher` lets us call this functor on all of the above
+ * raft::memory_type_dispatcher(res, functor{}, host_data.view());
+ * raft::memory_type_dispatcher(res, functor{}, device_data.view());
+ * raft::memory_type_dispatcher(res, functor{}, managed_data.view());
+ * raft::memory_type_dispatcher(res, functor{}, pinned_data.view());
+ * // Here, we use the mdbuffer type template parameter to ensure that the data
+ * // type and layout are as expected by the functor
+ * raft::memory_type_dispatcher<raft::mdbuffer<double, matrix_extents<int>>>(res, functor{},
+ * float_data.view()); raft::memory_type_dispatcher<raft::mdbuffer<double,
+ * matrix_extents<int>>>(res, functor{}, f_data.view());
+ * @endcode
+ *
+ * As this example shows, `memory_type_dispatcher` can be used to dispatch any
+ * compatible mdspan input to a functor, regardless of the mdspan type(s) that
+ * functor supports.
+ */
+template <typename lambda_t, typename mdbuffer_type, enable_if_mdbuffer<mdbuffer_type>* = nullptr>
+decltype(auto) memory_type_dispatcher(raft::resources const& res, lambda_t&& f, mdbuffer_type&& buf)
+{
+  if (is_host_device_accessible(buf.mem_type())) {
+    // First see if functor has been specialized for this exact memory type
+    if constexpr (detail::
+                    is_callable_for_memory_type<lambda_t, mdbuffer_type, memory_type::managed>) {
+      if (buf.mem_type() == memory_type::managed) {
+        return f(buf.template view<memory_type::managed>());
+      }
+    }
+    if constexpr (detail::
+                    is_callable_for_memory_type<lambda_t, mdbuffer_type, memory_type::pinned>) {
+      if (buf.mem_type() == memory_type::pinned) {
+        return f(buf.template view<memory_type::pinned>());
+      }
+    }
+  }
+  // If the functor is specialized for device and the data are
+  // device-accessible, use the device specialization
+  if constexpr (detail::is_callable_for_memory_type<lambda_t, mdbuffer_type, memory_type::device>) {
+    if (is_device_accessible(buf.mem_type())) {
+      return f(mdbuffer{res, buf, memory_type::device}.template view<memory_type::device>());
+    }
+    // If there is no host specialization, still use the device specialization
+    if constexpr (!detail::
+                    is_callable_for_memory_type<lambda_t, mdbuffer_type, memory_type::host>) {
+      return f(mdbuffer{res, buf, memory_type::device}.template view<memory_type::device>());
+    }
+  }
+
+  // If nothing else has worked, use the host specialization
+  if constexpr (detail::is_callable_for_memory_type<lambda_t, mdbuffer_type, memory_type::host>) {
+    return f(mdbuffer{res, buf, memory_type::host}.template view<memory_type::host>());
+  }
+
+  // In the extremely rare case that the functor has been specialized _only_
+  // for either pinned memory, managed memory, or both, and the input data are
+  // neither pinned nor managed, we must perform a copy. In this situation, if
+  // we have specializations for both pinned and managed memory, we arbitrarily
+  // prefer the managed specialization. Note that if the data _are_ either
+  // pinned or managed already, we will have already invoked the correct
+  // specialization above.
+  if constexpr (detail::
+                  is_callable_for_memory_type<lambda_t, mdbuffer_type, memory_type::managed>) {
+    return f(mdbuffer{res, buf, memory_type::managed}.template view<memory_type::managed>());
+  } else if constexpr (detail::is_callable_for_memory_type<lambda_t,
+                                                           mdbuffer_type,
+                                                           memory_type::pinned>) {
+    return f(mdbuffer{res, buf, memory_type::pinned}.template view<memory_type::pinned>());
+  }
+
+  // Suppress warning for unreachable loop. In general, it is a desirable thing
+  // for this to be unreachable, but some functors may be specialized in such a
+  // way that this is not the case.
+#pragma nv_diag_suppress 128
+  RAFT_FAIL("The given functor could not be invoked on the provided data");
+#pragma nv_diag_default 128
+}
+
+template <typename lambda_t, typename mdspan_type, enable_if_mdspan<mdspan_type>* = nullptr>
+decltype(auto) memory_type_dispatcher(raft::resources const& res, lambda_t&& f, mdspan_type view)
+{
+  return memory_type_dispatcher(res, std::forward<lambda_t>(f), mdbuffer{view});
+}
+
+template <typename mdbuffer_type,
+          typename lambda_t,
+          typename mdspan_type,
+          enable_if_mdbuffer<mdbuffer_type>* = nullptr,
+          enable_if_mdspan<mdspan_type>*     = nullptr>
+decltype(auto) memory_type_dispatcher(raft::resources const& res, lambda_t&& f, mdspan_type view)
+{
+  return memory_type_dispatcher(res, std::forward<lambda_t>(f), mdbuffer_type{res, mdbuffer{view}});
+}
+
+/** @} */
+
+}  // namespace raft
diff --git a/cpp/include/raft/util/variant_utils.hpp b/cpp/include/raft/util/variant_utils.hpp
new file mode 100644
index 0000000000..26ca2b7eb4
--- /dev/null
+++ b/cpp/include/raft/util/variant_utils.hpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <type_traits>
+#include <variant>
+
+namespace raft {
+
+template <typename variant1, typename variant2>
+struct concatenated_variant;
+
+template <typename... types1, typename... types2>
+struct concatenated_variant<std::variant<types1...>, std::variant<types2...>> {
+  using type = std::variant<types1..., types2...>;
+};
+
+template <typename variant1, typename variant2>
+using concatenated_variant_t = typename concatenated_variant<variant1, variant2>::type;
+
+template <typename visitor_t, typename variant_t, std::size_t index = std::size_t{}>
+auto fast_visit(visitor_t&& visitor, variant_t&& variant)
+{
+  using return_t = decltype(std::forward<visitor_t>(visitor)(std::get<0>(variant)));
+  auto result    = return_t{};
+
+  if constexpr (index ==
+                std::variant_size_v<std::remove_cv_t<std::remove_reference_t<variant_t>>>) {
+    __builtin_unreachable();
+  } else {
+    if (index == variant.index()) {
+      result = std::forward<visitor_t>(visitor)(std::get<index>(std::forward<variant_t>(variant)));
+    } else {
+      result = fast_visit<visitor_t, variant_t, index + 1>(std::forward<visitor_t>(visitor),
+                                                           std::forward<variant_t>(variant));
+    }
+  }
+  return result;
+}
+
+template <typename T, typename VariantType>
+struct is_type_in_variant;
+
+template <typename T, typename... Vs>
+struct is_type_in_variant<T, std::variant<Vs...>> {
+  static constexpr bool value = (std::is_same_v<T, Vs> || ...);
+};
+
+template <typename T, typename VariantType>
+auto static constexpr is_type_in_variant_v = is_type_in_variant<T, VariantType>::value;
+
+}  // namespace raft
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index f043442840..6e32281ec0 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -124,6 +124,7 @@ if(BUILD_TESTS)
     test/core/interruptible.cu
     test/core/nvtx.cpp
     test/core/mdarray.cu
+    test/core/mdbuffer.cu
     test/core/mdspan_copy.cpp
     test/core/mdspan_copy.cu
     test/core/mdspan_utils.cu
@@ -460,6 +461,7 @@ if(BUILD_TESTS)
     test/util/device_atomics.cu
     test/util/integer_utils.cpp
     test/util/integer_utils.cu
+    test/util/memory_type_dispatcher.cu
     test/util/pow2_utils.cu
     test/util/reduction.cu
   )
diff --git a/cpp/test/core/mdarray.cu b/cpp/test/core/mdarray.cu
index 86e51be2e4..b0ab36c6e3 100644
--- a/cpp/test/core/mdarray.cu
+++ b/cpp/test/core/mdarray.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/host_container_policy.hpp>
 #include <raft/core/host_mdarray.hpp>
+#include <raft/core/managed_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/util/cuda_utils.cuh>
diff --git a/cpp/test/core/mdbuffer.cu b/cpp/test/core/mdbuffer.cu
new file mode 100644
index 0000000000..d93d532938
--- /dev/null
+++ b/cpp/test/core/mdbuffer.cu
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <cstdint>
+#include <gtest/gtest.h>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/managed_mdarray.hpp>
+#include <raft/core/mdbuffer.cuh>
+#include <raft/core/pinned_mdarray.hpp>
+#include <utility>
+#include <variant>
+
+namespace raft {
+
+TEST(MDBuffer, FromHost)
+{
+  auto res             = device_resources{};
+  auto constexpr depth = std::uint32_t{5};
+  auto constexpr rows  = std::uint32_t{3};
+  auto constexpr cols  = std::uint32_t{2};
+  auto data = make_host_mdarray<int, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+
+  auto buffer = mdbuffer(data);
+  EXPECT_FALSE(buffer.is_owning());
+  EXPECT_EQ(buffer.mem_type(), memory_type::host);
+  EXPECT_EQ(buffer.view<memory_type::host>().data_handle(), data.data_handle());
+  EXPECT_EQ(std::as_const(buffer).view<memory_type::host>().data_handle(), data.data_handle());
+  EXPECT_EQ(buffer.view<memory_type::host>().data_handle(),
+            std::as_const(buffer).view<memory_type::host>().data_handle());
+  EXPECT_EQ(buffer.view().index(), variant_index_from_memory_type(memory_type::host));
+
+  buffer = mdbuffer(data.view());
+  EXPECT_FALSE(buffer.is_owning());
+  EXPECT_EQ(buffer.mem_type(), memory_type::host);
+  EXPECT_EQ(buffer.view<memory_type::host>().data_handle(), data.data_handle());
+  EXPECT_EQ(std::as_const(buffer).view<memory_type::host>().data_handle(), data.data_handle());
+  EXPECT_EQ(buffer.view<memory_type::host>().data_handle(),
+            std::as_const(buffer).view<memory_type::host>().data_handle());
+
+  auto original_data_handle = data.data_handle();
+  buffer                    = mdbuffer(std::move(data));
+  EXPECT_TRUE(buffer.is_owning());
+  EXPECT_EQ(buffer.mem_type(), memory_type::host);
+  EXPECT_EQ(buffer.view<memory_type::host>().data_handle(), original_data_handle);
+
+  auto buffer2 = mdbuffer(res, buffer);
+  EXPECT_FALSE(buffer2.is_owning());
+  EXPECT_EQ(buffer2.mem_type(), memory_type::host);
+  EXPECT_EQ(buffer2.view<memory_type::host>().data_handle(),
+            buffer.view<memory_type::host>().data_handle());
+
+  buffer2 = mdbuffer(res, buffer, memory_type::host);
+  EXPECT_FALSE(buffer2.is_owning());
+  EXPECT_EQ(buffer2.mem_type(), memory_type::host);
+  EXPECT_EQ(buffer2.view<memory_type::host>().data_handle(),
+            buffer.view<memory_type::host>().data_handle());
+
+  buffer2 = mdbuffer(res, buffer, memory_type::device);
+  EXPECT_TRUE(buffer2.is_owning());
+  EXPECT_EQ(buffer2.mem_type(), memory_type::device);
+  EXPECT_NE(buffer2.view<memory_type::device>().data_handle(),
+            buffer.view<memory_type::host>().data_handle());
+
+  buffer2 = mdbuffer(res, buffer, memory_type::managed);
+  EXPECT_TRUE(buffer2.is_owning());
+  EXPECT_EQ(buffer2.mem_type(), memory_type::managed);
+  EXPECT_NE(buffer2.view<memory_type::managed>().data_handle(),
+            buffer.view<memory_type::host>().data_handle());
+
+  buffer2 = mdbuffer(res, buffer, memory_type::pinned);
+  EXPECT_TRUE(buffer2.is_owning());
+  EXPECT_EQ(buffer2.mem_type(), memory_type::pinned);
+  EXPECT_NE(buffer2.view<memory_type::pinned>().data_handle(),
+            buffer.view<memory_type::host>().data_handle());
+}
+
+TEST(MDBuffer, FromDevice)
+{
+  auto res             = device_resources{};
+  auto constexpr depth = std::uint32_t{5};
+  auto constexpr rows  = std::uint32_t{3};
+  auto constexpr cols  = std::uint32_t{2};
+  auto data = make_device_mdarray<int, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+
+  auto buffer = mdbuffer(data);
+  EXPECT_FALSE(buffer.is_owning());
+  EXPECT_EQ(buffer.mem_type(), memory_type::device);
+  EXPECT_EQ(buffer.view<memory_type::device>().data_handle(), data.data_handle());
+  EXPECT_EQ(std::as_const(buffer).view<memory_type::device>().data_handle(), data.data_handle());
+  EXPECT_EQ(buffer.view<memory_type::device>().data_handle(),
+            std::as_const(buffer).view<memory_type::device>().data_handle());
+  EXPECT_EQ(buffer.view().index(), variant_index_from_memory_type(memory_type::device));
+
+  buffer = mdbuffer(data.view());
+  EXPECT_FALSE(buffer.is_owning());
+  EXPECT_EQ(buffer.mem_type(), memory_type::device);
+  EXPECT_EQ(buffer.view<memory_type::device>().data_handle(), data.data_handle());
+  EXPECT_EQ(std::as_const(buffer).view<memory_type::device>().data_handle(), data.data_handle());
+  EXPECT_EQ(buffer.view<memory_type::device>().data_handle(),
+            std::as_const(buffer).view<memory_type::device>().data_handle());
+
+  auto original_data_handle = data.data_handle();
+  buffer                    = mdbuffer(std::move(data));
+  EXPECT_TRUE(buffer.is_owning());
+  EXPECT_EQ(buffer.mem_type(), memory_type::device);
+  EXPECT_EQ(buffer.view<memory_type::device>().data_handle(), original_data_handle);
+
+  auto buffer2 = mdbuffer(res, buffer);
+  EXPECT_FALSE(buffer2.is_owning());
+  EXPECT_EQ(buffer2.mem_type(), memory_type::device);
+  EXPECT_EQ(buffer2.view<memory_type::device>().data_handle(),
+            buffer.view<memory_type::device>().data_handle());
+
+  buffer2 = mdbuffer(res, buffer, memory_type::host);
+  EXPECT_TRUE(buffer2.is_owning());
+  EXPECT_EQ(buffer2.mem_type(), memory_type::host);
+  EXPECT_NE(buffer2.view<memory_type::host>().data_handle(),
+            buffer.view<memory_type::device>().data_handle());
+
+  buffer2 = mdbuffer(res, buffer, memory_type::device);
+  EXPECT_FALSE(buffer2.is_owning());
+  EXPECT_EQ(buffer2.mem_type(), memory_type::device);
+  EXPECT_EQ(buffer2.view<memory_type::device>().data_handle(),
+            buffer.view<memory_type::device>().data_handle());
+
+  buffer2 = mdbuffer(res, buffer, memory_type::managed);
+  EXPECT_TRUE(buffer2.is_owning());
+  EXPECT_EQ(buffer2.mem_type(), memory_type::managed);
+  EXPECT_NE(buffer2.view<memory_type::managed>().data_handle(),
+            buffer.view<memory_type::device>().data_handle());
+
+  buffer2 = mdbuffer(res, buffer, memory_type::pinned);
+  EXPECT_TRUE(buffer2.is_owning());
+  EXPECT_EQ(buffer2.mem_type(), memory_type::pinned);
+  EXPECT_NE(buffer2.view<memory_type::pinned>().data_handle(),
+            buffer.view<memory_type::device>().data_handle());
+}
+
+TEST(MDBuffer, FromManaged)
+{
+  auto res             = device_resources{};
+  auto constexpr depth = std::uint32_t{5};
+  auto constexpr rows  = std::uint32_t{3};
+  auto constexpr cols  = std::uint32_t{2};
+  auto data = make_managed_mdarray<int, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+
+  auto buffer = mdbuffer(data);
+  EXPECT_FALSE(buffer.is_owning());
+  EXPECT_EQ(buffer.mem_type(), memory_type::managed);
+  EXPECT_EQ(buffer.view<memory_type::managed>().data_handle(), data.data_handle());
+  EXPECT_EQ(std::as_const(buffer).view<memory_type::managed>().data_handle(), data.data_handle());
+  EXPECT_EQ(buffer.view<memory_type::managed>().data_handle(),
+            std::as_const(buffer).view<memory_type::managed>().data_handle());
+  EXPECT_EQ(buffer.view().index(), variant_index_from_memory_type(memory_type::managed));
+
+  buffer = mdbuffer(data.view());
+  EXPECT_FALSE(buffer.is_owning());
+  EXPECT_EQ(buffer.mem_type(), memory_type::managed);
+  EXPECT_EQ(buffer.view<memory_type::managed>().data_handle(), data.data_handle());
+  EXPECT_EQ(std::as_const(buffer).view<memory_type::managed>().data_handle(), data.data_handle());
+  EXPECT_EQ(buffer.view<memory_type::managed>().data_handle(),
+            std::as_const(buffer).view<memory_type::managed>().data_handle());
+
+  auto original_data_handle = data.data_handle();
+  buffer                    = mdbuffer(std::move(data));
+  EXPECT_TRUE(buffer.is_owning());
+  EXPECT_EQ(buffer.mem_type(), memory_type::managed);
+  EXPECT_EQ(buffer.view<memory_type::managed>().data_handle(), original_data_handle);
+
+  auto buffer2 = mdbuffer(res, buffer);
+  EXPECT_FALSE(buffer2.is_owning());
+  EXPECT_EQ(buffer2.mem_type(), memory_type::managed);
+  EXPECT_EQ(buffer2.view<memory_type::managed>().data_handle(),
+            buffer.view<memory_type::managed>().data_handle());
+
+  buffer2 = mdbuffer(res, buffer, memory_type::host);
+  EXPECT_FALSE(buffer2.is_owning());
+  EXPECT_EQ(buffer2.mem_type(), memory_type::host);
+  EXPECT_EQ(buffer2.view<memory_type::host>().data_handle(),
+            buffer.view<memory_type::managed>().data_handle());
+
+  buffer2 = mdbuffer(res, buffer, memory_type::device);
+  EXPECT_FALSE(buffer2.is_owning());
+  EXPECT_EQ(buffer2.mem_type(), memory_type::device);
+  EXPECT_EQ(buffer2.view<memory_type::device>().data_handle(),
+            buffer.view<memory_type::managed>().data_handle());
+
+  buffer2 = mdbuffer(res, buffer, memory_type::managed);
+  EXPECT_FALSE(buffer2.is_owning());
+  EXPECT_EQ(buffer2.mem_type(), memory_type::managed);
+  EXPECT_EQ(buffer2.view<memory_type::managed>().data_handle(),
+            buffer.view<memory_type::managed>().data_handle());
+
+  buffer2 = mdbuffer(res, buffer, memory_type::pinned);
+  EXPECT_TRUE(buffer2.is_owning());
+  EXPECT_EQ(buffer2.mem_type(), memory_type::pinned);
+  EXPECT_NE(buffer2.view<memory_type::pinned>().data_handle(),
+            buffer.view<memory_type::managed>().data_handle());
+}
+
+TEST(MDBuffer, FromPinned)
+{
+  auto res             = device_resources{};
+  auto constexpr depth = std::uint32_t{5};
+  auto constexpr rows  = std::uint32_t{3};
+  auto constexpr cols  = std::uint32_t{2};
+  auto data = make_pinned_mdarray<int, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+
+  auto buffer = mdbuffer(data);
+  EXPECT_FALSE(buffer.is_owning());
+  EXPECT_EQ(buffer.mem_type(), memory_type::pinned);
+  EXPECT_EQ(buffer.view<memory_type::pinned>().data_handle(), data.data_handle());
+  EXPECT_EQ(std::as_const(buffer).view<memory_type::pinned>().data_handle(), data.data_handle());
+  EXPECT_EQ(buffer.view<memory_type::pinned>().data_handle(),
+            std::as_const(buffer).view<memory_type::pinned>().data_handle());
+  EXPECT_EQ(buffer.view().index(), variant_index_from_memory_type(memory_type::pinned));
+
+  buffer = mdbuffer(data.view());
+  EXPECT_FALSE(buffer.is_owning());
+  EXPECT_EQ(buffer.mem_type(), memory_type::pinned);
+  EXPECT_EQ(buffer.view<memory_type::pinned>().data_handle(), data.data_handle());
+  EXPECT_EQ(std::as_const(buffer).view<memory_type::pinned>().data_handle(), data.data_handle());
+  EXPECT_EQ(buffer.view<memory_type::pinned>().data_handle(),
+            std::as_const(buffer).view<memory_type::pinned>().data_handle());
+
+  auto original_data_handle = data.data_handle();
+  buffer                    = mdbuffer(std::move(data));
+  EXPECT_TRUE(buffer.is_owning());
+  EXPECT_EQ(buffer.mem_type(), memory_type::pinned);
+  EXPECT_EQ(buffer.view<memory_type::pinned>().data_handle(), original_data_handle);
+
+  auto buffer2 = mdbuffer(res, buffer);
+  EXPECT_FALSE(buffer2.is_owning());
+  EXPECT_EQ(buffer2.mem_type(), memory_type::pinned);
+  EXPECT_EQ(buffer2.view<memory_type::pinned>().data_handle(),
+            buffer.view<memory_type::pinned>().data_handle());
+
+  buffer2 = mdbuffer(res, buffer, memory_type::host);
+  EXPECT_FALSE(buffer2.is_owning());
+  EXPECT_EQ(buffer2.mem_type(), memory_type::host);
+  EXPECT_EQ(buffer2.view<memory_type::host>().data_handle(),
+            buffer.view<memory_type::pinned>().data_handle());
+
+  buffer2 = mdbuffer(res, buffer, memory_type::device);
+  EXPECT_FALSE(buffer2.is_owning());
+  EXPECT_EQ(buffer2.mem_type(), memory_type::device);
+  EXPECT_EQ(buffer2.view<memory_type::device>().data_handle(),
+            buffer.view<memory_type::pinned>().data_handle());
+
+  buffer2 = mdbuffer(res, buffer, memory_type::managed);
+  EXPECT_TRUE(buffer2.is_owning());
+  EXPECT_EQ(buffer2.mem_type(), memory_type::managed);
+  EXPECT_NE(buffer2.view<memory_type::managed>().data_handle(),
+            buffer.view<memory_type::pinned>().data_handle());
+
+  buffer2 = mdbuffer(res, buffer, memory_type::pinned);
+  EXPECT_FALSE(buffer2.is_owning());
+  EXPECT_EQ(buffer2.mem_type(), memory_type::pinned);
+  EXPECT_EQ(buffer2.view<memory_type::pinned>().data_handle(),
+            buffer.view<memory_type::pinned>().data_handle());
+}
+
+TEST(MDBuffer, ImplicitMdspanConversion)
+{
+  auto res             = device_resources{};
+  auto constexpr depth = std::uint32_t{5};
+  auto constexpr rows  = std::uint32_t{3};
+  auto constexpr cols  = std::uint32_t{2};
+
+  using extents_type  = extents<std::uint32_t, depth, rows, cols>;
+  auto shared_extents = extents_type{};
+
+  auto data_host = make_host_mdarray<int, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
+    res, shared_extents);
+  auto data_device =
+    make_device_mdarray<int, std::uint32_t, layout_c_contiguous, depth, rows, cols>(res,
+                                                                                    shared_extents);
+  auto data_managed =
+    make_managed_mdarray<int, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
+      res, shared_extents);
+  auto data_pinned =
+    make_pinned_mdarray<int, std::uint32_t, layout_c_contiguous, depth, rows, cols>(res,
+                                                                                    shared_extents);
+
+  auto test_function = [shared_extents](mdbuffer<int, extents_type>&& buf) {
+    std::visit([shared_extents](auto view) { EXPECT_EQ(view.extents(), shared_extents); },
+               buf.view());
+  };
+
+  test_function(data_host);
+  test_function(data_device);
+  test_function(data_managed);
+  test_function(data_pinned);
+  test_function(data_host.view());
+  test_function(data_device.view());
+  test_function(data_managed.view());
+  test_function(data_pinned.view());
+
+  auto test_const_function = [shared_extents](mdbuffer<int const, extents_type>&& buf) {
+    std::visit([shared_extents](auto view) { EXPECT_EQ(view.extents(), shared_extents); },
+               buf.view());
+  };
+
+  test_const_function(data_host.view());
+  test_const_function(data_device.view());
+  test_const_function(data_managed.view());
+  test_const_function(data_pinned.view());
+}
+
+}  // namespace raft
diff --git a/cpp/test/core/memory_type.cpp b/cpp/test/core/memory_type.cpp
index 02aa8caa6c..cd8aa6bd9e 100644
--- a/cpp/test/core/memory_type.cpp
+++ b/cpp/test/core/memory_type.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include <cuda_runtime.h>
 #include <gtest/gtest.h>
 #include <raft/core/memory_type.hpp>
 
@@ -22,7 +23,7 @@ TEST(MemoryType, IsDeviceAccessible)
   static_assert(!is_device_accessible(memory_type::host));
   static_assert(is_device_accessible(memory_type::device));
   static_assert(is_device_accessible(memory_type::managed));
-  static_assert(!is_device_accessible(memory_type::pinned));
+  static_assert(is_device_accessible(memory_type::pinned));
 }
 
 TEST(MemoryType, IsHostAccessible)
@@ -38,6 +39,33 @@ TEST(MemoryType, IsHostDeviceAccessible)
   static_assert(!is_host_device_accessible(memory_type::host));
   static_assert(!is_host_device_accessible(memory_type::device));
   static_assert(is_host_device_accessible(memory_type::managed));
-  static_assert(!is_host_device_accessible(memory_type::pinned));
+  static_assert(is_host_device_accessible(memory_type::pinned));
 }
+
+TEST(MemoryTypeFromPointer, Host)
+{
+  auto ptr1 = static_cast<void*>(nullptr);
+  cudaMallocHost(&ptr1, 1);
+  EXPECT_EQ(memory_type_from_pointer(ptr1), memory_type::host);
+  cudaFree(ptr1);
+  auto ptr2 = static_cast<void*>(nullptr);
+  EXPECT_EQ(memory_type_from_pointer(ptr2), memory_type::host);
+}
+
+#ifndef RAFT_DISABLE_CUDA
+TEST(MemoryTypeFromPointer, Device)
+{
+  auto ptr = static_cast<void*>(nullptr);
+  cudaMalloc(&ptr, 1);
+  EXPECT_EQ(memory_type_from_pointer(ptr), memory_type::device);
+  cudaFree(ptr);
+}
+TEST(MemoryTypeFromPointer, Managed)
+{
+  auto ptr = static_cast<void*>(nullptr);
+  cudaMallocManaged(&ptr, 1);
+  EXPECT_EQ(memory_type_from_pointer(ptr), memory_type::managed);
+  cudaFree(ptr);
+}
+#endif
 }  // namespace raft
diff --git a/cpp/test/core/numpy_serializer.cu b/cpp/test/core/numpy_serializer.cu
index 0d12b97555..5c562d68f7 100644
--- a/cpp/test/core/numpy_serializer.cu
+++ b/cpp/test/core/numpy_serializer.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include <gtest/gtest.h>
 
 #include <raft/core/host_mdarray.hpp>
+#include <raft/core/managed_mdspan.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/core/serialize.hpp>
 
diff --git a/cpp/test/util/memory_type_dispatcher.cu b/cpp/test/util/memory_type_dispatcher.cu
new file mode 100644
index 0000000000..5e24ff5719
--- /dev/null
+++ b/cpp/test/util/memory_type_dispatcher.cu
@@ -0,0 +1,421 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <cstdint>
+#include <gtest/gtest.h>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/core/managed_mdarray.hpp>
+#include <raft/core/managed_mdspan.hpp>
+#include <raft/core/mdbuffer.cuh>
+#include <raft/core/pinned_mdarray.hpp>
+#include <raft/core/pinned_mdspan.hpp>
+#include <raft/util/memory_type_dispatcher.cuh>
+#include <utility>
+#include <variant>
+
+namespace raft {
+
+namespace dispatch_test {
+struct functor_h {
+  template <memory_type input_memory_type>
+  auto static constexpr expected_output()
+  {
+    return memory_type::host;
+  }
+  auto operator()(host_matrix_view<double> input) { return memory_type::host; }
+};
+struct functor_d {
+  template <memory_type input_memory_type>
+  auto static constexpr expected_output()
+  {
+    return memory_type::device;
+  }
+  auto operator()(host_matrix_view<double> input) { return memory_type::device; }
+};
+struct functor_m {
+  template <memory_type input_memory_type>
+  auto static constexpr expected_output()
+  {
+    return memory_type::managed;
+  }
+  auto operator()(host_matrix_view<double> input) { return memory_type::managed; }
+};
+struct functor_p {
+  template <memory_type input_memory_type>
+  auto static constexpr expected_output()
+  {
+    return memory_type::pinned;
+  }
+  auto operator()(host_matrix_view<double> input) { return memory_type::pinned; }
+};
+
+struct functor_hd {
+  template <memory_type input_memory_type>
+  auto static constexpr expected_output()
+  {
+    if constexpr (input_memory_type == memory_type::host) {
+      return memory_type::host;
+    } else {
+      return memory_type::device;
+    }
+  }
+  auto operator()(host_matrix_view<double> input) { return memory_type::host; }
+  auto operator()(device_matrix_view<double> input) { return memory_type::device; }
+};
+struct functor_hm {
+  template <memory_type input_memory_type>
+  auto static constexpr expected_output()
+  {
+    if constexpr (input_memory_type == memory_type::managed) {
+      return memory_type::managed;
+    } else {
+      return memory_type::host;
+    }
+  }
+  auto operator()(host_matrix_view<double> input) { return memory_type::host; }
+  auto operator()(managed_matrix_view<double> input) { return memory_type::managed; }
+};
+struct functor_hp {
+  template <memory_type input_memory_type>
+  auto static constexpr expected_output()
+  {
+    if constexpr (input_memory_type == memory_type::pinned) {
+      return memory_type::pinned;
+    } else {
+      return memory_type::host;
+    }
+  }
+  auto operator()(host_matrix_view<double> input) { return memory_type::host; }
+  auto operator()(pinned_matrix_view<double> input) { return memory_type::pinned; }
+};
+struct functor_dm {
+  template <memory_type input_memory_type>
+  auto static constexpr expected_output()
+  {
+    if constexpr (input_memory_type == memory_type::managed) {
+      return memory_type::managed;
+    } else {
+      return memory_type::device;
+    }
+  }
+  auto operator()(device_matrix_view<double> input) { return memory_type::device; }
+  auto operator()(managed_matrix_view<double> input) { return memory_type::managed; }
+};
+struct functor_dp {
+  template <memory_type input_memory_type>
+  auto static constexpr expected_output()
+  {
+    if constexpr (input_memory_type == memory_type::pinned) {
+      return memory_type::pinned;
+    } else {
+      return memory_type::device;
+    }
+  }
+  auto operator()(device_matrix_view<double> input) { return memory_type::device; }
+  auto operator()(pinned_matrix_view<double> input) { return memory_type::pinned; }
+};
+struct functor_mp {
+  template <memory_type input_memory_type>
+  auto static constexpr expected_output()
+  {
+    if constexpr (input_memory_type == memory_type::pinned) {
+      return memory_type::pinned;
+    } else {
+      return memory_type::managed;
+    }
+  }
+  auto operator()(managed_matrix_view<double> input) { return memory_type::managed; }
+  auto operator()(pinned_matrix_view<double> input) { return memory_type::pinned; }
+};
+
+struct functor_hdm {
+  template <memory_type input_memory_type>
+  auto static constexpr expected_output()
+  {
+    if constexpr (input_memory_type == memory_type::host) {
+      return memory_type::host;
+    } else if constexpr (input_memory_type == memory_type::managed) {
+      return memory_type::managed;
+    } else {
+      return memory_type::device;
+    }
+  }
+  auto operator()(host_matrix_view<double> input) { return memory_type::host; }
+  auto operator()(device_matrix_view<double> input) { return memory_type::device; }
+  auto operator()(managed_matrix_view<double> input) { return memory_type::managed; }
+};
+struct functor_hdp {
+  template <memory_type input_memory_type>
+  auto static constexpr expected_output()
+  {
+    if constexpr (input_memory_type == memory_type::host) {
+      return memory_type::host;
+    } else if constexpr (input_memory_type == memory_type::pinned) {
+      return memory_type::pinned;
+    } else {
+      return memory_type::device;
+    }
+  }
+  auto operator()(host_matrix_view<double> input) { return memory_type::host; }
+  auto operator()(device_matrix_view<double> input) { return memory_type::device; }
+  auto operator()(pinned_matrix_view<double> input) { return memory_type::pinned; }
+};
+struct functor_dmp {
+  template <memory_type input_memory_type>
+  auto static constexpr expected_output()
+  {
+    if constexpr (input_memory_type == memory_type::managed) {
+      return memory_type::managed;
+    } else if constexpr (input_memory_type == memory_type::pinned) {
+      return memory_type::pinned;
+    } else {
+      return memory_type::device;
+    }
+  }
+  auto operator()(device_matrix_view<double> input) { return memory_type::device; }
+  auto operator()(managed_matrix_view<double> input) { return memory_type::managed; }
+  auto operator()(pinned_matrix_view<double> input) { return memory_type::pinned; }
+};
+
+struct functor_hdmp {
+  template <memory_type input_memory_type>
+  auto static constexpr expected_output()
+  {
+    return input_memory_type;
+  }
+  auto operator()(host_matrix_view<double> input) { return memory_type::host; }
+  auto operator()(device_matrix_view<double> input) { return memory_type::device; }
+  auto operator()(managed_matrix_view<double> input) { return memory_type::managed; }
+  auto operator()(pinned_matrix_view<double> input) { return memory_type::pinned; }
+};
+
+template <raft::memory_type input_memory_type,
+          typename T             = double,
+          typename layout_policy = layout_c_contiguous>
+auto generate_input(raft::resources const& res)
+{
+  auto constexpr rows = std::uint32_t{3};
+  auto constexpr cols = std::uint32_t{5};
+  if constexpr (input_memory_type == raft::memory_type::host) {
+    return raft::make_host_matrix<T, std::uint32_t, layout_policy>(rows, cols);
+  } else if constexpr (input_memory_type == raft::memory_type::device) {
+    return raft::make_device_matrix<T, std::uint32_t, layout_policy>(res, rows, cols);
+  } else if constexpr (input_memory_type == raft::memory_type::managed) {
+    return raft::make_managed_matrix<T, std::uint32_t, layout_policy>(res, rows, cols);
+  } else if constexpr (input_memory_type == raft::memory_type::pinned) {
+    return raft::make_pinned_matrix<T, std::uint32_t, layout_policy>(res, rows, cols);
+  }
+}
+
+template <memory_type input_memory_type>
+void test_memory_type_dispatcher()
+{
+  auto res          = raft::device_resources{};
+  auto data         = generate_input<input_memory_type>(res);
+  auto data_float   = generate_input<input_memory_type, float>(res);
+  auto data_f       = generate_input<input_memory_type, double, layout_f_contiguous>(res);
+  auto data_f_float = generate_input<input_memory_type, float, layout_f_contiguous>(res);
+
+  EXPECT_EQ(memory_type_dispatcher(res, functor_h{}, data.view()),
+            functor_h::expected_output<input_memory_type>());
+  EXPECT_EQ(memory_type_dispatcher(res, functor_d{}, data.view()),
+            functor_d::expected_output<input_memory_type>());
+  EXPECT_EQ(memory_type_dispatcher(res, functor_m{}, data.view()),
+            functor_m::expected_output<input_memory_type>());
+  EXPECT_EQ(memory_type_dispatcher(res, functor_p{}, data.view()),
+            functor_p::expected_output<input_memory_type>());
+  EXPECT_EQ(memory_type_dispatcher(res, functor_hd{}, data.view()),
+            functor_hd::expected_output<input_memory_type>());
+  EXPECT_EQ(memory_type_dispatcher(res, functor_hm{}, data.view()),
+            functor_hm::expected_output<input_memory_type>());
+  EXPECT_EQ(memory_type_dispatcher(res, functor_hp{}, data.view()),
+            functor_hp::expected_output<input_memory_type>());
+  EXPECT_EQ(memory_type_dispatcher(res, functor_dm{}, data.view()),
+            functor_dm::expected_output<input_memory_type>());
+  EXPECT_EQ(memory_type_dispatcher(res, functor_dp{}, data.view()),
+            functor_dp::expected_output<input_memory_type>());
+  EXPECT_EQ(memory_type_dispatcher(res, functor_mp{}, data.view()),
+            functor_mp::expected_output<input_memory_type>());
+  EXPECT_EQ(memory_type_dispatcher(res, functor_hdm{}, data.view()),
+            functor_hdm::expected_output<input_memory_type>());
+  EXPECT_EQ(memory_type_dispatcher(res, functor_hdp{}, data.view()),
+            functor_hdp::expected_output<input_memory_type>());
+  EXPECT_EQ(memory_type_dispatcher(res, functor_dmp{}, data.view()),
+            functor_dmp::expected_output<input_memory_type>());
+  EXPECT_EQ(memory_type_dispatcher(res, functor_hdmp{}, data.view()),
+            functor_hdmp::expected_output<input_memory_type>());
+
+  // Functor expects double; input is float
+  auto out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_h{}, data_float.view());
+  EXPECT_EQ(out, functor_h::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_d{}, data_float.view());
+  EXPECT_EQ(out, functor_d::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_m{}, data_float.view());
+  EXPECT_EQ(out, functor_m::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_p{}, data_float.view());
+  EXPECT_EQ(out, functor_p::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_hd{}, data_float.view());
+  EXPECT_EQ(out, functor_hd::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_hm{}, data_float.view());
+  EXPECT_EQ(out, functor_hm::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_hp{}, data_float.view());
+  EXPECT_EQ(out, functor_hp::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_dm{}, data_float.view());
+  EXPECT_EQ(out, functor_dm::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_dp{}, data_float.view());
+  EXPECT_EQ(out, functor_dp::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_mp{}, data_float.view());
+  EXPECT_EQ(out, functor_mp::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_hdm{}, data_float.view());
+  EXPECT_EQ(out, functor_hdm::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_hdp{}, data_float.view());
+  EXPECT_EQ(out, functor_hdp::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_dmp{}, data_float.view());
+  EXPECT_EQ(out, functor_dmp::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_hdmp{}, data_float.view());
+  EXPECT_EQ(out, functor_hdmp::expected_output<input_memory_type>());
+
+  // Functor expects C-contiguous; input is F-contiguous
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_h{}, data_f.view());
+  EXPECT_EQ(out, functor_h::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_d{}, data_f.view());
+  EXPECT_EQ(out, functor_d::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_m{}, data_f.view());
+  EXPECT_EQ(out, functor_m::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_p{}, data_f.view());
+  EXPECT_EQ(out, functor_p::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_hd{}, data_f.view());
+  EXPECT_EQ(out, functor_hd::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_hm{}, data_f.view());
+  EXPECT_EQ(out, functor_hm::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_hp{}, data_f.view());
+  EXPECT_EQ(out, functor_hp::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_dm{}, data_f.view());
+  EXPECT_EQ(out, functor_dm::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_dp{}, data_f.view());
+  EXPECT_EQ(out, functor_dp::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_mp{}, data_f.view());
+  EXPECT_EQ(out, functor_mp::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_hdm{}, data_f.view());
+  EXPECT_EQ(out, functor_hdm::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_hdp{}, data_f.view());
+  EXPECT_EQ(out, functor_hdp::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_dmp{}, data_f.view());
+  EXPECT_EQ(out, functor_dmp::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_hdmp{}, data_f.view());
+  EXPECT_EQ(out, functor_hdmp::expected_output<input_memory_type>());
+
+  // Functor expects C-contiguous double; input is F-contiguous float
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_h{}, data_f_float.view());
+  EXPECT_EQ(out, functor_h::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_d{}, data_f_float.view());
+  EXPECT_EQ(out, functor_d::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_m{}, data_f_float.view());
+  EXPECT_EQ(out, functor_m::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_p{}, data_f_float.view());
+  EXPECT_EQ(out, functor_p::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_hd{}, data_f_float.view());
+  EXPECT_EQ(out, functor_hd::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_hm{}, data_f_float.view());
+  EXPECT_EQ(out, functor_hm::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_hp{}, data_f_float.view());
+  EXPECT_EQ(out, functor_hp::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_dm{}, data_f_float.view());
+  EXPECT_EQ(out, functor_dm::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_dp{}, data_f_float.view());
+  EXPECT_EQ(out, functor_dp::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_mp{}, data_f_float.view());
+  EXPECT_EQ(out, functor_mp::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_hdm{}, data_f_float.view());
+  EXPECT_EQ(out, functor_hdm::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_hdp{}, data_f_float.view());
+  EXPECT_EQ(out, functor_hdp::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_dmp{}, data_f_float.view());
+  EXPECT_EQ(out, functor_dmp::expected_output<input_memory_type>());
+  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+    res, functor_hdmp{}, data_f_float.view());
+  EXPECT_EQ(out, functor_hdmp::expected_output<input_memory_type>());
+}
+
+}  // namespace dispatch_test
+
+TEST(MemoryTypeDispatcher, FromHost)
+{
+  dispatch_test::test_memory_type_dispatcher<memory_type::host>();
+}
+
+TEST(MemoryTypeDispatcher, FromDevice)
+{
+  dispatch_test::test_memory_type_dispatcher<memory_type::device>();
+}
+
+TEST(MemoryTypeDispatcher, FromManaged)
+{
+  dispatch_test::test_memory_type_dispatcher<memory_type::managed>();
+}
+
+TEST(MemoryTypeDispatcher, FromPinned)
+{
+  dispatch_test::test_memory_type_dispatcher<memory_type::pinned>();
+}
+
+}  // namespace raft
diff --git a/docs/source/cpp_api/mdspan.rst b/docs/source/cpp_api/mdspan.rst
index 3fc0db7b96..b311020049 100644
--- a/docs/source/cpp_api/mdspan.rst
+++ b/docs/source/cpp_api/mdspan.rst
@@ -16,4 +16,6 @@ This page provides C++ class references for the RAFT's 1d span and multi-dimensi
    mdspan_mdspan.rst
    mdspan_mdarray.rst
    mdspan_span.rst
+   mdspan_mdbuffer.rst
+   memory_type_dispatcher.rst
    mdspan_temporary_device_buffer.rst
diff --git a/docs/source/cpp_api/mdspan_mdarray.rst b/docs/source/cpp_api/mdspan_mdarray.rst
index bcc2254204..af3943065d 100644
--- a/docs/source/cpp_api/mdspan_mdarray.rst
+++ b/docs/source/cpp_api/mdspan_mdarray.rst
@@ -68,4 +68,68 @@ Host Factories
 .. doxygengroup:: host_mdarray_factories
     :project: RAFT
     :members:
-    :content-only:
\ No newline at end of file
+    :content-only:
+
+Managed Vocabulary
+------------------
+
+``#include <raft/core/managed_mdarray.hpp>``
+
+.. doxygentypedef:: raft::managed_mdarray
+    :project: RAFT
+
+.. doxygentypedef:: raft::managed_matrix
+    :project: RAFT
+
+.. doxygentypedef:: raft::managed_vector
+    :project: RAFT
+
+.. doxygentypedef:: raft::managed_scalar
+    :project: RAFT
+
+
+Managed Factories
+-----------------
+
+``#include <raft/core/managed_mdarray.hpp>``
+
+.. doxygenfunction:: raft::make_managed_matrix
+    :project: RAFT
+
+.. doxygenfunction:: raft::make_managed_vector
+    :project: RAFT
+
+.. doxygenfunction:: raft::make_managed_scalar
+    :project: RAFT
+
+Pinned Vocabulary
+-----------------
+
+``#include <raft/core/pinned_mdarray.hpp>``
+
+.. doxygentypedef:: raft::pinned_mdarray
+    :project: RAFT
+
+.. doxygentypedef:: raft::pinned_matrix
+    :project: RAFT
+
+.. doxygentypedef:: raft::pinned_vector
+    :project: RAFT
+
+.. doxygentypedef:: raft::pinned_scalar
+    :project: RAFT
+
+
+Pinned Factories
+----------------
+
+``#include <raft/core/pinned_mdarray.hpp>``
+
+.. doxygenfunction:: raft::make_pinned_matrix
+    :project: RAFT
+
+.. doxygenfunction:: raft::make_pinned_vector
+    :project: RAFT
+
+.. doxygenfunction:: raft::make_pinned_scalar
+    :project: RAFT
diff --git a/docs/source/cpp_api/mdspan_mdbuffer.rst b/docs/source/cpp_api/mdspan_mdbuffer.rst
new file mode 100644
index 0000000000..40fe066a2e
--- /dev/null
+++ b/docs/source/cpp_api/mdspan_mdbuffer.rst
@@ -0,0 +1,13 @@
+mdbuffer: Multi-dimensional Maybe-Owning Container
+==================================================
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+``#include <raft/core/mdbuffer.cuh>``
+
+.. doxygengroup:: mdbuffer_apis
+    :project: RAFT
+    :members:
+    :content-only:
diff --git a/docs/source/cpp_api/mdspan_mdspan.rst b/docs/source/cpp_api/mdspan_mdspan.rst
index f9f972aa74..28d06b5323 100644
--- a/docs/source/cpp_api/mdspan_mdspan.rst
+++ b/docs/source/cpp_api/mdspan_mdspan.rst
@@ -92,9 +92,9 @@ Device Factories
 Managed Vocabulary
 ------------------
 
-``#include <raft/core/device_mdspan.hpp>``
+``#include <raft/core/managed_mdspan.hpp>``
 
-..doxygentypedef:: raft::managed_mdspan
+.. doxygentypedef:: raft::managed_mdspan
   :project: RAFT
 
 .. doxygenstruct:: raft::is_managed_mdspan
@@ -122,7 +122,7 @@ Managed Vocabulary
 Managed Factories
 -----------------
 
-``#include <raft/core/device_mdspan.hpp>``
+``#include <raft/core/managed_mdspan.hpp>``
 
 .. doxygenfunction:: make_managed_mdspan(ElementType* ptr, extents<IndexType, Extents...> exts)
     :project: RAFT
@@ -177,7 +177,38 @@ Host Factories
 .. doxygenfunction:: raft::make_host_vector_view
     :project: RAFT
 
-.. doxygenfunction:: raft::make_device_scalar_view
+.. doxygenfunction:: raft::make_host_scalar_view
+    :project: RAFT
+
+Pinned Vocabulary
+---------------
+
+``#include <raft/core/pinned_mdspan.hpp>``
+
+.. doxygentypedef:: raft::pinned_mdspan
+   :project: RAFT
+
+.. doxygentypedef:: raft::pinned_matrix_view
+   :project: RAFT
+
+.. doxygentypedef:: raft::pinned_vector_view
+   :project: RAFT
+
+.. doxygentypedef:: raft::pinned_scalar_view
+   :project: RAFT
+
+Pinned Factories
+--------------
+
+``#include <raft/core/pinned_mdspan.hpp>``
+
+.. doxygenfunction:: raft::make_pinned_matrix_view
+    :project: RAFT
+
+.. doxygenfunction:: raft::make_pinned_vector_view
+    :project: RAFT
+
+.. doxygenfunction:: raft::make_pinned_scalar_view
     :project: RAFT
 
 
diff --git a/docs/source/cpp_api/memory_type_dispatcher.rst b/docs/source/cpp_api/memory_type_dispatcher.rst
new file mode 100644
index 0000000000..687a872967
--- /dev/null
+++ b/docs/source/cpp_api/memory_type_dispatcher.rst
@@ -0,0 +1,13 @@
+memory_type_dispatcher
+======================
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+``#include <raft/util/memory_type_dispatcher.cuh>``
+
+.. doxygengroup:: memory_type_dispatcher
+    :project: RAFT
+    :members:
+    :content-only:

From 6762fe540b19ea58904786c4cacf57ca5f0d9695 Mon Sep 17 00:00:00 2001
From: Vivek Narang <123010842+narangvivek10@users.noreply.github.com>
Date: Thu, 4 Jan 2024 14:17:00 -0500
Subject: [PATCH 11/11] Remove hardcoded limit in `print_results` function
 (#2080)

The `print_results` function here is currently hardcoded to print only 2 results irrespective of the number of queries. A better way here could be to replace the hardcoded limit and allow printing results for the actual number of queries.

Authors:
  - Vivek Narang (https://github.com/narangvivek10)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2080
---
 cpp/include/raft/neighbors/detail/refine_device.cuh | 2 +-
 cpp/template/src/common.cuh                         | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/refine_device.cuh b/cpp/include/raft/neighbors/detail/refine_device.cuh
index 337318f791..5c9f1459e7 100644
--- a/cpp/include/raft/neighbors/detail/refine_device.cuh
+++ b/cpp/include/raft/neighbors/detail/refine_device.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/template/src/common.cuh b/cpp/template/src/common.cuh
index 0b72d3bf3b..c2cb15bcf3 100644
--- a/cpp/template/src/common.cuh
+++ b/cpp/template/src/common.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,7 +42,7 @@ void generate_dataset(raft::device_resources const& dev_resources,
                         1.0f);
 }
 
-// Copy the results to host and print a few samples
+// Copy the results to host and print them
 template <typename IdxT>
 void print_results(raft::device_resources const& dev_resources,
                    raft::device_matrix_view<IdxT, int64_t> neighbors,
@@ -61,7 +61,7 @@ void print_results(raft::device_resources const& dev_resources,
   // We need to sync the stream before accessing the data.
   raft::resource::sync_stream(dev_resources, stream);
 
-  for (int query_id = 0; query_id < 2; query_id++) {
+  for (int query_id = 0; query_id < neighbors.extent(0); query_id++) {
     std::cout << "Query " << query_id << " neighbor indices: ";
     raft::print_host_vector("", &neighbors_host(query_id, 0), topk, std::cout);
     std::cout << "Query " << query_id << " neighbor distances: ";