diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index 051dad7eaa..2c92ad0a99 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -29,12 +29,15 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
+- matplotlib
 - nccl>=2.9.9
 - ninja
 - nlohmann_json>=3.11.2
 - nvcc_linux-64=11.8
 - openblas
-- rmm=23.12.*
+- pandas
+- pyyaml
+- rmm==23.12.*
 - scikit-build>=0.13.1
 - sysroot_linux-64==2.17
 name: bench_ann_cuda-118_arch-x86_64
diff --git a/cpp/bench/ann/src/raft/raft_benchmark.cu b/cpp/bench/ann/src/raft/raft_benchmark.cu
index 7b859269b0..fa20c5c223 100644
--- a/cpp/bench/ann/src/raft/raft_benchmark.cu
+++ b/cpp/bench/ann/src/raft/raft_benchmark.cu
@@ -154,6 +154,7 @@ void parse_build_param(const nlohmann::json& conf,
       param.build_algo = raft::neighbors::cagra::graph_build_algo::NN_DESCENT;
     }
   }
+  if (conf.contains("nn_descent_niter")) { param.nn_descent_niter = conf.at("nn_descent_niter"); }
 }
 
 template <typename T, typename IdxT>
diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt
index ca4b0f099d..5da2cd916b 100644
--- a/cpp/bench/prims/CMakeLists.txt
+++ b/cpp/bench/prims/CMakeLists.txt
@@ -32,6 +32,7 @@ function(ConfigureBench)
     PRIVATE raft::raft
             raft_internal
             $<$<BOOL:${ConfigureBench_LIB}>:raft::compiled>
+            ${RAFT_CTK_MATH_DEPENDENCIES}
             benchmark::benchmark
             Threads::Threads
             $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
@@ -73,11 +74,14 @@ function(ConfigureBench)
 endfunction()
 
 if(BUILD_PRIMS_BENCH)
+  ConfigureBench(
+    NAME CORE_BENCH PATH bench/prims/core/bitset.cu bench/prims/core/copy.cu bench/prims/main.cpp
+  )
+
   ConfigureBench(
     NAME CLUSTER_BENCH PATH bench/prims/cluster/kmeans_balanced.cu bench/prims/cluster/kmeans.cu
     bench/prims/main.cpp OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY
   )
-  ConfigureBench(NAME CORE_BENCH PATH bench/prims/core/bitset.cu bench/prims/main.cpp)
 
   ConfigureBench(
     NAME TUNE_DISTANCE PATH bench/prims/distance/tune_pairwise/kernel.cu
diff --git a/cpp/bench/prims/cluster/kmeans_balanced.cu b/cpp/bench/prims/cluster/kmeans_balanced.cu
index effe2a55a4..129578c303 100644
--- a/cpp/bench/prims/cluster/kmeans_balanced.cu
+++ b/cpp/bench/prims/cluster/kmeans_balanced.cu
@@ -50,10 +50,10 @@ struct KMeansBalanced : public fixture {
     constexpr T kRangeMin = std::is_integral_v<T> ? std::numeric_limits<T>::min() : T(-1);
     if constexpr (std::is_integral_v<T>) {
       raft::random::uniformInt(
-        rng, X.data_handle(), params.data.rows * params.data.cols, kRangeMin, kRangeMax, stream);
+        handle, rng, X.data_handle(), params.data.rows * params.data.cols, kRangeMin, kRangeMax);
     } else {
       raft::random::uniform(
-        rng, X.data_handle(), params.data.rows * params.data.cols, kRangeMin, kRangeMax, stream);
+        handle, rng, X.data_handle(), params.data.rows * params.data.cols, kRangeMin, kRangeMax);
     }
     resource::sync_stream(handle, stream);
   }
diff --git a/cpp/bench/prims/core/copy.cu b/cpp/bench/prims/core/copy.cu
new file mode 100644
index 0000000000..31ee83b924
--- /dev/null
+++ b/cpp/bench/prims/core/copy.cu
@@ -0,0 +1,401 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+#include <cstdint>
+#include <raft/core/copy.cuh>
+#include <raft/core/cudart_utils.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/mdspan_types.hpp>
+#include <raft/core/memory_type.hpp>
+#include <raft/thirdparty/mdspan/include/experimental/mdspan>
+
+namespace raft::bench::core {
+
+template <typename IdxT, std::size_t Rank>
+auto constexpr const default_dims = []() {
+  auto dims = std::array<IdxT, Rank>{};
+  std::fill(dims.begin(), dims.end(), 2);
+  return dims;
+}();
+
+template <typename IdxT>
+auto constexpr const default_dims<IdxT, std::size_t{1}> = std::array<IdxT, 1>{3000000};
+
+template <typename IdxT>
+auto constexpr const default_dims<IdxT, std::size_t{2}> = std::array<IdxT, 2>{1000, 3000};
+
+template <typename IdxT>
+auto constexpr const default_dims<IdxT, std::size_t{3}> = std::array<IdxT, 3>{20, 300, 500};
+
+template <typename T,
+          typename IdxT,
+          typename LayoutPolicy,
+          memory_type MemType,
+          std::size_t Rank,
+          typename = std::make_index_sequence<Rank>>
+struct bench_array_type;
+
+template <typename T,
+          typename IdxT,
+          typename LayoutPolicy,
+          memory_type MemType,
+          std::size_t Rank,
+          std::size_t... S>
+struct bench_array_type<T, IdxT, LayoutPolicy, MemType, Rank, std::index_sequence<S...>> {
+  template <std::size_t>
+  auto static constexpr const extent_type = raft::dynamic_extent;
+
+  using type =
+    std::conditional_t<MemType == memory_type::host,
+                       host_mdarray<T, extents<IdxT, extent_type<S>...>, LayoutPolicy>,
+                       device_mdarray<T, extents<IdxT, extent_type<S>...>, LayoutPolicy>>;
+};
+
+template <typename SrcT,
+          typename DstT,
+          typename IdxT,
+          typename SrcLayoutPolicy,
+          typename DstLayoutPolicy,
+          memory_type SrcMemType,
+          memory_type DstMemType,
+          std::size_t Rank>
+struct params {
+  std::array<IdxT, Rank> dims = default_dims<IdxT, Rank>;
+  using src_array_type =
+    typename bench_array_type<SrcT, IdxT, SrcLayoutPolicy, SrcMemType, Rank>::type;
+  using dst_array_type =
+    typename bench_array_type<DstT, IdxT, DstLayoutPolicy, DstMemType, Rank>::type;
+};
+
+template <typename SrcT,
+          typename DstT,
+          typename IdxT,
+          typename SrcLayoutPolicy,
+          typename DstLayoutPolicy,
+          memory_type SrcMemType,
+          memory_type DstMemType,
+          std::size_t Rank>
+struct CopyBench : public fixture {
+  using params_type =
+    params<SrcT, DstT, IdxT, SrcLayoutPolicy, DstLayoutPolicy, SrcMemType, DstMemType, Rank>;
+  using src_array_type = typename params_type::src_array_type;
+  using dst_array_type = typename params_type::dst_array_type;
+  explicit CopyBench(const params_type& ps)
+    : fixture{true},
+      res_{},
+      params_{ps},
+      src_{
+        res_,
+        typename src_array_type::mapping_type{
+          std::apply([](auto... exts) { return make_extents<IdxT>(exts...); }, ps.dims)},
+        typename src_array_type::container_policy_type{},
+      },
+      dst_{
+        res_,
+        typename dst_array_type::mapping_type{
+          std::apply([](auto... exts) { return make_extents<IdxT>(exts...); }, ps.dims)},
+        typename dst_array_type::container_policy_type{},
+      }
+  {
+    res_.get_cublas_handle();  // initialize cublas handle
+    auto src_data = std::vector<SrcT>(src_.size());
+    std::iota(src_data.begin(), src_data.end(), SrcT{});
+    raft::copy(src_.data_handle(), src_data.data(), src_.size(), res_.get_stream());
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    loop_on_state(state, [this]() { raft::copy(res_, dst_.view(), src_.view()); });
+  }
+
+ private:
+  raft::device_resources res_;
+  params_type params_;
+  src_array_type src_;
+  dst_array_type dst_;
+};
+
+template <typename ParamsT>
+auto static const inputs = std::vector<ParamsT>{ParamsT{}};
+
+#define COPY_REGISTER(BenchT) \
+  RAFT_BENCH_REGISTER(BenchT, "BenchT", inputs<typename BenchT::params_type>)
+
+using copy_bench_device_device_1d_same_dtype_same_layout        = CopyBench<int,
+                                                                     int,
+                                                                     int,
+                                                                     layout_c_contiguous,
+                                                                     layout_c_contiguous,
+                                                                     memory_type::device,
+                                                                     memory_type::device,
+                                                                     1>;
+using copy_bench_device_device_1d_same_dtype_diff_layout        = CopyBench<int,
+                                                                     int,
+                                                                     int,
+                                                                     layout_c_contiguous,
+                                                                     layout_f_contiguous,
+                                                                     memory_type::device,
+                                                                     memory_type::device,
+                                                                     1>;
+using copy_bench_device_device_1d_diff_dtype_diff_layout        = CopyBench<float,
+                                                                     double,
+                                                                     int,
+                                                                     layout_c_contiguous,
+                                                                     layout_f_contiguous,
+                                                                     memory_type::device,
+                                                                     memory_type::device,
+                                                                     1>;
+using copy_bench_device_device_2d_same_dtype_diff_layout        = CopyBench<int,
+                                                                     int,
+                                                                     int,
+                                                                     layout_c_contiguous,
+                                                                     layout_f_contiguous,
+                                                                     memory_type::device,
+                                                                     memory_type::device,
+                                                                     2>;
+using copy_bench_device_device_2d_same_dtype_diff_layout_cublas = CopyBench<float,
+                                                                            float,
+                                                                            int,
+                                                                            layout_c_contiguous,
+                                                                            layout_f_contiguous,
+                                                                            memory_type::device,
+                                                                            memory_type::device,
+                                                                            2>;
+using copy_bench_device_device_3d_diff_dtype_diff_layout        = CopyBench<float,
+                                                                     double,
+                                                                     int,
+                                                                     layout_c_contiguous,
+                                                                     layout_f_contiguous,
+                                                                     memory_type::device,
+                                                                     memory_type::device,
+                                                                     3>;
+using copy_bench_device_device_3d_diff_dtype_same_layout        = CopyBench<float,
+                                                                     double,
+                                                                     int,
+                                                                     layout_c_contiguous,
+                                                                     layout_c_contiguous,
+                                                                     memory_type::device,
+                                                                     memory_type::device,
+                                                                     3>;
+
+using copy_bench_host_host_1d_same_dtype_same_layout             = CopyBench<int,
+                                                                 int,
+                                                                 int,
+                                                                 layout_c_contiguous,
+                                                                 layout_c_contiguous,
+                                                                 memory_type::host,
+                                                                 memory_type::host,
+                                                                 1>;
+using copy_bench_host_host_1d_same_dtype_diff_layout             = CopyBench<int,
+                                                                 int,
+                                                                 int,
+                                                                 layout_c_contiguous,
+                                                                 layout_f_contiguous,
+                                                                 memory_type::host,
+                                                                 memory_type::host,
+                                                                 1>;
+using copy_bench_host_host_1d_diff_dtype_diff_layout             = CopyBench<float,
+                                                                 double,
+                                                                 int,
+                                                                 layout_c_contiguous,
+                                                                 layout_f_contiguous,
+                                                                 memory_type::host,
+                                                                 memory_type::host,
+                                                                 1>;
+using copy_bench_host_host_2d_same_dtype_diff_layout             = CopyBench<int,
+                                                                 int,
+                                                                 int,
+                                                                 layout_c_contiguous,
+                                                                 layout_f_contiguous,
+                                                                 memory_type::host,
+                                                                 memory_type::host,
+                                                                 2>;
+using copy_bench_host_host_2d_same_dtype_diff_layout_float_float = CopyBench<float,
+                                                                             float,
+                                                                             int,
+                                                                             layout_c_contiguous,
+                                                                             layout_f_contiguous,
+                                                                             memory_type::host,
+                                                                             memory_type::host,
+                                                                             2>;
+using copy_bench_host_host_3d_diff_dtype_same_layout             = CopyBench<float,
+                                                                 double,
+                                                                 int,
+                                                                 layout_c_contiguous,
+                                                                 layout_c_contiguous,
+                                                                 memory_type::host,
+                                                                 memory_type::host,
+                                                                 3>;
+using copy_bench_host_host_3d_diff_dtype_diff_layout             = CopyBench<float,
+                                                                 double,
+                                                                 int,
+                                                                 layout_c_contiguous,
+                                                                 layout_f_contiguous,
+                                                                 memory_type::host,
+                                                                 memory_type::host,
+                                                                 3>;
+
+using copy_bench_device_host_1d_same_dtype_same_layout        = CopyBench<int,
+                                                                   int,
+                                                                   int,
+                                                                   layout_c_contiguous,
+                                                                   layout_c_contiguous,
+                                                                   memory_type::device,
+                                                                   memory_type::host,
+                                                                   1>;
+using copy_bench_device_host_1d_same_dtype_diff_layout        = CopyBench<int,
+                                                                   int,
+                                                                   int,
+                                                                   layout_c_contiguous,
+                                                                   layout_f_contiguous,
+                                                                   memory_type::device,
+                                                                   memory_type::host,
+                                                                   1>;
+using copy_bench_device_host_1d_diff_dtype_diff_layout        = CopyBench<float,
+                                                                   double,
+                                                                   int,
+                                                                   layout_c_contiguous,
+                                                                   layout_f_contiguous,
+                                                                   memory_type::device,
+                                                                   memory_type::host,
+                                                                   1>;
+using copy_bench_device_host_2d_same_dtype_diff_layout        = CopyBench<int,
+                                                                   int,
+                                                                   int,
+                                                                   layout_c_contiguous,
+                                                                   layout_f_contiguous,
+                                                                   memory_type::device,
+                                                                   memory_type::host,
+                                                                   2>;
+using copy_bench_device_host_2d_same_dtype_diff_layout_cublas = CopyBench<float,
+                                                                          float,
+                                                                          int,
+                                                                          layout_c_contiguous,
+                                                                          layout_f_contiguous,
+                                                                          memory_type::device,
+                                                                          memory_type::host,
+                                                                          2>;
+using copy_bench_device_host_3d_diff_dtype_same_layout        = CopyBench<float,
+                                                                   double,
+                                                                   int,
+                                                                   layout_c_contiguous,
+                                                                   layout_c_contiguous,
+                                                                   memory_type::device,
+                                                                   memory_type::host,
+                                                                   3>;
+using copy_bench_device_host_3d_diff_dtype_diff_layout        = CopyBench<float,
+                                                                   double,
+                                                                   int,
+                                                                   layout_c_contiguous,
+                                                                   layout_f_contiguous,
+                                                                   memory_type::device,
+                                                                   memory_type::host,
+                                                                   3>;
+
+using copy_bench_host_device_1d_same_dtype_same_layout        = CopyBench<int,
+                                                                   int,
+                                                                   int,
+                                                                   layout_c_contiguous,
+                                                                   layout_c_contiguous,
+                                                                   memory_type::host,
+                                                                   memory_type::device,
+                                                                   1>;
+using copy_bench_host_device_1d_same_dtype_diff_layout        = CopyBench<int,
+                                                                   int,
+                                                                   int,
+                                                                   layout_c_contiguous,
+                                                                   layout_f_contiguous,
+                                                                   memory_type::host,
+                                                                   memory_type::device,
+                                                                   1>;
+using copy_bench_host_device_1d_diff_dtype_diff_layout        = CopyBench<float,
+                                                                   double,
+                                                                   int,
+                                                                   layout_c_contiguous,
+                                                                   layout_f_contiguous,
+                                                                   memory_type::host,
+                                                                   memory_type::device,
+                                                                   1>;
+using copy_bench_host_device_2d_same_dtype_diff_layout        = CopyBench<int,
+                                                                   int,
+                                                                   int,
+                                                                   layout_c_contiguous,
+                                                                   layout_f_contiguous,
+                                                                   memory_type::host,
+                                                                   memory_type::device,
+                                                                   2>;
+using copy_bench_host_device_2d_same_dtype_diff_layout_cublas = CopyBench<float,
+                                                                          float,
+                                                                          int,
+                                                                          layout_c_contiguous,
+                                                                          layout_f_contiguous,
+                                                                          memory_type::host,
+                                                                          memory_type::device,
+                                                                          2>;
+using copy_bench_host_device_3d_diff_dtype_diff_layout        = CopyBench<float,
+                                                                   double,
+                                                                   int,
+                                                                   layout_c_contiguous,
+                                                                   layout_f_contiguous,
+                                                                   memory_type::host,
+                                                                   memory_type::device,
+                                                                   3>;
+using copy_bench_host_device_3d_diff_dtype_same_layout        = CopyBench<float,
+                                                                   double,
+                                                                   int,
+                                                                   layout_c_contiguous,
+                                                                   layout_c_contiguous,
+                                                                   memory_type::host,
+                                                                   memory_type::device,
+                                                                   3>;
+
+// COPY_REGISTER(copy_bench_same_dtype_1d_host_host);
+COPY_REGISTER(copy_bench_device_device_1d_same_dtype_same_layout);
+COPY_REGISTER(copy_bench_device_device_1d_same_dtype_diff_layout);
+COPY_REGISTER(copy_bench_device_device_1d_diff_dtype_diff_layout);
+COPY_REGISTER(copy_bench_device_device_2d_same_dtype_diff_layout);
+COPY_REGISTER(copy_bench_device_device_2d_same_dtype_diff_layout_cublas);
+COPY_REGISTER(copy_bench_device_device_3d_diff_dtype_same_layout);
+COPY_REGISTER(copy_bench_device_device_3d_diff_dtype_diff_layout);
+
+COPY_REGISTER(copy_bench_host_host_1d_same_dtype_same_layout);
+COPY_REGISTER(copy_bench_host_host_1d_same_dtype_diff_layout);
+COPY_REGISTER(copy_bench_host_host_1d_diff_dtype_diff_layout);
+COPY_REGISTER(copy_bench_host_host_2d_same_dtype_diff_layout);
+COPY_REGISTER(copy_bench_host_host_2d_same_dtype_diff_layout_float_float);
+COPY_REGISTER(copy_bench_host_host_3d_diff_dtype_same_layout);
+COPY_REGISTER(copy_bench_host_host_3d_diff_dtype_diff_layout);
+
+COPY_REGISTER(copy_bench_device_host_1d_same_dtype_same_layout);
+COPY_REGISTER(copy_bench_device_host_1d_same_dtype_diff_layout);
+COPY_REGISTER(copy_bench_device_host_1d_diff_dtype_diff_layout);
+COPY_REGISTER(copy_bench_device_host_2d_same_dtype_diff_layout);
+COPY_REGISTER(copy_bench_device_host_2d_same_dtype_diff_layout_cublas);
+COPY_REGISTER(copy_bench_device_host_3d_diff_dtype_same_layout);
+COPY_REGISTER(copy_bench_device_host_3d_diff_dtype_diff_layout);
+
+COPY_REGISTER(copy_bench_host_device_1d_same_dtype_same_layout);
+COPY_REGISTER(copy_bench_host_device_1d_same_dtype_diff_layout);
+COPY_REGISTER(copy_bench_host_device_1d_diff_dtype_diff_layout);
+COPY_REGISTER(copy_bench_host_device_2d_same_dtype_diff_layout);
+COPY_REGISTER(copy_bench_host_device_2d_same_dtype_diff_layout_cublas);
+COPY_REGISTER(copy_bench_host_device_3d_diff_dtype_same_layout);
+COPY_REGISTER(copy_bench_host_device_3d_diff_dtype_diff_layout);
+
+}  // namespace raft::bench::core
diff --git a/cpp/bench/prims/distance/kernels.cu b/cpp/bench/prims/distance/kernels.cu
index 7d916e6ce0..3f74759665 100644
--- a/cpp/bench/prims/distance/kernels.cu
+++ b/cpp/bench/prims/distance/kernels.cu
@@ -46,9 +46,9 @@ struct GramMatrix : public fixture {
     A.resize(params.m * params.k, stream);
     B.resize(params.k * params.n, stream);
     C.resize(params.m * params.n, stream);
-    raft::random::Rng r(123456ULL);
-    r.uniform(A.data(), params.m * params.k, T(-1.0), T(1.0), stream);
-    r.uniform(B.data(), params.k * params.n, T(-1.0), T(1.0), stream);
+    raft::random::RngState rng(123456ULL);
+    raft::random::uniform(handle, rng, A.data(), params.m * params.k, T(-1.0), T(1.0));
+    raft::random::uniform(handle, rng, B.data(), params.k * params.n, T(-1.0), T(1.0));
   }
 
   ~GramMatrix()
diff --git a/cpp/bench/prims/linalg/norm.cu b/cpp/bench/prims/linalg/norm.cu
index f83953f8e4..1db23e4ca4 100644
--- a/cpp/bench/prims/linalg/norm.cu
+++ b/cpp/bench/prims/linalg/norm.cu
@@ -42,7 +42,7 @@ struct rowNorm : public fixture {
   rowNorm(const norm_input<IdxT>& p) : params(p), in(p.rows * p.cols, stream), dots(p.rows, stream)
   {
     raft::random::RngState rng{1234};
-    raft::random::uniform(rng, in.data(), p.rows * p.cols, (T)-10.0, (T)10.0, stream);
+    raft::random::uniform(handle, rng, in.data(), p.rows * p.cols, (T)-10.0, (T)10.0);
   }
 
   void run_benchmark(::benchmark::State& state) override
diff --git a/cpp/bench/prims/linalg/normalize.cu b/cpp/bench/prims/linalg/normalize.cu
index ad9052a008..91319e774c 100644
--- a/cpp/bench/prims/linalg/normalize.cu
+++ b/cpp/bench/prims/linalg/normalize.cu
@@ -41,7 +41,7 @@ struct rowNormalize : public fixture {
     : params(p), in(p.rows * p.cols, stream), out(p.rows * p.cols, stream)
   {
     raft::random::RngState rng{1234};
-    raft::random::uniform(rng, in.data(), p.rows * p.cols, (T)-10.0, (T)10.0, stream);
+    raft::random::uniform(handle, rng, in.data(), p.rows * p.cols, (T)-10.0, (T)10.0);
   }
 
   void run_benchmark(::benchmark::State& state) override
diff --git a/cpp/bench/prims/linalg/reduce_cols_by_key.cu b/cpp/bench/prims/linalg/reduce_cols_by_key.cu
index ac0c612ee4..1b584e80c8 100644
--- a/cpp/bench/prims/linalg/reduce_cols_by_key.cu
+++ b/cpp/bench/prims/linalg/reduce_cols_by_key.cu
@@ -42,7 +42,7 @@ struct reduce_cols_by_key : public fixture {
     : params(p), in(p.rows * p.cols, stream), out(p.rows * p.keys, stream), keys(p.cols, stream)
   {
     raft::random::RngState rng{42};
-    raft::random::uniformInt(rng, keys.data(), p.cols, (KeyT)0, (KeyT)p.keys, stream);
+    raft::random::uniformInt(handle, rng, keys.data(), p.cols, (KeyT)0, (KeyT)p.keys);
   }
 
   void run_benchmark(::benchmark::State& state) override
diff --git a/cpp/bench/prims/linalg/reduce_rows_by_key.cu b/cpp/bench/prims/linalg/reduce_rows_by_key.cu
index aa9c9a1f62..b68cefc274 100644
--- a/cpp/bench/prims/linalg/reduce_rows_by_key.cu
+++ b/cpp/bench/prims/linalg/reduce_rows_by_key.cu
@@ -37,7 +37,7 @@ struct reduce_rows_by_key : public fixture {
       workspace(p.rows, stream)
   {
     raft::random::RngState rng{42};
-    raft::random::uniformInt(rng, keys.data(), p.rows, (KeyT)0, (KeyT)p.keys, stream);
+    raft::random::uniformInt(handle, rng, keys.data(), p.rows, (KeyT)0, (KeyT)p.keys);
   }
 
   void run_benchmark(::benchmark::State& state) override
diff --git a/cpp/bench/prims/matrix/argmin.cu b/cpp/bench/prims/matrix/argmin.cu
index a8f667257a..afee81aa00 100644
--- a/cpp/bench/prims/matrix/argmin.cu
+++ b/cpp/bench/prims/matrix/argmin.cu
@@ -40,7 +40,7 @@ struct Argmin : public fixture {
 
     raft::random::RngState rng{1234};
     raft::random::uniform(
-      rng, matrix.data_handle(), params.rows * params.cols, T(-1), T(1), stream);
+      handle, rng, matrix.data_handle(), params.rows * params.cols, T(-1), T(1));
     resource::sync_stream(handle, stream);
   }
 
diff --git a/cpp/bench/prims/matrix/gather.cu b/cpp/bench/prims/matrix/gather.cu
index ca6a2830bd..00a145ffa9 100644
--- a/cpp/bench/prims/matrix/gather.cu
+++ b/cpp/bench/prims/matrix/gather.cu
@@ -52,11 +52,11 @@ struct Gather : public fixture {
 
     raft::random::RngState rng{1234};
     raft::random::uniform(
-      rng, matrix.data_handle(), params.rows * params.cols, T(-1), T(1), stream);
+      handle, rng, matrix.data_handle(), params.rows * params.cols, T(-1), T(1));
     raft::random::uniformInt(
       handle, rng, map.data_handle(), params.map_length, (MapT)0, (MapT)params.rows);
     if constexpr (Conditional) {
-      raft::random::uniform(rng, stencil.data_handle(), params.map_length, T(-1), T(1), stream);
+      raft::random::uniform(handle, rng, stencil.data_handle(), params.map_length, T(-1), T(1));
     }
     resource::sync_stream(handle, stream);
   }
diff --git a/cpp/bench/prims/neighbors/cagra_bench.cuh b/cpp/bench/prims/neighbors/cagra_bench.cuh
index 63f6c14686..07e93a3473 100644
--- a/cpp/bench/prims/neighbors/cagra_bench.cuh
+++ b/cpp/bench/prims/neighbors/cagra_bench.cuh
@@ -62,20 +62,20 @@ struct CagraBench : public fixture {
     constexpr T kRangeMin = std::is_integral_v<T> ? std::numeric_limits<T>::min() : T(-1);
     if constexpr (std::is_integral_v<T>) {
       raft::random::uniformInt(
-        state, dataset_.data_handle(), dataset_.size(), kRangeMin, kRangeMax, stream);
+        handle, state, dataset_.data_handle(), dataset_.size(), kRangeMin, kRangeMax);
       raft::random::uniformInt(
-        state, queries_.data_handle(), queries_.size(), kRangeMin, kRangeMax, stream);
+        handle, state, queries_.data_handle(), queries_.size(), kRangeMin, kRangeMax);
     } else {
       raft::random::uniform(
-        state, dataset_.data_handle(), dataset_.size(), kRangeMin, kRangeMax, stream);
+        handle, state, dataset_.data_handle(), dataset_.size(), kRangeMin, kRangeMax);
       raft::random::uniform(
-        state, queries_.data_handle(), queries_.size(), kRangeMin, kRangeMax, stream);
+        handle, state, queries_.data_handle(), queries_.size(), kRangeMin, kRangeMax);
     }
 
     // Generate random knn graph
 
     raft::random::uniformInt<IdxT>(
-      state, knn_graph_.data_handle(), knn_graph_.size(), 0, ps.n_samples - 1, stream);
+      handle, state, knn_graph_.data_handle(), knn_graph_.size(), 0, ps.n_samples - 1);
 
     auto metric = raft::distance::DistanceType::L2Expanded;
 
diff --git a/cpp/bench/prims/neighbors/knn.cuh b/cpp/bench/prims/neighbors/knn.cuh
index e580b20fdc..31ac869b37 100644
--- a/cpp/bench/prims/neighbors/knn.cuh
+++ b/cpp/bench/prims/neighbors/knn.cuh
@@ -260,9 +260,9 @@ struct knn : public fixture {
     constexpr T kRangeMax = std::is_integral_v<T> ? std::numeric_limits<T>::max() : T(1);
     constexpr T kRangeMin = std::is_integral_v<T> ? std::numeric_limits<T>::min() : T(-1);
     if constexpr (std::is_integral_v<T>) {
-      raft::random::uniformInt(state, vec.data(), n, kRangeMin, kRangeMax, stream);
+      raft::random::uniformInt(handle, state, vec.data(), n, kRangeMin, kRangeMax);
     } else {
-      raft::random::uniform(state, vec.data(), n, kRangeMin, kRangeMax, stream);
+      raft::random::uniform(handle, state, vec.data(), n, kRangeMin, kRangeMax);
     }
   }
 
diff --git a/cpp/include/raft/core/copy.cuh b/cpp/include/raft/core/copy.cuh
new file mode 100644
index 0000000000..f256f9ea0f
--- /dev/null
+++ b/cpp/include/raft/core/copy.cuh
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <raft/core/detail/copy.hpp>
+namespace raft {
+/**
+ * @brief Copy data from one mdspan to another with the same extents
+ *
+ * This function copies data from one mdspan to another, regardless of whether
+ * or not the mdspans have the same layout, memory type (host/device/managed)
+ * or data type. So long as it is possible to convert the data type from source
+ * to destination, and the extents are equal, this function should be able to
+ * perform the copy. Any necessary device operations will be stream-ordered via the CUDA stream
+ * provided by the `raft::resources` argument.
+ *
+ * This header includes a custom kernel used for copying data between
+ * completely arbitrary mdspans on device. To compile this function in a
+ * non-CUDA translation unit, `raft/core/copy.hpp` may be used instead. The
+ * pure C++ header will correctly compile even without a CUDA compiler.
+ * Depending on the specialization, this CUDA header may invoke the kernel and
+ * therefore require a CUDA compiler.
+ *
+ * Limitations: Currently this function does not support copying directly
+ * between two arbitrary mdspans on different CUDA devices. It is assumed that the caller sets the
+ * correct CUDA device. Furthermore, host-to-host copies that require a transformation of the
+ * underlying memory layout are currently not performant, although they are supported.
+ *
+ * Note that when copying to an mdspan with a non-unique layout (i.e. the same
+ * underlying memory is addressed by different element indexes), the source
+ * data must contain non-unique values for every non-unique destination
+ * element. If this is not the case, the behavior is undefined. Some copies
+ * to non-unique layouts which are well-defined will nevertheless fail with an
+ * exception to avoid race conditions in the underlying copy.
+ *
+ * @tparam DstType An mdspan type for the destination container.
+ * @tparam SrcType An mdspan type for the source container
+ * @param res raft::resources used to provide a stream for copies involving the
+ * device.
+ * @param dst The destination mdspan.
+ * @param src The source mdspan.
+ */
+template <typename DstType, typename SrcType>
+detail::mdspan_copyable_with_kernel_t<DstType, SrcType> copy(resources const& res,
+                                                             DstType&& dst,
+                                                             SrcType&& src)
+{
+  detail::copy(res, std::forward<DstType>(dst), std::forward<SrcType>(src));
+}
+
+#ifndef RAFT_NON_CUDA_COPY_IMPLEMENTED
+#define RAFT_NON_CUDA_COPY_IMPLEMENTED
+template <typename DstType, typename SrcType>
+detail::mdspan_copyable_not_with_kernel_t<DstType, SrcType> copy(resources const& res,
+                                                                 DstType&& dst,
+                                                                 SrcType&& src)
+{
+  detail::copy(res, std::forward<DstType>(dst), std::forward<SrcType>(src));
+}
+#endif
+}  // namespace raft
diff --git a/cpp/include/raft/core/copy.hpp b/cpp/include/raft/core/copy.hpp
new file mode 100644
index 0000000000..0a16b742a2
--- /dev/null
+++ b/cpp/include/raft/core/copy.hpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <raft/core/detail/copy.hpp>
+namespace raft {
+
+#ifndef RAFT_NON_CUDA_COPY_IMPLEMENTED
+#define RAFT_NON_CUDA_COPY_IMPLEMENTED
+/**
+ * @brief Copy data from one mdspan to another with the same extents
+ *
+ * This function copies data from one mdspan to another, regardless of whether
+ * or not the mdspans have the same layout, memory type (host/device/managed)
+ * or data type. So long as it is possible to convert the data type from source
+ * to destination, and the extents are equal, this function should be able to
+ * perform the copy.
+ *
+ * This header does _not_ include the custom kernel used for copying data
+ * between completely arbitrary mdspans on device. For arbitrary copies of this
+ * kind, `#include <raft/core/copy.cuh>` instead. Specializations of this
+ * function that require the custom kernel will be SFINAE-omitted when this
+ * header is used instead of `copy.cuh`. This header _does_ support
+ * device-to-device copies that can be performed with cuBLAS or a
+ * straightforward cudaMemcpy. Any necessary device operations will be stream-ordered via the CUDA
+ * stream provided by the `raft::resources` argument.
+ *
+ * Limitations: Currently this function does not support copying directly
+ * between two arbitrary mdspans on different CUDA devices. It is assumed that the caller sets the
+ * correct CUDA device. Furthermore, host-to-host copies that require a transformation of the
+ * underlying memory layout are currently not performant, although they are supported.
+ *
+ * Note that when copying to an mdspan with a non-unique layout (i.e. the same
+ * underlying memory is addressed by different element indexes), the source
+ * data must contain non-unique values for every non-unique destination
+ * element. If this is not the case, the behavior is undefined. Some copies
+ * to non-unique layouts which are well-defined will nevertheless fail with an
+ * exception to avoid race conditions in the underlying copy.
+ *
+ * @tparam DstType An mdspan type for the destination container.
+ * @tparam SrcType An mdspan type for the source container
+ * @param res raft::resources used to provide a stream for copies involving the
+ * device.
+ * @param dst The destination mdspan.
+ * @param src The source mdspan.
+ */
+template <typename DstType, typename SrcType>
+detail::mdspan_copyable_not_with_kernel_t<DstType, SrcType> copy(resources const& res,
+                                                                 DstType&& dst,
+                                                                 SrcType&& src)
+{
+  detail::copy(res, std::forward<DstType>(dst), std::forward<SrcType>(src));
+}
+#endif
+
+}  // namespace raft
diff --git a/cpp/include/raft/core/cuda_support.hpp b/cpp/include/raft/core/cuda_support.hpp
new file mode 100644
index 0000000000..07fb95a921
--- /dev/null
+++ b/cpp/include/raft/core/cuda_support.hpp
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+namespace raft {
+#ifndef RAFT_DISABLE_CUDA
+auto constexpr static const CUDA_ENABLED = true;
+#else
+auto constexpr static const CUDA_ENABLED = false;
+#endif
+}  // namespace raft
diff --git a/cpp/include/raft/core/detail/copy.hpp b/cpp/include/raft/core/detail/copy.hpp
new file mode 100644
index 0000000000..b23660fefe
--- /dev/null
+++ b/cpp/include/raft/core/detail/copy.hpp
@@ -0,0 +1,541 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <cstdio>
+#include <execution>
+#include <raft/core/cuda_support.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/error.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/core/mdspan.hpp>
+#include <raft/core/resource/stream_view.hpp>
+#include <raft/core/resources.hpp>
+#include <type_traits>
+#ifndef RAFT_DISABLE_CUDA
+#include <raft/core/cudart_utils.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/resource/cublas_handle.hpp>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#ifdef __CUDACC__
+#include <raft/util/cuda_dev_essentials.cuh>
+#endif
+#endif
+
+namespace raft {
+namespace detail {
+
+template <bool B,
+          typename DstType = void,
+          typename SrcType = void,
+          typename T       = void,
+          typename         = void>
+struct mdspan_copyable : std::false_type {
+  auto static constexpr const custom_kernel_allowed     = false;
+  auto static constexpr const custom_kernel_not_allowed = false;
+};
+
+/*
+ * A helper struct used to determine whether one mdspan type can be copied to
+ * another and if so how
+ */
+template <typename DstType, typename SrcType, typename T>
+struct mdspan_copyable<true,
+                       DstType,
+                       SrcType,
+                       T,
+                       std::enable_if_t<std::conjunction_v<
+                         std::bool_constant<is_mdspan_v<std::remove_reference_t<DstType>>>,
+                         std::bool_constant<is_mdspan_v<std::remove_reference_t<SrcType>>>>>> {
+  using dst_type = std::remove_reference_t<DstType>;
+  using src_type = std::remove_reference_t<SrcType>;
+
+  // Extents properties
+  using dst_extents_type = typename dst_type::extents_type;
+  using src_extents_type = typename src_type::extents_type;
+  using index_type =
+    std::conditional_t<(std::numeric_limits<typename dst_extents_type::index_type>::max() >
+                        std::numeric_limits<typename src_extents_type::index_type>::max()),
+                       typename dst_extents_type::index_type,
+                       typename src_extents_type::index_type>;
+
+  // Dtype properties
+  using dst_value_type                   = typename dst_type::value_type;
+  using src_value_type                   = typename src_type::value_type;
+  using dst_element_type                 = typename dst_type::element_type;
+  using src_element_type                 = typename src_type::element_type;
+  auto static constexpr const same_dtype = std::is_same_v<dst_value_type, src_value_type>;
+  auto static constexpr const compatible_dtype =
+    std::is_assignable_v<typename dst_type::reference, typename src_type::reference>;
+
+  auto static constexpr const dst_float  = std::is_same_v<dst_value_type, float>;
+  auto static constexpr const src_float  = std::is_same_v<src_value_type, float>;
+  auto static constexpr const dst_double = std::is_same_v<dst_value_type, double>;
+  auto static constexpr const src_double = std::is_same_v<src_value_type, double>;
+
+  auto static constexpr const both_float                = dst_float && src_float;
+  auto static constexpr const both_double               = dst_double && src_double;
+  auto static constexpr const both_float_or_both_double = both_float || both_double;
+
+  // Ranks
+  auto static constexpr const dst_rank        = dst_extents_type::rank();
+  auto static constexpr const src_rank        = src_extents_type::rank();
+  auto static constexpr const compatible_rank = (dst_rank == src_rank);
+  auto static constexpr const has_vector_rank = (dst_rank == 1);
+  auto static constexpr const has_matrix_rank = (dst_rank == 2);
+
+  // Layout properties
+  using dst_layout_type = typename dst_type::layout_type;
+  using src_layout_type = typename src_type::layout_type;
+
+  auto static constexpr const same_layout = std::is_same_v<dst_layout_type, src_layout_type>;
+
+  auto static check_for_unique_dst(dst_type dst)
+  {
+    if constexpr (!dst_type::is_always_unique()) {
+      RAFT_EXPECTS(dst.is_unique(), "Destination mdspan must be unique for parallelized copies");
+    }
+  }
+
+  auto static constexpr const src_contiguous =
+    std::disjunction_v<std::is_same<src_layout_type, layout_c_contiguous>,
+                       std::is_same<src_layout_type, layout_f_contiguous>>;
+
+  auto static constexpr const dst_contiguous =
+    std::disjunction_v<std::is_same<dst_layout_type, layout_c_contiguous>,
+                       std::is_same<dst_layout_type, layout_f_contiguous>>;
+
+  auto static constexpr const both_contiguous = src_contiguous && dst_contiguous;
+
+  auto static constexpr const same_underlying_layout =
+    std::disjunction_v<std::bool_constant<same_layout>,
+                       std::bool_constant<has_vector_rank && both_contiguous>>;
+  // Layout for intermediate tile if copying through custom kernel
+  using tile_layout_type =
+    std::conditional_t<src_contiguous,
+                       src_layout_type,
+                       std::conditional_t<dst_contiguous, dst_layout_type, layout_c_contiguous>>;
+
+  // Accessibility
+  auto static constexpr const dst_device_accessible = is_device_mdspan_v<dst_type>;
+  auto static constexpr const src_device_accessible = is_device_mdspan_v<src_type>;
+  auto static constexpr const both_device_accessible =
+    dst_device_accessible && src_device_accessible;
+
+  auto static constexpr const dst_host_accessible  = is_host_mdspan_v<dst_type>;
+  auto static constexpr const src_host_accessible  = is_host_mdspan_v<src_type>;
+  auto static constexpr const both_host_accessible = dst_host_accessible && src_host_accessible;
+
+  // Allowed copy codepaths
+  auto static constexpr const can_use_host = both_host_accessible;
+
+#if (defined(__AVX__) || defined(__SSE__) || defined(__ARM_NEON))
+  // TODO(wphicks): Following should be only necessary restrictions. Test if
+  // perf actually improves once fully implemented.
+  // auto static constexpr const can_use_simd = can_use_host && both_contiguous &&
+  // both_float_or_both_double;
+  auto static constexpr const can_use_simd =
+    can_use_host && both_contiguous && both_float && has_matrix_rank;
+#else
+  auto static constexpr const can_use_simd = false;
+#endif
+
+  auto static constexpr const can_use_std_copy =
+    std::conjunction_v<std::bool_constant<can_use_host>,
+                       std::bool_constant<compatible_dtype>,
+                       std::bool_constant<both_contiguous>,
+                       std::bool_constant<same_underlying_layout>>;
+  auto static constexpr const can_use_raft_copy =
+    std::conjunction_v<std::bool_constant<CUDA_ENABLED>,
+                       std::bool_constant<same_dtype>,
+                       std::bool_constant<both_contiguous>,
+                       std::bool_constant<same_underlying_layout>>;
+
+  // Do we need intermediate storage on device in order to perform
+  // non-trivial layout or dtype conversions after copying source from host or
+  // before copying converted results back to host?
+  auto static constexpr const requires_intermediate =
+    !both_host_accessible && !both_device_accessible && !can_use_raft_copy;
+
+  auto static constexpr const use_intermediate_dst =
+    std::conjunction_v<std::bool_constant<requires_intermediate>,
+                       std::bool_constant<src_device_accessible>>;
+
+  auto static constexpr const use_intermediate_src =
+    std::conjunction_v<std::bool_constant<requires_intermediate>,
+                       std::bool_constant<!use_intermediate_dst>>;
+  auto static constexpr const can_use_device =
+    std::conjunction_v<std::bool_constant<CUDA_ENABLED>,
+                       std::disjunction<std::bool_constant<both_device_accessible>,
+                                        std::bool_constant<requires_intermediate>,
+                                        std::bool_constant<can_use_raft_copy>>>;
+
+  auto static constexpr const can_use_cublas =
+    std::conjunction_v<std::bool_constant<can_use_device>,
+                       std::bool_constant<compatible_dtype>,
+                       std::bool_constant<both_contiguous>,
+                       std::bool_constant<!same_underlying_layout>,
+                       std::bool_constant<has_matrix_rank>,
+                       std::bool_constant<both_float_or_both_double>>;
+
+  auto static constexpr const custom_kernel_allowed =
+    std::conjunction_v<std::bool_constant<can_use_device>,
+                       std::bool_constant<!(can_use_raft_copy || can_use_cublas)>>;
+
+  auto static constexpr const custom_kernel_not_allowed = !custom_kernel_allowed;
+  auto static constexpr const custom_kernel_required =
+    std::conjunction_v<std::bool_constant<!can_use_host>,
+                       std::bool_constant<!(can_use_raft_copy || can_use_cublas)>>;
+
+  // Viable overload?
+  auto static constexpr const value =
+    std::conjunction_v<std::bool_constant<is_mdspan_v<src_type>>,
+                       std::bool_constant<is_mdspan_v<dst_type>>,
+                       std::bool_constant<can_use_host || can_use_device>>;
+  using type = std::enable_if_t<value, T>;
+};
+
+template <typename DstType, typename SrcType, typename T = void>
+using mdspan_copyable_t = typename mdspan_copyable<true, DstType, SrcType, T>::type;
+template <typename DstType, typename SrcType>
+auto static constexpr const mdspan_copyable_v =
+  mdspan_copyable<true, DstType, SrcType, void>::value;
+
+template <typename DstType, typename SrcType>
+auto static constexpr const mdspan_copyable_with_kernel_v =
+  mdspan_copyable<true, DstType, SrcType, void>::custom_kernel_allowed;
+template <typename DstType, typename SrcType>
+auto static constexpr const mdspan_copyable_not_with_kernel_v =
+  mdspan_copyable<true, DstType, SrcType, void>::custom_kernel_not_allowed;
+
+template <typename DstType, typename SrcType, typename T = void>
+using mdspan_copyable_with_kernel_t =
+  std::enable_if_t<mdspan_copyable_with_kernel_v<DstType, SrcType>, T>;
+
+template <typename DstType, typename SrcType, typename T = void>
+using mdspan_copyable_not_with_kernel_t =
+  std::enable_if_t<mdspan_copyable_not_with_kernel_v<DstType, SrcType>, T>;
+
+#ifdef __CUDACC__
+auto static constexpr const mdspan_copy_tile_dim   = 32;
+auto static constexpr const mdspan_copy_tile_elems = mdspan_copy_tile_dim * mdspan_copy_tile_dim;
+
+// Helper struct to work around lack of CUDA-native std::apply
+template <typename IdxType, IdxType... Idx>
+struct index_sequence {};
+
+template <typename IdxType, IdxType N, IdxType... Idx>
+struct make_index_sequence
+  : std::conditional_t<N == IdxType{},
+                       index_sequence<IdxType, Idx...>,
+                       make_index_sequence<IdxType, N - IdxType{1}, N - IdxType{1}, Idx...>> {};
+
+/* template <typename LambdaT, typename ContainerT, typename IdxT, IdxT... Idx>
+__host__ __device__ decltype(auto) apply(LambdaT&& lambda, ContainerT&& args, index_sequence<IdxT,
+Idx...>)
+{
+  return lambda(args[Idx]...);
+}
+
+template <typename LambdaT, typename ContainerT, typename IdxT, IdxT size>
+__host__ __device__ decltype(auto) apply(LambdaT&& lambda, ContainerT&& args)
+{
+  return apply(std::forward<LambdaT>(lambda), std::forward<ContainerT>(args),
+make_index_sequence<IdxT, size>{});
+} */
+
+/*
+ * Given an mdspan and an array of indices, return a reference to the
+ * indicated element.
+ */
+template <typename MdspanType, typename IdxType, IdxType... Idx>
+__device__ decltype(auto) get_mdspan_elem(MdspanType md,
+                                          IdxType const* indices,
+                                          index_sequence<IdxType, Idx...>)
+{
+  return md(indices[Idx]...);
+}
+
+template <typename MdspanType, typename IdxType>
+__device__ decltype(auto) get_mdspan_elem(MdspanType md, IdxType const* indices)
+{
+  return get_mdspan_elem(
+    md, indices, make_index_sequence<IdxType, MdspanType::extents_type::rank()>{});
+}
+
+/* Advance old_indices forward by the number of mdspan elements specified
+ * by increment. Store the result in indices. Return true if the new
+ * indices are valid for the input mdspan.
+ */
+template <typename MdspanType, typename IdxType, typename IncrType>
+__device__ auto increment_indices(IdxType* indices,
+                                  MdspanType const& md,
+                                  IdxType const* old_indices,
+                                  IdxType const* index_strides,
+                                  IncrType increment)
+{
+#pragma unroll
+  for (auto i = typename MdspanType::extents_type::rank_type{}; i < md.rank(); ++i) {
+    increment += index_strides[i] * old_indices[i];
+  }
+
+#pragma unroll
+  for (auto i = typename MdspanType::extents_type::rank_type{}; i < md.rank(); ++i) {
+    // Iterate through dimensions in order from slowest to fastest varying for
+    // layout_right and layout_left. Otherwise, just iterate through dimensions
+    // in order.
+    //
+    // TODO(wphicks): It is possible to always iterate through dimensions in
+    // the slowest to fastest order. Consider this or at minimum expanding to
+    // padded layouts.
+    auto const real_index = [](auto ind) {
+      if constexpr (std::is_same_v<typename MdspanType::layout_type, layout_f_contiguous>) {
+        return MdspanType::rank() - ind - 1;
+      } else {
+        return ind;
+      }
+    }(i);
+
+    auto cur_index = IdxType{};
+
+    while (cur_index < md.extent(real_index) - 1 && increment >= index_strides[real_index]) {
+      increment -= index_strides[real_index];
+      ++cur_index;
+    }
+    indices[real_index] = cur_index;
+  }
+
+  return increment == IdxType{};
+}
+
+/*
+ * WARNING: This kernel _must_ be launched with mdspan_copy_tile_dim x
+ * mdspan_copy_tile_dim threads per block. This restriction allows for
+ * additional optimizations at the expense of generalized launch
+ * parameters.
+ */
+template <typename DstType, typename SrcType>
+__global__ mdspan_copyable_with_kernel_t<DstType, SrcType> mdspan_copy_kernel(DstType dst,
+                                                                              SrcType src)
+{
+  using config = mdspan_copyable<true, DstType, SrcType>;
+
+  // An intermediate storage location for the data to be copied.
+  __shared__ typename config::dst_value_type tile[mdspan_copy_tile_dim][mdspan_copy_tile_dim + 1];
+
+  // Compute the cumulative product of extents in order from fastest to
+  // slowest varying extent
+  typename config::index_type index_strides[config::dst_rank];
+  auto cur_stride = typename config::index_type{1};
+#pragma unroll
+  for (auto i = typename SrcType::extents_type::rank_type{}; i < config::src_rank; ++i) {
+    // Iterate through dimensions in order from fastest to slowest varying
+    auto const real_index = [](auto ind) {
+      if constexpr (std::is_same_v<typename config::src_layout_type, layout_c_contiguous>) {
+        return config::src_rank - ind - 1;
+      } else {
+        return ind;
+      }
+    }(i);
+
+    index_strides[real_index] = cur_stride;
+    cur_stride *= src.extent(real_index);
+  }
+
+  // The index of the first element in the mdspan which will be copied via
+  // the current tile for this block.
+  typename config::index_type tile_offset[config::dst_rank] = {0};
+  typename config::index_type cur_indices[config::dst_rank];
+  auto valid_tile = increment_indices(
+    tile_offset, src, tile_offset, index_strides, blockIdx.x * mdspan_copy_tile_elems);
+
+  while (valid_tile) {
+    auto tile_read_x = std::is_same_v<typename config::src_layout_type, layout_f_contiguous>
+                         ? threadIdx.x
+                         : threadIdx.y;
+    auto tile_read_y = std::is_same_v<typename config::src_layout_type, layout_f_contiguous>
+                         ? threadIdx.y
+                         : threadIdx.x;
+
+    auto valid_index = increment_indices(cur_indices,
+                                         src,
+                                         tile_offset,
+                                         index_strides,
+                                         tile_read_x * mdspan_copy_tile_dim + tile_read_y);
+
+    if constexpr (config::same_underlying_layout || !config::dst_contiguous) {
+      if (valid_index) {
+        tile[tile_read_x][tile_read_y]    = get_mdspan_elem(src, cur_indices);
+        get_mdspan_elem(dst, cur_indices) = tile[tile_read_x][tile_read_y];
+      }
+    } else {
+      if (valid_index) { tile[tile_read_x][tile_read_y] = get_mdspan_elem(src, cur_indices); }
+      __syncthreads();
+
+      valid_index = increment_indices(cur_indices,
+                                      src,
+                                      tile_offset,
+                                      index_strides,
+                                      tile_read_y * mdspan_copy_tile_dim + tile_read_x);
+      if (valid_index) { get_mdspan_elem(dst, cur_indices) = tile[tile_read_y][tile_read_x]; }
+      __syncthreads();
+    }
+    valid_tile = increment_indices(
+      tile_offset, src, tile_offset, index_strides, blockDim.x * mdspan_copy_tile_elems);
+  }
+}
+#endif
+
+template <typename DstType, typename SrcType>
+mdspan_copyable_t<DstType, SrcType> copy(resources const& res, DstType&& dst, SrcType&& src)
+{
+  using config = mdspan_copyable<true, DstType, SrcType>;
+  for (auto i = std::size_t{}; i < config::src_rank; ++i) {
+    RAFT_EXPECTS(src.extent(i) == dst.extent(i), "Must copy between mdspans of the same shape");
+  }
+
+  if constexpr (config::use_intermediate_src) {
+#ifndef RAFT_DISABLE_CUDA
+    // Copy to intermediate source on device, then perform necessary
+    // changes in layout on device, directly into final destination
+    using mdarray_t   = device_mdarray<typename config::src_value_type,
+                                     typename config::src_extents_type,
+                                     typename config::src_layout_type>;
+    auto intermediate = mdarray_t(res,
+                                  typename mdarray_t::mapping_type{src.extents()},
+                                  typename mdarray_t::container_policy_type{});
+    detail::copy(res, intermediate.view(), src);
+    detail::copy(res, dst, intermediate.view());
+#else
+    // Not possible to reach this due to enable_ifs. Included for safety.
+    throw(raft::non_cuda_build_error("Copying to device in non-CUDA build"));
+#endif
+
+  } else if constexpr (config::use_intermediate_dst) {
+#ifndef RAFT_DISABLE_CUDA
+    // Perform necessary changes in layout on device, then copy to final
+    // destination on host
+    using mdarray_t   = device_mdarray<typename config::dst_value_type,
+                                     typename config::dst_extents_type,
+                                     typename config::dst_layout_type>;
+    auto intermediate = mdarray_t(res,
+                                  typename mdarray_t::mapping_type{dst.extents()},
+                                  typename mdarray_t::container_policy_type{});
+    detail::copy(res, intermediate.view(), src);
+    detail::copy(res, dst, intermediate.view());
+#else
+    throw(raft::non_cuda_build_error("Copying from device in non-CUDA build"));
+#endif
+  } else if constexpr (config::can_use_raft_copy) {
+#ifndef RAFT_DISABLE_CUDA
+    raft::copy(dst.data_handle(), src.data_handle(), dst.size(), resource::get_cuda_stream(res));
+#else
+    // Not possible to reach this due to enable_ifs. Included for safety.
+    throw(raft::non_cuda_build_error("Copying to from or on device in non-CUDA build"));
+#endif
+  } else if constexpr (config::can_use_cublas) {
+#ifndef RAFT_DISABLE_CUDA
+    auto constexpr const alpha = typename std::remove_reference_t<DstType>::value_type{1};
+    auto constexpr const beta  = typename std::remove_reference_t<DstType>::value_type{0};
+    if constexpr (std::is_same_v<typename config::dst_layout_type, layout_c_contiguous>) {
+      CUBLAS_TRY(linalg::detail::cublasgeam(resource::get_cublas_handle(res),
+                                            CUBLAS_OP_T,
+                                            CUBLAS_OP_N,
+                                            dst.extent(1),
+                                            dst.extent(0),
+                                            &alpha,
+                                            src.data_handle(),
+                                            src.extent(0),
+                                            &beta,
+                                            dst.data_handle(),
+                                            dst.extent(1),
+                                            dst.data_handle(),
+                                            dst.extent(1),
+                                            resource::get_cuda_stream(res)));
+    } else {
+      CUBLAS_TRY(linalg::detail::cublasgeam(resource::get_cublas_handle(res),
+                                            CUBLAS_OP_T,
+                                            CUBLAS_OP_N,
+                                            dst.extent(0),
+                                            dst.extent(1),
+                                            &alpha,
+                                            src.data_handle(),
+                                            src.extent(1),
+                                            &beta,
+                                            dst.data_handle(),
+                                            dst.extent(0),
+                                            dst.data_handle(),
+                                            dst.extent(0),
+                                            resource::get_cuda_stream(res)));
+    }
+#else
+    // Not possible to reach this due to enable_ifs. Included for safety.
+    throw(raft::non_cuda_build_error("Copying to from or on device in non-CUDA build"));
+#endif
+  } else if constexpr (config::custom_kernel_allowed) {
+#ifdef __CUDACC__
+    config::check_for_unique_dst(dst);
+    auto const blocks = std::min(
+      // This maximum is somewhat arbitrary. Could query the device to see
+      // how many blocks we could reasonably allow, but this is probably
+      // sufficient considering that this kernel will likely overlap with
+      // real computations for most use cases.
+      typename config::index_type{32},
+      raft::ceildiv(typename config::index_type(dst.size()),
+                    typename config::index_type(mdspan_copy_tile_elems)));
+    auto constexpr const threads = dim3{mdspan_copy_tile_dim, mdspan_copy_tile_dim, 1};
+    mdspan_copy_kernel<<<blocks, threads, 0, resource::get_cuda_stream(res)>>>(dst, src);
+#else
+    // Should never actually reach this because of enable_ifs. Included for
+    // safety.
+    RAFT_FAIL(
+      "raft::copy called in a way that requires custom kernel. Please use "
+      "raft/core/copy.cuh and include the header in a .cu file");
+#endif
+  } else if constexpr (config::can_use_std_copy) {
+    std::copy(src.data_handle(), src.data_handle() + dst.size(), dst.data_handle());
+  } else {
+    // TODO(wphicks): Make the following cache-oblivious and add SIMD support
+    auto indices = std::array<typename config::index_type, config::dst_rank>{};
+    for (auto i = std::size_t{}; i < dst.size(); ++i) {
+      if (i != 0) {
+        if constexpr (std::is_same_v<typename config::src_layout_type, layout_c_contiguous>) {
+          // For layout_right/layout_c_contiguous, we iterate over the
+          // rightmost extent fastest
+          auto dim = config::src_rank - 1;
+          while ((++indices[dim]) == src.extent(dim)) {
+            indices[dim] = typename config::index_type{};
+            --dim;
+          }
+        } else {
+          // For layout_left/layout_f_contiguous (and currently all other
+          // layouts), we iterate over the leftmost extent fastest. The
+          // cache-oblivious implementation should work through dimensions in
+          // order of increasing stride.
+          auto dim = std::size_t{};
+          while ((++indices[dim]) == src.extent(dim)) {
+            indices[dim] = typename config::index_type{};
+            ++dim;
+          }
+        }
+      }
+      std::apply(dst, indices) = std::apply(src, indices);
+    }
+  }
+}
+}  // namespace detail
+}  // namespace raft
diff --git a/cpp/include/raft/core/error.hpp b/cpp/include/raft/core/error.hpp
index 84b244f4dc..9045c5c871 100644
--- a/cpp/include/raft/core/error.hpp
+++ b/cpp/include/raft/core/error.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -98,6 +98,16 @@ struct logic_error : public raft::exception {
   explicit logic_error(std::string const& message) : raft::exception(message) {}
 };
 
+/**
+ * @brief Exception thrown when attempting to use CUDA features from a non-CUDA
+ * build
+ *
+ */
+struct non_cuda_build_error : public raft::exception {
+  explicit non_cuda_build_error(char const* const message) : raft::exception(message) {}
+  explicit non_cuda_build_error(std::string const& message) : raft::exception(message) {}
+};
+
 /**
  * @}
  */
diff --git a/cpp/include/raft/core/resource/resource_types.hpp b/cpp/include/raft/core/resource/resource_types.hpp
index 8e331293bf..c30f2e81e8 100644
--- a/cpp/include/raft/core/resource/resource_types.hpp
+++ b/cpp/include/raft/core/resource/resource_types.hpp
@@ -39,6 +39,8 @@ enum resource_type {
   SUB_COMMUNICATOR,        // raft sub communicator
   DEVICE_PROPERTIES,       // cuda device properties
   DEVICE_ID,               // cuda device id
+  STREAM_VIEW,             // view of a cuda stream or a placeholder in
+                           // CUDA-free builds
   THRUST_POLICY,           // thrust execution policy
   WORKSPACE_RESOURCE,      // rmm device memory resource
 
diff --git a/cpp/include/raft/core/resource/stream_view.hpp b/cpp/include/raft/core/resource/stream_view.hpp
new file mode 100644
index 0000000000..ccf516076f
--- /dev/null
+++ b/cpp/include/raft/core/resource/stream_view.hpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/core/resources.hpp>
+#include <raft/core/stream_view.hpp>
+#ifndef RAFT_DISABLE_CUDA
+#include <raft/core/resource/cuda_stream.hpp>
+#endif
+
+namespace raft::resource {
+struct stream_view_resource : public resource {
+  stream_view_resource(raft::stream_view view = raft::stream_view_per_thread) : stream(view) {}
+  void* get_resource() override { return &stream; }
+
+  ~stream_view_resource() override {}
+
+ private:
+  raft::stream_view stream;
+};
+
+/**
+ * Factory that knows how to construct a specific raft::resource to populate
+ * the resources instance.
+ */
+struct stream_view_resource_factory : public resource_factory {
+ public:
+  stream_view_resource_factory(raft::stream_view view = raft::stream_view_per_thread) : stream(view)
+  {
+  }
+  resource_type get_resource_type() override { return resource_type::STREAM_VIEW; }
+  resource* make_resource() override { return new stream_view_resource(stream); }
+
+ private:
+  raft::stream_view stream;
+};
+
+/**
+ * @defgroup resource_stream_view stream resource functions compatible with
+ * non-CUDA builds
+ * @{
+ */
+/**
+ * Load a raft::stream_view from a resources instance (and populate it on the res
+ * if needed).
+ * @param res raft res object for managing resources
+ * @return
+ */
+inline raft::stream_view get_stream_view(resources const& res)
+{
+  if (!res.has_resource_factory(resource_type::STREAM_VIEW)) {
+    res.add_resource_factory(std::make_shared<stream_view_resource_factory>());
+  }
+  return *res.get_resource<raft::stream_view>(resource_type::STREAM_VIEW);
+};
+
+/**
+ * Load a raft::stream__view from a resources instance (and populate it on the res
+ * if needed).
+ * @param[in] res raft resources object for managing resources
+ * @param[in] view raft stream view
+ */
+inline void set_stream_view(resources const& res, raft::stream_view view)
+{
+  res.add_resource_factory(std::make_shared<stream_view_resource_factory>(view));
+};
+
+/**
+ * @brief synchronize a specific stream
+ *
+ * @param[in] res the raft resources object
+ * @param[in] stream stream to synchronize
+ */
+inline void sync_stream_view(const resources& res, raft::stream_view stream)
+{
+  stream.interruptible_synchronize();
+}
+
+/**
+ * @brief synchronize main stream on the resources instance
+ */
+inline void sync_stream_view(const resources& res) { sync_stream_view(res, get_stream_view(res)); }
+
+/**
+ * @}
+ */
+
+}  // namespace raft::resource
diff --git a/cpp/include/raft/core/stream_view.hpp b/cpp/include/raft/core/stream_view.hpp
new file mode 100644
index 0000000000..f7e7934dbf
--- /dev/null
+++ b/cpp/include/raft/core/stream_view.hpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <raft/core/cuda_support.hpp>
+#include <raft/core/error.hpp>
+#include <raft/core/logger.hpp>
+#ifndef RAFT_DISABLE_CUDA
+#include <raft/core/interruptible.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#endif
+
+namespace raft {
+
+namespace detail {
+struct fail_stream_view {
+  constexpr fail_stream_view()                                           = default;
+  constexpr fail_stream_view(fail_stream_view const&)                    = default;
+  constexpr fail_stream_view(fail_stream_view&&)                         = default;
+  auto constexpr operator=(fail_stream_view const&) -> fail_stream_view& = default;
+  auto constexpr operator=(fail_stream_view&&) -> fail_stream_view&      = default;
+  auto value() { throw non_cuda_build_error{"Attempted to access CUDA stream in non-CUDA build"}; }
+  [[nodiscard]] auto is_per_thread_default() const { return false; }
+  [[nodiscard]] auto is_default() const { return false; }
+  void synchronize() const
+  {
+    throw non_cuda_build_error{"Attempted to sync CUDA stream in non-CUDA build"};
+  }
+  void synchronize_no_throw() const
+  {
+    RAFT_LOG_ERROR("Attempted to sync CUDA stream in non-CUDA build");
+  }
+};
+}  // namespace detail
+
+/** A lightweight wrapper around rmm::cuda_stream_view that can be used in
+ * CUDA-free builds
+ *
+ * While CUDA-free builds should never actually make use of a CUDA stream at
+ * runtime, it is sometimes useful to have a symbol that can stand in place of
+ * a CUDA stream to avoid excessive ifdef directives interspersed with other
+ * logic. This struct's methods invoke the underlying rmm::cuda_stream_view in
+ * CUDA-enabled builds but throw runtime exceptions if any non-trivial method
+ * is called from a CUDA-free build */
+struct stream_view {
+#ifndef RAFT_DISABLE_CUDA
+  using underlying_view_type = rmm::cuda_stream_view;
+#else
+  using underlying_view_type = detail::fail_stream_view;
+#endif
+
+  constexpr stream_view(
+    underlying_view_type base_view = stream_view::get_underlying_per_thread_default())
+    : base_view_{base_view}
+  {
+  }
+  constexpr stream_view(stream_view const&)          = default;
+  constexpr stream_view(stream_view&&)               = default;
+  auto operator=(stream_view const&) -> stream_view& = default;
+  auto operator=(stream_view&&) -> stream_view&      = default;
+  auto value() { return base_view_.value(); }
+  operator underlying_view_type() const noexcept { return base_view_; }
+  [[nodiscard]] auto is_per_thread_default() const { return base_view_.is_per_thread_default(); }
+  [[nodiscard]] auto is_default() const { return base_view_.is_default(); }
+  void synchronize() const { base_view_.synchronize(); }
+  void synchronize_no_throw() const { base_view_.synchronize_no_throw(); }
+  void interruptible_synchronize() const
+  {
+#ifndef RAFT_DISABLE_CUDA
+    interruptible::synchronize(base_view_);
+#else
+    synchronize();
+#endif
+  }
+
+  auto underlying() { return base_view_; }
+  void synchronize_if_cuda_enabled()
+  {
+    if constexpr (raft::CUDA_ENABLED) { base_view_.synchronize(); }
+  }
+
+ private:
+  underlying_view_type base_view_;
+  auto static get_underlying_per_thread_default() -> underlying_view_type
+  {
+#ifndef RAFT_DISABLE_CUDA
+    return rmm::cuda_stream_per_thread;
+#else
+    auto static constexpr const default_fail_stream = underlying_view_type{};
+    return default_fail_stream;
+#endif
+  }
+};
+
+auto static const stream_view_per_thread = stream_view{};
+
+}  // namespace raft
diff --git a/cpp/include/raft/neighbors/cagra.cuh b/cpp/include/raft/neighbors/cagra.cuh
index f9682a973f..1efb4da95e 100644
--- a/cpp/include/raft/neighbors/cagra.cuh
+++ b/cpp/include/raft/neighbors/cagra.cuh
@@ -318,6 +318,7 @@ index<T, IdxT> build(raft::resources const& res,
     auto nn_descent_params                      = experimental::nn_descent::index_params();
     nn_descent_params.graph_degree              = intermediate_degree;
     nn_descent_params.intermediate_graph_degree = 1.5 * intermediate_degree;
+    nn_descent_params.max_iterations            = params.nn_descent_niter;
     build_knn_graph<T, IdxT>(res, dataset, knn_graph->view(), nn_descent_params);
   }
 
diff --git a/cpp/include/raft/neighbors/cagra_types.hpp b/cpp/include/raft/neighbors/cagra_types.hpp
index 5061d6082d..4db08110b9 100644
--- a/cpp/include/raft/neighbors/cagra_types.hpp
+++ b/cpp/include/raft/neighbors/cagra_types.hpp
@@ -58,6 +58,8 @@ struct index_params : ann::index_params {
   size_t graph_degree = 64;
   /** ANN algorithm to build knn graph. */
   graph_build_algo build_algo = graph_build_algo::IVF_PQ;
+  /** Number of Iterations to run if building with NN_DESCENT */
+  size_t nn_descent_niter = 20;
 };
 
 enum class search_algo {
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
index 47c10de200..975ae9ec00 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -124,7 +124,7 @@ inline void make_rotation_matrix(raft::resources const& handle,
                                  uint32_t n_rows,
                                  uint32_t n_cols,
                                  float* rotation_matrix,
-                                 raft::random::Rng rng = raft::random::Rng(7ULL))
+                                 raft::random::RngState rng = raft::random::RngState(7ULL))
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
     "ivf_pq::make_rotation_matrix(%u * %u)", n_rows, n_cols);
@@ -134,7 +134,7 @@ inline void make_rotation_matrix(raft::resources const& handle,
   if (force_random_rotation || !inplace) {
     rmm::device_uvector<float> buf(inplace ? 0 : n * n, stream);
     float* mat = inplace ? rotation_matrix : buf.data();
-    rng.normal(mat, n * n, 0.0f, 1.0f, stream);
+    raft::random::normal(handle, rng, mat, n * n, 0.0f, 1.0f);
     linalg::detail::qrGetQ_inplace(handle, mat, n, n, stream);
     if (!inplace) {
       RAFT_CUDA_TRY(cudaMemcpy2DAsync(rotation_matrix,
diff --git a/cpp/include/raft/neighbors/detail/nn_descent.cuh b/cpp/include/raft/neighbors/detail/nn_descent.cuh
index 1fb568a934..ce77cdc3de 100644
--- a/cpp/include/raft/neighbors/detail/nn_descent.cuh
+++ b/cpp/include/raft/neighbors/detail/nn_descent.cuh
@@ -362,28 +362,28 @@ class GNND {
   GnndGraph<Index_t> graph_;
   std::atomic<int64_t> update_counter_;
 
-  Index_t nrow_;
-  const int ndim_;
+  size_t nrow_;
+  size_t ndim_;
 
-  raft::device_matrix<__half, Index_t, raft::row_major> d_data_;
-  raft::device_vector<DistData_t, Index_t> l2_norms_;
+  raft::device_matrix<__half, size_t, raft::row_major> d_data_;
+  raft::device_vector<DistData_t, size_t> l2_norms_;
 
-  raft::device_matrix<ID_t, Index_t, raft::row_major> graph_buffer_;
-  raft::device_matrix<DistData_t, Index_t, raft::row_major> dists_buffer_;
+  raft::device_matrix<ID_t, size_t, raft::row_major> graph_buffer_;
+  raft::device_matrix<DistData_t, size_t, raft::row_major> dists_buffer_;
 
   // TODO: Investigate using RMM/RAFT types https://github.com/rapidsai/raft/issues/1827
   thrust::host_vector<ID_t, pinned_memory_allocator<ID_t>> graph_host_buffer_;
   thrust::host_vector<DistData_t, pinned_memory_allocator<DistData_t>> dists_host_buffer_;
 
-  raft::device_vector<int, Index_t> d_locks_;
+  raft::device_vector<int, size_t> d_locks_;
 
   thrust::host_vector<Index_t, pinned_memory_allocator<Index_t>> h_rev_graph_new_;
   thrust::host_vector<Index_t, pinned_memory_allocator<Index_t>> h_graph_old_;
   thrust::host_vector<Index_t, pinned_memory_allocator<Index_t>> h_rev_graph_old_;
   // int2.x is the number of forward edges, int2.y is the number of reverse edges
 
-  raft::device_vector<int2, Index_t> d_list_sizes_new_;
-  raft::device_vector<int2, Index_t> d_list_sizes_old_;
+  raft::device_vector<int2, size_t> d_list_sizes_new_;
+  raft::device_vector<int2, size_t> d_list_sizes_old_;
 };
 
 constexpr int TILE_ROW_WIDTH = 64;
@@ -1143,21 +1143,21 @@ GNND<Data_t, Index_t>::GNND(raft::resources const& res, const BuildConfig& build
            NUM_SAMPLES),
     nrow_(build_config.max_dataset_size),
     ndim_(build_config.dataset_dim),
-    d_data_{raft::make_device_matrix<__half, Index_t, raft::row_major>(
+    d_data_{raft::make_device_matrix<__half, size_t, raft::row_major>(
       res, nrow_, build_config.dataset_dim)},
-    l2_norms_{raft::make_device_vector<DistData_t, Index_t>(res, nrow_)},
+    l2_norms_{raft::make_device_vector<DistData_t, size_t>(res, nrow_)},
     graph_buffer_{
-      raft::make_device_matrix<ID_t, Index_t, raft::row_major>(res, nrow_, DEGREE_ON_DEVICE)},
+      raft::make_device_matrix<ID_t, size_t, raft::row_major>(res, nrow_, DEGREE_ON_DEVICE)},
     dists_buffer_{
-      raft::make_device_matrix<DistData_t, Index_t, raft::row_major>(res, nrow_, DEGREE_ON_DEVICE)},
+      raft::make_device_matrix<DistData_t, size_t, raft::row_major>(res, nrow_, DEGREE_ON_DEVICE)},
     graph_host_buffer_(nrow_ * DEGREE_ON_DEVICE),
     dists_host_buffer_(nrow_ * DEGREE_ON_DEVICE),
-    d_locks_{raft::make_device_vector<int, Index_t>(res, nrow_)},
+    d_locks_{raft::make_device_vector<int, size_t>(res, nrow_)},
     h_rev_graph_new_(nrow_ * NUM_SAMPLES),
     h_graph_old_(nrow_ * NUM_SAMPLES),
     h_rev_graph_old_(nrow_ * NUM_SAMPLES),
-    d_list_sizes_new_{raft::make_device_vector<int2, Index_t>(res, nrow_)},
-    d_list_sizes_old_{raft::make_device_vector<int2, Index_t>(res, nrow_)}
+    d_list_sizes_new_{raft::make_device_vector<int2, size_t>(res, nrow_)},
+    d_list_sizes_old_{raft::make_device_vector<int2, size_t>(res, nrow_)}
 {
   static_assert(NUM_SAMPLES <= 32);
 
@@ -1342,8 +1342,8 @@ void GNND<Data_t, Index_t>::build(Data_t* data, const Index_t nrow, Index_t* out
   for (size_t i = 0; i < (size_t)nrow_; i++) {
     for (size_t j = 0; j < build_config_.node_degree; j++) {
       size_t idx = i * graph_.node_degree + j;
-      Index_t id = graph_.h_graph[idx].id();
-      if (id < nrow_) {
+      int id     = graph_.h_graph[idx].id();
+      if (id < static_cast<int>(nrow_)) {
         graph_shrink_buffer[i * build_config_.node_degree + j] = id;
       } else {
         graph_shrink_buffer[i * build_config_.node_degree + j] =
diff --git a/cpp/include/raft/stats/detail/neighborhood_recall.cuh b/cpp/include/raft/stats/detail/neighborhood_recall.cuh
new file mode 100644
index 0000000000..78cd64538e
--- /dev/null
+++ b/cpp/include/raft/stats/detail/neighborhood_recall.cuh
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/error.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/core/math.hpp>
+#include <raft/core/mdspan_types.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/core/resources.hpp>
+
+#include <cub/cub.cuh>
+
+#include <cuda/atomic>
+
+#include <optional>
+
+namespace raft::stats::detail {
+
+template <typename IndicesValueType,
+          typename DistanceValueType,
+          typename IndexType,
+          typename ScalarType>
+__global__ void neighborhood_recall(
+  raft::device_matrix_view<const IndicesValueType, IndexType, raft::row_major> indices,
+  raft::device_matrix_view<const IndicesValueType, IndexType, raft::row_major> ref_indices,
+  std::optional<raft::device_matrix_view<const DistanceValueType, IndexType, raft::row_major>>
+    distances,
+  std::optional<raft::device_matrix_view<const DistanceValueType, IndexType, raft::row_major>>
+    ref_distances,
+  raft::device_scalar_view<ScalarType> recall_score,
+  DistanceValueType const eps)
+{
+  auto constexpr kThreadsPerBlock = 32;
+  IndexType const row_idx         = blockIdx.x;
+  auto const lane_idx             = threadIdx.x % kThreadsPerBlock;
+
+  // Each warp stores a recall score computed across the columns per row
+  IndexType thread_recall_score = 0;
+  for (IndexType col_idx = lane_idx; col_idx < indices.extent(1); col_idx += kThreadsPerBlock) {
+    for (IndexType ref_col_idx = 0; ref_col_idx < ref_indices.extent(1); ref_col_idx++) {
+      if (indices(row_idx, col_idx) == ref_indices(row_idx, ref_col_idx)) {
+        thread_recall_score += 1;
+        break;
+      } else if (distances.has_value()) {
+        auto dist               = distances.value()(row_idx, col_idx);
+        auto ref_dist           = ref_distances.value()(row_idx, ref_col_idx);
+        DistanceValueType diff  = raft::abs(dist - ref_dist);
+        DistanceValueType m     = std::max(raft::abs(dist), raft::abs(ref_dist));
+        DistanceValueType ratio = diff > eps ? diff / m : diff;
+
+        if (ratio <= eps) {
+          thread_recall_score += 1;
+          break;
+        }
+      }
+    }
+  }
+
+  // Reduce across a warp for row score
+  typedef cub::BlockReduce<IndexType, kThreadsPerBlock> BlockReduce;
+
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  ScalarType row_recall_score = BlockReduce(temp_storage).Sum(thread_recall_score);
+
+  // Reduce across all rows for global score
+  if (lane_idx == 0) {
+    cuda::atomic_ref<ScalarType, cuda::thread_scope_device> device_recall_score{
+      *recall_score.data_handle()};
+    std::size_t const total_count = indices.extent(0) * indices.extent(1);
+    device_recall_score.fetch_add(row_recall_score / total_count);
+  }
+}
+
+template <typename IndicesValueType,
+          typename DistanceValueType,
+          typename IndexType,
+          typename ScalarType>
+void neighborhood_recall(
+  raft::resources const& res,
+  raft::device_matrix_view<const IndicesValueType, IndexType, raft::row_major> indices,
+  raft::device_matrix_view<const IndicesValueType, IndexType, raft::row_major> ref_indices,
+  std::optional<raft::device_matrix_view<const DistanceValueType, IndexType, raft::row_major>>
+    distances,
+  std::optional<raft::device_matrix_view<const DistanceValueType, IndexType, raft::row_major>>
+    ref_distances,
+  raft::device_scalar_view<ScalarType> recall_score,
+  DistanceValueType const eps)
+{
+  // One warp per row, launch a warp-width block per-row kernel
+  auto constexpr kThreadsPerBlock = 32;
+  auto const num_blocks           = indices.extent(0);
+
+  neighborhood_recall<<<num_blocks, kThreadsPerBlock>>>(
+    indices, ref_indices, distances, ref_distances, recall_score, eps);
+}
+
+}  // end namespace raft::stats::detail
diff --git a/cpp/include/raft/stats/neighborhood_recall.cuh b/cpp/include/raft/stats/neighborhood_recall.cuh
new file mode 100644
index 0000000000..e082bc87b4
--- /dev/null
+++ b/cpp/include/raft/stats/neighborhood_recall.cuh
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/neighborhood_recall.cuh"
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/error.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/core/mdspan_types.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+
+#include <optional>
+
+namespace raft::stats {
+
+/**
+ * @defgroup stats_neighborhood_recall Neighborhood Recall Score
+ * @{
+ */
+
+/**
+ * @brief Calculate Neighborhood Recall score on the device for indices, distances computed by any
+ * Nearest Neighbors Algorithm against reference indices, distances. Recall score is calculated by
+ * comparing the total number of matching indices and dividing that value by the total size of the
+ * indices matrix of dimensions (D, k). If distance matrices are provided, then non-matching indices
+ * could be considered a match if abs(dist, ref_dist) < eps.
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::device_resources res;
+ * // assume D rows and N column dataset
+ * auto k = 64;
+ * auto indices = raft::make_device_matrix<int>(res, D, k);
+ * auto distances = raft::make_device_matrix<float>(res, D, k);
+ * // run ANN algorithm of choice
+ *
+ * auto ref_indices = raft::make_device_matrix<int>(res, D, k);
+ * auto ref_distances = raft::make_device_matrix<float>(res, D, k);
+ * // run brute-force KNN for reference
+ *
+ * auto scalar = 0.0f;
+ * auto recall_score = raft::make_device_scalar(res, scalar);
+ *
+ * raft::stats::neighborhood_recall(res,
+                                    raft::make_const_mdspan(indices.view()),
+                                    raft::make_const_mdspan(ref_indices.view()),
+                                    recall_score.view(),
+                                    raft::make_const_mdspan(distances.view()),
+                                    raft::make_const_mdspan(ref_distances.view()));
+ * @endcode
+ *
+ * @tparam IndicesValueType data-type of the indices
+ * @tparam IndexType data-type to index all matrices
+ * @tparam ScalarType data-type to store recall score
+ * @tparam DistanceValueType data-type of the distances
+ * @param res raft::resources object to manage resources
+ * @param[in] indices raft::device_matrix_view indices of neighbors
+ * @param[in] ref_indices raft::device_matrix_view reference indices of neighbors
+ * @param[out] recall_score raft::device_scalar_view output recall score
+ * @param[in] distances (optional) raft::device_matrix_view distances of neighbors
+ * @param[in] ref_distances (optional) raft::device_matrix_view reference distances of neighbors
+ * @param[in] eps (optional, default = 0.001) value within which distances are considered matching
+ */
+template <typename IndicesValueType,
+          typename IndexType,
+          typename ScalarType,
+          typename DistanceValueType = float>
+void neighborhood_recall(
+  raft::resources const& res,
+  raft::device_matrix_view<const IndicesValueType, IndexType, raft::row_major> indices,
+  raft::device_matrix_view<const IndicesValueType, IndexType, raft::row_major> ref_indices,
+  raft::device_scalar_view<ScalarType> recall_score,
+  std::optional<raft::device_matrix_view<const DistanceValueType, IndexType, raft::row_major>>
+    distances = std::nullopt,
+  std::optional<raft::device_matrix_view<const DistanceValueType, IndexType, raft::row_major>>
+    ref_distances                                                    = std::nullopt,
+  std::optional<raft::host_scalar_view<const DistanceValueType>> eps = std::nullopt)
+{
+  RAFT_EXPECTS(indices.extent(0) == ref_indices.extent(0),
+               "The number of rows in indices and reference indices should be equal");
+  RAFT_EXPECTS(indices.extent(1) == ref_indices.extent(1),
+               "The number of columns in indices and reference indices should be equal");
+
+  if (distances.has_value() or ref_distances.has_value()) {
+    RAFT_EXPECTS(distances.has_value() and ref_distances.has_value(),
+                 "Both distances and reference distances should have values");
+
+    RAFT_EXPECTS(distances.value().extent(0) == ref_distances.value().extent(0),
+                 "The number of rows in distances and reference distances should be equal");
+    RAFT_EXPECTS(distances.value().extent(1) == ref_distances.value().extent(1),
+                 "The number of columns in indices and reference indices should be equal");
+
+    RAFT_EXPECTS(indices.extent(0) == distances.value().extent(0),
+                 "The number of rows in indices and distances should be equal");
+    RAFT_EXPECTS(indices.extent(1) == distances.value().extent(1),
+                 "The number of columns in indices and distances should be equal");
+  }
+
+  DistanceValueType eps_val = 0.001;
+  if (eps.has_value()) { eps_val = *eps.value().data_handle(); }
+
+  detail::neighborhood_recall(
+    res, indices, ref_indices, distances, ref_distances, recall_score, eps_val);
+}
+
+/**
+ * @brief Calculate Neighborhood Recall score on the host for indices, distances computed by any
+ * Nearest Neighbors Algorithm against reference indices, distances. Recall score is calculated by
+ * comparing the total number of matching indices and dividing that value by the total size of the
+ * indices matrix of dimensions (D, k). If distance matrices are provided, then non-matching indices
+ * could be considered a match if abs(dist, ref_dist) < eps.
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::device_resources res;
+ * // assume D rows and N column dataset
+ * auto k = 64;
+ * auto indices = raft::make_device_matrix<int>(res, D, k);
+ * auto distances = raft::make_device_matrix<float>(res, D, k);
+ * // run ANN algorithm of choice
+ *
+ * auto ref_indices = raft::make_device_matrix<int>(res, D, k);
+ * auto ref_distances = raft::make_device_matrix<float>(res, D, k);
+ * // run brute-force KNN for reference
+ *
+ * auto scalar = 0.0f;
+ * auto recall_score = raft::make_host_scalar(scalar);
+ *
+ * raft::stats::neighborhood_recall(res,
+                                    raft::make_const_mdspan(indices.view()),
+                                    raft::make_const_mdspan(ref_indices.view()),
+                                    recall_score.view(),
+                                    raft::make_const_mdspan(distances.view()),
+                                    raft::make_const_mdspan(ref_distances.view()));
+ * @endcode
+ *
+ * @tparam IndicesValueType data-type of the indices
+ * @tparam IndexType data-type to index all matrices
+ * @tparam ScalarType data-type to store recall score
+ * @tparam DistanceValueType data-type of the distances
+ * @param res raft::resources object to manage resources
+ * @param[in] indices raft::device_matrix_view indices of neighbors
+ * @param[in] ref_indices raft::device_matrix_view reference indices of neighbors
+ * @param[out] recall_score raft::host_scalar_view output recall score
+ * @param[in] distances (optional) raft::device_matrix_view distances of neighbors
+ * @param[in] ref_distances (optional) raft::device_matrix_view reference distances of neighbors
+ * @param[in] eps (optional, default = 0.001) value within which distances are considered matching
+ */
+template <typename IndicesValueType,
+          typename IndexType,
+          typename ScalarType,
+          typename DistanceValueType = float>
+void neighborhood_recall(
+  raft::resources const& res,
+  raft::device_matrix_view<const IndicesValueType, IndexType, raft::row_major> indices,
+  raft::device_matrix_view<const IndicesValueType, IndexType, raft::row_major> ref_indices,
+  raft::host_scalar_view<ScalarType> recall_score,
+  std::optional<raft::device_matrix_view<const DistanceValueType, IndexType, raft::row_major>>
+    distances = std::nullopt,
+  std::optional<raft::device_matrix_view<const DistanceValueType, IndexType, raft::row_major>>
+    ref_distances                                                    = std::nullopt,
+  std::optional<raft::host_scalar_view<const DistanceValueType>> eps = std::nullopt)
+{
+  auto recall_score_d = raft::make_device_scalar(res, *recall_score.data_handle());
+  neighborhood_recall(
+    res, indices, ref_indices, recall_score_d.view(), distances, ref_distances, eps);
+  raft::update_host(recall_score.data_handle(),
+                    recall_score_d.data_handle(),
+                    1,
+                    raft::resource::get_cuda_stream(res));
+  raft::resource::sync_stream(res);
+}
+
+/** @} */  // end group stats_recall
+
+}  // end namespace raft::stats
diff --git a/cpp/internal/raft_internal/neighbors/refine_helper.cuh b/cpp/internal/raft_internal/neighbors/refine_helper.cuh
index ee06d90851..4a06116877 100644
--- a/cpp/internal/raft_internal/neighbors/refine_helper.cuh
+++ b/cpp/internal/raft_internal/neighbors/refine_helper.cuh
@@ -61,16 +61,20 @@ class RefineHelper {
       refined_distances_host(handle),
       refined_indices_host(handle)
   {
-    raft::random::Rng r(1234ULL);
+    raft::random::RngState rng(1234ULL);
 
     dataset = raft::make_device_matrix<DataT, IdxT>(handle_, p.n_rows, p.dim);
     queries = raft::make_device_matrix<DataT, IdxT>(handle_, p.n_queries, p.dim);
     if constexpr (std::is_same<DataT, float>{}) {
-      r.uniform(dataset.data_handle(), dataset.size(), DataT(-10.0), DataT(10.0), stream_);
-      r.uniform(queries.data_handle(), queries.size(), DataT(-10.0), DataT(10.0), stream_);
+      raft::random::uniform(
+        handle, rng, dataset.data_handle(), dataset.size(), DataT(-10.0), DataT(10.0));
+      raft::random::uniform(
+        handle, rng, queries.data_handle(), queries.size(), DataT(-10.0), DataT(10.0));
     } else {
-      r.uniformInt(dataset.data_handle(), dataset.size(), DataT(1), DataT(20), stream_);
-      r.uniformInt(queries.data_handle(), queries.size(), DataT(1), DataT(20), stream_);
+      raft::random::uniformInt(
+        handle, rng, dataset.data_handle(), dataset.size(), DataT(1), DataT(20));
+      raft::random::uniformInt(
+        handle, rng, queries.data_handle(), queries.size(), DataT(1), DataT(20));
     }
 
     refined_distances = raft::make_device_matrix<DistanceT, IdxT>(handle_, p.n_queries, p.k);
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 0651ccac86..9b9b882d1d 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -21,7 +21,7 @@ rapids_test_init()
 
 function(ConfigureTest)
 
-  set(options OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY)
+  set(options OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY NOCUDA)
   set(oneValueArgs NAME GPUS PERCENT)
   set(multiValueArgs PATH TARGETS CONFIGURATIONS)
 
@@ -37,7 +37,11 @@ function(ConfigureTest)
     set(_RAFT_TEST_PERCENT 100)
   endif()
 
-  set(TEST_NAME ${_RAFT_TEST_NAME})
+  if(_RAFT_TEST_NOCUDA)
+    set(TEST_NAME "${_RAFT_TEST_NAME}_NOCUDA")
+  else()
+    set(TEST_NAME ${_RAFT_TEST_NAME})
+  endif()
 
   add_executable(${TEST_NAME} ${_RAFT_TEST_PATH})
   target_link_libraries(
@@ -68,6 +72,9 @@ function(ConfigureTest)
   if(_RAFT_TEST_EXPLICIT_INSTANTIATE_ONLY)
     target_compile_definitions(${TEST_NAME} PRIVATE "RAFT_EXPLICIT_INSTANTIATE_ONLY")
   endif()
+  if(_RAFT_TEST_NOCUDA)
+    target_compile_definitions(${TEST_NAME} PRIVATE "RAFT_DISABLE_CUDA")
+  endif()
 
   target_include_directories(${TEST_NAME} PUBLIC "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/test>")
 
@@ -117,6 +124,8 @@ if(BUILD_TESTS)
     test/core/interruptible.cu
     test/core/nvtx.cpp
     test/core/mdarray.cu
+    test/core/mdspan_copy.cpp
+    test/core/mdspan_copy.cu
     test/core/mdspan_utils.cu
     test/core/numpy_serializer.cu
     test/core/memory_type.cpp
@@ -124,12 +133,18 @@ if(BUILD_TESTS)
     test/core/sparse_matrix.cpp
     test/core/span.cpp
     test/core/span.cu
+    test/core/stream_view.cpp
     test/core/temporary_device_buffer.cu
     test/test.cpp
     LIB
     EXPLICIT_INSTANTIATE_ONLY
   )
 
+  ConfigureTest(
+    NAME CORE_TEST PATH test/core/stream_view.cpp test/core/mdspan_copy.cpp LIB
+    EXPLICIT_INSTANTIATE_ONLY NOCUDA
+  )
+
   ConfigureTest(
     NAME
     DISTANCE_TEST
@@ -420,6 +435,7 @@ if(BUILD_TESTS)
     test/stats/mean_center.cu
     test/stats/minmax.cu
     test/stats/mutual_info_score.cu
+    test/stats/neighborhood_recall.cu
     test/stats/r2_score.cu
     test/stats/rand_index.cu
     test/stats/regression_metrics.cu
diff --git a/cpp/test/core/mdspan_copy.cpp b/cpp/test/core/mdspan_copy.cpp
new file mode 100644
index 0000000000..2f938e3035
--- /dev/null
+++ b/cpp/test/core/mdspan_copy.cpp
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <cstdint>
+#include <gtest/gtest.h>
+#include <raft/core/copy.hpp>
+#ifndef RAFT_DISABLE_CUDA
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+#endif
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/resources.hpp>
+
+namespace raft {
+TEST(MDSpanCopy, Mdspan1DHostHost)
+{
+  auto res     = resources{};
+  auto cols    = std::uint32_t{2};
+  auto in_left = make_host_vector<float, std::uint32_t, layout_c_contiguous>(res, cols);
+
+  auto gen_unique_entry = [](auto&& x) { return x; };
+  for (auto i = std::uint32_t{}; i < cols; ++i) {
+    in_left(i) = gen_unique_entry(i);
+  }
+
+  auto out_right = make_host_vector<double, std::uint32_t, layout_f_contiguous>(res, cols);
+  static_assert(detail::mdspan_copyable<true,
+                                        decltype(out_right.view()),
+                                        decltype(in_left.view())>::can_use_std_copy,
+                "Current implementation should use std::copy for this copy");
+  copy(res, out_right.view(), in_left.view());
+  for (auto i = std::uint32_t{}; i < cols; ++i) {
+    ASSERT_TRUE(match(out_right(i), double(gen_unique_entry(i)), CompareApprox<double>{0.0001}));
+  }
+}
+
+#ifndef RAFT_DISABLE_CUDA
+TEST(MDSpanCopy, Mdspan1DHostDevice)
+{
+  auto res     = device_resources{};
+  auto cols    = std::uint32_t{2};
+  auto in_left = make_host_vector<float, std::uint32_t, layout_c_contiguous>(res, cols);
+
+  auto gen_unique_entry = [](auto&& x) { return x; };
+  for (auto i = std::uint32_t{}; i < cols; ++i) {
+    in_left(i) = gen_unique_entry(i);
+  }
+
+  auto out_right = make_device_vector<float, std::uint32_t, layout_f_contiguous>(res, cols);
+  static_assert(detail::mdspan_copyable<true,
+                                        decltype(out_right.view()),
+                                        decltype(in_left.view())>::can_use_raft_copy,
+                "Current implementation should use raft::copy for this copy");
+  copy(res, out_right.view(), in_left.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < cols; ++i) {
+    ASSERT_TRUE(
+      match(float(out_right(i)), float(gen_unique_entry(i)), CompareApprox<float>{0.0001f}));
+  }
+}
+
+TEST(MDSpanCopy, Mdspan1DDeviceHost)
+{
+  auto res     = device_resources{};
+  auto cols    = std::uint32_t{2};
+  auto in_left = make_device_vector<float, std::uint32_t, layout_c_contiguous>(res, cols);
+
+  auto gen_unique_entry = [](auto&& x) { return x; };
+  for (auto i = std::uint32_t{}; i < cols; ++i) {
+    in_left(i) = gen_unique_entry(i);
+  }
+
+  auto out_right = make_host_vector<float, std::uint32_t, layout_f_contiguous>(res, cols);
+  static_assert(detail::mdspan_copyable<true,
+                                        decltype(out_right.view()),
+                                        decltype(in_left.view())>::can_use_raft_copy,
+                "Current implementation should use raft::copy for this copy");
+  copy(res, out_right.view(), in_left.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < cols; ++i) {
+    ASSERT_TRUE(
+      match(float(out_right(i)), float(gen_unique_entry(i)), CompareApprox<float>{0.0001f}));
+  }
+}
+#endif
+
+TEST(MDSpanCopy, Mdspan3DHostHost)
+{
+  auto res             = resources{};
+  auto constexpr depth = std::uint32_t{500};
+  auto constexpr rows  = std::uint32_t{300};
+  auto constexpr cols  = std::uint32_t{200};
+  auto in_left = make_host_mdarray<float, std::uint32_t, layout_f_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+  auto in_right = make_host_mdarray<float, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+  auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { return x * 7 + y * 11 + z * 13; };
+
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        in_left(i, j, k)  = gen_unique_entry(i, j, k);
+        in_right(i, j, k) = gen_unique_entry(i, j, k);
+      }
+    }
+  }
+
+  auto out_left = make_host_mdarray<double, std::uint32_t, layout_f_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+  auto out_right = make_host_mdarray<double, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+
+  static_assert(detail::mdspan_copyable<true,
+                                        decltype(out_right.view()),
+                                        decltype(in_right.view())>::can_use_std_copy,
+                "Current implementation should use std::copy for this copy");
+  copy(res, out_right.view(), in_right.view());
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        ASSERT_TRUE(match(
+          out_right(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox<double>{0.0001}));
+      }
+    }
+  }
+
+  copy(res, out_right.view(), in_left.view());
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        ASSERT_TRUE(match(
+          out_right(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox<double>{0.0001}));
+      }
+    }
+  }
+
+  copy(res, out_left.view(), in_right.view());
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        ASSERT_TRUE(match(
+          out_left(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox<double>{0.0001}));
+      }
+    }
+  }
+
+  static_assert(detail::mdspan_copyable<true, decltype(out_left.view()), decltype(in_left.view())>::
+                  can_use_std_copy,
+                "Current implementation should use std::copy for this copy");
+  copy(res, out_left.view(), in_left.view());
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        ASSERT_TRUE(match(
+          out_left(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox<double>{0.0001}));
+      }
+    }
+  }
+}
+
+#ifndef RAFT_DISABLE_CUDA
+TEST(MDSpanCopy, Mdspan3DHostDevice)
+{
+  auto res = device_resources{};
+  // Use smaller values here since host/device copy takes awhile.
+  // Non-trivial logic is tested in the other cases.
+  auto constexpr depth = std::uint32_t{5};
+  auto constexpr rows  = std::uint32_t{3};
+  auto constexpr cols  = std::uint32_t{2};
+  auto in_left = make_host_mdarray<float, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+  auto in_right = make_host_mdarray<float, std::uint32_t, layout_f_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+  auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { return x * 7 + y * 11 + z * 13; };
+
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        in_left(i, j, k)  = gen_unique_entry(i, j, k);
+        in_right(i, j, k) = gen_unique_entry(i, j, k);
+      }
+    }
+  }
+
+  auto out_left = make_device_mdarray<float, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+  auto out_right =
+    make_device_mdarray<float, std::uint32_t, layout_f_contiguous, depth, rows, cols>(
+      res, extents<std::uint32_t, depth, rows, cols>{});
+
+  static_assert(detail::mdspan_copyable<true,
+                                        decltype(out_right.view()),
+                                        decltype(in_right.view())>::can_use_raft_copy,
+                "Current implementation should use raft::copy for this copy");
+  copy(res, out_right.view(), in_right.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        ASSERT_TRUE(match(float(out_right(i, j, k)),
+                          float(gen_unique_entry(i, j, k)),
+                          CompareApprox<float>{0.0001}));
+      }
+    }
+  }
+
+  static_assert(detail::mdspan_copyable<true, decltype(out_left.view()), decltype(in_left.view())>::
+                  can_use_raft_copy,
+                "Current implementation should use raft::copy for this copy");
+  copy(res, out_left.view(), in_left.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        ASSERT_TRUE(match(float(out_left(i, j, k)),
+                          float(gen_unique_entry(i, j, k)),
+                          CompareApprox<float>{0.0001}));
+      }
+    }
+  }
+}
+
+TEST(MDSpanCopy, Mdspan2DDeviceDevice)
+{
+  auto res            = device_resources{};
+  auto constexpr rows = std::uint32_t{300};
+  auto constexpr cols = std::uint32_t{200};
+  auto in_left        = make_device_mdarray<float, std::uint32_t, layout_c_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+  auto in_right = make_device_mdarray<float, std::uint32_t, layout_f_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+  auto gen_unique_entry = [](auto&& x, auto&& y) { return x * 7 + y * 11; };
+
+  for (auto i = std::uint32_t{}; i < rows; ++i) {
+    for (auto j = std::uint32_t{}; j < cols; ++j) {
+      in_left(i, j)  = gen_unique_entry(i, j);
+      in_right(i, j) = gen_unique_entry(i, j);
+    }
+  }
+
+  auto out_left = make_device_mdarray<float, std::uint32_t, layout_c_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+  auto out_right = make_device_mdarray<float, std::uint32_t, layout_f_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+
+  static_assert(detail::mdspan_copyable<true,
+                                        decltype(out_right.view()),
+                                        decltype(in_right.view())>::can_use_raft_copy,
+                "Current implementation should use raft::copy for this copy");
+  copy(res, out_right.view(), in_right.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < rows; ++i) {
+    for (auto j = std::uint32_t{}; j < cols; ++j) {
+      ASSERT_TRUE(
+        match(float(out_right(i, j)), float(gen_unique_entry(i, j)), CompareApprox<float>{0.0001}));
+    }
+  }
+
+  static_assert(detail::mdspan_copyable<true,
+                                        decltype(out_right.view()),
+                                        decltype(in_left.view())>::can_use_cublas,
+                "Current implementation should use cuBLAS for this copy");
+  copy(res, out_right.view(), in_left.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < rows; ++i) {
+    for (auto j = std::uint32_t{}; j < cols; ++j) {
+      ASSERT_TRUE(
+        match(float(out_right(i, j)), float(gen_unique_entry(i, j)), CompareApprox<float>{0.0001}));
+    }
+  }
+
+  static_assert(detail::mdspan_copyable<true,
+                                        decltype(out_left.view()),
+                                        decltype(in_right.view())>::can_use_cublas,
+                "Current implementation should use cuBLAS for this copy");
+  copy(res, out_left.view(), in_right.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < rows; ++i) {
+    for (auto j = std::uint32_t{}; j < cols; ++j) {
+      ASSERT_TRUE(
+        match(float(out_left(i, j)), float(gen_unique_entry(i, j)), CompareApprox<float>{0.0001}));
+    }
+  }
+}
+#endif
+
+}  // namespace raft
diff --git a/cpp/test/core/mdspan_copy.cu b/cpp/test/core/mdspan_copy.cu
new file mode 100644
index 0000000000..95d7d3befd
--- /dev/null
+++ b/cpp/test/core/mdspan_copy.cu
@@ -0,0 +1,433 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <cstdint>
+#include <gtest/gtest.h>
+#include <raft/core/copy.cuh>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/host_mdarray.hpp>
+
+namespace raft {
+TEST(MDSpanCopy, Mdspan3DDeviceDeviceCuda)
+{
+  auto res                   = device_resources{};
+  auto constexpr const depth = std::uint32_t{50};
+  auto constexpr const rows  = std::uint32_t{30};
+  auto constexpr const cols  = std::uint32_t{20};
+  auto in_left = make_device_mdarray<int, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+  auto in_right = make_device_mdarray<int, std::uint32_t, layout_f_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+  auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { return x * 7 + y * 11 + z * 13; };
+
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        in_left(i, j, k)  = gen_unique_entry(i, j, k);
+        in_right(i, j, k) = gen_unique_entry(i, j, k);
+      }
+    }
+  }
+  res.sync_stream();
+
+  // Test dtype conversion without transpose
+  auto out_long =
+    make_device_mdarray<std::int64_t, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
+      res, extents<std::uint32_t, depth, rows, cols>{});
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_long.view()), decltype(in_left.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_long.view(), in_left.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        ASSERT_EQ(std::int64_t(out_long(i, j, k)), std::int64_t(gen_unique_entry(i, j, k)));
+      }
+    }
+  }
+
+  // Test transpose
+  auto out_left = make_device_mdarray<int, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+  auto out_right = make_device_mdarray<int, std::uint32_t, layout_f_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_right.view()), decltype(in_left.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_right.view(), in_left.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        ASSERT_EQ(int(out_right(i, j, k)), int(gen_unique_entry(i, j, k)));
+      }
+    }
+  }
+
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_left.view()), decltype(in_right.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_left.view(), in_right.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        ASSERT_EQ(int(out_left(i, j, k)), int(gen_unique_entry(i, j, k)));
+      }
+    }
+  }
+}
+
+TEST(MDSpanCopy, Mdspan2DDeviceDeviceCuda)
+{
+  auto res            = device_resources{};
+  auto constexpr rows = std::uint32_t{30};
+  auto constexpr cols = std::uint32_t{20};
+  auto in_left        = make_device_mdarray<float, std::uint32_t, layout_c_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+  auto in_right = make_device_mdarray<float, std::uint32_t, layout_f_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+  auto gen_unique_entry = [](auto&& x, auto&& y) { return x * 7 + y * 11; };
+
+  for (auto i = std::uint32_t{}; i < rows; ++i) {
+    for (auto j = std::uint32_t{}; j < cols; ++j) {
+      in_left(i, j)  = gen_unique_entry(i, j);
+      in_right(i, j) = gen_unique_entry(i, j);
+    }
+  }
+
+  auto out_left = make_device_mdarray<double, std::uint32_t, layout_c_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+  auto out_right = make_device_mdarray<double, std::uint32_t, layout_f_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+
+  res.sync_stream();
+
+  // Test dtype conversion without transpose
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_right.view()), decltype(in_right.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_right.view(), in_right.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < rows; ++i) {
+    for (auto j = std::uint32_t{}; j < cols; ++j) {
+      ASSERT_TRUE(match(
+        double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox<double>{0.0001}));
+    }
+  }
+
+  // Test dtype conversion with transpose
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_right.view()), decltype(in_left.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_right.view(), in_left.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < rows; ++i) {
+    for (auto j = std::uint32_t{}; j < cols; ++j) {
+      ASSERT_TRUE(match(
+        double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox<double>{0.0001}));
+    }
+  }
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_left.view()), decltype(in_right.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_left.view(), in_right.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < rows; ++i) {
+    for (auto j = std::uint32_t{}; j < cols; ++j) {
+      ASSERT_TRUE(match(
+        double(out_left(i, j)), double(gen_unique_entry(i, j)), CompareApprox<double>{0.0001}));
+    }
+  }
+}
+TEST(MDSpanCopy, Mdspan3DDeviceHostCuda)
+{
+  auto res                   = device_resources{};
+  auto constexpr const depth = std::uint32_t{50};
+  auto constexpr const rows  = std::uint32_t{30};
+  auto constexpr const cols  = std::uint32_t{20};
+  auto in_left = make_device_mdarray<int, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+  auto in_right = make_device_mdarray<int, std::uint32_t, layout_f_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+  auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { return x * 7 + y * 11 + z * 13; };
+
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        in_left(i, j, k)  = gen_unique_entry(i, j, k);
+        in_right(i, j, k) = gen_unique_entry(i, j, k);
+      }
+    }
+  }
+  res.sync_stream();
+
+  // Test dtype conversion without transpose
+  auto out_long =
+    make_host_mdarray<std::int64_t, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
+      res, extents<std::uint32_t, depth, rows, cols>{});
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_long.view()), decltype(in_left.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_long.view(), in_left.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        ASSERT_EQ(std::int64_t(out_long(i, j, k)), std::int64_t(gen_unique_entry(i, j, k)));
+      }
+    }
+  }
+
+  // Test transpose
+  auto out_left = make_host_mdarray<int, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+  auto out_right = make_host_mdarray<int, std::uint32_t, layout_f_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_right.view()), decltype(in_left.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_right.view(), in_left.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        ASSERT_EQ(int(out_right(i, j, k)), int(gen_unique_entry(i, j, k)));
+      }
+    }
+  }
+
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_left.view()), decltype(in_right.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_left.view(), in_right.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        ASSERT_EQ(int(out_left(i, j, k)), int(gen_unique_entry(i, j, k)));
+      }
+    }
+  }
+}
+
+TEST(MDSpanCopy, Mdspan2DDeviceHostCuda)
+{
+  auto res            = device_resources{};
+  auto constexpr rows = std::uint32_t{30};
+  auto constexpr cols = std::uint32_t{20};
+  auto in_left        = make_host_mdarray<float, std::uint32_t, layout_c_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+  auto in_right = make_host_mdarray<float, std::uint32_t, layout_f_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+  auto gen_unique_entry = [](auto&& x, auto&& y) { return x * 7 + y * 11; };
+
+  for (auto i = std::uint32_t{}; i < rows; ++i) {
+    for (auto j = std::uint32_t{}; j < cols; ++j) {
+      in_left(i, j)  = gen_unique_entry(i, j);
+      in_right(i, j) = gen_unique_entry(i, j);
+    }
+  }
+
+  auto out_left = make_device_mdarray<double, std::uint32_t, layout_c_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+  auto out_right = make_device_mdarray<double, std::uint32_t, layout_f_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+
+  res.sync_stream();
+
+  // Test dtype conversion without transpose
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_right.view()), decltype(in_right.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_right.view(), in_right.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < rows; ++i) {
+    for (auto j = std::uint32_t{}; j < cols; ++j) {
+      ASSERT_TRUE(match(
+        double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox<double>{0.0001}));
+    }
+  }
+
+  // Test dtype conversion with transpose
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_right.view()), decltype(in_left.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_right.view(), in_left.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < rows; ++i) {
+    for (auto j = std::uint32_t{}; j < cols; ++j) {
+      ASSERT_TRUE(match(
+        double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox<double>{0.0001}));
+    }
+  }
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_left.view()), decltype(in_right.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_left.view(), in_right.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < rows; ++i) {
+    for (auto j = std::uint32_t{}; j < cols; ++j) {
+      ASSERT_TRUE(match(
+        double(out_left(i, j)), double(gen_unique_entry(i, j)), CompareApprox<double>{0.0001}));
+    }
+  }
+}
+
+TEST(MDSpanCopy, Mdspan3DHostDeviceCuda)
+{
+  auto res                   = device_resources{};
+  auto constexpr const depth = std::uint32_t{50};
+  auto constexpr const rows  = std::uint32_t{30};
+  auto constexpr const cols  = std::uint32_t{20};
+  auto in_left = make_device_mdarray<int, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+  auto in_right = make_device_mdarray<int, std::uint32_t, layout_f_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+  auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { return x * 7 + y * 11 + z * 13; };
+
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        in_left(i, j, k)  = gen_unique_entry(i, j, k);
+        in_right(i, j, k) = gen_unique_entry(i, j, k);
+      }
+    }
+  }
+  res.sync_stream();
+
+  // Test dtype conversion without transpose
+  auto out_long =
+    make_device_mdarray<std::int64_t, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
+      res, extents<std::uint32_t, depth, rows, cols>{});
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_long.view()), decltype(in_left.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_long.view(), in_left.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        ASSERT_EQ(std::int64_t(out_long(i, j, k)), std::int64_t(gen_unique_entry(i, j, k)));
+      }
+    }
+  }
+
+  // Test transpose
+  auto out_left = make_device_mdarray<int, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+  auto out_right = make_device_mdarray<int, std::uint32_t, layout_f_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_right.view()), decltype(in_left.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_right.view(), in_left.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        ASSERT_EQ(int(out_right(i, j, k)), int(gen_unique_entry(i, j, k)));
+      }
+    }
+  }
+
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_left.view()), decltype(in_right.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_left.view(), in_right.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        ASSERT_EQ(int(out_left(i, j, k)), int(gen_unique_entry(i, j, k)));
+      }
+    }
+  }
+}
+
+TEST(MDSpanCopy, Mdspan2DHostDeviceCuda)
+{
+  auto res            = device_resources{};
+  auto constexpr rows = std::uint32_t{30};
+  auto constexpr cols = std::uint32_t{20};
+  auto in_left        = make_device_mdarray<float, std::uint32_t, layout_c_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+  auto in_right = make_device_mdarray<float, std::uint32_t, layout_f_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+  auto gen_unique_entry = [](auto&& x, auto&& y) { return x * 7 + y * 11; };
+
+  for (auto i = std::uint32_t{}; i < rows; ++i) {
+    for (auto j = std::uint32_t{}; j < cols; ++j) {
+      in_left(i, j)  = gen_unique_entry(i, j);
+      in_right(i, j) = gen_unique_entry(i, j);
+    }
+  }
+
+  auto out_left = make_device_mdarray<double, std::uint32_t, layout_c_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+  auto out_right = make_device_mdarray<double, std::uint32_t, layout_f_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+
+  res.sync_stream();
+
+  // Test dtype conversion without transpose
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_right.view()), decltype(in_right.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_right.view(), in_right.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < rows; ++i) {
+    for (auto j = std::uint32_t{}; j < cols; ++j) {
+      ASSERT_TRUE(match(
+        double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox<double>{0.0001}));
+    }
+  }
+
+  // Test dtype conversion with transpose
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_right.view()), decltype(in_left.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_right.view(), in_left.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < rows; ++i) {
+    for (auto j = std::uint32_t{}; j < cols; ++j) {
+      ASSERT_TRUE(match(
+        double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox<double>{0.0001}));
+    }
+  }
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_left.view()), decltype(in_right.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_left.view(), in_right.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < rows; ++i) {
+    for (auto j = std::uint32_t{}; j < cols; ++j) {
+      ASSERT_TRUE(match(
+        double(out_left(i, j)), double(gen_unique_entry(i, j)), CompareApprox<double>{0.0001}));
+    }
+  }
+}
+
+}  // namespace raft
diff --git a/cpp/test/core/stream_view.cpp b/cpp/test/core/stream_view.cpp
new file mode 100644
index 0000000000..715c53fe21
--- /dev/null
+++ b/cpp/test/core/stream_view.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/core/cuda_support.hpp>
+#include <raft/core/stream_view.hpp>
+#ifndef RAFT_DISABLE_CUDA
+#include <rmm/cuda_stream_view.hpp>
+#endif
+namespace raft {
+TEST(StreamView, Default)
+{
+  auto stream = stream_view_per_thread;
+  ASSERT_EQ(stream.is_per_thread_default(), raft::CUDA_ENABLED);
+  ASSERT_FALSE(stream.is_default());
+  if (raft::CUDA_ENABLED) {
+    EXPECT_NO_THROW(stream.synchronize());
+    EXPECT_NO_THROW(stream.interruptible_synchronize());
+  } else {
+    EXPECT_THROW(stream.synchronize(), raft::non_cuda_build_error);
+    EXPECT_THROW(stream.interruptible_synchronize(), raft::non_cuda_build_error);
+  }
+  EXPECT_NO_THROW(stream.synchronize_no_throw());
+  EXPECT_NO_THROW(stream.synchronize_if_cuda_enabled());
+#ifndef RAFT_DISABLE_CUDA
+  static_assert(std::is_same_v<decltype(stream.underlying()), rmm::cuda_stream_view>,
+                "underlying should return rmm::cuda_stream_view");
+#endif
+}
+}  // namespace raft
diff --git a/cpp/test/distance/gram.cu b/cpp/test/distance/gram.cu
index d5fecd93c6..a9dbd8328f 100644
--- a/cpp/test/distance/gram.cu
+++ b/cpp/test/distance/gram.cu
@@ -99,9 +99,9 @@ class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
     gram_host.resize(gram.size());
     std::fill(gram_host.begin(), gram_host.end(), 0);
 
-    raft::random::Rng r(42137ULL);
-    r.uniform(x1.data(), x1.size(), math_t(0), math_t(1), stream);
-    r.uniform(x2.data(), x2.size(), math_t(0), math_t(1), stream);
+    raft::random::RngState rng(42137ULL);
+    raft::random::uniform(handle, rng, x1.data(), x1.size(), math_t(0), math_t(1));
+    raft::random::uniform(handle, rng, x2.data(), x2.size(), math_t(0), math_t(1));
   }
 
   ~GramMatrixTest() override {}
diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu
index fd1b4e7b45..8578fe9637 100644
--- a/cpp/test/linalg/reduce.cu
+++ b/cpp/test/linalg/reduce.cu
@@ -124,7 +124,7 @@ class ReduceTest : public ::testing::TestWithParam<ReduceInputs<InType, OutType,
     raft::random::RngState r(params.seed);
     IdxType rows = params.rows, cols = params.cols;
     IdxType len = rows * cols;
-    gen_uniform(data.data(), r, len, stream);
+    gen_uniform(handle, data.data(), r, len);
 
     MainLambda main_op;
     ReduceLambda reduce_op;
diff --git a/cpp/test/neighbors/ann_cagra.cuh b/cpp/test/neighbors/ann_cagra.cuh
index e6c3873063..2856daa8f2 100644
--- a/cpp/test/neighbors/ann_cagra.cuh
+++ b/cpp/test/neighbors/ann_cagra.cuh
@@ -125,18 +125,19 @@ __global__ void GenerateRoundingErrorFreeDataset_kernel(float* const ptr,
   ptr[tid]        = u32 / resolution;
 }
 
-void GenerateRoundingErrorFreeDataset(float* const ptr,
+void GenerateRoundingErrorFreeDataset(const raft::resources& handle,
+                                      float* const ptr,
                                       const uint32_t n_row,
                                       const uint32_t dim,
-                                      raft::random::Rng& rng,
-                                      cudaStream_t cuda_stream)
+                                      raft::random::RngState& rng)
 {
+  auto cuda_stream          = resource::get_cuda_stream(handle);
   const uint32_t size       = n_row * dim;
   const uint32_t block_size = 256;
   const uint32_t grid_size  = (size + block_size - 1) / block_size;
 
   const uint32_t resolution = 1u << static_cast<unsigned>(std::floor((24 - std::log2(dim)) / 2));
-  rng.uniformInt(reinterpret_cast<uint32_t*>(ptr), size, 0u, resolution - 1, cuda_stream);
+  raft::random::uniformInt(handle, rng, reinterpret_cast<uint32_t*>(ptr), size, 0u, resolution - 1);
 
   GenerateRoundingErrorFreeDataset_kernel<<<grid_size, block_size, 0, cuda_stream>>>(
     ptr, size, resolution);
@@ -293,13 +294,16 @@ class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> {
   {
     database.resize(((size_t)ps.n_rows) * ps.dim, stream_);
     search_queries.resize(ps.n_queries * ps.dim, stream_);
-    raft::random::Rng r(1234ULL);
+    raft::random::RngState r(1234ULL);
     if constexpr (std::is_same<DataT, float>{}) {
-      r.normal(database.data(), ps.n_rows * ps.dim, DataT(0.1), DataT(2.0), stream_);
-      r.normal(search_queries.data(), ps.n_queries * ps.dim, DataT(0.1), DataT(2.0), stream_);
+      raft::random::normal(handle_, r, database.data(), ps.n_rows * ps.dim, DataT(0.1), DataT(2.0));
+      raft::random::normal(
+        handle_, r, search_queries.data(), ps.n_queries * ps.dim, DataT(0.1), DataT(2.0));
     } else {
-      r.uniformInt(database.data(), ps.n_rows * ps.dim, DataT(1), DataT(20), stream_);
-      r.uniformInt(search_queries.data(), ps.n_queries * ps.dim, DataT(1), DataT(20), stream_);
+      raft::random::uniformInt(
+        handle_, r, database.data(), ps.n_rows * ps.dim, DataT(1), DataT(20));
+      raft::random::uniformInt(
+        handle_, r, search_queries.data(), ps.n_queries * ps.dim, DataT(1), DataT(20));
     }
     resource::sync_stream(handle_);
   }
@@ -379,11 +383,12 @@ class AnnCagraSortTest : public ::testing::TestWithParam<AnnCagraInputs> {
   void SetUp() override
   {
     database.resize(((size_t)ps.n_rows) * ps.dim, handle_.get_stream());
-    raft::random::Rng r(1234ULL);
+    raft::random::RngState r(1234ULL);
     if constexpr (std::is_same<DataT, float>{}) {
-      GenerateRoundingErrorFreeDataset(database.data(), ps.n_rows, ps.dim, r, handle_.get_stream());
+      GenerateRoundingErrorFreeDataset(handle_, database.data(), ps.n_rows, ps.dim, r);
     } else {
-      r.uniformInt(database.data(), ps.n_rows * ps.dim, DataT(1), DataT(20), handle_.get_stream());
+      raft::random::uniformInt(
+        handle_, r, database.data(), ps.n_rows * ps.dim, DataT(1), DataT(20));
     }
     handle_.sync_stream();
   }
@@ -643,13 +648,16 @@ class AnnCagraFilterTest : public ::testing::TestWithParam<AnnCagraInputs> {
   {
     database.resize(((size_t)ps.n_rows) * ps.dim, stream_);
     search_queries.resize(ps.n_queries * ps.dim, stream_);
-    raft::random::Rng r(1234ULL);
+    raft::random::RngState r(1234ULL);
     if constexpr (std::is_same<DataT, float>{}) {
-      r.normal(database.data(), ps.n_rows * ps.dim, DataT(0.1), DataT(2.0), stream_);
-      r.normal(search_queries.data(), ps.n_queries * ps.dim, DataT(0.1), DataT(2.0), stream_);
+      raft::random::normal(handle_, r, database.data(), ps.n_rows * ps.dim, DataT(0.1), DataT(2.0));
+      raft::random::normal(
+        handle_, r, search_queries.data(), ps.n_queries * ps.dim, DataT(0.1), DataT(2.0));
     } else {
-      r.uniformInt(database.data(), ps.n_rows * ps.dim, DataT(1), DataT(20), stream_);
-      r.uniformInt(search_queries.data(), ps.n_queries * ps.dim, DataT(1), DataT(20), stream_);
+      raft::random::uniformInt(
+        handle_, r, database.data(), ps.n_rows * ps.dim, DataT(1), DataT(20));
+      raft::random::uniformInt(
+        handle_, r, search_queries.data(), ps.n_queries * ps.dim, DataT(1), DataT(20));
     }
     resource::sync_stream(handle_);
   }
diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh
index 71d48cdeb7..7b1d32ca83 100644
--- a/cpp/test/neighbors/ann_ivf_flat.cuh
+++ b/cpp/test/neighbors/ann_ivf_flat.cuh
@@ -411,13 +411,17 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
     database.resize(ps.num_db_vecs * ps.dim, stream_);
     search_queries.resize(ps.num_queries * ps.dim, stream_);
 
-    raft::random::Rng r(1234ULL);
+    raft::random::RngState r(1234ULL);
     if constexpr (std::is_same<DataT, float>{}) {
-      r.uniform(database.data(), ps.num_db_vecs * ps.dim, DataT(0.1), DataT(2.0), stream_);
-      r.uniform(search_queries.data(), ps.num_queries * ps.dim, DataT(0.1), DataT(2.0), stream_);
+      raft::random::uniform(
+        handle_, r, database.data(), ps.num_db_vecs * ps.dim, DataT(0.1), DataT(2.0));
+      raft::random::uniform(
+        handle_, r, search_queries.data(), ps.num_queries * ps.dim, DataT(0.1), DataT(2.0));
     } else {
-      r.uniformInt(database.data(), ps.num_db_vecs * ps.dim, DataT(1), DataT(20), stream_);
-      r.uniformInt(search_queries.data(), ps.num_queries * ps.dim, DataT(1), DataT(20), stream_);
+      raft::random::uniformInt(
+        handle_, r, database.data(), ps.num_db_vecs * ps.dim, DataT(1), DataT(20));
+      raft::random::uniformInt(
+        handle_, r, search_queries.data(), ps.num_queries * ps.dim, DataT(1), DataT(20));
     }
     resource::sync_stream(handle_);
   }
diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index e03d09ae50..d1f5ee5b03 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -170,13 +170,17 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     database.resize(size_t{ps.num_db_vecs} * size_t{ps.dim}, stream_);
     search_queries.resize(size_t{ps.num_queries} * size_t{ps.dim}, stream_);
 
-    raft::random::Rng r(1234ULL);
+    raft::random::RngState r(1234ULL);
     if constexpr (std::is_same<DataT, float>{}) {
-      r.uniform(database.data(), ps.num_db_vecs * ps.dim, DataT(0.1), DataT(2.0), stream_);
-      r.uniform(search_queries.data(), ps.num_queries * ps.dim, DataT(0.1), DataT(2.0), stream_);
+      raft::random::uniform(
+        handle_, r, database.data(), ps.num_db_vecs * ps.dim, DataT(0.1), DataT(2.0));
+      raft::random::uniform(
+        handle_, r, search_queries.data(), ps.num_queries * ps.dim, DataT(0.1), DataT(2.0));
     } else {
-      r.uniformInt(database.data(), ps.num_db_vecs * ps.dim, DataT(1), DataT(20), stream_);
-      r.uniformInt(search_queries.data(), ps.num_queries * ps.dim, DataT(1), DataT(20), stream_);
+      raft::random::uniformInt(
+        handle_, r, database.data(), ps.num_db_vecs * ps.dim, DataT(1), DataT(20));
+      raft::random::uniformInt(
+        handle_, r, search_queries.data(), ps.num_queries * ps.dim, DataT(1), DataT(20));
     }
     resource::sync_stream(handle_);
   }
diff --git a/cpp/test/neighbors/ann_nn_descent.cuh b/cpp/test/neighbors/ann_nn_descent.cuh
index 948323cf6e..d62b863437 100644
--- a/cpp/test/neighbors/ann_nn_descent.cuh
+++ b/cpp/test/neighbors/ann_nn_descent.cuh
@@ -121,11 +121,12 @@ class AnnNNDescentTest : public ::testing::TestWithParam<AnnNNDescentInputs> {
   void SetUp() override
   {
     database.resize(((size_t)ps.n_rows) * ps.dim, stream_);
-    raft::random::Rng r(1234ULL);
+    raft::random::RngState r(1234ULL);
     if constexpr (std::is_same<DataT, float>{}) {
-      r.normal(database.data(), ps.n_rows * ps.dim, DataT(0.1), DataT(2.0), stream_);
+      raft::random::normal(handle_, r, database.data(), ps.n_rows * ps.dim, DataT(0.1), DataT(2.0));
     } else {
-      r.uniformInt(database.data(), ps.n_rows * ps.dim, DataT(1), DataT(20), stream_);
+      raft::random::uniformInt(
+        handle_, r, database.data(), ps.n_rows * ps.dim, DataT(1), DataT(20));
     }
     resource::sync_stream(handle_);
   }
diff --git a/cpp/test/neighbors/ann_utils.cuh b/cpp/test/neighbors/ann_utils.cuh
index be60ec5b6d..18860100f3 100644
--- a/cpp/test/neighbors/ann_utils.cuh
+++ b/cpp/test/neighbors/ann_utils.cuh
@@ -123,13 +123,13 @@ struct idx_dist_pair {
   idx_dist_pair(IdxT x, DistT y, CompareDist op) : idx(x), dist(y), eq_compare(op) {}
 };
 
+/** Calculate recall value using only neighbor indices
+ */
 template <typename T>
-auto eval_recall(const std::vector<T>& expected_idx,
+auto calc_recall(const std::vector<T>& expected_idx,
                  const std::vector<T>& actual_idx,
                  size_t rows,
-                 size_t cols,
-                 double eps,
-                 double min_recall) -> testing::AssertionResult
+                 size_t cols)
 {
   size_t match_count = 0;
   size_t total_count = static_cast<size_t>(rows) * static_cast<size_t>(cols);
@@ -147,8 +147,21 @@ auto eval_recall(const std::vector<T>& expected_idx,
       }
     }
   }
-  double actual_recall = static_cast<double>(match_count) / static_cast<double>(total_count);
-  double error_margin  = (actual_recall - min_recall) / std::max(1.0 - min_recall, eps);
+  return std::make_tuple(
+    static_cast<double>(match_count) / static_cast<double>(total_count), match_count, total_count);
+}
+
+template <typename T>
+auto eval_recall(const std::vector<T>& expected_idx,
+                 const std::vector<T>& actual_idx,
+                 size_t rows,
+                 size_t cols,
+                 double eps,
+                 double min_recall) -> testing::AssertionResult
+{
+  auto [actual_recall, match_count, total_count] =
+    calc_recall(expected_idx, actual_idx, rows, cols);
+  double error_margin = (actual_recall - min_recall) / std::max(1.0 - min_recall, eps);
   RAFT_LOG_INFO("Recall = %f (%zu/%zu), the error is %2.1f%% %s the threshold (eps = %f).",
                 actual_recall,
                 match_count,
@@ -164,17 +177,16 @@ auto eval_recall(const std::vector<T>& expected_idx,
   return testing::AssertionSuccess();
 }
 
-/** same as eval_recall, but in case indices do not match,
- * then check distances as well, and accept match if actual dist is equal to expected_dist */
+/** Overload of calc_recall to account for distances
+ */
 template <typename T, typename DistT>
-auto eval_neighbours(const std::vector<T>& expected_idx,
-                     const std::vector<T>& actual_idx,
-                     const std::vector<DistT>& expected_dist,
-                     const std::vector<DistT>& actual_dist,
-                     size_t rows,
-                     size_t cols,
-                     double eps,
-                     double min_recall) -> testing::AssertionResult
+auto calc_recall(const std::vector<T>& expected_idx,
+                 const std::vector<T>& actual_idx,
+                 const std::vector<DistT>& expected_dist,
+                 const std::vector<DistT>& actual_dist,
+                 size_t rows,
+                 size_t cols,
+                 double eps)
 {
   size_t match_count = 0;
   size_t total_count = static_cast<size_t>(rows) * static_cast<size_t>(cols);
@@ -196,8 +208,25 @@ auto eval_neighbours(const std::vector<T>& expected_idx,
       }
     }
   }
-  double actual_recall = static_cast<double>(match_count) / static_cast<double>(total_count);
-  double error_margin  = (actual_recall - min_recall) / std::max(1.0 - min_recall, eps);
+  return std::make_tuple(
+    static_cast<double>(match_count) / static_cast<double>(total_count), match_count, total_count);
+}
+
+/** same as eval_recall, but in case indices do not match,
+ * then check distances as well, and accept match if actual dist is equal to expected_dist */
+template <typename T, typename DistT>
+auto eval_neighbours(const std::vector<T>& expected_idx,
+                     const std::vector<T>& actual_idx,
+                     const std::vector<DistT>& expected_dist,
+                     const std::vector<DistT>& actual_dist,
+                     size_t rows,
+                     size_t cols,
+                     double eps,
+                     double min_recall) -> testing::AssertionResult
+{
+  auto [actual_recall, match_count, total_count] =
+    calc_recall(expected_idx, actual_idx, expected_dist, actual_dist, rows, cols, eps);
+  double error_margin = (actual_recall - min_recall) / std::max(1.0 - min_recall, eps);
   RAFT_LOG_INFO("Recall = %f (%zu/%zu), the error is %2.1f%% %s the threshold (eps = %f).",
                 actual_recall,
                 match_count,
diff --git a/cpp/test/random/rmat_rectangular_generator.cu b/cpp/test/random/rmat_rectangular_generator.cu
index 1af3d2be31..77af44f133 100644
--- a/cpp/test/random/rmat_rectangular_generator.cu
+++ b/cpp/test/random/rmat_rectangular_generator.cu
@@ -178,7 +178,7 @@ class RmatGenTest : public ::testing::TestWithParam<RmatInputs> {
       max_scale{std::max(params.r_scale, params.c_scale)}
   {
     theta.resize(4 * max_scale, stream);
-    uniform<float>(state, theta.data(), theta.size(), 0.0f, 1.0f, stream);
+    uniform<float>(handle, state, theta.data(), theta.size(), 0.0f, 1.0f);
     normalize<float, float>(theta.data(),
                             theta.data(),
                             max_scale,
@@ -271,7 +271,7 @@ class RmatGenMdspanTest : public ::testing::TestWithParam<RmatInputs> {
       max_scale{std::max(params.r_scale, params.c_scale)}
   {
     theta.resize(4 * max_scale, stream);
-    uniform<float>(state, theta.data(), theta.size(), 0.0f, 1.0f, stream);
+    uniform<float>(handle, state, theta.data(), theta.size(), 0.0f, 1.0f);
     normalize<float, float>(theta.data(),
                             theta.data(),
                             max_scale,
diff --git a/cpp/test/sparse/gram.cu b/cpp/test/sparse/gram.cu
index 7b4736a08c..ca43aa83b9 100644
--- a/cpp/test/sparse/gram.cu
+++ b/cpp/test/sparse/gram.cu
@@ -125,7 +125,7 @@ class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
  protected:
   GramMatrixTest()
     : params(GetParam()),
-      stream(0),
+      stream(resource::get_cuda_stream(handle)),
       x1(0, stream),
       x2(0, stream),
       x1_csr_indptr(0, stream),
@@ -137,8 +137,6 @@ class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
       gram(0, stream),
       gram_host(0)
   {
-    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-
     if (params.ld1 == 0) { params.ld1 = params.is_row_major ? params.n_cols : params.n1; }
     if (params.ld2 == 0) { params.ld2 = params.is_row_major ? params.n_cols : params.n2; }
     if (params.ld_out == 0) { params.ld_out = params.is_row_major ? params.n2 : params.n1; }
@@ -154,14 +152,14 @@ class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
     gram_host.resize(gram.size());
     std::fill(gram_host.begin(), gram_host.end(), 0);
 
-    raft::random::Rng r(42137ULL);
-    r.uniform(x1.data(), x1.size(), math_t(0), math_t(1), stream);
-    r.uniform(x2.data(), x2.size(), math_t(0), math_t(1), stream);
+    raft::random::RngState r(42137ULL);
+    raft::random::uniform(handle, r, x1.data(), x1.size(), math_t(0), math_t(1));
+    raft::random::uniform(handle, r, x2.data(), x2.size(), math_t(0), math_t(1));
 
     RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
-  ~GramMatrixTest() override { RAFT_CUDA_TRY_NO_THROW(cudaStreamDestroy(stream)); }
+  ~GramMatrixTest() override {}
 
   int prepareCsr(math_t* dense, int n_rows, int ld, int* indptr, int* indices, math_t* data)
   {
diff --git a/cpp/test/stats/neighborhood_recall.cu b/cpp/test/stats/neighborhood_recall.cu
new file mode 100644
index 0000000000..43ae7059bd
--- /dev/null
+++ b/cpp/test/stats/neighborhood_recall.cu
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../neighbors/ann_utils.cuh"
+#include "../test_utils.h"
+
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/mdspan.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+
+#include <raft_internal/neighbors/naive_knn.cuh>
+
+#include <raft/stats/neighborhood_recall.cuh>
+#include <raft/util/itertools.hpp>
+
+#include <gtest/gtest.h>
+
+namespace raft::stats {
+
+struct NeighborhoodRecallInputs {
+  int n_rows;
+  int n_cols;
+  int k;
+};
+
+template <typename DistanceT, typename IdxT>
+class NeighborhoodRecallTest : public ::testing::TestWithParam<NeighborhoodRecallInputs> {
+ public:
+  NeighborhoodRecallTest()
+    : ps{::testing::TestWithParam<NeighborhoodRecallInputs>::GetParam()},
+      data_1{raft::make_device_matrix<DistanceT, IdxT>(res, ps.n_rows, ps.n_cols)},
+      data_2{raft::make_device_matrix<DistanceT, IdxT>(res, ps.n_rows, ps.n_cols)}
+  {
+  }
+
+ protected:
+  void test_recall()
+  {
+    size_t queries_size = ps.n_rows * ps.k;
+
+    // calculate nn for dataset 1
+    auto distances_1 = raft::make_device_matrix<DistanceT, IdxT>(res, ps.n_rows, ps.k);
+    auto indices_1   = raft::make_device_matrix<IdxT, IdxT>(res, ps.n_rows, ps.k);
+    raft::neighbors::naive_knn<DistanceT, DistanceT, IdxT>(
+      res,
+      distances_1.data_handle(),
+      indices_1.data_handle(),
+      data_1.data_handle(),
+      data_1.data_handle(),
+      ps.n_rows,
+      ps.n_rows,
+      ps.n_cols,
+      ps.k,
+      raft::distance::DistanceType::L2Expanded);
+    std::vector<DistanceT> distances_1_h(queries_size);
+    std::vector<IdxT> indices_1_h(queries_size);
+    raft::copy(distances_1_h.data(),
+               distances_1.data_handle(),
+               ps.n_rows * ps.k,
+               raft::resource::get_cuda_stream(res));
+    raft::copy(indices_1_h.data(),
+               indices_1.data_handle(),
+               ps.n_rows * ps.k,
+               raft::resource::get_cuda_stream(res));
+
+    // calculate nn for dataset 2
+    auto distances_2 = raft::make_device_matrix<DistanceT, IdxT>(res, ps.n_rows, ps.k);
+    auto indices_2   = raft::make_device_matrix<IdxT, IdxT>(res, ps.n_rows, ps.k);
+    raft::neighbors::naive_knn<DistanceT, DistanceT, IdxT>(
+      res,
+      distances_2.data_handle(),
+      indices_2.data_handle(),
+      data_2.data_handle(),
+      data_2.data_handle(),
+      ps.n_rows,
+      ps.n_rows,
+      ps.n_cols,
+      ps.k,
+      raft::distance::DistanceType::L2Expanded);
+    std::vector<DistanceT> distances_2_h(queries_size);
+    std::vector<IdxT> indices_2_h(queries_size);
+    raft::copy(distances_2_h.data(),
+               distances_2.data_handle(),
+               ps.n_rows * ps.k,
+               raft::resource::get_cuda_stream(res));
+    raft::copy(indices_2_h.data(),
+               indices_2.data_handle(),
+               ps.n_rows * ps.k,
+               raft::resource::get_cuda_stream(res));
+
+    raft::resource::sync_stream(res);
+
+    // find CPU recall scores
+    [[maybe_unused]] auto [indices_only_recall_h, mc1, tc1] =
+      raft::neighbors::calc_recall(indices_1_h, indices_2_h, ps.n_rows, ps.k);
+    [[maybe_unused]] auto [recall_h, mc2, tc2] = raft::neighbors::calc_recall(
+      indices_1_h, indices_2_h, distances_1_h, distances_2_h, ps.n_rows, ps.k, 0.001);
+
+    // find GPU recall scores
+    auto s1                         = 0;
+    auto indices_only_recall_scalar = raft::make_host_scalar<double>(s1);
+    neighborhood_recall(res,
+                        raft::make_const_mdspan(indices_1.view()),
+                        raft::make_const_mdspan(indices_2.view()),
+                        indices_only_recall_scalar.view());
+
+    auto s2            = 0;
+    auto recall_scalar = raft::make_host_scalar<double>(s2);
+    DistanceT s3       = 0.001;
+    auto eps_mda       = raft::make_host_scalar<DistanceT>(s3);
+
+    neighborhood_recall<IdxT, IdxT, double, DistanceT>(res,
+                                                       raft::make_const_mdspan(indices_1.view()),
+                                                       raft::make_const_mdspan(indices_2.view()),
+                                                       recall_scalar.view(),
+                                                       raft::make_const_mdspan(distances_1.view()),
+                                                       raft::make_const_mdspan(distances_2.view()));
+
+    // assert correctness
+    ASSERT_TRUE(raft::match(indices_only_recall_h,
+                            *indices_only_recall_scalar.data_handle(),
+                            raft::CompareApprox<double>(0.01)));
+    ASSERT_TRUE(
+      raft::match(recall_h, *recall_scalar.data_handle(), raft::CompareApprox<double>(0.01)));
+  }
+
+  void SetUp() override
+  {
+    // form two random datasets
+    raft::random::Rng r1(1234ULL);
+    r1.normal(data_1.data_handle(),
+              ps.n_rows * ps.n_cols,
+              DistanceT(0.1),
+              DistanceT(2.0),
+              raft::resource::get_cuda_stream(res));
+    raft::random::Rng r2(21111ULL);
+    r2.normal(data_2.data_handle(),
+              ps.n_rows * ps.n_cols,
+              DistanceT(0.1),
+              DistanceT(2.0),
+              raft::resource::get_cuda_stream(res));
+    resource::sync_stream(res);
+  }
+
+ private:
+  raft::resources res;
+  NeighborhoodRecallInputs ps;
+  raft::device_matrix<DistanceT, IdxT> data_1;
+  raft::device_matrix<DistanceT, IdxT> data_2;
+};
+
+const std::vector<NeighborhoodRecallInputs> inputs =
+  raft::util::itertools::product<NeighborhoodRecallInputs>({10, 50, 100},  // n_rows
+                                                           {80, 100},      // n_cols
+                                                           {32, 64});      // k
+
+using NeighborhoodRecallTestF_U32 = NeighborhoodRecallTest<float, std::uint32_t>;
+TEST_P(NeighborhoodRecallTestF_U32, AnnCagra) { this->test_recall(); }
+
+INSTANTIATE_TEST_CASE_P(NeighborhoodRecallTest,
+                        NeighborhoodRecallTestF_U32,
+                        ::testing::ValuesIn(inputs));
+
+}  // end namespace raft::stats
diff --git a/cpp/test/test_utils.cuh b/cpp/test/test_utils.cuh
index 5704eefae3..1afa7acc83 100644
--- a/cpp/test/test_utils.cuh
+++ b/cpp/test/test_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -228,38 +228,39 @@ testing::AssertionResult diagonalMatch(
 }
 
 template <typename T, typename IdxT>
-typename std::enable_if_t<std::is_floating_point_v<T>> gen_uniform(T* out,
+typename std::enable_if_t<std::is_floating_point_v<T>> gen_uniform(const raft::resources& handle,
+                                                                   T* out,
                                                                    raft::random::RngState& rng,
                                                                    IdxT len,
-                                                                   cudaStream_t stream,
                                                                    T range_min = T(-1),
                                                                    T range_max = T(1))
 {
-  raft::random::uniform(rng, out, len, range_min, range_max, stream);
+  raft::random::uniform(handle, rng, out, len, range_min, range_max);
 }
 
 template <typename T, typename IdxT>
-typename std::enable_if_t<std::is_integral_v<T>> gen_uniform(T* out,
+typename std::enable_if_t<std::is_integral_v<T>> gen_uniform(const raft::resources& handle,
+                                                             T* out,
                                                              raft::random::RngState& rng,
                                                              IdxT len,
-                                                             cudaStream_t stream,
                                                              T range_min = T(0),
                                                              T range_max = T(100))
 {
-  raft::random::uniformInt(rng, out, len, range_min, range_max, stream);
+  raft::random::uniformInt(handle, rng, out, len, range_min, range_max);
 }
 
 template <typename T1, typename T2, typename IdxT>
-void gen_uniform(raft::KeyValuePair<T1, T2>* out,
+void gen_uniform(const raft::resources& handle,
+                 raft::KeyValuePair<T1, T2>* out,
                  raft::random::RngState& rng,
-                 IdxT len,
-                 cudaStream_t stream)
+                 IdxT len)
 {
+  auto stream = resource::get_cuda_stream(handle);
   rmm::device_uvector<T1> keys(len, stream);
   rmm::device_uvector<T2> values(len, stream);
 
-  gen_uniform(keys.data(), rng, len, stream);
-  gen_uniform(values.data(), rng, len, stream);
+  gen_uniform(handle, keys.data(), rng, len);
+  gen_uniform(handle, values.data(), rng, len);
 
   const T1* d_keys   = keys.data();
   const T2* d_values = values.data();
diff --git a/cpp/test/util/bitonic_sort.cu b/cpp/test/util/bitonic_sort.cu
index 2cf5420334..f928480b54 100644
--- a/cpp/test/util/bitonic_sort.cu
+++ b/cpp/test/util/bitonic_sort.cu
@@ -109,6 +109,7 @@ class BitonicTest : public testing::TestWithParam<test_spec> {  // NOLINT
   std::vector<T> in;     // NOLINT
   std::vector<T> out;    // NOLINT
   std::vector<T> ref;    // NOLINT
+  raft::resources handle_;
 
   void segmented_sort(std::vector<T>& vec, int k, bool ascending)  // NOLINT
   {
@@ -128,14 +129,14 @@ class BitonicTest : public testing::TestWithParam<test_spec> {  // NOLINT
     }
   }
 
-  void fill_random(rmm::device_uvector<T>& arr, rmm::cuda_stream_view stream)
+  void fill_random(rmm::device_uvector<T>& arr)
   {
-    raft::random::Rng rng(42);
+    raft::random::RngState rng(42);
     if constexpr (std::is_floating_point_v<T>) {
-      return rng.normal(arr.data(), arr.size(), T(10), T(100), stream);
+      return raft::random::normal(handle_, rng, arr.data(), arr.size(), T(10), T(100));
     }
     if constexpr (std::is_integral_v<T>) {
-      return rng.normalInt(arr.data(), arr.size(), T(10), T(100), stream);
+      return raft::random::normalInt(handle_, rng, arr.data(), arr.size(), T(10), T(100));
     }
   }
 
@@ -146,11 +147,11 @@ class BitonicTest : public testing::TestWithParam<test_spec> {  // NOLINT
       out(spec.len()),
       ref(spec.len())
   {
-    auto stream = rmm::cuda_stream_default;
+    auto stream = resource::get_cuda_stream(handle_);
 
     // generate input
     rmm::device_uvector<T> arr_d(spec.len(), stream);
-    fill_random(arr_d, stream);
+    fill_random(arr_d);
     update_host(in.data(), arr_d.data(), arr_d.size(), stream);
 
     // calculate the results
diff --git a/dependencies.yaml b/dependencies.yaml
index 11d14c7be9..9baacdab4f 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -29,6 +29,7 @@ files:
       - develop
       - cudatoolkit
       - nn_bench
+      - nn_bench_python
   test_cpp:
     output: none
     includes:
@@ -226,8 +227,8 @@ dependencies:
           - glog>=0.6.0
           - h5py>=3.8.0
           - benchmark>=1.8.2
-          - rmm=23.12.*
           - openblas
+          - *rmm_conda
   nn_bench_python:
     common:
       - output_types: [conda]
diff --git a/docs/source/ann_benchmarks_param_tuning.md b/docs/source/ann_benchmarks_param_tuning.md
index 0faaeba59c..075d82a135 100644
--- a/docs/source/ann_benchmarks_param_tuning.md
+++ b/docs/source/ann_benchmarks_param_tuning.md
@@ -53,6 +53,7 @@ CAGRA uses a graph-based index, which creates an intermediate, approximate kNN g
 | `graph_degree`              | `build_param`  | N        | Positive Integer >0        | 64 | Degree of the final kNN graph index. |
 | `intermediate_graph_degree` | `build_param`  | N        | Positive Integer >0        | 128 | Degree of the intermediate kNN graph. |
 | `graph_build_algo`          | `build_param`  | N | ["IVF_PQ", "NN_DESCENT"]   | "IVF_PQ" | Algorithm to use for search |
+| `nn_descent_niter`          | `build_param`  | N        | Positive Integer>0         | 20 | Number of iterations if using NN_DESCENT. |
 | `dataset_memory_type`       | `build_param`  | N | ["device", "host", "mmap"] | "device" | What memory type should the dataset reside?                                                                                                                                       |
 | `query_memory_type`         | `search_params` | N | ["device", "host", "mmap"] | "device | What memory type should the queries reside? |
 | `itopk`                     | `search_wdith`  | N        | Positive Integer >0        | 64 | Number of intermediate search results retained during the search. Higher values improve search accuracy at the cost of speed. |
diff --git a/docs/source/cpp_api/stats_neighborhood.rst b/docs/source/cpp_api/stats_neighborhood.rst
index f80e349c3b..7c7ad90a49 100644
--- a/docs/source/cpp_api/stats_neighborhood.rst
+++ b/docs/source/cpp_api/stats_neighborhood.rst
@@ -16,3 +16,15 @@ namespace *raft::stats*
     :project: RAFT
     :members:
     :content-only:
+
+Neighborhood Recall
+-------------------
+
+``#include <raft/stats/neighborhood_recall.cuh>``
+
+namespace *raft::stats*
+
+.. doxygengroup:: stats_neighborhood_recall
+    :project: RAFT
+    :members:
+    :content-only:
diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md
index 132e38984c..2e8572c299 100644
--- a/docs/source/raft_ann_benchmarks.md
+++ b/docs/source/raft_ann_benchmarks.md
@@ -2,9 +2,26 @@
 
 This project provides a benchmark program for various ANN search implementations. It's especially suitable for comparing GPU implementations as well as comparing GPU against CPU.
 
-## Installing the benchmarks
+## Table of Contents
 
-The easiest way to install these benchmarks is through conda. We provide packages for GPU enabled systems, as well for systems without a GPU. We suggest using mamba as it generally leads to a faster install time:
+- [Installing and Running the Benchmarks](#installing--and-running-the-benchmarks)
+    - [Using conda](#conda)
+    - [Using Docker](#docker)
+- [End-to-end example: Million-scale](end-to-end-example-million-scale)
+- [End-to-end example: Billion-scale](#end-to-end-example-billion-scale)
+- [Creating and customizing dataset configurations](#creating-and-customizing-dataset-configurations)
+- [Adding a new ANN algorithm](#adding-a-new-ann-algorithm)
+
+## Installing and Running the Benchmarks
+
+There are two main ways pre-compiled benchmarks are distributed:
+
+- [Conda](#Conda): Great solution for users not using containers but want an easy to install and use Python package. Pip wheels are planned to be added as an alternative for users that cannot use conda and prefer to not use containers.
+- [Docker](#Docker): Great solution that only needs docker and NVIDIA docker to use. Provides a single docker run command for basic dataset benchmarking, as well as all the functionality of the conda solution inside the containers.
+
+## Conda
+
+If containers are not an option or not preferred, the easiest way to install the ANN benchmarks is through conda. We provide packages for GPU enabled systems, as well for systems without a GPU. We suggest using mamba as it generally leads to a faster install time:
 
 ```bash
 
@@ -24,7 +41,7 @@ Please see the [build instructions](ann_benchmarks_build.md) to build the benchm
 
 ## Running the benchmarks
 
-### Usage
+### Python Package Usage
 There are 4 general steps to running the benchmarks and visualizing the results:
 1. Prepare Dataset
 2. Build Index and Search Index
@@ -60,12 +77,112 @@ Configuration files already exist for the following list of the million-scale da
 - `fashion-mnist-784-euclidean`
 - `glove-50-angular`
 - `glove-100-angular`
-- `lastfm-65-angular`
 - `mnist-784-euclidean`
 - `nytimes-256-angular`
 - `sift-128-euclidean`
 
-### End-to-end example: Billion-scale
+## Docker
+
+We provide images for GPU enabled systems, as well as systems without a GPU. The following images are available:
+
+- `raft-ann-bench`: Contains GPU and CPU benchmarks, can run all algorithms supported. Will download million-scale datasets as required. Best suited for users that prefer a smaller container size for GPU based systems. Requires the NVIDIA Container Toolkit to run GPU algorithms, can run CPU algorithms without it.
+- `raft-ann-bench-datasets`: Contains the GPU and CPU benchmarks with million-scale datasets already included in the container. Best suited for users that want to run multiple million scale datasets already included in the image.
+- `raft-ann-bench-cpu`: Contains only CPU benchmarks with minimal size. Best suited for users that want the smallest containers to reproduce benchmarks on systems without a GPU.
+
+Nightly images are located in [dockerhub](https://hub.docker.com/r/rapidsai/raft-ann-bench), meanwhile release (stable) versions are located in [NGC](https://hub.docker.com/r/rapidsai/raft-ann-bench), starting with release 23.10.
+
+- The following command pulls the nightly container for python version 10, cuda version 12, and RAFT version 23.10:
+
+```bash
+docker pull rapidsai/raft-ann-bench:23.10a-cuda12.0-py3.10 #substitute raft-ann-bench for the exact desired container.
+```
+
+The CUDA and python versions can be changed for the supported values:
+
+Supported CUDA versions: 11.2 and 12.0
+Supported Python versions: 3.9 and 3.10.
+
+You can see the exact versions as well in the dockerhub site:
+
+- [RAFT ANN Benchmark images](https://hub.docker.com/r/rapidsai/raft-ann-bench/tags)
+- [RAFT ANN Benchmark with datasets preloaded images](https://hub.docker.com/r/rapidsai/raft-ann-bench-cpu/tags)
+- [RAFT ANN Benchmark CPU only images](https://hub.docker.com/r/rapidsai/raft-ann-bench-datasets/tags)
+
+**Note:** GPU containers use the CUDA toolkit from inside the container, the only requirement is a driver installed on the host machine that supports that version. So, for example, CUDA 11.8 containers can run in systems with a CUDA 12.x capable driver.
+
+-  The following command (only available after RAPIDS 23.10 release) pulls the container:
+
+```bash
+docker pull nvcr.io/nvidia/rapidsai/raft-ann-bench:23.08-cuda11.8-py3.10 #substitute raft-ann-bench for the exact desired container.
+```
+
+### Container Usage
+
+The container can be used in two different ways:
+
+1. **Automated benchmark with single `docker run` (ease mode)**: Helper scripts are included to ease the procedure of running benchmarks end-to-end:
+
+For GPU systems, where `$DATA_FOLDER` is a local folder where you want datasets stored in `$DATA_FOLDER/datasets` and results in `$DATA_FOLDER/result` (we highly recommend `$DATA_FOLDER` to be a dedicated folder for the datasets and results of the containers):
+
+```bash
+export DATA_FOLDER=path/to/store/datasets/and/results
+docker run --gpus all --rm -it -u $(id -u) \
+    -v $DATA_FOLDER:/home/rapids/benchmarks  \
+    rapidsai/raft-ann-bench:23.10a-cuda11.8-py3.10 \
+    "--dataset deep-image-96-angular" \
+    "--normalize" \
+    "--algorithms raft_cagra,raft_ivf_pq" \
+    ""
+```
+
+Where:
+
+```bash
+export DATA_FOLDER=path/to/store/datasets/and/results # <- local folder to store datasets and results
+docker run --gpus all --rm -it -u $(id -u) \
+    -v $DATA_FOLDER:/home/rapids/benchmarks  \
+    rapidsai/raft-ann-bench:23.10a-cuda11.8-py3.10 \ # <- image to use, either `raft-ann-bench` or `raft-ann-bench-datasets`, can choose RAPIDS, cuda and python versions.
+    "--dataset deep-image-96-angular" \ # <- dataset name
+    "--normalize" \ # <- whether to normalize the dataset, leave string empty ("") to not normalize.
+    "--algorithms raft_cagra" \ # <- what algorithm(s) to use as a ; separated list, as well as any other argument to pass to `raft_ann_benchmarks.run`
+    "" # optional arguments to pass to `raft_ann_benchmarks.plot`
+```
+
+*** Note about user and file permissions: *** The flag `-u $(id -u)` allows the user inside the container to match the `uid` of the user outside the container, allowing the container to read and write to the mounted volume indicated by $DATA_FOLDER.
+
+For CPU systems the same interface applies, except for not needing the gpus argument and using the cpu images:
+```bash
+export DATA_FOLDER=path/to/store/datasets/and/results
+docker run  all --rm -it -u $(id -u) \
+    -v $DATA_FOLDER:/home/rapids/benchmarks  \
+    rapidsai/raft-ann-bench-cpu:23.10a-py3.10 \
+     "--dataset deep-image-96-angular" \
+     "--normalize" \
+     "--algorithms raft_cagra" \
+     ""
+```
+
+**Note:** The user inside the containers is `root`. To workaround this, the scripts in the containers fix the user of the output files after the benchmarks are run. If the benchmarks are interrupted, the owner of the `datasets/results` produced by the container will be wrong, and will need to be manually fixed by the user.
+
+2. **Using the preinstalled `raft_ann_benchmarks` python package (advanced mode)**: The docker containers are built using the conda packages described in the following section, so they can be used directly as if they were installed manually following the instructions in the next section. This is recommended for advanced users, and is the option that allows the full flexibility of the benchmarking scripts. To use the python scripts directly, use the following command:
+
+```bash
+export DATA_FOLDER=path/to/store/datasets/and/results
+docker run --gpus all --rm -it -u $(id -u) \
+    -v $DATA_FOLDER:/home/rapids/benchmarks  \
+    rapidsai/raft-ann-bench:23.10a-cuda11.8-py3.10 \
+    --entrypoint /bin/bash
+```
+
+This will drop you into a command line in the container, with the `raft_ann_benchmarks` python package ready to use, as was described in the prior [conda section](#conda):
+
+```
+(base) root@00b068fbb862:/home/rapids#
+```
+
+Additionally, the containers could be run in dettached mode without any issue.
+
+## End-to-end example: Billion-scale
 `raft-ann-bench.get_dataset` cannot be used to download the [billion-scale datasets](ann_benchmarks_dataset.md#billion-scale)
 because they are so large. You should instead use our billion-scale datasets guide to download and prepare them.
 All other python  mentioned below work as intended once the
@@ -102,7 +219,7 @@ options:
                         Path to billion-scale dataset groundtruth file (default: None)
 ```
 
-##### Step 1: Prepare Dataset<a id='prep-dataset'></a>
+#### Step 1: Prepare Dataset<a id='prep-dataset'></a>
 The script `raft-ann-bench.get_dataset` will download and unpack the dataset in directory
 that the user provides. As of now, only million-scale datasets are supported by this
 script. For more information on [datasets and formats](ann_benchmarks_dataset.md).
@@ -123,7 +240,7 @@ When option `normalize` is provided to the script, any dataset that has cosine d
 will be normalized to inner product. So, for example, the dataset `glove-100-angular` 
 will be written at location `datasets/glove-100-inner/`.
 
-#### Step 2: Build and Search Index
+### Step 2: Build and Search Index
 The script `raft-ann-bench.run` will build and search indices for a given dataset and its
 specified configuration.
 To confirgure which algorithms are available, we use `algos.yaml`.
@@ -180,7 +297,7 @@ it is assumed both are `True`.
 `indices` and `algorithms` : these parameters ensure that the algorithm specified for an index 
 is available in `algos.yaml` and not disabled, as well as having an associated executable.
 
-#### Step 3: Data Export
+### Step 3: Data Export
 The script `raft-ann-bench.data_export` will convert the intermediate JSON outputs produced by `raft-ann-bench.run` to more
 easily readable CSV files, which are needed to build charts made by `raft-ann-bench.plot`.
 
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/mnist-784-euclidean.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/mnist-784-euclidean.json
index fff3bca1d7..6e982873bb 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/mnist-784-euclidean.json
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/mnist-784-euclidean.json
@@ -1,4 +1,4 @@
-{
+c{
   "dataset": {
     "name": "mnist-784-euclidean",
     "base_file": "mnist-784-euclidean/base.fbin",