From f774b0b21fa3af4b553534dc56cb4f37d4e5c2c7 Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Tue, 29 Jun 2021 12:47:14 +0200
Subject: [PATCH 01/17] Replace RAFT buffer adapter with RMM buffer

---
 cpp/include/raft/common/cub_wrappers.cuh      |  4 +-
 cpp/include/raft/comms/test.hpp               | 41 +++------
 cpp/include/raft/distance/distance.cuh        |  7 +-
 cpp/include/raft/label/classlabels.cuh        | 10 +--
 cpp/include/raft/lap/lap.cuh                  | 83 ++++++++++---------
 cpp/include/raft/lap/lap_functions.cuh        | 47 ++++-------
 cpp/include/raft/linalg/eig.cuh               | 16 ++--
 cpp/include/raft/linalg/qr.cuh                | 18 ++--
 cpp/include/raft/linalg/svd.cuh               | 27 +++---
 cpp/include/raft/matrix/math.cuh              |  7 +-
 cpp/include/raft/random/rng.cuh               | 12 +--
 cpp/include/raft/sparse/convert/csr.cuh       | 10 +--
 cpp/include/raft/sparse/coo.cuh               | 27 +++---
 cpp/include/raft/sparse/csr.cuh               |  6 +-
 .../raft/sparse/distance/bin_distance.cuh     | 26 +++---
 cpp/include/raft/sparse/distance/csr_spmv.cuh |  4 +-
 .../raft/sparse/distance/ip_distance.cuh      | 33 ++++----
 .../raft/sparse/distance/l2_distance.cuh      | 32 +++----
 .../raft/sparse/distance/lp_distance.cuh      |  6 +-
 cpp/include/raft/sparse/linalg/add.cuh        |  4 +-
 cpp/include/raft/sparse/linalg/spectral.cuh   | 14 ++--
 cpp/include/raft/sparse/linalg/symmetrize.cuh |  9 +-
 cpp/include/raft/sparse/linalg/transpose.h    |  6 +-
 cpp/include/raft/sparse/op/filter.cuh         | 10 +--
 .../knn/detail/knn_brute_force_faiss.cuh      |  9 +-
 .../raft/spatial/knn/detail/processing.hpp    | 10 +--
 cpp/test/lap/lap.cu                           | 17 ++--
 cpp/test/linalg/cholesky_r1.cu                | 25 +++---
 cpp/test/linalg/map_then_reduce.cu            | 10 +--
 cpp/test/matrix/matrix.cu                     | 15 ++--
 cpp/test/sparse/dist_coo_spmv.cu              |  6 +-
 cpp/test/sparse/linkage.cu                    |  2 +-
 cpp/test/sparse/symmetrize.cu                 |  9 +-
 33 files changed, 252 insertions(+), 310 deletions(-)
diff --git a/cpp/include/raft/common/cub_wrappers.cuh b/cpp/include/raft/common/cub_wrappers.cuh
index 8d5b29f700..8e3519fea5 100644
--- a/cpp/include/raft/common/cub_wrappers.cuh
+++ b/cpp/include/raft/common/cub_wrappers.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <cub/cub.cuh>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_uvector.hpp>
 
 namespace raft {
 
@@ -34,7 +34,7 @@ namespace raft {
      * @param stream cuda stream
      */
 template <typename KeyT, typename ValueT>
-void sortPairs(raft::mr::device::buffer<char> &workspace, const KeyT *inKeys,
+void sortPairs(rmm::device_uvector<char> &workspace, const KeyT *inKeys,
                KeyT *outKeys, const ValueT *inVals, ValueT *outVals, int len,
                cudaStream_t stream) {
   size_t worksize;
diff --git a/cpp/include/raft/comms/test.hpp b/cpp/include/raft/comms/test.hpp
index 4e95c4eef0..9f5edc1425 100644
--- a/cpp/include/raft/comms/test.hpp
+++ b/cpp/include/raft/comms/test.hpp
@@ -18,7 +18,6 @@
 
 #include <raft/comms/comms.hpp>
 #include <raft/handle.hpp>
-#include <raft/mr/device/buffer.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
@@ -44,8 +43,7 @@ bool test_collective_allreduce(const handle_t &handle, int root) {
 
   cudaStream_t stream = handle.get_stream();
 
-  raft::mr::device::buffer<int> temp_d(handle.get_device_allocator(), stream);
-  temp_d.resize(1, stream);
+  rmm::device_uvector<int> temp_d(1, stream);
   CUDA_CHECK(
     cudaMemcpyAsync(temp_d.data(), &send, 1, cudaMemcpyHostToDevice, stream));
 
@@ -76,8 +74,7 @@ bool test_collective_broadcast(const handle_t &handle, int root) {
 
   cudaStream_t stream = handle.get_stream();
 
-  raft::mr::device::buffer<int> temp_d(handle.get_device_allocator(), stream);
-  temp_d.resize(1, stream);
+  rmm::device_uvector<int> temp_d(1, stream);
 
   if (communicator.get_rank() == root)
     CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int),
@@ -104,8 +101,7 @@ bool test_collective_reduce(const handle_t &handle, int root) {
 
   cudaStream_t stream = handle.get_stream();
 
-  raft::mr::device::buffer<int> temp_d(handle.get_device_allocator(), stream);
-  temp_d.resize(1, stream);
+  rmm::device_uvector<int> temp_d(1, stream);
 
   CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int),
                              cudaMemcpyHostToDevice, stream));
@@ -134,11 +130,8 @@ bool test_collective_allgather(const handle_t &handle, int root) {
 
   cudaStream_t stream = handle.get_stream();
 
-  raft::mr::device::buffer<int> temp_d(handle.get_device_allocator(), stream);
-  temp_d.resize(1, stream);
-
-  raft::mr::device::buffer<int> recv_d(handle.get_device_allocator(), stream,
-                                       communicator.get_size());
+  rmm::device_uvector<int> temp_d(1, stream);
+  rmm::device_uvector<int> recv_d(communicator.get_size(), stream);
 
   CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int),
                              cudaMemcpyHostToDevice, stream));
@@ -169,12 +162,9 @@ bool test_collective_gather(const handle_t &handle, int root) {
 
   cudaStream_t stream = handle.get_stream();
 
-  raft::mr::device::buffer<int> temp_d(handle.get_device_allocator(), stream);
-  temp_d.resize(1, stream);
-
-  raft::mr::device::buffer<int> recv_d(
-    handle.get_device_allocator(), stream,
-    communicator.get_rank() == root ? communicator.get_size() : 0);
+  rmm::device_uvector<int> temp_d(1, stream);
+  rmm::device_uvector<int> recv_d(
+    communicator.get_rank() == root ? communicator.get_size() : 0, stream);
 
   CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int),
                              cudaMemcpyHostToDevice, stream));
@@ -211,12 +201,9 @@ bool test_collective_gatherv(const handle_t &handle, int root) {
 
   cudaStream_t stream = handle.get_stream();
 
-  raft::mr::device::buffer<int> temp_d(handle.get_device_allocator(), stream);
-  temp_d.resize(sends.size(), stream);
-
-  raft::mr::device::buffer<int> recv_d(
-    handle.get_device_allocator(), stream,
-    communicator.get_rank() == root ? displacements.back() : 0);
+  rmm::device_uvector<int> temp_d(sends.size(), stream);
+  rmm::device_uvector<int> recv_d(
+    communicator.get_rank() == root ? displacements.back() : 0, stream);
 
   CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(),
                              sends.size() * sizeof(int), cudaMemcpyHostToDevice,
@@ -256,10 +243,8 @@ bool test_collective_reducescatter(const handle_t &handle, int root) {
 
   cudaStream_t stream = handle.get_stream();
 
-  raft::mr::device::buffer<int> temp_d(handle.get_device_allocator(), stream,
-                                       sends.size());
-  raft::mr::device::buffer<int> recv_d(handle.get_device_allocator(), stream,
-                                       1);
+  rmm::device_uvector<int> temp_d(sends.size(), stream);
+  rmm::device_uvector<int> recv_d(1, stream);
 
   CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(),
                              sends.size() * sizeof(int), cudaMemcpyHostToDevice,
diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/distance.cuh
index 579b3bb446..a0a47a2bdb 100644
--- a/cpp/include/raft/distance/distance.cuh
+++ b/cpp/include/raft/distance/distance.cuh
@@ -22,7 +22,7 @@
 #include <raft/distance/cosine.cuh>
 #include <raft/distance/euclidean.cuh>
 #include <raft/distance/l1.cuh>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_uvector.hpp>
 
 namespace raft {
 namespace distance {
@@ -243,7 +243,7 @@ void distance(const InType *x, const InType *y, OutType *dist, Index_ m,
 template <typename Type, typename Index_, raft::distance::DistanceType DistType>
 void pairwise_distance_impl(const Type *x, const Type *y, Type *dist, Index_ m,
                             Index_ n, Index_ k,
-                            raft::mr::device::buffer<char> &workspace,
+                            rmm::device_uvector<char> &workspace,
                             cudaStream_t stream, bool isRowMajor) {
   auto worksize =
     getWorkspaceSize<DistType, Type, Type, Type, Index_>(x, y, m, n, k);
@@ -254,8 +254,7 @@ void pairwise_distance_impl(const Type *x, const Type *y, Type *dist, Index_ m,
 
 template <typename Type, typename Index_ = int>
 void pairwise_distance(const Type *x, const Type *y, Type *dist, Index_ m,
-                       Index_ n, Index_ k,
-                       raft::mr::device::buffer<char> &workspace,
+                       Index_ n, Index_ k, rmm::device_uvector<char> &workspace,
                        raft::distance::DistanceType metric, cudaStream_t stream,
                        bool isRowMajor = true) {
   switch (metric) {
diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/classlabels.cuh
index 0da7da2eb6..f2b2463165 100644
--- a/cpp/include/raft/label/classlabels.cuh
+++ b/cpp/include/raft/label/classlabels.cuh
@@ -22,7 +22,7 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_uvector.hpp>
 
 namespace raft {
 namespace label {
@@ -46,9 +46,9 @@ template <typename value_t>
 void getUniquelabels(value_t *y, size_t n, value_t **y_unique, int *n_unique,
                      cudaStream_t stream,
                      std::shared_ptr<raft::mr::device::allocator> allocator) {
-  raft::mr::device::buffer<value_t> y2(allocator, stream, n);
-  raft::mr::device::buffer<value_t> y3(allocator, stream, n);
-  raft::mr::device::buffer<int> d_num_selected(allocator, stream, 1);
+  rmm::device_uvector<value_t> y2(n, stream);
+  rmm::device_uvector<value_t> y3(n, stream);
+  rmm::device_uvector<int> d_num_selected(1, stream);
   size_t bytes = 0;
   size_t bytes2 = 0;
 
@@ -58,7 +58,7 @@ void getUniquelabels(value_t *y, size_t n, value_t **y_unique, int *n_unique,
   cub::DeviceSelect::Unique(NULL, bytes2, y2.data(), y3.data(),
                             d_num_selected.data(), n);
   bytes = max(bytes, bytes2);
-  raft::mr::device::buffer<char> cub_storage(allocator, stream, bytes);
+  rmm::device_uvector<char> cub_storage(bytes, stream);
 
   // Select Unique classes
   cub::DeviceRadixSort::SortKeys(cub_storage.data(), bytes, y, y2.data(), n);
diff --git a/cpp/include/raft/lap/lap.cuh b/cpp/include/raft/lap/lap.cuh
index 6bc1c08029..29c1cf6aa1 100644
--- a/cpp/include/raft/lap/lap.cuh
+++ b/cpp/include/raft/lap/lap.cuh
@@ -25,6 +25,7 @@
 #pragma once
 
 #include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include "d_structs.h"
 #include "lap_functions.cuh"
@@ -44,19 +45,19 @@ class LinearAssignmentProblem {
   VertexData<vertex_t> d_row_data_dev, d_col_data_dev;
 
   raft::handle_t const &handle_;
-  raft::mr::device::buffer<int> row_covers_v;
-  raft::mr::device::buffer<int> col_covers_v;
-  raft::mr::device::buffer<weight_t> row_duals_v;
-  raft::mr::device::buffer<weight_t> col_duals_v;
-  raft::mr::device::buffer<weight_t> col_slacks_v;
-  raft::mr::device::buffer<int> row_is_visited_v;
-  raft::mr::device::buffer<int> col_is_visited_v;
-  raft::mr::device::buffer<vertex_t> row_parents_v;
-  raft::mr::device::buffer<vertex_t> col_parents_v;
-  raft::mr::device::buffer<vertex_t> row_children_v;
-  raft::mr::device::buffer<vertex_t> col_children_v;
-  raft::mr::device::buffer<weight_t> obj_val_primal_v;
-  raft::mr::device::buffer<weight_t> obj_val_dual_v;
+  rmm::device_uvector<int> row_covers_v;
+  rmm::device_uvector<int> col_covers_v;
+  rmm::device_uvector<weight_t> row_duals_v;
+  rmm::device_uvector<weight_t> col_duals_v;
+  rmm::device_uvector<weight_t> col_slacks_v;
+  rmm::device_uvector<int> row_is_visited_v;
+  rmm::device_uvector<int> col_is_visited_v;
+  rmm::device_uvector<vertex_t> row_parents_v;
+  rmm::device_uvector<vertex_t> col_parents_v;
+  rmm::device_uvector<vertex_t> row_children_v;
+  rmm::device_uvector<vertex_t> col_children_v;
+  rmm::device_uvector<weight_t> obj_val_primal_v;
+  rmm::device_uvector<weight_t> obj_val_dual_v;
 
  public:
   LinearAssignmentProblem(raft::handle_t const &handle, vertex_t size,
@@ -66,19 +67,19 @@ class LinearAssignmentProblem {
       batchsize_(batchsize),
       epsilon_(epsilon),
       d_costs_(nullptr),
-      row_covers_v(handle_.get_device_allocator(), handle_.get_stream(), 0),
-      col_covers_v(handle_.get_device_allocator(), handle_.get_stream(), 0),
-      row_duals_v(handle_.get_device_allocator(), handle_.get_stream(), 0),
-      col_duals_v(handle_.get_device_allocator(), handle_.get_stream(), 0),
-      col_slacks_v(handle_.get_device_allocator(), handle_.get_stream(), 0),
-      row_is_visited_v(handle_.get_device_allocator(), handle_.get_stream(), 0),
-      col_is_visited_v(handle_.get_device_allocator(), handle_.get_stream(), 0),
-      row_parents_v(handle_.get_device_allocator(), handle_.get_stream(), 0),
-      col_parents_v(handle_.get_device_allocator(), handle_.get_stream(), 0),
-      row_children_v(handle_.get_device_allocator(), handle_.get_stream(), 0),
-      col_children_v(handle_.get_device_allocator(), handle_.get_stream(), 0),
-      obj_val_primal_v(handle_.get_device_allocator(), handle_.get_stream(), 0),
-      obj_val_dual_v(handle_.get_device_allocator(), handle_.get_stream(), 0) {}
+      row_covers_v(0, handle_.get_stream()),
+      col_covers_v(0, handle_.get_stream()),
+      row_duals_v(0, handle_.get_stream()),
+      col_duals_v(0, handle_.get_stream()),
+      col_slacks_v(0, handle_.get_stream()),
+      row_is_visited_v(0, handle_.get_stream()),
+      col_is_visited_v(0, handle_.get_stream()),
+      row_parents_v(0, handle_.get_stream()),
+      col_parents_v(0, handle_.get_stream()),
+      row_children_v(0, handle_.get_stream()),
+      col_children_v(0, handle_.get_stream()),
+      obj_val_primal_v(0, handle_.get_stream()),
+      obj_val_dual_v(0, handle_.get_stream()) {}
 
   // Executes Hungarian algorithm on the input cost matrix.
   void solve(weight_t const *d_cost_matrix, vertex_t *d_row_assignment,
@@ -152,19 +153,20 @@ class LinearAssignmentProblem {
  private:
   // Helper function for initializing global variables and arrays on a single host.
   void initializeDevice() {
-    row_covers_v.resize(batchsize_ * size_);
-    col_covers_v.resize(batchsize_ * size_);
-    row_duals_v.resize(batchsize_ * size_);
-    col_duals_v.resize(batchsize_ * size_);
-    col_slacks_v.resize(batchsize_ * size_);
-    row_is_visited_v.resize(batchsize_ * size_);
-    col_is_visited_v.resize(batchsize_ * size_);
-    row_parents_v.resize(batchsize_ * size_);
-    col_parents_v.resize(batchsize_ * size_);
-    row_children_v.resize(batchsize_ * size_);
-    col_children_v.resize(batchsize_ * size_);
-    obj_val_primal_v.resize(batchsize_);
-    obj_val_dual_v.resize(batchsize_);
+    cudaStream_t stream = handle_.get_stream();
+    row_covers_v.resize(batchsize_ * size_, stream);
+    col_covers_v.resize(batchsize_ * size_, stream);
+    row_duals_v.resize(batchsize_ * size_, stream);
+    col_duals_v.resize(batchsize_ * size_, stream);
+    col_slacks_v.resize(batchsize_ * size_, stream);
+    row_is_visited_v.resize(batchsize_ * size_, stream);
+    col_is_visited_v.resize(batchsize_ * size_, stream);
+    row_parents_v.resize(batchsize_ * size_, stream);
+    col_parents_v.resize(batchsize_ * size_, stream);
+    row_children_v.resize(batchsize_ * size_, stream);
+    col_children_v.resize(batchsize_ * size_, stream);
+    obj_val_primal_v.resize(batchsize_, stream);
+    obj_val_dual_v.resize(batchsize_, stream);
 
     d_vertices_dev.row_covers = row_covers_v.data();
     d_vertices_dev.col_covers = col_covers_v.data();
@@ -231,8 +233,7 @@ class LinearAssignmentProblem {
   int hungarianStep3() {
     int next;
 
-    raft::mr::device::buffer<bool> flag_v(handle_.get_device_allocator(),
-                                          handle_.get_stream(), 1);
+    rmm::device_uvector<bool> flag_v(1, handle_.get_stream());
 
     bool h_flag = false;
     raft::update_device(flag_v.data(), &h_flag, 1, handle_.get_stream());
diff --git a/cpp/include/raft/lap/lap_functions.cuh b/cpp/include/raft/lap/lap_functions.cuh
index 0079f50e82..ce3f67f8fe 100644
--- a/cpp/include/raft/lap/lap_functions.cuh
+++ b/cpp/include/raft/lap/lap_functions.cuh
@@ -34,9 +34,8 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/mr/device/buffer.hpp>
-
 #include <raft/lap/lap_kernels.cuh>
+#include <rmm/device_uvector.hpp>
 
 namespace raft {
 namespace lap {
@@ -127,10 +126,8 @@ inline void computeInitialAssignments(raft::handle_t const &handle,
 
   std::size_t size = SP * N;
 
-  raft::mr::device::buffer<int> row_lock_v(handle.get_device_allocator(),
-                                           handle.get_stream(), size);
-  raft::mr::device::buffer<int> col_lock_v(handle.get_device_allocator(),
-                                           handle.get_stream(), size);
+  rmm::device_uvector<int> row_lock_v(size, handle.get_stream());
+  rmm::device_uvector<int> col_lock_v(size, handle.get_stream());
 
   thrust::fill_n(thrust::device, d_vertices.row_assignments, size, -1);
   thrust::fill_n(thrust::device, d_vertices.col_assignments, size, -1);
@@ -216,25 +213,21 @@ inline vertex_t zeroCoverIteration(raft::handle_t const &handle,
                                    weight_t epsilon) {
   vertex_t M;
 
-  raft::mr::device::buffer<vertex_t> csr_ptrs_v(handle.get_device_allocator(),
-                                                handle.get_stream(), 0);
-  raft::mr::device::buffer<vertex_t> csr_neighbors_v(
-    handle.get_device_allocator(), handle.get_stream(), 0);
+  rmm::device_uvector<vertex_t> csr_ptrs_v(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> csr_neighbors_v(0, handle.get_stream());
 
   {
     dim3 blocks_per_grid;
     dim3 threads_per_block;
     int total_blocks = 0;
 
-    raft::mr::device::buffer<bool> predicates_v(handle.get_device_allocator(),
-                                                handle.get_stream(), SP * N);
-    raft::mr::device::buffer<vertex_t> addresses_v(
-      handle.get_device_allocator(), handle.get_stream(), SP * N);
+    rmm::device_uvector<bool> predicates_v(SP * N, handle.get_stream());
+    rmm::device_uvector<vertex_t> addresses_v(SP * N, handle.get_stream());
 
     thrust::fill_n(thrust::device, predicates_v.data(), SP * N, false);
     thrust::fill_n(thrust::device, addresses_v.data(), SP * N, vertex_t{0});
 
-    csr_ptrs_v.resize(SP + 1);
+    csr_ptrs_v.resize(SP + 1, handle.get_stream());
 
     thrust::fill_n(thrust::device, csr_ptrs_v.data(), (SP + 1), vertex_t{-1});
 
@@ -253,7 +246,7 @@ inline vertex_t zeroCoverIteration(raft::handle_t const &handle,
                            addresses_v.end(), addresses_v.begin());
 
     if (M > 0) {
-      csr_neighbors_v.resize(M);
+      csr_neighbors_v.resize(M, handle.get_stream());
 
       kernel_rowScatterCSR<<<blocks_per_grid, threads_per_block, 0,
                              handle.get_stream()>>>(
@@ -302,10 +295,8 @@ inline void reversePass(raft::handle_t const &handle,
   raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block,
                                          total_blocks, size);
 
-  raft::mr::device::buffer<bool> predicates_v(handle.get_device_allocator(),
-                                              handle.get_stream(), size);
-  raft::mr::device::buffer<vertex_t> addresses_v(handle.get_device_allocator(),
-                                                 handle.get_stream(), size);
+  rmm::device_uvector<bool> predicates_v(size, handle.get_stream());
+  rmm::device_uvector<vertex_t> addresses_v(size, handle.get_stream());
 
   thrust::fill_n(thrust::device, predicates_v.data(), size, false);
   thrust::fill_n(thrust::device, addresses_v.data(), size, vertex_t{0});
@@ -331,8 +322,7 @@ inline void reversePass(raft::handle_t const &handle,
     raft::lap::detail::calculateLinearDims(
       blocks_per_grid_1, threads_per_block_1, total_blocks_1, csr_size);
 
-    raft::mr::device::buffer<vertex_t> elements_v(
-      handle.get_device_allocator(), handle.get_stream(), csr_size);
+    rmm::device_uvector<vertex_t> elements_v(csr_size, handle.get_stream());
 
     kernel_augmentScatter<<<blocks_per_grid, threads_per_block, 0,
                             handle.get_stream()>>>(
@@ -360,10 +350,8 @@ inline void augmentationPass(raft::handle_t const &handle,
   raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block,
                                          total_blocks, SP * N);
 
-  raft::mr::device::buffer<bool> predicates_v(handle.get_device_allocator(),
-                                              handle.get_stream(), SP * N);
-  raft::mr::device::buffer<vertex_t> addresses_v(handle.get_device_allocator(),
-                                                 handle.get_stream(), SP * N);
+  rmm::device_uvector<bool> predicates_v(SP * N, handle.get_stream());
+  rmm::device_uvector<vertex_t> addresses_v(SP * N, handle.get_stream());
 
   thrust::fill_n(thrust::device, predicates_v.data(), SP * N, false);
   thrust::fill_n(thrust::device, addresses_v.data(), SP * N, vertex_t{0});
@@ -390,8 +378,8 @@ inline void augmentationPass(raft::handle_t const &handle,
     raft::lap::detail::calculateLinearDims(
       blocks_per_grid_1, threads_per_block_1, total_blocks_1, row_ids_csr_size);
 
-    raft::mr::device::buffer<vertex_t> elements_v(
-      handle.get_device_allocator(), handle.get_stream(), row_ids_csr_size);
+    rmm::device_uvector<vertex_t> elements_v(row_ids_csr_size,
+                                             handle.get_stream());
 
     kernel_augmentScatter<<<blocks_per_grid, threads_per_block, 0,
                             handle.get_stream()>>>(
@@ -420,8 +408,7 @@ inline void dualUpdate(raft::handle_t const &handle,
   dim3 threads_per_block;
   int total_blocks;
 
-  raft::mr::device::buffer<weight_t> sp_min_v(handle.get_device_allocator(),
-                                              handle.get_stream(), 1);
+  rmm::device_uvector<weight_t> sp_min_v(1, handle.get_stream());
 
   raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block,
                                          total_blocks, SP);
diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh
index 6172618380..8ab7011db4 100644
--- a/cpp/include/raft/linalg/eig.cuh
+++ b/cpp/include/raft/linalg/eig.cuh
@@ -22,7 +22,7 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
 #include <raft/matrix/matrix.cuh>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_uvector.hpp>
 
 namespace raft {
 namespace linalg {
@@ -52,8 +52,8 @@ void eigDC(const raft::handle_t &handle, const math_t *in, int n_rows,
                                             CUBLAS_FILL_MODE_UPPER, n_rows, in,
                                             n_cols, eig_vals, &lwork));
 
-  raft::mr::device::buffer<math_t> d_work(allocator, stream, lwork);
-  raft::mr::device::buffer<int> d_dev_info(allocator, stream, 1);
+  rmm::device_uvector<math_t> d_work(lwork, stream);
+  rmm::device_uvector<int> d_dev_info(1, stream);
 
   raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream);
 
@@ -104,9 +104,9 @@ void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
     CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, math_t(0.0), math_t(0.0),
     n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, &lwork));
 
-  raft::mr::device::buffer<math_t> d_work(allocator, stream, lwork);
-  raft::mr::device::buffer<int> d_dev_info(allocator, stream, 1);
-  raft::mr::device::buffer<math_t> d_eig_vectors(allocator, stream, 0);
+  rmm::device_uvector<math_t> d_work(lwork, stream);
+  rmm::device_uvector<int> d_dev_info(1, stream);
+  rmm::device_uvector<math_t> d_eig_vectors(0, stream);
 
   if (memUsage == OVERWRITE_INPUT) {
     CUSOLVER_CHECK(cusolverDnsyevdx(
@@ -176,8 +176,8 @@ void eigJacobi(const raft::handle_t &handle, const math_t *in, int n_rows,
     cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, n_rows,
     eig_vectors, n_cols, eig_vals, &lwork, syevj_params));
 
-  raft::mr::device::buffer<math_t> d_work(allocator, stream, lwork);
-  raft::mr::device::buffer<int> dev_info(allocator, stream, 1);
+  rmm::device_uvector<math_t> d_work(lwork, stream);
+  rmm::device_uvector<int> dev_info(1, stream);
 
   raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream);
 
diff --git a/cpp/include/raft/linalg/qr.cuh b/cpp/include/raft/linalg/qr.cuh
index cafa8d54f1..14771f289d 100644
--- a/cpp/include/raft/linalg/qr.cuh
+++ b/cpp/include/raft/linalg/qr.cuh
@@ -19,7 +19,7 @@
 #include <raft/linalg/cublas_wrappers.h>
 #include <raft/linalg/cusolver_wrappers.h>
 #include <raft/matrix/matrix.cuh>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_uvector.hpp>
 
 namespace raft {
 namespace linalg {
@@ -42,7 +42,6 @@ namespace linalg {
 template <typename math_t>
 void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q,
             int n_rows, int n_cols, cudaStream_t stream) {
-  auto allocator = handle.get_device_allocator();
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   int m = n_rows, n = n_cols;
@@ -50,14 +49,14 @@ void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q,
   CUDA_CHECK(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n,
                              cudaMemcpyDeviceToDevice, stream));
 
-  raft::mr::device::buffer<math_t> tau(allocator, stream, k);
+  rmm::device_uvector<math_t> tau(k, stream);
   CUDA_CHECK(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * k, stream));
 
-  raft::mr::device::buffer<int> devInfo(allocator, stream, 1);
+  rmm::device_uvector<int> devInfo(1, stream);
   int Lwork;
 
   CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, m, n, Q, m, &Lwork));
-  raft::mr::device::buffer<math_t> workspace(allocator, stream, Lwork);
+  rmm::device_uvector<math_t> workspace(Lwork, stream);
   CUSOLVER_CHECK(cusolverDngeqrf(cusolverH, m, n, Q, m, tau.data(),
                                  workspace.data(), Lwork, devInfo.data(),
                                  stream));
@@ -86,12 +85,11 @@ void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q,
 template <typename math_t>
 void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R,
              int n_rows, int n_cols, cudaStream_t stream) {
-  auto allocator = handle.get_device_allocator();
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   int m = n_rows, n = n_cols;
-  raft::mr::device::buffer<math_t> R_full(allocator, stream, m * n);
-  raft::mr::device::buffer<math_t> tau(allocator, stream, min(m, n));
+  rmm::device_uvector<math_t> R_full(m * n, stream);
+  rmm::device_uvector<math_t> tau(min(m, n), stream);
   CUDA_CHECK(
     cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream));
   int R_full_nrows = m, R_full_ncols = n;
@@ -99,12 +97,12 @@ void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R,
                              cudaMemcpyDeviceToDevice, stream));
 
   int Lwork;
-  raft::mr::device::buffer<int> devInfo(allocator, stream, 1);
+  rmm::device_uvector<int> devInfo(1, stream);
 
   CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, R_full_nrows,
                                             R_full_ncols, R_full.data(),
                                             R_full_nrows, &Lwork));
-  raft::mr::device::buffer<math_t> workspace(allocator, stream, Lwork);
+  rmm::device_uvector<math_t> workspace(Lwork, stream);
   CUSOLVER_CHECK(cusolverDngeqrf(
     cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows,
     tau.data(), workspace.data(), Lwork, devInfo.data(), stream));
diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/svd.cuh
index 7357a68a4c..a1507cfa9b 100644
--- a/cpp/include/raft/linalg/svd.cuh
+++ b/cpp/include/raft/linalg/svd.cuh
@@ -23,7 +23,7 @@
 #include <raft/handle.hpp>
 #include <raft/matrix/math.cuh>
 #include <raft/matrix/matrix.cuh>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_uvector.hpp>
 #include "eig.cuh"
 #include "gemm.cuh"
 #include "transpose.h"
@@ -54,8 +54,6 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols,
            T *sing_vals, T *left_sing_vecs, T *right_sing_vecs,
            bool trans_right, bool gen_left_vec, bool gen_right_vec,
            cudaStream_t stream) {
-  std::shared_ptr<raft::mr::device::allocator> allocator =
-    handle.get_device_allocator();
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
   cublasHandle_t cublasH = handle.get_cublas_handle();
 
@@ -71,13 +69,13 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols,
   const int m = n_rows;
   const int n = n_cols;
 
-  raft::mr::device::buffer<int> devInfo(allocator, stream, 1);
+  rmm::device_uvector<int> devInfo(1, stream);
   T *d_rwork = nullptr;
 
   int lwork = 0;
   CUSOLVER_CHECK(
     cusolverDngesvd_bufferSize<T>(cusolverH, n_rows, n_cols, &lwork));
-  raft::mr::device::buffer<T> d_work(allocator, stream, lwork);
+  rmm::device_uvector<T> d_work(lwork, stream);
 
   char jobu = 'S';
   char jobvt = 'A';
@@ -112,12 +110,11 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols,
 template <typename T>
 void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S,
             T *U, T *V, bool gen_left_vec, cudaStream_t stream) {
-  auto allocator = handle.get_device_allocator();
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
   cublasHandle_t cublasH = handle.get_cublas_handle();
 
   int len = n_cols * n_cols;
-  raft::mr::device::buffer<T> in_cross_mult(allocator, stream, len);
+  rmm::device_uvector<T> in_cross_mult(len, stream);
 
   T alpha = T(1);
   T beta = T(0);
@@ -162,7 +159,6 @@ void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
                math_t *sing_vals, math_t *left_sing_vecs,
                math_t *right_sing_vecs, bool gen_left_vec, bool gen_right_vec,
                math_t tol, int max_sweeps, cudaStream_t stream) {
-  auto allocator = handle.get_device_allocator();
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   gesvdjInfo_t gesvdj_params = NULL;
@@ -174,7 +170,7 @@ void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
   int m = n_rows;
   int n = n_cols;
 
-  raft::mr::device::buffer<int> devInfo(allocator, stream, 1);
+  rmm::device_uvector<int> devInfo(1, stream);
 
   int lwork = 0;
   int econ = 1;
@@ -183,7 +179,7 @@ void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
     cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals,
     left_sing_vecs, m, right_sing_vecs, n, &lwork, gesvdj_params));
 
-  raft::mr::device::buffer<math_t> d_work(allocator, stream, lwork);
+  rmm::device_uvector<math_t> d_work(lwork, stream);
 
   CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj(
     cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals,
@@ -210,10 +206,8 @@ template <typename math_t>
 void svdReconstruction(const raft::handle_t &handle, math_t *U, math_t *S,
                        math_t *V, math_t *out, int n_rows, int n_cols, int k,
                        cudaStream_t stream) {
-  auto allocator = handle.get_device_allocator();
-
   const math_t alpha = 1.0, beta = 0.0;
-  raft::mr::device::buffer<math_t> SVT(allocator, stream, k * n_cols);
+  rmm::device_uvector<math_t> SVT(k * n_cols, stream);
 
   raft::linalg::gemm(handle, S, k, k, V, SVT.data(), k, n_cols, CUBLAS_OP_N,
                      CUBLAS_OP_T, alpha, beta, stream);
@@ -239,14 +233,13 @@ template <typename math_t>
 bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U,
                          math_t *S_vec, math_t *V, int n_rows, int n_cols,
                          int k, math_t tol, cudaStream_t stream) {
-  auto allocator = handle.get_device_allocator();
   cublasHandle_t cublasH = handle.get_cublas_handle();
 
   int m = n_rows, n = n_cols;
 
   // form product matrix
-  raft::mr::device::buffer<math_t> P_d(allocator, stream, m * n);
-  raft::mr::device::buffer<math_t> S_mat(allocator, stream, k * k);
+  rmm::device_uvector<math_t> P_d(m * n, stream);
+  rmm::device_uvector<math_t> S_mat(k * k, stream);
   CUDA_CHECK(cudaMemsetAsync(P_d.data(), 0, sizeof(math_t) * m * n, stream));
   CUDA_CHECK(cudaMemsetAsync(S_mat.data(), 0, sizeof(math_t) * k * k, stream));
 
@@ -262,7 +255,7 @@ bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U,
 
   // calculate percent error
   const math_t alpha = 1.0, beta = -1.0;
-  raft::mr::device::buffer<math_t> A_minus_P(allocator, stream, m * n);
+  rmm::device_uvector<math_t> A_minus_P(m * n, stream);
   CUDA_CHECK(
     cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream));
 
diff --git a/cpp/include/raft/matrix/math.cuh b/cpp/include/raft/matrix/math.cuh
index 0a72117140..f84b85d222 100644
--- a/cpp/include/raft/matrix/math.cuh
+++ b/cpp/include/raft/matrix/math.cuh
@@ -22,7 +22,7 @@
 #include <raft/linalg/matrix_vector_op.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_uvector.hpp>
 
 namespace raft {
 namespace matrix {
@@ -294,10 +294,7 @@ void ratio(const raft::handle_t &handle, math_t *src, math_t *dest, IdxType len,
   auto d_src = src;
   auto d_dest = dest;
 
-  std::shared_ptr<raft::mr::device::allocator> allocator =
-    handle.get_device_allocator();
-
-  raft::mr::device::buffer<math_t> d_sum(allocator, stream, 1);
+  rmm::device_uvector<math_t> d_sum(1, stream);
   auto *d_sum_ptr = d_sum.data();
   auto no_op = [] __device__(math_t in) { return in; };
   raft::linalg::mapThenSumReduce(d_sum_ptr, len, no_op, stream, src);
diff --git a/cpp/include/raft/random/rng.cuh b/cpp/include/raft/random/rng.cuh
index 56710ea81f..a96c0bae38 100644
--- a/cpp/include/raft/random/rng.cuh
+++ b/cpp/include/raft/random/rng.cuh
@@ -25,8 +25,8 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
 #include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
 #include <random>
+#include <rmm/device_uvector.hpp>
 #include <type_traits>
 #include "rng_impl.cuh"
 
@@ -512,10 +512,10 @@ class Rng {
     std::shared_ptr<raft::mr::device::allocator> allocator =
       handle.get_device_allocator();
 
-    raft::mr::device::buffer<WeightsT> expWts(allocator, stream, len);
-    raft::mr::device::buffer<WeightsT> sortedWts(allocator, stream, len);
-    raft::mr::device::buffer<IdxT> inIdx(allocator, stream, len);
-    raft::mr::device::buffer<IdxT> outIdxBuff(allocator, stream, len);
+    rmm::device_uvector<WeightsT> expWts(len, stream);
+    rmm::device_uvector<WeightsT> sortedWts(len, stream);
+    rmm::device_uvector<IdxT> inIdx(len, stream);
+    rmm::device_uvector<IdxT> outIdxBuff(len, stream);
     auto *inIdxPtr = inIdx.data();
     // generate modified weights
     custom_distribution(
@@ -533,7 +533,7 @@ class Rng {
     ///@todo: use a more efficient partitioning scheme instead of full sort
     // sort the array and pick the top sampledLen items
     IdxT *outIdxPtr = outIdxBuff.data();
-    raft::mr::device::buffer<char> workspace(allocator, stream);
+    rmm::device_uvector<char> workspace(0, stream);
     sortPairs(workspace, expWts.data(), sortedWts.data(), inIdxPtr, outIdxPtr,
               (int)len, stream);
     if (outIdx != nullptr) {
diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh
index a034bdbda8..31a8f54721 100644
--- a/cpp/include/raft/sparse/convert/csr.cuh
+++ b/cpp/include/raft/sparse/convert/csr.cuh
@@ -23,7 +23,7 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
 #include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
@@ -50,15 +50,15 @@ void coo_to_csr(const raft::handle_t &handle, const int *srcRows,
   auto stream = handle.get_stream();
   auto cusparseHandle = handle.get_cusparse_handle();
   auto d_alloc = handle.get_device_allocator();
-  raft::mr::device::buffer<int> dstRows(d_alloc, stream, nnz);
+  rmm::device_uvector<int> dstRows(nnz, stream);
   CUDA_CHECK(cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz,
                              cudaMemcpyDeviceToDevice, stream));
   CUDA_CHECK(cudaMemcpyAsync(dstCols, srcCols, sizeof(int) * nnz,
                              cudaMemcpyDeviceToDevice, stream));
   auto buffSize = raft::sparse::cusparsecoosort_bufferSizeExt(
     cusparseHandle, m, m, nnz, srcRows, srcCols, stream);
-  raft::mr::device::buffer<char> pBuffer(d_alloc, stream, buffSize);
-  raft::mr::device::buffer<int> P(d_alloc, stream, nnz);
+  rmm::device_uvector<char> pBuffer(buffSize, stream);
+  rmm::device_uvector<int> P(nnz, stream);
   CUSPARSE_CHECK(
     cusparseCreateIdentityPermutation(cusparseHandle, nnz, P.data()));
   raft::sparse::cusparsecoosortByRow(cusparseHandle, m, m, nnz, dstRows.data(),
@@ -154,7 +154,7 @@ template <typename T>
 void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m,
                        std::shared_ptr<raft::mr::device::allocator> d_alloc,
                        cudaStream_t stream) {
-  raft::mr::device::buffer<T> row_counts(d_alloc, stream, m);
+  rmm::device_uvector<T> row_counts(m, stream);
 
   CUDA_CHECK(cudaMemsetAsync(row_counts.data(), 0, m * sizeof(T), stream));
 
diff --git a/cpp/include/raft/sparse/coo.cuh b/cpp/include/raft/sparse/coo.cuh
index 73120fea8c..40d1a06720 100644
--- a/cpp/include/raft/sparse/coo.cuh
+++ b/cpp/include/raft/sparse/coo.cuh
@@ -18,7 +18,7 @@
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <cusparse_v2.h>
 
@@ -58,9 +58,9 @@ namespace sparse {
 template <typename T, typename Index_Type = int>
 class COO {
  protected:
-  raft::mr::device::buffer<Index_Type> rows_arr;
-  raft::mr::device::buffer<Index_Type> cols_arr;
-  raft::mr::device::buffer<T> vals_arr;
+  rmm::device_uvector<Index_Type> rows_arr;
+  rmm::device_uvector<Index_Type> cols_arr;
+  rmm::device_uvector<T> vals_arr;
 
  public:
   Index_Type nnz;
@@ -72,9 +72,9 @@ class COO {
     * @param stream: CUDA stream to use
     */
   COO(std::shared_ptr<raft::mr::device::allocator> d_alloc, cudaStream_t stream)
-    : rows_arr(d_alloc, stream, 0),
-      cols_arr(d_alloc, stream, 0),
-      vals_arr(d_alloc, stream, 0),
+    : rows_arr(0, stream),
+      cols_arr(0, stream),
+      vals_arr(0, stream),
       nnz(0),
       n_rows(0),
       n_cols(0) {}
@@ -87,10 +87,9 @@ class COO {
     * @param n_rows: number of rows in the dense matrix
     * @param n_cols: number of cols in the dense matrix
     */
-  COO(raft::mr::device::buffer<Index_Type> &rows,
-      raft::mr::device::buffer<Index_Type> &cols,
-      raft::mr::device::buffer<T> &vals, Index_Type nnz, Index_Type n_rows = 0,
-      Index_Type n_cols = 0)
+  COO(rmm::device_uvector<Index_Type> &rows,
+      rmm::device_uvector<Index_Type> &cols, rmm::device_uvector<T> &vals,
+      Index_Type nnz, Index_Type n_rows = 0, Index_Type n_cols = 0)
     : rows_arr(rows),
       cols_arr(cols),
       vals_arr(vals),
@@ -109,9 +108,9 @@ class COO {
   COO(std::shared_ptr<raft::mr::device::allocator> d_alloc, cudaStream_t stream,
       Index_Type nnz, Index_Type n_rows = 0, Index_Type n_cols = 0,
       bool init = true)
-    : rows_arr(d_alloc, stream, nnz),
-      cols_arr(d_alloc, stream, nnz),
-      vals_arr(d_alloc, stream, nnz),
+    : rows_arr(nnz, stream),
+      cols_arr(nnz, stream),
+      vals_arr(nnz, stream),
       nnz(nnz),
       n_rows(n_rows),
       n_cols(n_cols) {
diff --git a/cpp/include/raft/sparse/csr.cuh b/cpp/include/raft/sparse/csr.cuh
index bc4a68d296..b30d7af8b4 100644
--- a/cpp/include/raft/sparse/csr.cuh
+++ b/cpp/include/raft/sparse/csr.cuh
@@ -21,7 +21,7 @@
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
@@ -219,7 +219,7 @@ void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr,
              Index_ nnz, Index_ N,
              std::shared_ptr<raft::mr::device::allocator> d_alloc,
              cudaStream_t stream, Lambda filter_op) {
-  raft::mr::device::buffer<bool> m(d_alloc, stream, 1);
+  rmm::device_uvector<bool> m(1, stream);
 
   WeakCCState state(m.data());
   weak_cc_batched<Index_, TPB_X>(labels, row_ind, row_ind_ptr, nnz, N, 0, N,
@@ -253,7 +253,7 @@ void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr,
              Index_ nnz, Index_ N,
              std::shared_ptr<raft::mr::device::allocator> d_alloc,
              cudaStream_t stream) {
-  raft::mr::device::buffer<bool> m(d_alloc, stream, 1);
+  rmm::device_uvector<bool> m(1, stream);
   WeakCCState state(m.data());
   weak_cc_batched<Index_, TPB_X>(labels, row_ind, row_ind_ptr, nnz, N, 0, N,
                                  stream, [](Index_) { return true; });
diff --git a/cpp/include/raft/sparse/distance/bin_distance.cuh b/cpp/include/raft/sparse/distance/bin_distance.cuh
index ae5cbdf9d3..1ad5466aad 100644
--- a/cpp/include/raft/sparse/distance/bin_distance.cuh
+++ b/cpp/include/raft/sparse/distance/bin_distance.cuh
@@ -24,7 +24,7 @@
 #include <raft/cuda_utils.cuh>
 
 #include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <raft/sparse/distance/common.h>
 #include <raft/sparse/utils.h>
@@ -88,8 +88,8 @@ void compute_bin_distance(value_t *out, const value_idx *Q_coo_rows,
                           cusparseHandle_t handle,
                           std::shared_ptr<raft::mr::device::allocator> alloc,
                           cudaStream_t stream, expansion_f expansion_func) {
-  raft::mr::device::buffer<value_t> Q_norms(alloc, stream, m);
-  raft::mr::device::buffer<value_t> R_norms(alloc, stream, n);
+  rmm::device_uvector<value_t> Q_norms(m, stream);
+  rmm::device_uvector<value_t> R_norms(n, stream);
   CUDA_CHECK(
     cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t)));
   CUDA_CHECK(
@@ -113,9 +113,7 @@ class jaccard_expanded_distances_t : public distances_t<value_t> {
  public:
   explicit jaccard_expanded_distances_t(
     const distances_config_t<value_idx, value_t> &config)
-    : config_(&config),
-      workspace(config.allocator, config.stream, 0),
-      ip_dists(config) {}
+    : config_(&config), workspace(0, config.stream), ip_dists(config) {}
 
   void compute(value_t *out_dists) {
     ip_dists.compute(out_dists);
@@ -123,8 +121,8 @@ class jaccard_expanded_distances_t : public distances_t<value_t> {
     value_idx *b_indices = ip_dists.b_rows_coo();
     value_t *b_data = ip_dists.b_data_coo();
 
-    raft::mr::device::buffer<value_idx> search_coo_rows(
-      config_->allocator, config_->stream, config_->a_nnz);
+    rmm::device_uvector<value_idx> search_coo_rows(config_->a_nnz,
+                                                   config_->stream);
     raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows,
                                       search_coo_rows.data(), config_->a_nnz,
                                       config_->stream);
@@ -149,7 +147,7 @@ class jaccard_expanded_distances_t : public distances_t<value_t> {
 
  private:
   const distances_config_t<value_idx, value_t> *config_;
-  raft::mr::device::buffer<char> workspace;
+  rmm::device_uvector<char> workspace;
   ip_distances_t<value_idx, value_t> ip_dists;
 };
 
@@ -162,9 +160,7 @@ class dice_expanded_distances_t : public distances_t<value_t> {
  public:
   explicit dice_expanded_distances_t(
     const distances_config_t<value_idx, value_t> &config)
-    : config_(&config),
-      workspace(config.allocator, config.stream, 0),
-      ip_dists(config) {}
+    : config_(&config), workspace(0, config.stream), ip_dists(config) {}
 
   void compute(value_t *out_dists) {
     ip_dists.compute(out_dists);
@@ -172,8 +168,8 @@ class dice_expanded_distances_t : public distances_t<value_t> {
     value_idx *b_indices = ip_dists.b_rows_coo();
     value_t *b_data = ip_dists.b_data_coo();
 
-    raft::mr::device::buffer<value_idx> search_coo_rows(
-      config_->allocator, config_->stream, config_->a_nnz);
+    rmm::device_uvector<value_idx> search_coo_rows(config_->a_nnz,
+                                                   config_->stream);
     raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows,
                                       search_coo_rows.data(), config_->a_nnz,
                                       config_->stream);
@@ -194,7 +190,7 @@ class dice_expanded_distances_t : public distances_t<value_t> {
 
  private:
   const distances_config_t<value_idx, value_t> *config_;
-  raft::mr::device::buffer<char> workspace;
+  rmm::device_uvector<char> workspace;
   ip_distances_t<value_idx, value_t> ip_dists;
 };
 
diff --git a/cpp/include/raft/sparse/distance/csr_spmv.cuh b/cpp/include/raft/sparse/distance/csr_spmv.cuh
index cd8ca09913..2ee0abe47a 100644
--- a/cpp/include/raft/sparse/distance/csr_spmv.cuh
+++ b/cpp/include/raft/sparse/distance/csr_spmv.cuh
@@ -20,7 +20,7 @@
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <raft/sparse/distance/common.h>
 #include <raft/sparse/utils.h>
@@ -346,7 +346,7 @@ template <typename value_idx>
 inline value_idx max_degree(
   value_idx *indptr, value_idx n_rows,
   std::shared_ptr<raft::mr::device::allocator> allocator, cudaStream_t stream) {
-  raft::mr::device::buffer<value_idx> max_d(allocator, stream, 1);
+  rmm::device_uvector<value_idx> max_d(1, stream);
   CUDA_CHECK(cudaMemsetAsync(max_d.data(), 0, sizeof(value_idx), stream));
 
   /**
diff --git a/cpp/include/raft/sparse/distance/ip_distance.cuh b/cpp/include/raft/sparse/distance/ip_distance.cuh
index 90717bfc5f..6297b009f4 100644
--- a/cpp/include/raft/sparse/distance/ip_distance.cuh
+++ b/cpp/include/raft/sparse/distance/ip_distance.cuh
@@ -23,7 +23,7 @@
 #include <raft/cuda_utils.cuh>
 
 #include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <raft/sparse/distance/common.h>
 #include <raft/sparse/linalg/transpose.h>
@@ -92,10 +92,10 @@ class ip_distances_gemm_t : public ip_trans_getters_t<value_idx, value_t> {
   explicit ip_distances_gemm_t(
     const distances_config_t<value_idx, value_t> &config)
     : config_(&config),
-      workspace(config.allocator, config.stream, 0),
-      csc_indptr(config.allocator, config.stream, 0),
-      csc_indices(config.allocator, config.stream, 0),
-      csc_data(config.allocator, config.stream, 0),
+      workspace(0, config.stream),
+      csc_indptr(0, config.stream),
+      csc_indices(0, config.stream),
+      csc_data(0, config.stream),
       alpha(1.0) {
     init_mat_descriptor(matA);
     init_mat_descriptor(matB);
@@ -118,12 +118,10 @@ class ip_distances_gemm_t : public ip_trans_getters_t<value_idx, value_t> {
     /**
 	   * Compute pairwise distances and return dense matrix in column-major format
 	   */
-    raft::mr::device::buffer<value_idx> out_batch_indptr(
-      config_->allocator, config_->stream, config_->a_nrows + 1);
-    raft::mr::device::buffer<value_idx> out_batch_indices(config_->allocator,
-                                                          config_->stream, 0);
-    raft::mr::device::buffer<value_t> out_batch_data(config_->allocator,
-                                                     config_->stream, 0);
+    rmm::device_uvector<value_idx> out_batch_indptr(config_->a_nrows + 1,
+                                                    config_->stream);
+    rmm::device_uvector<value_idx> out_batch_indices(0, config_->stream);
+    rmm::device_uvector<value_t> out_batch_data(0, config_->stream);
 
     value_idx out_batch_nnz = get_nnz(out_batch_indptr.data());
 
@@ -219,10 +217,10 @@ class ip_distances_gemm_t : public ip_trans_getters_t<value_idx, value_t> {
   cusparseMatDescr_t matC;
   cusparseMatDescr_t matD;
   cusparsePointerMode_t orig_ptr_mode;
-  raft::mr::device::buffer<char> workspace;
-  raft::mr::device::buffer<value_idx> csc_indptr;
-  raft::mr::device::buffer<value_idx> csc_indices;
-  raft::mr::device::buffer<value_t> csc_data;
+  rmm::device_uvector<char> workspace;
+  rmm::device_uvector<value_idx> csc_indptr;
+  rmm::device_uvector<value_idx> csc_indices;
+  rmm::device_uvector<value_t> csc_data;
   const distances_config_t<value_idx, value_t> *config_;
 };
 
@@ -234,8 +232,7 @@ class ip_distances_spmv_t : public ip_trans_getters_t<value_idx, value_t> {
    * @param[in] config specifies inputs, outputs, and sizes
    */
   ip_distances_spmv_t(const distances_config_t<value_idx, value_t> &config)
-    : config_(&config),
-      coo_rows_b(config.allocator, config.stream, config.b_nnz) {
+    : config_(&config), coo_rows_b(config.b_nnz, config.stream) {
     raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows,
                                       coo_rows_b.data(), config_->b_nnz,
                                       config_->stream);
@@ -262,7 +259,7 @@ class ip_distances_spmv_t : public ip_trans_getters_t<value_idx, value_t> {
 
  private:
   const distances_config_t<value_idx, value_t> *config_;
-  raft::mr::device::buffer<value_idx> coo_rows_b;
+  rmm::device_uvector<value_idx> coo_rows_b;
 };
 
 template <typename value_idx = int, typename value_t = float>
diff --git a/cpp/include/raft/sparse/distance/l2_distance.cuh b/cpp/include/raft/sparse/distance/l2_distance.cuh
index 829471e0e3..3898189630 100644
--- a/cpp/include/raft/sparse/distance/l2_distance.cuh
+++ b/cpp/include/raft/sparse/distance/l2_distance.cuh
@@ -24,7 +24,7 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <raft/sparse/utils.h>
 #include <raft/sparse/csr.cuh>
@@ -92,8 +92,8 @@ void compute_l2(value_t *out, const value_idx *Q_coo_rows,
                 cusparseHandle_t handle,
                 std::shared_ptr<raft::mr::device::allocator> alloc,
                 cudaStream_t stream, expansion_f expansion_func) {
-  raft::mr::device::buffer<value_t> Q_sq_norms(alloc, stream, m);
-  raft::mr::device::buffer<value_t> R_sq_norms(alloc, stream, n);
+  rmm::device_uvector<value_t> Q_sq_norms(m, stream);
+  rmm::device_uvector<value_t> R_sq_norms(n, stream);
   CUDA_CHECK(
     cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t)));
   CUDA_CHECK(
@@ -117,9 +117,7 @@ class l2_expanded_distances_t : public distances_t<value_t> {
  public:
   explicit l2_expanded_distances_t(
     const distances_config_t<value_idx, value_t> &config)
-    : config_(&config),
-      workspace(config.allocator, config.stream, 0),
-      ip_dists(config) {}
+    : config_(&config), workspace(0, config.stream), ip_dists(config) {}
 
   void compute(value_t *out_dists) {
     ip_dists.compute(out_dists);
@@ -127,8 +125,8 @@ class l2_expanded_distances_t : public distances_t<value_t> {
     value_idx *b_indices = ip_dists.b_rows_coo();
     value_t *b_data = ip_dists.b_data_coo();
 
-    raft::mr::device::buffer<value_idx> search_coo_rows(
-      config_->allocator, config_->stream, config_->a_nnz);
+    rmm::device_uvector<value_idx> search_coo_rows(config_->a_nnz,
+                                                   config_->stream);
     raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows,
                                       search_coo_rows.data(), config_->a_nnz,
                                       config_->stream);
@@ -146,7 +144,7 @@ class l2_expanded_distances_t : public distances_t<value_t> {
 
  protected:
   const distances_config_t<value_idx, value_t> *config_;
-  raft::mr::device::buffer<char> workspace;
+  rmm::device_uvector<char> workspace;
   ip_distances_t<value_idx, value_t> ip_dists;
 };
 
@@ -186,9 +184,7 @@ class cosine_expanded_distances_t : public distances_t<value_t> {
  public:
   explicit cosine_expanded_distances_t(
     const distances_config_t<value_idx, value_t> &config)
-    : config_(&config),
-      workspace(config.allocator, config.stream, 0),
-      ip_dists(config) {}
+    : config_(&config), workspace(0, config.stream), ip_dists(config) {}
 
   void compute(value_t *out_dists) {
     ip_dists.compute(out_dists);
@@ -196,8 +192,8 @@ class cosine_expanded_distances_t : public distances_t<value_t> {
     value_idx *b_indices = ip_dists.b_rows_coo();
     value_t *b_data = ip_dists.b_data_coo();
 
-    raft::mr::device::buffer<value_idx> search_coo_rows(
-      config_->allocator, config_->stream, config_->a_nnz);
+    rmm::device_uvector<value_idx> search_coo_rows(config_->a_nnz,
+                                                   config_->stream);
     raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows,
                                       search_coo_rows.data(), config_->a_nnz,
                                       config_->stream);
@@ -221,7 +217,7 @@ class cosine_expanded_distances_t : public distances_t<value_t> {
 
  private:
   const distances_config_t<value_idx, value_t> *config_;
-  raft::mr::device::buffer<char> workspace;
+  rmm::device_uvector<char> workspace;
   ip_distances_t<value_idx, value_t> ip_dists;
 };
 
@@ -239,9 +235,7 @@ class hellinger_expanded_distances_t : public distances_t<value_t> {
  public:
   explicit hellinger_expanded_distances_t(
     const distances_config_t<value_idx, value_t> &config)
-    : config_(&config),
-      workspace(config.allocator, config.stream, 0),
-      ip_dists(config) {}
+    : config_(&config), workspace(0, config.stream), ip_dists(config) {}
 
   void compute(value_t *out_dists) {
     // First sqrt A and B
@@ -282,7 +276,7 @@ class hellinger_expanded_distances_t : public distances_t<value_t> {
 
  private:
   const distances_config_t<value_idx, value_t> *config_;
-  raft::mr::device::buffer<char> workspace;
+  rmm::device_uvector<char> workspace;
   ip_distances_t<value_idx, value_t> ip_dists;
 };
 
diff --git a/cpp/include/raft/sparse/distance/lp_distance.cuh b/cpp/include/raft/sparse/distance/lp_distance.cuh
index e524d87b7c..4a7d5e53ff 100644
--- a/cpp/include/raft/sparse/distance/lp_distance.cuh
+++ b/cpp/include/raft/sparse/distance/lp_distance.cuh
@@ -24,7 +24,7 @@
 #include <raft/cuda_utils.cuh>
 
 #include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <raft/sparse/utils.h>
 #include <raft/sparse/csr.cuh>
@@ -62,8 +62,8 @@ void unexpanded_lp_distances(
     // for max occupancy.
     // Ref: https://github.com/rapidsai/cuml/issues/3371
 
-    raft::mr::device::buffer<value_idx> coo_rows(
-      config_->allocator, config_->stream, max(config_->b_nnz, config_->a_nnz));
+    rmm::device_uvector<value_idx> coo_rows(max(config_->b_nnz, config_->a_nnz),
+                                            config_->stream);
 
     raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows,
                                       coo_rows.data(), config_->b_nnz,
diff --git a/cpp/include/raft/sparse/linalg/add.cuh b/cpp/include/raft/sparse/linalg/add.cuh
index 47b1ba6e41..6bb4b31bd5 100644
--- a/cpp/include/raft/sparse/linalg/add.cuh
+++ b/cpp/include/raft/sparse/linalg/add.cuh
@@ -22,7 +22,7 @@
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
@@ -168,7 +168,7 @@ size_t csr_add_calc_inds(const int *a_ind, const int *a_indptr, const T *a_val,
   dim3 grid(raft::ceildiv(m, TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
 
-  raft::mr::device::buffer<int> row_counts(d_alloc, stream, m + 1);
+  rmm::device_uvector<int> row_counts(m + 1, stream);
   CUDA_CHECK(
     cudaMemsetAsync(row_counts.data(), 0, (m + 1) * sizeof(int), stream));
 
diff --git a/cpp/include/raft/sparse/linalg/spectral.cuh b/cpp/include/raft/sparse/linalg/spectral.cuh
index 15302f3b74..0d6c55bb65 100644
--- a/cpp/include/raft/sparse/linalg/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/spectral.cuh
@@ -19,8 +19,8 @@
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
 #include <raft/spectral/partition.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/coo.cuh>
@@ -36,15 +36,15 @@ void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals,
                    unsigned long long seed = 1234567) {
   auto stream = handle.get_stream();
   auto d_alloc = handle.get_device_allocator();
-  raft::mr::device::buffer<int> src_offsets(d_alloc, stream, n + 1);
-  raft::mr::device::buffer<int> dst_cols(d_alloc, stream, nnz);
-  raft::mr::device::buffer<T> dst_vals(d_alloc, stream, nnz);
+  rmm::device_uvector<int> src_offsets(n + 1, stream);
+  rmm::device_uvector<int> dst_cols(nnz, stream);
+  rmm::device_uvector<T> dst_vals(nnz, stream);
   convert::coo_to_csr(handle, rows, cols, vals, nnz, n, src_offsets.data(),
                       dst_cols.data(), dst_vals.data());
 
-  raft::mr::device::buffer<T> eigVals(d_alloc, stream, n_components + 1);
-  raft::mr::device::buffer<T> eigVecs(d_alloc, stream, n * (n_components + 1));
-  raft::mr::device::buffer<int> labels(d_alloc, stream, n);
+  rmm::device_uvector<T> eigVals(n_components + 1, stream);
+  rmm::device_uvector<T> eigVecs(n * (n_components + 1), stream);
+  rmm::device_uvector<int> labels(n, stream);
 
   CUDA_CHECK(cudaStreamSynchronize(stream));
 
diff --git a/cpp/include/raft/sparse/linalg/symmetrize.cuh b/cpp/include/raft/sparse/linalg/symmetrize.cuh
index 5c2c78f0c3..128ecb21bb 100644
--- a/cpp/include/raft/sparse/linalg/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/symmetrize.cuh
@@ -22,7 +22,7 @@
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <raft/sparse/op/sort.h>
 #include <thrust/device_ptr.h>
@@ -31,7 +31,6 @@
 
 #include <cuda_runtime.h>
 #include <stdio.h>
-#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <algorithm>
@@ -135,7 +134,7 @@ void coo_symmetrize(COO<T> *in, COO<T> *out,
 
   ASSERT(!out->validate_mem(), "Expecting unallocated COO for output");
 
-  raft::mr::device::buffer<int> in_row_ind(d_alloc, stream, in->n_rows);
+  rmm::device_uvector<int> in_row_ind(in->n_rows, stream);
 
   convert::sorted_coo_to_csr(in, in_row_ind.data(), d_alloc, stream);
 
@@ -265,11 +264,11 @@ void from_knn_symmetrize_matrix(
                        raft::ceildiv(k, TPB_Y));
 
   // Notice n+1 since we can reuse these arrays for transpose_edges, original_edges in step (4)
-  raft::mr::device::buffer<value_idx> row_sizes(d_alloc, stream, n);
+  rmm::device_uvector<value_idx> row_sizes(n, stream);
   CUDA_CHECK(
     cudaMemsetAsync(row_sizes.data(), 0, sizeof(value_idx) * n, stream));
 
-  raft::mr::device::buffer<value_idx> row_sizes2(d_alloc, stream, n);
+  rmm::device_uvector<value_idx> row_sizes2(n, stream);
   CUDA_CHECK(
     cudaMemsetAsync(row_sizes2.data(), 0, sizeof(value_idx) * n, stream));
 
diff --git a/cpp/include/raft/sparse/linalg/transpose.h b/cpp/include/raft/sparse/linalg/transpose.h
index 6afe4ca8f6..7979de5657 100644
--- a/cpp/include/raft/sparse/linalg/transpose.h
+++ b/cpp/include/raft/sparse/linalg/transpose.h
@@ -22,7 +22,7 @@
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
@@ -72,8 +72,8 @@ void csr_transpose(cusparseHandle_t handle, const value_idx *csr_indptr,
     CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1,
     &convert_csc_workspace_size, stream));
 
-  raft::mr::device::buffer<char> convert_csc_workspace(
-    allocator, stream, convert_csc_workspace_size);
+  rmm::device_uvector<char> convert_csc_workspace(convert_csc_workspace_size,
+                                                  stream);
 
   CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc(
     handle, csr_nrows, csr_ncols, nnz, csr_data, csr_indptr, csr_indices,
diff --git a/cpp/include/raft/sparse/op/filter.cuh b/cpp/include/raft/sparse/op/filter.cuh
index 562d506cfe..5df7decf8b 100644
--- a/cpp/include/raft/sparse/op/filter.cuh
+++ b/cpp/include/raft/sparse/op/filter.cuh
@@ -22,7 +22,7 @@
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
@@ -87,8 +87,8 @@ void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz,
                        int *cur_cnnz, T scalar, int n,
                        std::shared_ptr<raft::mr::device::allocator> d_alloc,
                        cudaStream_t stream) {
-  raft::mr::device::buffer<int> ex_scan(d_alloc, stream, n);
-  raft::mr::device::buffer<int> cur_ex_scan(d_alloc, stream, n);
+  rmm::device_uvector<int> ex_scan(n, stream);
+  rmm::device_uvector<int> cur_ex_scan(n, stream);
 
   CUDA_CHECK(cudaMemsetAsync(ex_scan.data(), 0, n * sizeof(int), stream));
   CUDA_CHECK(cudaMemsetAsync(cur_ex_scan.data(), 0, n * sizeof(int), stream));
@@ -129,8 +129,8 @@ template <int TPB_X, typename T>
 void coo_remove_scalar(COO<T> *in, COO<T> *out, T scalar,
                        std::shared_ptr<raft::mr::device::allocator> d_alloc,
                        cudaStream_t stream) {
-  raft::mr::device::buffer<int> row_count_nz(d_alloc, stream, in->n_rows);
-  raft::mr::device::buffer<int> row_count(d_alloc, stream, in->n_rows);
+  rmm::device_uvector<int> row_count_nz(in->n_rows, stream);
+  rmm::device_uvector<int> row_count(in->n_rows, stream);
 
   CUDA_CHECK(
     cudaMemsetAsync(row_count_nz.data(), 0, in->n_rows * sizeof(int), stream));
diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index 09494e9eb1..a6834552e3 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -19,6 +19,8 @@
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
 
+#include <rmm/device_uvector.hpp>
+
 #include <faiss/gpu/GpuDistance.h>
 #include <faiss/gpu/GpuResources.h>
 #include <faiss/gpu/StandardGpuResources.h>
@@ -243,13 +245,12 @@ void brute_force_knn_impl(std::vector<float *> &input, std::vector<int> &sizes,
   int device;
   CUDA_CHECK(cudaGetDevice(&device));
 
-  raft::mr::device::buffer<int64_t> trans(allocator, userStream,
-                                          id_ranges->size());
+  rmm::device_uvector<int64_t> trans(id_ranges->size(), userStream);
   raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(),
                       userStream);
 
-  raft::mr::device::buffer<float> all_D(allocator, userStream, 0);
-  raft::mr::device::buffer<int64_t> all_I(allocator, userStream, 0);
+  rmm::device_uvector<float> all_D(0, userStream);
+  rmm::device_uvector<int64_t> all_I(0, userStream);
 
   float *out_D = res_D;
   int64_t *out_I = res_I;
diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp
index a645412c2f..d9292440d1 100644
--- a/cpp/include/raft/spatial/knn/detail/processing.hpp
+++ b/cpp/include/raft/spatial/knn/detail/processing.hpp
@@ -20,9 +20,9 @@
 #include <raft/linalg/norm.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
 #include <raft/stats/mean.cuh>
 #include <raft/stats/mean_center.cuh>
+#include <rmm/device_uvector.hpp>
 
 namespace raft {
 namespace spatial {
@@ -57,7 +57,7 @@ class CosineMetricProcessor : public MetricProcessor<math_t> {
   size_t n_cols_;
   cudaStream_t stream_;
   std::shared_ptr<deviceAllocator> device_allocator_;
-  raft::mr::device::buffer<math_t> colsums_;
+  rmm::device_uvector<math_t> colsums_;
 
  public:
   CosineMetricProcessor(size_t n_rows, size_t n_cols, int k, bool row_major,
@@ -65,7 +65,7 @@ class CosineMetricProcessor : public MetricProcessor<math_t> {
                         std::shared_ptr<deviceAllocator> allocator)
     : device_allocator_(allocator),
       stream_(stream),
-      colsums_(allocator, stream, n_rows),
+      colsums_(n_rows, stream),
       n_cols_(n_cols),
       n_rows_(n_rows),
       row_major_(row_major),
@@ -108,7 +108,7 @@ class CorrelationMetricProcessor : public CosineMetricProcessor<math_t> {
                              std::shared_ptr<deviceAllocator> allocator)
     : CosineMetricProcessor<math_t>(n_rows, n_cols, k, row_major, stream,
                                     allocator),
-      means_(allocator, stream, n_rows) {}
+      means_(n_rows, stream) {}
 
   void preprocess(math_t *data) {
     math_t normalizer_const = 1.0 / (math_t)cosine::n_cols_;
@@ -143,7 +143,7 @@ class CorrelationMetricProcessor : public CosineMetricProcessor<math_t> {
 
   ~CorrelationMetricProcessor() = default;
 
-  raft::mr::device::buffer<math_t> means_;
+  rmm::device_uvector<math_t> means_;
 };
 
 template <typename math_t>
diff --git a/cpp/test/lap/lap.cu b/cpp/test/lap/lap.cu
index 04f473f836..08429e18f2 100644
--- a/cpp/test/lap/lap.cu
+++ b/cpp/test/lap/lap.cu
@@ -24,6 +24,8 @@
  */
 #include <gtest/gtest.h>
 
+#include <rmm/device_uvector.hpp>
+
 #include <omp.h>
 #include <iostream>
 #include <raft/lap/lap.cuh>
@@ -65,15 +67,12 @@ void hungarian_test(int problemsize, int costrange, int problemcount,
   for (int j = 0; j < problemcount; j++) {
     generateProblem(h_cost, batchsize, problemsize, costrange);
 
-    raft::mr::device::buffer<weight_t> elements_v(
-      handle.get_device_allocator(), handle.get_stream(),
-      batchsize * problemsize * problemsize);
-    raft::mr::device::buffer<vertex_t> row_assignment_v(
-      handle.get_device_allocator(), handle.get_stream(),
-      batchsize * problemsize);
-    raft::mr::device::buffer<vertex_t> col_assignment_v(
-      handle.get_device_allocator(), handle.get_stream(),
-      batchsize * problemsize);
+    rmm::device_uvector<weight_t> elements_v(
+      batchsize * problemsize * problemsize, handle.get_stream());
+    rmm::device_uvector<vertex_t> row_assignment_v(batchsize * problemsize,
+                                                   handle.get_stream());
+    rmm::device_uvector<vertex_t> col_assignment_v(batchsize * problemsize,
+                                                   handle.get_stream());
 
     raft::update_device(elements_v.data(), h_cost,
                         batchsize * problemsize * problemsize,
diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/test/linalg/cholesky_r1.cu
index 00236d53fa..3c84d0db5f 100644
--- a/cpp/test/linalg/cholesky_r1.cu
+++ b/cpp/test/linalg/cholesky_r1.cu
@@ -20,7 +20,8 @@
 #include <raft/handle.hpp>
 #include <raft/linalg/cholesky_r1_update.cuh>
 #include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_uvector.hpp>
+
 #include <sstream>
 #include <vector>
 #include "../test_utils.h"
@@ -31,12 +32,11 @@ template <typename math_t>
 class CholeskyR1Test : public ::testing::Test {
  protected:
   CholeskyR1Test()
-    : allocator(handle.get_device_allocator()),
-      G(allocator, handle.get_stream(), n_rows * n_rows),
-      L(allocator, handle.get_stream(), n_rows * n_rows),
-      L_exp(allocator, handle.get_stream(), n_rows * n_rows),
-      devInfo(allocator, handle.get_stream(), 1),
-      workspace(allocator, handle.get_stream()) {
+    : G(n_rows * n_rows, handle.get_stream()),
+      L(n_rows * n_rows, handle.get_stream()),
+      L_exp(n_rows * n_rows, handle.get_stream()),
+      devInfo(1, handle.get_stream()),
+      workspace(0, handle.get_stream()) {
     CUDA_CHECK(cudaStreamCreate(&stream));
     handle.set_stream(stream);
     raft::update_device(G.data(), G_host, n_rows * n_rows, stream);
@@ -105,7 +105,6 @@ class CholeskyR1Test : public ::testing::Test {
   }
 
   raft::handle_t handle;
-  std::shared_ptr<raft::mr::device::allocator> allocator;
   cusolverDnHandle_t solver_handle;
   cudaStream_t stream;
 
@@ -120,11 +119,11 @@ class CholeskyR1Test : public ::testing::Test {
 
   math_t G2_host[4] = {3, 4, 2, 1};
 
-  raft::mr::device::buffer<int> devInfo;
-  raft::mr::device::buffer<math_t> G;
-  raft::mr::device::buffer<math_t> L_exp;
-  raft::mr::device::buffer<math_t> L;
-  raft::mr::device::buffer<char> workspace;
+  rmm::device_uvector<int> devInfo;
+  rmm::device_uvector<math_t> G;
+  rmm::device_uvector<math_t> L_exp;
+  rmm::device_uvector<math_t> L;
+  rmm::device_uvector<char> workspace;
 };
 
 typedef ::testing::Types<float, double> FloatTypes;
diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu
index 6e146fa4bb..c78dd9e8fb 100644
--- a/cpp/test/linalg/map_then_reduce.cu
+++ b/cpp/test/linalg/map_then_reduce.cu
@@ -19,6 +19,7 @@
 #include <limits>
 #include <raft/linalg/map_then_reduce.cuh>
 #include <raft/random/rng.cuh>
+#include <rmm/device_uvector.hpp>
 #include "../test_utils.h"
 
 namespace raft {
@@ -131,9 +132,7 @@ class MapGenericReduceTest : public ::testing::Test {
 
  protected:
   MapGenericReduceTest()
-    : allocator(handle.get_device_allocator()),
-      input(allocator, handle.get_stream(), n),
-      output(allocator, handle.get_stream(), 1) {
+    : input(n, handle.get_stream()), output(1, handle.get_stream()) {
     CUDA_CHECK(cudaStreamCreate(&stream));
     handle.set_stream(stream);
     initInput(input.data(), input.size(), stream);
@@ -172,9 +171,8 @@ class MapGenericReduceTest : public ::testing::Test {
   int n = 1237;
   raft::handle_t handle;
   cudaStream_t stream;
-  std::shared_ptr<raft::mr::device::allocator> allocator;
-  raft::mr::device::buffer<InType> input;
-  raft::mr::device::buffer<OutType> output;
+  rmm::device_uvector<InType> input;
+  rmm::device_uvector<OutType> output;
 };
 
 using IoTypePair =
diff --git a/cpp/test/matrix/matrix.cu b/cpp/test/matrix/matrix.cu
index 28222c0697..e7da92a136 100644
--- a/cpp/test/matrix/matrix.cu
+++ b/cpp/test/matrix/matrix.cu
@@ -18,6 +18,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/matrix/matrix.cuh>
 #include <raft/random/rng.cuh>
+#include <rmm/device_uvector.hpp>
 #include "../test_utils.h"
 
 namespace raft {
@@ -102,10 +103,9 @@ class MatrixCopyRowsTest : public ::testing::Test {
 
  protected:
   MatrixCopyRowsTest()
-    : allocator(handle.get_device_allocator()),
-      input(allocator, handle.get_stream(), n_cols * n_rows),
-      indices(allocator, handle.get_stream(), n_selected),
-      output(allocator, handle.get_stream(), n_cols * n_selected) {
+    : input(n_cols * n_rows, handle.get_stream()),
+      indices(n_selected, handle.get_stream()),
+      output(n_cols * n_selected, handle.get_stream()) {
     CUDA_CHECK(cudaStreamCreate(&stream));
     handle.set_stream(stream);
     raft::update_device(indices.data(), indices_host, n_selected, stream);
@@ -143,10 +143,9 @@ class MatrixCopyRowsTest : public ::testing::Test {
                                     14, 21, 22, 23, 27, 28, 29};
   raft::handle_t handle;
   cudaStream_t stream;
-  std::shared_ptr<raft::mr::device::allocator> allocator;
-  raft::mr::device::buffer<math_t> input;
-  raft::mr::device::buffer<math_t> output;
-  raft::mr::device::buffer<idx_array_t> indices;
+  rmm::device_uvector<math_t> input;
+  rmm::device_uvector<math_t> output;
+  rmm::device_uvector<idx_array_t> indices;
 };
 
 using TypeTuple =
diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu
index 6e3f3b5038..3582075bfb 100644
--- a/cpp/test/sparse/dist_coo_spmv.cu
+++ b/cpp/test/sparse/dist_coo_spmv.cu
@@ -23,6 +23,7 @@
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/mr/device/allocator.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <raft/sparse/convert/coo.cuh>
 #include <raft/sparse/distance/coo_spmv.cuh>
@@ -67,9 +68,8 @@ class SparseDistanceCOOSPMVTest
   template <typename reduce_f, typename accum_f, typename write_f>
   void compute_dist(reduce_f reduce_func, accum_f accum_func,
                     write_f write_func, bool rev = true) {
-    raft::mr::device::buffer<value_idx> coo_rows(
-      dist_config.allocator, dist_config.stream,
-      max(dist_config.b_nnz, dist_config.a_nnz));
+    rmm::device_uvector<value_idx> coo_rows(
+      max(dist_config.b_nnz, dist_config.a_nnz), dist_config.stream);
 
     raft::sparse::convert::csr_to_coo(dist_config.b_indptr, dist_config.b_nrows,
                                       coo_rows.data(), dist_config.b_nnz,
diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu
index ce567e4298..64403eab7f 100644
--- a/cpp/test/sparse/linkage.cu
+++ b/cpp/test/sparse/linkage.cu
@@ -117,7 +117,7 @@ double compute_rand_index(
   ASSERT(size >= 2, "Rand Index for size less than 2 not defined!");
 
   //allocating and initializing memory for a and b in the GPU
-  raft::mr::device::buffer<uint64_t> arr_buf(allocator, stream, 2);
+  rmm::device_uvector<uint64_t> arr_buf(2, stream);
   CUDA_CHECK(cudaMemsetAsync(arr_buf.data(), 0, 2 * sizeof(uint64_t), stream));
 
   //kernel configuration
diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu
index d104028d2b..e96e5c289c 100644
--- a/cpp/test/sparse/symmetrize.cu
+++ b/cpp/test/sparse/symmetrize.cu
@@ -17,11 +17,12 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/random/rng.cuh>
-#include "../test_utils.h"
-
 #include <raft/sparse/convert/coo.cuh>
 #include <raft/sparse/coo.cuh>
 #include <raft/sparse/linalg/symmetrize.cuh>
+#include <rmm/device_uvector.hpp>
+
+#include "../test_utils.h"
 
 #include <iostream>
 
@@ -87,7 +88,7 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam<
     value_idx n = params.n_cols;
     value_idx nnz = params.indices_h.size();
 
-    raft::mr::device::buffer<value_idx> coo_rows(alloc, stream, nnz);
+    rmm::device_uvector<value_idx> coo_rows(nnz, stream);
 
     raft::sparse::convert::csr_to_coo(indptr, m, coo_rows.data(), nnz, stream);
 
@@ -96,7 +97,7 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam<
     raft::sparse::linalg::symmetrize(handle, coo_rows.data(), indices, data, m,
                                      n, coo_rows.size(), out);
 
-    raft::mr::device::buffer<value_idx> sum(alloc, stream, 1);
+    rmm::device_uvector<value_idx> sum(1, stream);
 
     CUDA_CHECK(cudaMemsetAsync(sum.data(), 0, 1 * sizeof(value_idx), stream));
 

From 360bebb18959f026295600a91a7d2fa9bc520b5b Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Wed, 30 Jun 2021 14:53:49 +0200
Subject: [PATCH 02/17] Remove references to RAFT allocator adapter

---
 cpp/include/raft/comms/helper.hpp             | 10 ++--
 cpp/include/raft/comms/std_comms.hpp          | 40 ++++++----------
 cpp/include/raft/label/classlabels.cuh        | 33 +++++--------
 .../raft/linalg/cholesky_r1_update.cuh        |  8 ++--
 cpp/include/raft/linalg/eig.cuh               |  3 --
 cpp/include/raft/matrix/math.cuh              |  2 -
 cpp/include/raft/random/rng.cuh               |  5 --
 cpp/include/raft/sparse/convert/csr.cuh       | 12 +----
 cpp/include/raft/sparse/coo.cuh               | 10 ++--
 cpp/include/raft/sparse/csr.cuh               | 11 +----
 .../raft/sparse/distance/bin_distance.cuh     | 10 ++--
 cpp/include/raft/sparse/distance/common.h     |  3 --
 cpp/include/raft/sparse/distance/coo_spmv.cuh |  1 -
 cpp/include/raft/sparse/distance/csr_spmv.cuh | 12 ++---
 cpp/include/raft/sparse/distance/distance.cuh |  1 -
 .../raft/sparse/distance/ip_distance.cuh      |  3 +-
 .../raft/sparse/distance/l2_distance.cuh      | 10 ++--
 .../raft/sparse/distance/lp_distance.cuh      |  1 -
 .../sparse/hierarchy/detail/agglomerative.cuh |  2 -
 .../hierarchy/detail/connectivities.cuh       |  8 ++--
 .../raft/sparse/hierarchy/detail/mst.cuh      | 13 ++----
 .../raft/sparse/hierarchy/single_linkage.hpp  |  1 -
 cpp/include/raft/sparse/linalg/add.cuh        |  3 --
 cpp/include/raft/sparse/linalg/spectral.cuh   |  2 -
 cpp/include/raft/sparse/linalg/symmetrize.cuh | 18 +++-----
 cpp/include/raft/sparse/linalg/transpose.h    |  6 +--
 cpp/include/raft/sparse/op/filter.cuh         | 19 ++------
 cpp/include/raft/sparse/op/reduce.cuh         |  2 -
 cpp/include/raft/sparse/op/sort.h             | 12 +----
 .../sparse/selection/connect_components.cuh   | 26 ++++-------
 cpp/include/raft/sparse/selection/knn.cuh     | 18 ++------
 .../raft/sparse/selection/knn_graph.cuh       |  1 -
 cpp/include/raft/spatial/knn/ann.hpp          |  3 --
 .../knn/detail/ann_quantized_faiss.cuh        |  6 +--
 .../knn/detail/knn_brute_force_faiss.cuh      |  9 ++--
 .../raft/spatial/knn/detail/processing.hpp    | 23 ++++------
 cpp/include/raft/spatial/knn/knn.hpp          |  6 +--
 cpp/include/raft/spectral/matrix_wrappers.hpp | 46 ++++++-------------
 cpp/test/label/label.cu                       | 18 +++-----
 cpp/test/label/merge_labels.cu                |  1 -
 cpp/test/linalg/binary_op.cu                  |  8 ++--
 cpp/test/linalg/cholesky_r1.cu                |  1 -
 cpp/test/linalg/map.cu                        |  1 -
 cpp/test/mr/device/buffer.cpp                 | 18 ++++----
 cpp/test/sparse/add.cu                        |  5 +-
 cpp/test/sparse/connect_components.cu         | 11 ++---
 cpp/test/sparse/convert_csr.cu                |  5 +-
 cpp/test/sparse/csr_row_slice.cu              |  4 --
 cpp/test/sparse/csr_to_dense.cu               |  4 --
 cpp/test/sparse/csr_transpose.cu              |  6 +--
 cpp/test/sparse/dist_coo_spmv.cu              |  4 --
 cpp/test/sparse/dist_csr_spmv.cu              |  4 --
 cpp/test/sparse/distance.cu                   |  4 --
 cpp/test/sparse/filter.cu                     | 13 ++----
 cpp/test/sparse/knn.cu                        |  7 +--
 cpp/test/sparse/knn_graph.cu                  |  3 +-
 cpp/test/sparse/linkage.cu                    | 10 ++--
 cpp/test/sparse/reduce.cu                     |  3 +-
 cpp/test/sparse/selection.cu                  |  2 -
 cpp/test/sparse/sort.cu                       |  5 +-
 cpp/test/sparse/symmetrize.cu                 | 12 ++---
 cpp/test/spatial/haversine.cu                 |  3 --
 62 files changed, 161 insertions(+), 390 deletions(-)

diff --git a/cpp/include/raft/comms/helper.hpp b/cpp/include/raft/comms/helper.hpp
index 7b24e31bbe..e01490d728 100644
--- a/cpp/include/raft/comms/helper.hpp
+++ b/cpp/include/raft/comms/helper.hpp
@@ -38,11 +38,10 @@ namespace comms {
  */
 void build_comms_nccl_only(handle_t *handle, ncclComm_t nccl_comm,
                            int num_ranks, int rank) {
-  auto d_alloc = handle->get_device_allocator();
   cudaStream_t stream = handle->get_stream();
 
   auto communicator = std::make_shared<comms_t>(std::unique_ptr<comms_iface>(
-    new raft::comms::std_comms(nccl_comm, num_ranks, rank, d_alloc, stream)));
+    new raft::comms::std_comms(nccl_comm, num_ranks, rank, stream)));
   handle->set_comms(communicator);
 }
 
@@ -80,12 +79,11 @@ void build_comms_nccl_ucx(handle_t *handle, ncclComm_t nccl_comm,
     }
   }
 
-  auto d_alloc = handle->get_device_allocator();
   cudaStream_t stream = handle->get_stream();
 
-  auto communicator = std::make_shared<comms_t>(std::unique_ptr<comms_iface>(
-    new raft::comms::std_comms(nccl_comm, (ucp_worker_h)ucp_worker, eps_sp,
-                               num_ranks, rank, d_alloc, stream)));
+  auto communicator = std::make_shared<comms_t>(
+    std::unique_ptr<comms_iface>(new raft::comms::std_comms(
+      nccl_comm, (ucp_worker_h)ucp_worker, eps_sp, num_ranks, rank, stream)));
   handle->set_comms(communicator);
 }
 
diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp
index 765e8741bb..ff75931fb9 100644
--- a/cpp/include/raft/comms/std_comms.hpp
+++ b/cpp/include/raft/comms/std_comms.hpp
@@ -20,7 +20,7 @@
 
 #include <raft/comms/ucp_helper.hpp>
 #include <raft/handle.hpp>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <raft/error.hpp>
 
@@ -64,17 +64,16 @@ class std_comms : public comms_iface {
    */
   std_comms(ncclComm_t nccl_comm, ucp_worker_h ucp_worker,
             std::shared_ptr<ucp_ep_h *> eps, int num_ranks, int rank,
-            const std::shared_ptr<mr::device::allocator> device_allocator,
             cudaStream_t stream, bool subcomms_ucp = true)
     : nccl_comm_(nccl_comm),
       stream_(stream),
+      status_(2, stream),
       num_ranks_(num_ranks),
       rank_(rank),
       subcomms_ucp_(subcomms_ucp),
       ucp_worker_(ucp_worker),
       ucp_eps_(eps),
-      next_request_id_(0),
-      device_allocator_(device_allocator) {
+      next_request_id_(0) {
     initialize();
   };
 
@@ -85,27 +84,19 @@ class std_comms : public comms_iface {
    * @param rank rank of the current worker
    */
   std_comms(const ncclComm_t nccl_comm, int num_ranks, int rank,
-            const std::shared_ptr<mr::device::allocator> device_allocator,
             cudaStream_t stream)
     : nccl_comm_(nccl_comm),
       stream_(stream),
+      status_(2, stream),
       num_ranks_(num_ranks),
       rank_(rank),
-      subcomms_ucp_(false),
-      device_allocator_(device_allocator) {
+      subcomms_ucp_(false) {
     initialize();
   };
 
-  virtual ~std_comms() {
-    device_allocator_->deallocate(sendbuff_, sizeof(int), stream_);
-    device_allocator_->deallocate(recvbuff_, sizeof(int), stream_);
-  }
-
   void initialize() {
-    sendbuff_ = reinterpret_cast<int *>(
-      device_allocator_->allocate(sizeof(int), stream_));
-    recvbuff_ = reinterpret_cast<int *>(
-      device_allocator_->allocate(sizeof(int), stream_));
+    sendbuff_ = status_.data();
+    recvbuff_ = status_.data() + 1;
   }
 
   int get_size() const { return num_ranks_; }
@@ -113,8 +104,8 @@ class std_comms : public comms_iface {
   int get_rank() const { return rank_; }
 
   std::unique_ptr<comms_iface> comm_split(int color, int key) const {
-    mr::device::buffer<int> d_colors(device_allocator_, stream_, get_size());
-    mr::device::buffer<int> d_keys(device_allocator_, stream_, get_size());
+    rmm::device_uvector<int> d_colors(get_size(), stream_);
+    rmm::device_uvector<int> d_keys(get_size(), stream_);
 
     update_device(d_colors.data() + get_rank(), &color, 1, stream_);
     update_device(d_keys.data() + get_rank(), &key, 1, stream_);
@@ -167,12 +158,12 @@ class std_comms : public comms_iface {
 
     if (ucp_worker_ != nullptr && subcomms_ucp_) {
       auto eps_sp = std::make_shared<ucp_ep_h *>(new_ucx_ptrs.data());
-      return std::unique_ptr<comms_iface>(new std_comms(
-        nccl_comm, (ucp_worker_h)ucp_worker_, eps_sp, subcomm_ranks.size(), key,
-        device_allocator_, stream_, subcomms_ucp_));
+      return std::unique_ptr<comms_iface>(
+        new std_comms(nccl_comm, (ucp_worker_h)ucp_worker_, eps_sp,
+                      subcomm_ranks.size(), key, stream_, subcomms_ucp_));
     } else {
-      return std::unique_ptr<comms_iface>(new std_comms(
-        nccl_comm, subcomm_ranks.size(), key, device_allocator_, stream_));
+      return std::unique_ptr<comms_iface>(
+        new std_comms(nccl_comm, subcomm_ranks.size(), key, stream_));
     }
   }
 
@@ -465,6 +456,7 @@ class std_comms : public comms_iface {
   cudaStream_t stream_;
 
   int *sendbuff_, *recvbuff_;
+  rmm::device_uvector<int> status_;
 
   int num_ranks_;
   int rank_;
@@ -478,8 +470,6 @@ class std_comms : public comms_iface {
   mutable std::unordered_map<request_t, struct ucp_request *>
     requests_in_flight_;
   mutable std::unordered_set<request_t> free_requests_;
-
-  std::shared_ptr<mr::device::allocator> device_allocator_;
 };
 }  // end namespace comms
 }  // end namespace raft
diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/classlabels.cuh
index f2b2463165..01a0afa774 100644
--- a/cpp/include/raft/label/classlabels.cuh
+++ b/cpp/include/raft/label/classlabels.cuh
@@ -21,7 +21,6 @@
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/unary_op.cuh>
-#include <raft/mr/device/allocator.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
@@ -40,12 +39,11 @@ namespace label {
  *   on exit it has size [n_unique]
  * \param [out] n_unique number of unique labels
  * \param [in] stream cuda stream
- * \param [in] allocator device allocator
  */
 template <typename value_t>
-void getUniquelabels(value_t *y, size_t n, value_t **y_unique, int *n_unique,
-                     cudaStream_t stream,
-                     std::shared_ptr<raft::mr::device::allocator> allocator) {
+int getUniquelabels(rmm::device_uvector<value_t> &y_unique, value_t *y,
+                    size_t n, cudaStream_t stream) {
+  int n_unique;
   rmm::device_uvector<value_t> y2(n, stream);
   rmm::device_uvector<value_t> y3(n, stream);
   rmm::device_uvector<int> d_num_selected(1, stream);
@@ -64,13 +62,13 @@ void getUniquelabels(value_t *y, size_t n, value_t **y_unique, int *n_unique,
   cub::DeviceRadixSort::SortKeys(cub_storage.data(), bytes, y, y2.data(), n);
   cub::DeviceSelect::Unique(cub_storage.data(), bytes, y2.data(), y3.data(),
                             d_num_selected.data(), n);
-  raft::update_host(n_unique, d_num_selected.data(), 1, stream);
+  raft::update_host(&n_unique, d_num_selected.data(), 1, stream);
   CUDA_CHECK(cudaStreamSynchronize(stream));
 
   // Copy unique classes to output
-  *y_unique =
-    (value_t *)allocator->allocate(*n_unique * sizeof(value_t), stream);
-  raft::copy(*y_unique, y3.data(), *n_unique, stream);
+  y_unique.resize(n_unique, stream);
+  raft::copy(y_unique.data(), y3.data(), n_unique, stream);
+  return n_unique;
 }
 
 /**
@@ -147,22 +145,17 @@ __global__ void map_label_kernel(Type *map_ids, size_t N_labels, Type *in,
    */
 template <typename Type, typename Lambda>
 void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream,
-                    Lambda filter_op,
-                    std::shared_ptr<raft::mr::device::allocator> allocator,
-                    bool zero_based = false) {
+                    Lambda filter_op, bool zero_based = false) {
   static const size_t TPB_X = 256;
 
   dim3 blocks(raft::ceildiv(N, TPB_X));
   dim3 threads(TPB_X);
 
-  Type *map_ids;
-  int num_clusters;
-  getUniquelabels(in, N, &map_ids, &num_clusters, stream, allocator);
+  rmm::device_uvector<Type> map_ids(0, stream);
+  int num_clusters = getUniquelabels(map_ids, in, N, stream);
 
   map_label_kernel<Type, TPB_X><<<blocks, threads, 0, stream>>>(
-    map_ids, num_clusters, in, out, N, filter_op, zero_based);
-
-  allocator->deallocate(map_ids, num_clusters * sizeof(Type), stream);
+    map_ids.data(), num_clusters, in, out, N, filter_op, zero_based);
 }
 
 /**
@@ -184,11 +177,9 @@ void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream,
    */
 template <typename Type>
 void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream,
-                    std::shared_ptr<raft::mr::device::allocator> allocator,
                     bool zero_based = false) {
   make_monotonic<Type>(
-    out, in, N, stream, [] __device__(Type val) { return false; }, allocator,
-    zero_based);
+    out, in, N, stream, [] __device__(Type val) { return false; }, zero_based);
 }
 };  // namespace label
 };  // end namespace raft
diff --git a/cpp/include/raft/linalg/cholesky_r1_update.cuh b/cpp/include/raft/linalg/cholesky_r1_update.cuh
index b5a93c4953..d6d064c20e 100644
--- a/cpp/include/raft/linalg/cholesky_r1_update.cuh
+++ b/cpp/include/raft/linalg/cholesky_r1_update.cuh
@@ -63,11 +63,11 @@ namespace linalg {
  * @code{.cpp}
  * // Initialize arrays
  * int ld_L = n_rows;
- * device_buffer<math_t> L(allocator, stream, ld_L * n_rows);
+ * rmm::device_uvector<math_t> L(ld_L * n_rows, stream);
  * MLCommon::LinAlg::choleskyRank1Update(handle, L, n_rows, ld_L, nullptr,
  *                                       &n_bytes, CUBLAS_FILL_MODE_LOWER,
  *                                       stream);
- * device_buffer<char> workspace(allocator, stream, n_bytes);
+ * rmm::device_uvector<char> workspace(n_bytes, stream);
  *
  * for (n=1; n<=n_rows; rank++) {
  *   // Calculate a new row/column of matrix A into A_new
@@ -87,11 +87,11 @@ namespace linalg {
  * @code{.cpp}
  * // Initialize arrays
  * int ld_U = n_rows;
- * device_buffer<math_t> U(allocator, stream, ld_U * n_rows);
+ * rmm::device_uvector<math_t> U(ld_U * n_rows, stream);
  * MLCommon::LinAlg::choleskyRank1Update(handle, L, n_rows, ld_U, nullptr,
  *                                       &n_bytes, CUBLAS_FILL_MODE_UPPER,
  *                                       stream);
- * device_buffer<char> workspace(allocator, stream, n_bytes);
+ * rmm::device_uvector<char> workspace(stream, n_bytes, stream);
  *
  * for (n=1; n<=n_rows; n++) {
  *   // Calculate a new row/column of matrix A into array A_new
diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh
index 8ab7011db4..83abd96756 100644
--- a/cpp/include/raft/linalg/eig.cuh
+++ b/cpp/include/raft/linalg/eig.cuh
@@ -44,7 +44,6 @@ template <typename math_t>
 void eigDC(const raft::handle_t &handle, const math_t *in, int n_rows,
            int n_cols, math_t *eig_vectors, math_t *eig_vals,
            cudaStream_t stream) {
-  auto allocator = handle.get_device_allocator();
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   int lwork;
@@ -93,7 +92,6 @@ template <typename math_t>
 void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
               int n_eig_vals, math_t *eig_vectors, math_t *eig_vals,
               EigVecMemUsage memUsage, cudaStream_t stream) {
-  auto allocator = handle.get_device_allocator();
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   int lwork;
@@ -163,7 +161,6 @@ template <typename math_t>
 void eigJacobi(const raft::handle_t &handle, const math_t *in, int n_rows,
                int n_cols, math_t *eig_vectors, math_t *eig_vals,
                cudaStream_t stream, math_t tol = 1.e-7, int sweeps = 15) {
-  auto allocator = handle.get_device_allocator();
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   syevjInfo_t syevj_params = nullptr;
diff --git a/cpp/include/raft/matrix/math.cuh b/cpp/include/raft/matrix/math.cuh
index f84b85d222..cd99bbfc84 100644
--- a/cpp/include/raft/matrix/math.cuh
+++ b/cpp/include/raft/matrix/math.cuh
@@ -21,7 +21,6 @@
 #include <raft/linalg/map_then_reduce.cuh>
 #include <raft/linalg/matrix_vector_op.cuh>
 #include <raft/linalg/unary_op.cuh>
-#include <raft/mr/device/allocator.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
@@ -285,7 +284,6 @@ void setValue(math_t *out, const math_t *in, math_t scalar, int len,
  * @param src: input matrix
  * @param dest: output matrix. The result is stored in the dest matrix
  * @param len: number elements of input matrix
- * @param allocator device allocator
  * @param stream cuda stream
  */
 template <typename math_t, typename IdxType = int>
diff --git a/cpp/include/raft/random/rng.cuh b/cpp/include/raft/random/rng.cuh
index a96c0bae38..3d2e44e49b 100644
--- a/cpp/include/raft/random/rng.cuh
+++ b/cpp/include/raft/random/rng.cuh
@@ -24,7 +24,6 @@
 #include <raft/common/scatter.cuh>
 #include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
-#include <raft/mr/device/allocator.hpp>
 #include <random>
 #include <rmm/device_uvector.hpp>
 #include <type_traits>
@@ -498,7 +497,6 @@ class Rng {
    * sampling is desired
    * @param sampledLen output sampled array length
    * @param len input array length
-   * @param allocator device allocator for allocating any workspace required
    * @param stream cuda stream
    */
   template <typename DataT, typename WeightsT, typename IdxT = int>
@@ -509,9 +507,6 @@ class Rng {
     ASSERT(sampledLen <= len,
            "sampleWithoutReplacement: 'sampledLen' cant be more than 'len'.");
 
-    std::shared_ptr<raft::mr::device::allocator> allocator =
-      handle.get_device_allocator();
-
     rmm::device_uvector<WeightsT> expWts(len, stream);
     rmm::device_uvector<WeightsT> sortedWts(len, stream);
     rmm::device_uvector<IdxT> inIdx(len, stream);
diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh
index 31a8f54721..16f351bf48 100644
--- a/cpp/include/raft/sparse/convert/csr.cuh
+++ b/cpp/include/raft/sparse/convert/csr.cuh
@@ -22,7 +22,6 @@
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
-#include <raft/mr/device/allocator.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/device_ptr.h>
@@ -49,7 +48,6 @@ void coo_to_csr(const raft::handle_t &handle, const int *srcRows,
                 int *dst_offsets, int *dstCols, value_t *dstVals) {
   auto stream = handle.get_stream();
   auto cusparseHandle = handle.get_cusparse_handle();
-  auto d_alloc = handle.get_device_allocator();
   rmm::device_uvector<int> dstRows(nnz, stream);
   CUDA_CHECK(cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz,
                              cudaMemcpyDeviceToDevice, stream));
@@ -147,12 +145,10 @@ void csr_adj_graph(const Index_ *row_ind, Index_ total_rows, Index_ nnz,
  * @param nnz: size of COO rows array
  * @param row_ind: output row indices array
  * @param m: number of rows in dense matrix
- * @param d_alloc device allocator for temporary buffers
  * @param stream: cuda stream to use
  */
 template <typename T>
 void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m,
-                       std::shared_ptr<raft::mr::device::allocator> d_alloc,
                        cudaStream_t stream) {
   rmm::device_uvector<T> row_counts(m, stream);
 
@@ -173,15 +169,11 @@ void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m,
  *
  * @param coo: Input COO matrix
  * @param row_ind: output row indices array
- * @param d_alloc device allocator for temporary buffers
  * @param stream: cuda stream to use
  */
 template <typename T>
-void sorted_coo_to_csr(COO<T> *coo, int *row_ind,
-                       std::shared_ptr<raft::mr::device::allocator> d_alloc,
-                       cudaStream_t stream) {
-  sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, d_alloc,
-                    stream);
+void sorted_coo_to_csr(COO<T> *coo, int *row_ind, cudaStream_t stream) {
+  sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, stream);
 }
 
 };  // end NAMESPACE convert
diff --git a/cpp/include/raft/sparse/coo.cuh b/cpp/include/raft/sparse/coo.cuh
index 40d1a06720..6af8eae395 100644
--- a/cpp/include/raft/sparse/coo.cuh
+++ b/cpp/include/raft/sparse/coo.cuh
@@ -17,7 +17,6 @@
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/mr/device/allocator.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <cusparse_v2.h>
@@ -68,10 +67,9 @@ class COO {
   Index_Type n_cols;
 
   /**
-    * @param d_alloc: the device allocator to use for the underlying buffers
     * @param stream: CUDA stream to use
     */
-  COO(std::shared_ptr<raft::mr::device::allocator> d_alloc, cudaStream_t stream)
+  COO(cudaStream_t stream)
     : rows_arr(0, stream),
       cols_arr(0, stream),
       vals_arr(0, stream),
@@ -98,16 +96,14 @@ class COO {
       n_cols(n_cols) {}
 
   /**
-    * @param d_alloc: the device allocator use
     * @param stream: CUDA stream to use
     * @param nnz: size of the rows/cols/vals arrays
     * @param n_rows: number of rows in the dense matrix
     * @param n_cols: number of cols in the dense matrix
     * @param init: initialize arrays with zeros
     */
-  COO(std::shared_ptr<raft::mr::device::allocator> d_alloc, cudaStream_t stream,
-      Index_Type nnz, Index_Type n_rows = 0, Index_Type n_cols = 0,
-      bool init = true)
+  COO(cudaStream_t stream, Index_Type nnz, Index_Type n_rows = 0,
+      Index_Type n_cols = 0, bool init = true)
     : rows_arr(nnz, stream),
       cols_arr(nnz, stream),
       vals_arr(nnz, stream),
diff --git a/cpp/include/raft/sparse/csr.cuh b/cpp/include/raft/sparse/csr.cuh
index b30d7af8b4..a9ee3e63b2 100644
--- a/cpp/include/raft/sparse/csr.cuh
+++ b/cpp/include/raft/sparse/csr.cuh
@@ -20,7 +20,6 @@
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/mr/device/allocator.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/device_ptr.h>
@@ -208,7 +207,6 @@ void weak_cc_batched(Index_ *labels, const Index_ *row_ind,
  * @param row_ind_ptr the row index pointer of the CSR array
  * @param nnz the size of row_ind_ptr array
  * @param N number of vertices
- * @param d_alloc: deviceAllocator to use for temp memory
  * @param stream the cuda stream to use
  * @param filter_op an optional filtering function to determine which points
  * should get considered for labeling. It gets global indexes (not batch-wide!)
@@ -216,9 +214,7 @@ void weak_cc_batched(Index_ *labels, const Index_ *row_ind,
 template <typename Index_ = int, int TPB_X = 256,
           typename Lambda = auto(Index_)->bool>
 void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr,
-             Index_ nnz, Index_ N,
-             std::shared_ptr<raft::mr::device::allocator> d_alloc,
-             cudaStream_t stream, Lambda filter_op) {
+             Index_ nnz, Index_ N, cudaStream_t stream, Lambda filter_op) {
   rmm::device_uvector<bool> m(1, stream);
 
   WeakCCState state(m.data());
@@ -245,14 +241,11 @@ void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr,
  * @param row_ind_ptr the row index pointer of the CSR array
  * @param nnz the size of row_ind_ptr array
  * @param N number of vertices
- * @param d_alloc: deviceAllocator to use for temp memory
  * @param stream the cuda stream to use
  */
 template <typename Index_, int TPB_X = 256>
 void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr,
-             Index_ nnz, Index_ N,
-             std::shared_ptr<raft::mr::device::allocator> d_alloc,
-             cudaStream_t stream) {
+             Index_ nnz, Index_ N, cudaStream_t stream) {
   rmm::device_uvector<bool> m(1, stream);
   WeakCCState state(m.data());
   weak_cc_batched<Index_, TPB_X>(labels, row_ind, row_ind_ptr, nnz, N, 0, N,
diff --git a/cpp/include/raft/sparse/distance/bin_distance.cuh b/cpp/include/raft/sparse/distance/bin_distance.cuh
index 1ad5466aad..68c9548f7f 100644
--- a/cpp/include/raft/sparse/distance/bin_distance.cuh
+++ b/cpp/include/raft/sparse/distance/bin_distance.cuh
@@ -23,7 +23,6 @@
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
 
-#include <raft/mr/device/allocator.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <raft/sparse/distance/common.h>
@@ -85,9 +84,8 @@ void compute_bin_distance(value_t *out, const value_idx *Q_coo_rows,
                           const value_t *Q_data, value_idx Q_nnz,
                           const value_idx *R_coo_rows, const value_t *R_data,
                           value_idx R_nnz, value_idx m, value_idx n,
-                          cusparseHandle_t handle,
-                          std::shared_ptr<raft::mr::device::allocator> alloc,
-                          cudaStream_t stream, expansion_f expansion_func) {
+                          cusparseHandle_t handle, cudaStream_t stream,
+                          expansion_f expansion_func) {
   rmm::device_uvector<value_t> Q_norms(m, stream);
   rmm::device_uvector<value_t> R_norms(n, stream);
   CUDA_CHECK(
@@ -130,7 +128,7 @@ class jaccard_expanded_distances_t : public distances_t<value_t> {
     compute_bin_distance(
       out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz,
       b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows,
-      config_->handle, config_->allocator, config_->stream,
+      config_->handle, config_->stream,
       [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
         value_t q_r_union = q_norm + r_norm;
         value_t denom = q_r_union - dot;
@@ -177,7 +175,7 @@ class dice_expanded_distances_t : public distances_t<value_t> {
     compute_bin_distance(
       out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz,
       b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows,
-      config_->handle, config_->allocator, config_->stream,
+      config_->handle, config_->stream,
       [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
         value_t q_r_union = q_norm + r_norm;
         value_t dice = (2 * dot) / q_r_union;
diff --git a/cpp/include/raft/sparse/distance/common.h b/cpp/include/raft/sparse/distance/common.h
index 712d2c52bd..aabf5e47d4 100644
--- a/cpp/include/raft/sparse/distance/common.h
+++ b/cpp/include/raft/sparse/distance/common.h
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <cusparse_v2.h>
-#include <raft/mr/device/allocator.hpp>
 
 namespace raft {
 namespace sparse {
@@ -42,8 +41,6 @@ struct distances_config_t {
   value_t *b_data;
 
   cusparseHandle_t handle;
-
-  std::shared_ptr<raft::mr::device::allocator> allocator;
   cudaStream_t stream;
 };
 
diff --git a/cpp/include/raft/sparse/distance/coo_spmv.cuh b/cpp/include/raft/sparse/distance/coo_spmv.cuh
index d596c6b852..edd1b78ae8 100644
--- a/cpp/include/raft/sparse/distance/coo_spmv.cuh
+++ b/cpp/include/raft/sparse/distance/coo_spmv.cuh
@@ -19,7 +19,6 @@
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/mr/device/allocator.hpp>
 #include <raft/mr/device/buffer.hpp>
 
 #include <raft/sparse/distance/common.h>
diff --git a/cpp/include/raft/sparse/distance/csr_spmv.cuh b/cpp/include/raft/sparse/distance/csr_spmv.cuh
index 2ee0abe47a..635e4568fd 100644
--- a/cpp/include/raft/sparse/distance/csr_spmv.cuh
+++ b/cpp/include/raft/sparse/distance/csr_spmv.cuh
@@ -19,7 +19,6 @@
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/mr/device/allocator.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <raft/sparse/distance/common.h>
@@ -343,9 +342,8 @@ __global__ void max_kernel(value_idx *out, value_idx *in, value_idx n) {
 }
 
 template <typename value_idx>
-inline value_idx max_degree(
-  value_idx *indptr, value_idx n_rows,
-  std::shared_ptr<raft::mr::device::allocator> allocator, cudaStream_t stream) {
+inline value_idx max_degree(value_idx *indptr, value_idx n_rows,
+                            cudaStream_t stream) {
   rmm::device_uvector<value_idx> max_d(1, stream);
   CUDA_CHECK(cudaMemsetAsync(max_d.data(), 0, sizeof(value_idx), stream));
 
@@ -466,9 +464,9 @@ void generalized_csr_pairwise_semiring(
   int nnz_upper_bound = max_nnz_per_block<value_idx, value_t>();
 
   // max_nnz set from max(diff(indptrA))
-  value_idx max_nnz = max_degree<value_idx>(config_.a_indptr, config_.a_nrows,
-                                            config_.allocator, config_.stream) +
-                      1;
+  value_idx max_nnz =
+    max_degree<value_idx>(config_.a_indptr, config_.a_nrows, config_.stream) +
+    1;
 
   if (max_nnz <= nnz_upper_bound)
     // use smem
diff --git a/cpp/include/raft/sparse/distance/distance.cuh b/cpp/include/raft/sparse/distance/distance.cuh
index 0cd0be11be..f22397614d 100644
--- a/cpp/include/raft/sparse/distance/distance.cuh
+++ b/cpp/include/raft/sparse/distance/distance.cuh
@@ -22,7 +22,6 @@
 #include <raft/linalg/distance_type.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/mr/device/allocator.hpp>
 #include <raft/mr/device/buffer.hpp>
 
 #include <raft/sparse/linalg/transpose.h>
diff --git a/cpp/include/raft/sparse/distance/ip_distance.cuh b/cpp/include/raft/sparse/distance/ip_distance.cuh
index 6297b009f4..cd48cd21a0 100644
--- a/cpp/include/raft/sparse/distance/ip_distance.cuh
+++ b/cpp/include/raft/sparse/distance/ip_distance.cuh
@@ -22,7 +22,6 @@
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
 
-#include <raft/mr/device/allocator.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <raft/sparse/distance/common.h>
@@ -207,7 +206,7 @@ class ip_distances_gemm_t : public ip_trans_getters_t<value_idx, value_t> {
     raft::sparse::linalg::csr_transpose(
       config_->handle, config_->b_indptr, config_->b_indices, config_->b_data,
       csc_indptr.data(), csc_indices.data(), csc_data.data(), config_->b_nrows,
-      config_->b_ncols, config_->b_nnz, config_->allocator, config_->stream);
+      config_->b_ncols, config_->b_nnz, config_->stream);
   }
 
   value_t alpha;
diff --git a/cpp/include/raft/sparse/distance/l2_distance.cuh b/cpp/include/raft/sparse/distance/l2_distance.cuh
index 3898189630..2ae7e365e4 100644
--- a/cpp/include/raft/sparse/distance/l2_distance.cuh
+++ b/cpp/include/raft/sparse/distance/l2_distance.cuh
@@ -23,7 +23,6 @@
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/unary_op.cuh>
-#include <raft/mr/device/allocator.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <raft/sparse/utils.h>
@@ -89,9 +88,8 @@ void compute_l2(value_t *out, const value_idx *Q_coo_rows,
                 const value_t *Q_data, value_idx Q_nnz,
                 const value_idx *R_coo_rows, const value_t *R_data,
                 value_idx R_nnz, value_idx m, value_idx n,
-                cusparseHandle_t handle,
-                std::shared_ptr<raft::mr::device::allocator> alloc,
-                cudaStream_t stream, expansion_f expansion_func) {
+                cusparseHandle_t handle, cudaStream_t stream,
+                expansion_f expansion_func) {
   rmm::device_uvector<value_t> Q_sq_norms(m, stream);
   rmm::device_uvector<value_t> R_sq_norms(n, stream);
   CUDA_CHECK(
@@ -134,7 +132,7 @@ class l2_expanded_distances_t : public distances_t<value_t> {
     compute_l2(
       out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz,
       b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows,
-      config_->handle, config_->allocator, config_->stream,
+      config_->handle, config_->stream,
       [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
         return -2 * dot + q_norm + r_norm;
       });
@@ -201,7 +199,7 @@ class cosine_expanded_distances_t : public distances_t<value_t> {
     compute_l2(
       out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz,
       b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows,
-      config_->handle, config_->allocator, config_->stream,
+      config_->handle, config_->stream,
       [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
         value_t norms = sqrt(q_norm) * sqrt(r_norm);
         // deal with potential for 0 in denominator by forcing 0/1 instead
diff --git a/cpp/include/raft/sparse/distance/lp_distance.cuh b/cpp/include/raft/sparse/distance/lp_distance.cuh
index 4a7d5e53ff..31fed0656f 100644
--- a/cpp/include/raft/sparse/distance/lp_distance.cuh
+++ b/cpp/include/raft/sparse/distance/lp_distance.cuh
@@ -23,7 +23,6 @@
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
 
-#include <raft/mr/device/allocator.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <raft/sparse/utils.h>
diff --git a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
index 1ac075489a..187985627f 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
@@ -99,7 +99,6 @@ void build_dendrogram_host(const handle_t &handle, const value_idx *rows,
                            const value_idx *cols, const value_t *data,
                            size_t nnz, value_idx *children, value_t *out_delta,
                            value_idx *out_size) {
-  auto d_alloc = handle.get_device_allocator();
   auto stream = handle.get_stream();
 
   value_idx n_edges = nnz;
@@ -224,7 +223,6 @@ template <typename value_idx, int tpb = 256>
 void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels,
                                 const value_idx *children, size_t n_clusters,
                                 size_t n_leaves) {
-  auto d_alloc = handle.get_device_allocator();
   auto stream = handle.get_stream();
   auto thrust_policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
 
diff --git a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
index 7cf959dda6..b6ec190a98 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
@@ -60,12 +60,11 @@ struct distance_graph_impl<raft::hierarchy::LinkageDistance::KNN_GRAPH,
            rmm::device_uvector<value_idx> &indptr,
            rmm::device_uvector<value_idx> &indices,
            rmm::device_uvector<value_t> &data, int c) {
-    auto d_alloc = handle.get_device_allocator();
     auto stream = handle.get_stream();
     auto exec_policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
 
     // Need to symmetrize knn into undirected graph
-    raft::sparse::COO<value_t, value_idx> knn_graph_coo(d_alloc, stream);
+    raft::sparse::COO<value_t, value_idx> knn_graph_coo(stream);
 
     raft::sparse::selection::knn_graph(handle, X, m, n, metric, knn_graph_coo,
                                        c);
@@ -86,9 +85,8 @@ struct distance_graph_impl<raft::hierarchy::LinkageDistance::KNN_GRAPH,
                (!self_loop * thrust::get<2>(tup));
       });
 
-    raft::sparse::convert::sorted_coo_to_csr(knn_graph_coo.rows(),
-                                             knn_graph_coo.nnz, indptr.data(),
-                                             m + 1, d_alloc, stream);
+    raft::sparse::convert::sorted_coo_to_csr(
+      knn_graph_coo.rows(), knn_graph_coo.nnz, indptr.data(), m + 1, stream);
 
     // TODO: Wouldn't need to copy here if we could compute knn
     // graph directly on the device uvectors
diff --git a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
index 765a5ad77f..033d5881d5 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
@@ -80,18 +80,16 @@ void connect_knn_graph(const raft::handle_t &handle, const value_t *X,
                        red_op reduction_op,
                        raft::distance::DistanceType metric =
                          raft::distance::DistanceType::L2SqrtExpanded) {
-  auto d_alloc = handle.get_device_allocator();
   auto stream = handle.get_stream();
 
-  raft::sparse::COO<value_t, value_idx> connected_edges(d_alloc, stream);
+  raft::sparse::COO<value_t, value_idx> connected_edges(stream);
 
   raft::linkage::connect_components<value_idx, value_t>(
     handle, connected_edges, X, color, m, n, reduction_op);
 
   rmm::device_uvector<value_idx> indptr2(m + 1, stream);
-  raft::sparse::convert::sorted_coo_to_csr(connected_edges.rows(),
-                                           connected_edges.nnz, indptr2.data(),
-                                           m + 1, d_alloc, stream);
+  raft::sparse::convert::sorted_coo_to_csr(
+    connected_edges.rows(), connected_edges.nnz, indptr2.data(), m + 1, stream);
 
   // On the second call, we hand the MST the original colors
   // and the new set of edges and let it restart the optimization process
@@ -136,7 +134,6 @@ void build_sorted_mst(const raft::handle_t &handle, const value_t *X,
                       raft::distance::DistanceType metric =
                         raft::distance::DistanceType::L2SqrtExpanded,
                       int max_iter = 10) {
-  auto d_alloc = handle.get_device_allocator();
   auto stream = handle.get_stream();
 
   // We want to have MST initialize colors on first call.
@@ -145,7 +142,7 @@ void build_sorted_mst(const raft::handle_t &handle, const value_t *X,
     true);
 
   int iters = 1;
-  int n_components = linkage::get_n_components(color, m, d_alloc, stream);
+  int n_components = linkage::get_n_components(color, m, stream);
 
   while (n_components > 1 && iters < max_iter) {
     connect_knn_graph<value_idx, value_t>(handle, X, mst_coo, m, n, color,
@@ -153,7 +150,7 @@ void build_sorted_mst(const raft::handle_t &handle, const value_t *X,
 
     iters++;
 
-    n_components = linkage::get_n_components(color, m, d_alloc, stream);
+    n_components = linkage::get_n_components(color, m, stream);
   }
 
   /**
diff --git a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
index 01a033945c..06fffb8aed 100644
--- a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
+++ b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
@@ -58,7 +58,6 @@ void single_linkage(const raft::handle_t &handle, const value_t *X, size_t m,
          "n_clusters must be less than or equal to the number of data points");
 
   auto stream = handle.get_stream();
-  auto d_alloc = handle.get_device_allocator();
 
   rmm::device_uvector<value_idx> indptr(EMPTY, stream);
   rmm::device_uvector<value_idx> indices(EMPTY, stream);
diff --git a/cpp/include/raft/sparse/linalg/add.cuh b/cpp/include/raft/sparse/linalg/add.cuh
index 6bb4b31bd5..3bf028d14a 100644
--- a/cpp/include/raft/sparse/linalg/add.cuh
+++ b/cpp/include/raft/sparse/linalg/add.cuh
@@ -21,7 +21,6 @@
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/mr/device/allocator.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/device_ptr.h>
@@ -156,14 +155,12 @@ __global__ void csr_add_kernel(const int *a_ind, const int *a_indptr,
  * @param nnz2: size of right hand index_ptr and val arrays
  * @param m: size of output array (number of rows in final matrix)
  * @param out_ind: output row_ind array
- * @param d_alloc: device allocator to use for temp memory
  * @param stream: cuda stream to use
  */
 template <typename T, int TPB_X = 128>
 size_t csr_add_calc_inds(const int *a_ind, const int *a_indptr, const T *a_val,
                          int nnz1, const int *b_ind, const int *b_indptr,
                          const T *b_val, int nnz2, int m, int *out_ind,
-                         std::shared_ptr<raft::mr::device::allocator> d_alloc,
                          cudaStream_t stream) {
   dim3 grid(raft::ceildiv(m, TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
diff --git a/cpp/include/raft/sparse/linalg/spectral.cuh b/cpp/include/raft/sparse/linalg/spectral.cuh
index 0d6c55bb65..28b9190c53 100644
--- a/cpp/include/raft/sparse/linalg/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/spectral.cuh
@@ -18,7 +18,6 @@
 
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/mr/device/allocator.hpp>
 #include <raft/spectral/partition.hpp>
 #include <rmm/device_uvector.hpp>
 
@@ -35,7 +34,6 @@ void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals,
                    int nnz, int n, int n_components, T *out,
                    unsigned long long seed = 1234567) {
   auto stream = handle.get_stream();
-  auto d_alloc = handle.get_device_allocator();
   rmm::device_uvector<int> src_offsets(n + 1, stream);
   rmm::device_uvector<int> dst_cols(nnz, stream);
   rmm::device_uvector<T> dst_vals(nnz, stream);
diff --git a/cpp/include/raft/sparse/linalg/symmetrize.cuh b/cpp/include/raft/sparse/linalg/symmetrize.cuh
index 128ecb21bb..5fcd336551 100644
--- a/cpp/include/raft/sparse/linalg/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/symmetrize.cuh
@@ -21,7 +21,6 @@
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/mr/device/allocator.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <raft/sparse/op/sort.h>
@@ -121,13 +120,11 @@ __global__ void coo_symmetrize_kernel(int *row_ind, int *rows, int *cols,
  * @param in: Input COO matrix
  * @param out: Output symmetrized COO matrix
  * @param reduction_op: a custom reduction function
- * @param d_alloc device allocator for temporary buffers
  * @param stream: cuda stream to use
  */
 template <int TPB_X = 128, typename T, typename Lambda>
 void coo_symmetrize(COO<T> *in, COO<T> *out,
                     Lambda reduction_op,  // two-argument reducer
-                    std::shared_ptr<raft::mr::device::allocator> d_alloc,
                     cudaStream_t stream) {
   dim3 grid(raft::ceildiv(in->n_rows, TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
@@ -136,7 +133,7 @@ void coo_symmetrize(COO<T> *in, COO<T> *out,
 
   rmm::device_uvector<int> in_row_ind(in->n_rows, stream);
 
-  convert::sorted_coo_to_csr(in, in_row_ind.data(), d_alloc, stream);
+  convert::sorted_coo_to_csr(in, in_row_ind.data(), stream);
 
   out->allocate(in->nnz * 2, in->n_rows, in->n_cols, true, stream);
 
@@ -249,14 +246,14 @@ __global__ static void symmetric_sum(value_idx *restrict edges,
  * @param k: Number of n_neighbors
  * @param out: Output COO Matrix class
  * @param stream: Input cuda stream
- * @param d_alloc device allocator for temporary buffers
  */
 template <typename value_idx = int64_t, typename value_t = float,
           int TPB_X = 32, int TPB_Y = 32>
-void from_knn_symmetrize_matrix(
-  const value_idx *restrict knn_indices, const value_t *restrict knn_dists,
-  const value_idx n, const int k, COO<value_t, value_idx> *out,
-  cudaStream_t stream, std::shared_ptr<raft::mr::device::allocator> d_alloc) {
+void from_knn_symmetrize_matrix(const value_idx *restrict knn_indices,
+                                const value_t *restrict knn_dists,
+                                const value_idx n, const int k,
+                                COO<value_t, value_idx> *out,
+                                cudaStream_t stream) {
   // (1) Find how much space needed in each row
   // We look through all datapoints and increment the count for each row.
   const dim3 threadsPerBlock(TPB_X, TPB_Y);
@@ -313,7 +310,6 @@ template <typename value_idx, typename value_t>
 void symmetrize(const raft::handle_t &handle, const value_idx *rows,
                 const value_idx *cols, const value_t *vals, size_t m, size_t n,
                 size_t nnz, raft::sparse::COO<value_t, value_idx> &out) {
-  auto d_alloc = handle.get_device_allocator();
   auto stream = handle.get_stream();
 
   // copy rows to cols and cols to rows
@@ -332,7 +328,7 @@ void symmetrize(const raft::handle_t &handle, const value_idx *rows,
   // sort COO
   raft::sparse::op::coo_sort((value_idx)m, (value_idx)n, (value_idx)nnz * 2,
                              symm_rows.data(), symm_cols.data(),
-                             symm_vals.data(), d_alloc, stream);
+                             symm_vals.data(), stream);
 
   raft::sparse::op::max_duplicates(handle, out, symm_rows.data(),
                                    symm_cols.data(), symm_vals.data(), nnz * 2,
diff --git a/cpp/include/raft/sparse/linalg/transpose.h b/cpp/include/raft/sparse/linalg/transpose.h
index 7979de5657..7ad4b93ec0 100644
--- a/cpp/include/raft/sparse/linalg/transpose.h
+++ b/cpp/include/raft/sparse/linalg/transpose.h
@@ -21,7 +21,6 @@
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/mr/device/allocator.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/device_ptr.h>
@@ -53,7 +52,6 @@ namespace linalg {
  * @param[in] csr_nrows : Number of rows in CSR
  * @param[in] csr_ncols : Number of columns in CSR
  * @param[in] nnz : Number of nonzeros of CSR
- * @param[in] allocator : Allocator for intermediate memory
  * @param[in] stream : Cuda stream for ordering events
  */
 template <typename value_idx, typename value_t>
@@ -61,9 +59,7 @@ void csr_transpose(cusparseHandle_t handle, const value_idx *csr_indptr,
                    const value_idx *csr_indices, const value_t *csr_data,
                    value_idx *csc_indptr, value_idx *csc_indices,
                    value_t *csc_data, value_idx csr_nrows, value_idx csr_ncols,
-                   value_idx nnz,
-                   std::shared_ptr<raft::mr::device::allocator> allocator,
-                   cudaStream_t stream) {
+                   value_idx nnz, cudaStream_t stream) {
   size_t convert_csc_workspace_size = 0;
 
   CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc_bufferSize(
diff --git a/cpp/include/raft/sparse/op/filter.cuh b/cpp/include/raft/sparse/op/filter.cuh
index 5df7decf8b..5383f6fe7e 100644
--- a/cpp/include/raft/sparse/op/filter.cuh
+++ b/cpp/include/raft/sparse/op/filter.cuh
@@ -21,7 +21,6 @@
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/mr/device/allocator.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/device_ptr.h>
@@ -84,9 +83,7 @@ __global__ void coo_remove_scalar_kernel(const int *rows, const int *cols,
 template <int TPB_X, typename T>
 void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz,
                        int *crows, int *ccols, T *cvals, int *cnnz,
-                       int *cur_cnnz, T scalar, int n,
-                       std::shared_ptr<raft::mr::device::allocator> d_alloc,
-                       cudaStream_t stream) {
+                       int *cur_cnnz, T scalar, int n, cudaStream_t stream) {
   rmm::device_uvector<int> ex_scan(n, stream);
   rmm::device_uvector<int> cur_ex_scan(n, stream);
 
@@ -122,13 +119,10 @@ void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz,
  * @param in: input COO matrix
  * @param out: output COO matrix
  * @param scalar: scalar to remove from arrays
- * @param d_alloc device allocator for temporary buffers
  * @param stream: cuda stream to use
  */
 template <int TPB_X, typename T>
-void coo_remove_scalar(COO<T> *in, COO<T> *out, T scalar,
-                       std::shared_ptr<raft::mr::device::allocator> d_alloc,
-                       cudaStream_t stream) {
+void coo_remove_scalar(COO<T> *in, COO<T> *out, T scalar, cudaStream_t stream) {
   rmm::device_uvector<int> row_count_nz(in->n_rows, stream);
   rmm::device_uvector<int> row_count(in->n_rows, stream);
 
@@ -154,7 +148,7 @@ void coo_remove_scalar(COO<T> *in, COO<T> *out, T scalar,
   coo_remove_scalar<TPB_X, T>(in->rows(), in->cols(), in->vals(), in->nnz,
                               out->rows(), out->cols(), out->vals(),
                               row_count_nz.data(), row_count.data(), scalar,
-                              in->n_rows, d_alloc, stream);
+                              in->n_rows, stream);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -163,14 +157,11 @@ void coo_remove_scalar(COO<T> *in, COO<T> *out, T scalar,
  *
  * @param in: input COO matrix
  * @param out: output COO matrix
- * @param d_alloc device allocator for temporary buffers
  * @param stream: cuda stream to use
  */
 template <int TPB_X, typename T>
-void coo_remove_zeros(COO<T> *in, COO<T> *out,
-                      std::shared_ptr<raft::mr::device::allocator> d_alloc,
-                      cudaStream_t stream) {
-  coo_remove_scalar<TPB_X, T>(in, out, T(0.0), d_alloc, stream);
+void coo_remove_zeros(COO<T> *in, COO<T> *out, cudaStream_t stream) {
+  coo_remove_scalar<TPB_X, T>(in, out, T(0.0), stream);
 }
 
 };  // namespace op
diff --git a/cpp/include/raft/sparse/op/reduce.cuh b/cpp/include/raft/sparse/op/reduce.cuh
index 53c9f89074..2708f0491e 100644
--- a/cpp/include/raft/sparse/op/reduce.cuh
+++ b/cpp/include/raft/sparse/op/reduce.cuh
@@ -21,7 +21,6 @@
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/mr/device/allocator.hpp>
 #include <raft/mr/device/buffer.hpp>
 
 #include <raft/sparse/op/sort.h>
@@ -126,7 +125,6 @@ void max_duplicates(const raft::handle_t &handle,
                     raft::sparse::COO<value_t, value_idx> &out,
                     const value_idx *rows, const value_idx *cols,
                     const value_t *vals, size_t nnz, size_t m, size_t n) {
-  auto d_alloc = handle.get_device_allocator();
   auto stream = handle.get_stream();
 
   auto exec_policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
diff --git a/cpp/include/raft/sparse/op/sort.h b/cpp/include/raft/sparse/op/sort.h
index 9dbe2b67c5..d53ceb62a9 100644
--- a/cpp/include/raft/sparse/op/sort.h
+++ b/cpp/include/raft/sparse/op/sort.h
@@ -21,7 +21,6 @@
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/mr/device/allocator.hpp>
 #include <raft/mr/device/buffer.hpp>
 
 #include <thrust/device_ptr.h>
@@ -62,13 +61,10 @@ struct TupleComp {
  * @param rows rows array from coo matrix
  * @param cols cols array from coo matrix
  * @param vals vals array from coo matrix
- * @param d_alloc device allocator for temporary buffers
  * @param stream: cuda stream to use
  */
 template <typename T>
 void coo_sort(int m, int n, int nnz, int *rows, int *cols, T *vals,
-              // TODO: Remove this
-              std::shared_ptr<raft::mr::device::allocator> d_alloc,
               cudaStream_t stream) {
   auto coo_indices = thrust::make_zip_iterator(thrust::make_tuple(rows, cols));
 
@@ -81,16 +77,12 @@ void coo_sort(int m, int n, int nnz, int *rows, int *cols, T *vals,
  * @brief Sort the underlying COO arrays by row
  * @tparam T: the type name of the underlying value array
  * @param in: COO to sort by row
- * @param d_alloc device allocator for temporary buffers
  * @param stream: the cuda stream to use
  */
 template <typename T>
-void coo_sort(COO<T> *const in,
-              // TODO: Remove this
-              std::shared_ptr<raft::mr::device::allocator> d_alloc,
-              cudaStream_t stream) {
+void coo_sort(COO<T> *const in, cudaStream_t stream) {
   coo_sort<T>(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(),
-              in->vals(), d_alloc, stream);
+              in->vals(), stream);
 }
 
 /**
diff --git a/cpp/include/raft/sparse/selection/connect_components.cuh b/cpp/include/raft/sparse/selection/connect_components.cuh
index 8aae90f1d8..9b02ae67e6 100644
--- a/cpp/include/raft/sparse/selection/connect_components.cuh
+++ b/cpp/include/raft/sparse/selection/connect_components.cuh
@@ -159,14 +159,10 @@ struct CubKVPMinReduce {
  */
 template <typename value_idx>
 value_idx get_n_components(value_idx *colors, size_t n_rows,
-                           std::shared_ptr<raft::mr::device::allocator> d_alloc,
                            cudaStream_t stream) {
-  value_idx *map_ids;
-  int num_clusters;
-  raft::label::getUniquelabels(colors, n_rows, &map_ids, &num_clusters, stream,
-                               d_alloc);
-  d_alloc->deallocate(map_ids, num_clusters * sizeof(value_idx), stream);
-
+  rmm::device_uvector<value_idx> map_ids(0, stream);
+  int num_clusters =
+    raft::label::getUniquelabels(map_ids, colors, n_rows, stream);
   return num_clusters;
 }
 
@@ -197,15 +193,13 @@ struct LookupColorOp {
  * @param[in] X original dense data
  * @param[in] n_rows number of rows in original dense data
  * @param[in] n_cols number of columns in original dense data
- * @param[in] d_alloc device allocator to use
  * @param[in] stream cuda stream for which to order cuda operations
  */
 template <typename value_idx, typename value_t, typename red_op>
 void perform_1nn(cub::KeyValuePair<value_idx, value_t> *kvp,
                  value_idx *nn_colors, value_idx *colors, const value_t *X,
-                 size_t n_rows, size_t n_cols,
-                 std::shared_ptr<raft::mr::device::allocator> d_alloc,
-                 cudaStream_t stream, red_op reduction_op) {
+                 size_t n_rows, size_t n_cols, cudaStream_t stream,
+                 red_op reduction_op) {
   rmm::device_uvector<int> workspace(n_rows, stream);
   rmm::device_uvector<value_t> x_norm(n_rows, stream);
 
@@ -324,7 +318,6 @@ void connect_components(const raft::handle_t &handle,
                         size_t n_rows, size_t n_cols, red_op reduction_op,
                         raft::distance::DistanceType metric =
                           raft::distance::DistanceType::L2SqrtExpanded) {
-  auto d_alloc = handle.get_device_allocator();
   auto stream = handle.get_stream();
 
   RAFT_EXPECTS(metric == raft::distance::DistanceType::L2SqrtExpanded,
@@ -336,10 +329,9 @@ void connect_components(const raft::handle_t &handle,
 
   // Normalize colors so they are drawn from a monotonically increasing set
   raft::label::make_monotonic(colors.data(), colors.data(), n_rows, stream,
-                              d_alloc, true);
+                              true);
 
-  value_idx n_components =
-    get_n_components(colors.data(), n_rows, d_alloc, stream);
+  value_idx n_components = get_n_components(colors.data(), n_rows, stream);
 
   /**
    * First compute 1-nn for all colors where the color of each data point
@@ -351,7 +343,7 @@ void connect_components(const raft::handle_t &handle,
   rmm::device_uvector<value_idx> src_indices(n_rows, stream);
 
   perform_1nn(temp_inds_dists.data(), nn_colors.data(), colors.data(), X,
-              n_rows, n_cols, d_alloc, stream, reduction_op);
+              n_rows, n_cols, stream, reduction_op);
 
   /**
    * Sort data points by color (neighbors are not sorted)
@@ -380,7 +372,7 @@ void connect_components(const raft::handle_t &handle,
 
   size++;
 
-  raft::sparse::COO<value_t, value_idx> min_edges(d_alloc, stream);
+  raft::sparse::COO<value_t, value_idx> min_edges(stream);
   min_edges.allocate(size, n_rows, n_rows, true, stream);
 
   min_components_by_color(min_edges, out_index.data(), src_indices.data(),
diff --git a/cpp/include/raft/sparse/selection/knn.cuh b/cpp/include/raft/sparse/selection/knn.cuh
index e327386d13..d571eabb77 100644
--- a/cpp/include/raft/sparse/selection/knn.cuh
+++ b/cpp/include/raft/sparse/selection/knn.cuh
@@ -24,7 +24,6 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/matrix/matrix.cuh>
-#include <raft/mr/device/allocator.hpp>
 #include <raft/mr/device/buffer.hpp>
 
 #include <raft/sparse/op/slice.h>
@@ -120,9 +119,7 @@ class sparse_knn_t {
                const value_idx *queryIndices_, const value_t *queryData_,
                size_t queryNNZ_, int n_query_rows_, int n_query_cols_,
                value_idx *output_indices_, value_t *output_dists_, int k_,
-               cusparseHandle_t cusparseHandle_,
-               std::shared_ptr<raft::mr::device::allocator> allocator_,
-               cudaStream_t stream_,
+               cusparseHandle_t cusparseHandle_, cudaStream_t stream_,
                size_t batch_size_index_ = 2 << 14,  // approx 1M
                size_t batch_size_query_ = 2 << 14,
                raft::distance::DistanceType metric_ =
@@ -144,7 +141,6 @@ class sparse_knn_t {
       output_dists(output_dists_),
       k(k_),
       cusparseHandle(cusparseHandle_),
-      allocator(allocator_),
       stream(stream_),
       batch_size_index(batch_size_index_),
       batch_size_query(batch_size_query_),
@@ -369,7 +365,6 @@ class sparse_knn_t {
     dist_config.a_data = query_batch_data;
 
     dist_config.handle = cusparseHandle;
-    dist_config.allocator = allocator;
     dist_config.stream = stream;
 
     if (raft::sparse::distance::supportedDistance.find(metric) ==
@@ -395,8 +390,6 @@ class sparse_knn_t {
 
   cusparseHandle_t cusparseHandle;
 
-  std::shared_ptr<raft::mr::device::allocator> allocator;
-
   cudaStream_t stream;
 };
 
@@ -418,7 +411,6 @@ class sparse_knn_t {
    * @param[out] output_dists dense matrix for output distances (size n_query_rows * k)
    * @param[in] k the number of neighbors to query
    * @param[in] cusparseHandle the initialized cusparseHandle instance to use
-   * @param[in] allocator device allocator instance to use
    * @param[in] stream CUDA stream to order operations with respect to
    * @param[in] batch_size_index maximum number of rows to use from index matrix per batch
    * @param[in] batch_size_query maximum number of rows to use from query matrix per batch
@@ -432,9 +424,7 @@ void brute_force_knn(const value_idx *idxIndptr, const value_idx *idxIndices,
                      const value_idx *queryIndices, const value_t *queryData,
                      size_t queryNNZ, int n_query_rows, int n_query_cols,
                      value_idx *output_indices, value_t *output_dists, int k,
-                     cusparseHandle_t cusparseHandle,
-                     std::shared_ptr<raft::mr::device::allocator> allocator,
-                     cudaStream_t stream,
+                     cusparseHandle_t cusparseHandle, cudaStream_t stream,
                      size_t batch_size_index = 2 << 14,  // approx 1M
                      size_t batch_size_query = 2 << 14,
                      raft::distance::DistanceType metric =
@@ -443,8 +433,8 @@ void brute_force_knn(const value_idx *idxIndptr, const value_idx *idxIndices,
   sparse_knn_t<value_idx, value_t>(
     idxIndptr, idxIndices, idxData, idxNNZ, n_idx_rows, n_idx_cols, queryIndptr,
     queryIndices, queryData, queryNNZ, n_query_rows, n_query_cols,
-    output_indices, output_dists, k, cusparseHandle, allocator, stream,
-    batch_size_index, batch_size_query, metric, metricArg)
+    output_indices, output_dists, k, cusparseHandle, stream, batch_size_index,
+    batch_size_query, metric, metricArg)
     .run();
 }
 
diff --git a/cpp/include/raft/sparse/selection/knn_graph.cuh b/cpp/include/raft/sparse/selection/knn_graph.cuh
index 1cf225087a..1cdd66f516 100644
--- a/cpp/include/raft/sparse/selection/knn_graph.cuh
+++ b/cpp/include/raft/sparse/selection/knn_graph.cuh
@@ -96,7 +96,6 @@ void knn_graph(const handle_t &handle, const value_t *X, size_t m, size_t n,
                raft::sparse::COO<value_t, value_idx> &out, int c = 15) {
   int k = build_k(m, c);
 
-  auto d_alloc = handle.get_device_allocator();
   auto stream = handle.get_stream();
 
   size_t nnz = m * k;
diff --git a/cpp/include/raft/spatial/knn/ann.hpp b/cpp/include/raft/spatial/knn/ann.hpp
index 77d7831b4a..2cdf9bf4f5 100644
--- a/cpp/include/raft/spatial/knn/ann.hpp
+++ b/cpp/include/raft/spatial/knn/ann.hpp
@@ -22,15 +22,12 @@
 #include <faiss/gpu/GpuIndex.h>
 #include <faiss/gpu/StandardGpuResources.h>
 
-#include <raft/mr/device/allocator.hpp>
 #include <raft/mr/device/buffer.hpp>
 
 namespace raft {
 namespace spatial {
 namespace knn {
 
-using deviceAllocator = raft::mr::device::allocator;
-
 /**
  * @brief Flat C++ API function to build an approximate nearest neighbors index
  * from an index array and a set of parameters.
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
index 6e4c99b646..c0345a01e6 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
@@ -46,7 +46,6 @@
 
 #include <cuml/common/device_buffer.hpp>
 #include <cuml/neighbors/knn.hpp>
-#include <raft/mr/device/allocator.hpp>
 
 #include <iostream>
 #include <set>
@@ -145,8 +144,7 @@ void approx_knn_build_index(raft::handle_t &handle,
   // perform preprocessing
   // k set to 0 (unused during preprocessing / revertion)
   std::unique_ptr<MetricProcessor<float>> query_metric_processor =
-    create_processor<float>(metric, n, D, 0, false, handle.get_stream(),
-                            handle.get_device_allocator());
+    create_processor<float>(metric, n, D, 0, false, handle.get_stream());
 
   query_metric_processor->preprocess(index_array);
 
@@ -183,7 +181,7 @@ void approx_knn_search(raft::handle_t &handle, float *distances,
   // perform preprocessing
   std::unique_ptr<MetricProcessor<float>> query_metric_processor =
     create_processor<float>(index->metric, n, index->index->d, k, false,
-                            handle.get_stream(), handle.get_device_allocator());
+                            handle.get_stream());
 
   query_metric_processor->preprocess(query_array);
   index->index->search(n, query_array, k, distances, indices);
diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index a6834552e3..6db8fb7a8e 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -181,7 +181,6 @@ inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK,
  * @param[out] res_I    pointer to device memory for returning k nearest indices
  * @param[out] res_D    pointer to device memory for returning k nearest distances
  * @param[in] k        number of neighbors to query
- * @param[in] allocator the device memory allocator to use for temporary scratch memory
  * @param[in] userStream the main cuda stream to use
  * @param[in] internalStreams optional when n_params > 0, the index partitions can be
  *        queried in parallel using these streams. Note that n_int_streams also
@@ -200,7 +199,6 @@ template <typename IntType = int>
 void brute_force_knn_impl(std::vector<float *> &input, std::vector<int> &sizes,
                           IntType D, float *search_items, IntType n,
                           int64_t *res_I, float *res_D, IntType k,
-                          std::shared_ptr<deviceAllocator> allocator,
                           cudaStream_t userStream,
                           cudaStream_t *internalStreams = nullptr,
                           int n_int_streams = 0, bool rowMajorIndex = true,
@@ -230,15 +228,14 @@ void brute_force_knn_impl(std::vector<float *> &input, std::vector<int> &sizes,
 
   // perform preprocessing
   std::unique_ptr<MetricProcessor<float>> query_metric_processor =
-    create_processor<float>(metric, n, D, k, rowMajorQuery, userStream,
-                            allocator);
+    create_processor<float>(metric, n, D, k, rowMajorQuery, userStream);
   query_metric_processor->preprocess(search_items);
 
   std::vector<std::unique_ptr<MetricProcessor<float>>> metric_processors(
     input.size());
   for (size_t i = 0; i < input.size(); i++) {
-    metric_processors[i] = create_processor<float>(
-      metric, sizes[i], D, k, rowMajorQuery, userStream, allocator);
+    metric_processors[i] = create_processor<float>(metric, sizes[i], D, k,
+                                                   rowMajorQuery, userStream);
     metric_processors[i]->preprocess(input[i]);
   }
 
diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp
index d9292440d1..876e91e877 100644
--- a/cpp/include/raft/spatial/knn/detail/processing.hpp
+++ b/cpp/include/raft/spatial/knn/detail/processing.hpp
@@ -19,7 +19,6 @@
 #include <raft/linalg/matrix_vector_op.cuh>
 #include <raft/linalg/norm.cuh>
 #include <raft/linalg/unary_op.cuh>
-#include <raft/mr/device/allocator.hpp>
 #include <raft/stats/mean.cuh>
 #include <raft/stats/mean_center.cuh>
 #include <rmm/device_uvector.hpp>
@@ -28,7 +27,6 @@ namespace raft {
 namespace spatial {
 namespace knn {
 
-using deviceAllocator = raft::mr::device::allocator;
 /**
  * @brief A virtual class defining pre- and post-processing
  * for metrics. This class will temporarily modify its given
@@ -56,15 +54,12 @@ class CosineMetricProcessor : public MetricProcessor<math_t> {
   size_t n_rows_;
   size_t n_cols_;
   cudaStream_t stream_;
-  std::shared_ptr<deviceAllocator> device_allocator_;
   rmm::device_uvector<math_t> colsums_;
 
  public:
   CosineMetricProcessor(size_t n_rows, size_t n_cols, int k, bool row_major,
-                        cudaStream_t stream,
-                        std::shared_ptr<deviceAllocator> allocator)
-    : device_allocator_(allocator),
-      stream_(stream),
+                        cudaStream_t stream)
+    : stream_(stream),
       colsums_(n_rows, stream),
       n_cols_(n_cols),
       n_rows_(n_rows),
@@ -104,10 +99,8 @@ class CorrelationMetricProcessor : public CosineMetricProcessor<math_t> {
 
  public:
   CorrelationMetricProcessor(size_t n_rows, size_t n_cols, int k,
-                             bool row_major, cudaStream_t stream,
-                             std::shared_ptr<deviceAllocator> allocator)
-    : CosineMetricProcessor<math_t>(n_rows, n_cols, k, row_major, stream,
-                                    allocator),
+                             bool row_major, cudaStream_t stream)
+    : CosineMetricProcessor<math_t>(n_rows, n_cols, k, row_major, stream),
       means_(n_rows, stream) {}
 
   void preprocess(math_t *data) {
@@ -161,18 +154,18 @@ class DefaultMetricProcessor : public MetricProcessor<math_t> {
 template <typename math_t>
 inline std::unique_ptr<MetricProcessor<math_t>> create_processor(
   distance::DistanceType metric, int n, int D, int k, bool rowMajorQuery,
-  cudaStream_t userStream, std::shared_ptr<deviceAllocator> allocator) {
+  cudaStream_t userStream) {
   MetricProcessor<math_t> *mp = nullptr;
 
   switch (metric) {
     case distance::DistanceType::CosineExpanded:
-      mp = new CosineMetricProcessor<math_t>(n, D, k, rowMajorQuery, userStream,
-                                             allocator);
+      mp =
+        new CosineMetricProcessor<math_t>(n, D, k, rowMajorQuery, userStream);
       break;
 
     case distance::DistanceType::CorrelationExpanded:
       mp = new CorrelationMetricProcessor<math_t>(n, D, k, rowMajorQuery,
-                                                  userStream, allocator);
+                                                  userStream);
       break;
     default:
       mp = new DefaultMetricProcessor<math_t>();
diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp
index a3a1972c13..71c547c281 100644
--- a/cpp/include/raft/spatial/knn/knn.hpp
+++ b/cpp/include/raft/spatial/knn/knn.hpp
@@ -18,15 +18,12 @@
 
 #include "detail/knn_brute_force_faiss.cuh"
 
-#include <raft/mr/device/allocator.hpp>
 #include <raft/mr/device/buffer.hpp>
 
 namespace raft {
 namespace spatial {
 namespace knn {
 
-using deviceAllocator = raft::mr::device::allocator;
-
 template <typename value_idx = int64_t, typename value_t = float>
 inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK,
                             value_idx *outV, size_t n_samples, int n_parts,
@@ -72,8 +69,7 @@ inline void brute_force_knn(
   std::vector<cudaStream_t> int_streams = handle.get_internal_streams();
 
   detail::brute_force_knn_impl(input, sizes, D, search_items, n, res_I, res_D,
-                               k, handle.get_device_allocator(),
-                               handle.get_stream(), int_streams.data(),
+                               k, handle.get_stream(), int_streams.data(),
                                handle.get_num_internal_streams(), rowMajorIndex,
                                rowMajorQuery, translations, metric, metric_arg);
 }
diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp
index c43154d17a..efa98313b6 100644
--- a/cpp/include/raft/spectral/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/matrix_wrappers.hpp
@@ -19,6 +19,7 @@
 #include <raft/linalg/cublas_wrappers.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <thrust/fill.h>
 #include <thrust/reduce.h>
@@ -72,52 +73,30 @@ struct vector_view_t {
     : buffer_(buffer), size_(sz) {}
 
   vector_view_t(vector_view_t&& other)
-    : buffer_(other.buffer_), size_(other.size_) {
-    other.buffer_ = nullptr;
-    other.size_ = 0;
-  }
+    : buffer_(other.raw()), size_(other.size()) {}
 
   vector_view_t& operator=(vector_view_t&& other) {
-    buffer_ = other.buffer_;
-    size_ = other.size_;
-
-    other.buffer_ = nullptr;
-    other.size_ = 0;
+    buffer_ = other.raw();
+    size_ = other.size();
   }
 };
 
-// allocatable vector, using raft handle allocator
-//
 template <typename value_type>
 class vector_t {
-  handle_t const& handle_;
-  value_type* buffer_;
-  size_type size_;
-  cudaStream_t stream_;
-
  public:
   vector_t(handle_t const& raft_handle, size_type sz)
-    : handle_(raft_handle),
-      buffer_(
-        static_cast<value_type*>(raft_handle.get_device_allocator()->allocate(
-          sz * sizeof(value_type), raft_handle.get_stream()))),
-      size_(sz),
-      stream_(raft_handle.get_stream()) {}
-
-  ~vector_t(void) {
-    handle_.get_device_allocator()->deallocate(
-      buffer_, size_ * sizeof(value_type), stream_);
-  }
+    : buffer_(sz, raft_handle.get_stream()) {}
 
-  size_type size(void) const { return size_; }
+  size_type size(void) const { return buffer_.size(); }
 
-  value_type* raw(void) { return buffer_; }
+  value_type* raw(void) { return buffer_.data(); }
 
-  value_type const* raw(void) const { return buffer_; }
+  value_type const* raw(void) const { return buffer_.data(); }
 
   template <typename ThrustExecPolicy>
   value_type nrm1(ThrustExecPolicy t_exe_pol) const {
-    return thrust::reduce(t_exe_pol, buffer_, buffer_ + size_, value_type{0},
+    return thrust::reduce(t_exe_pol, buffer_.data(),
+                          buffer_.data() + buffer_.size(), value_type{0},
                           [] __device__(auto left, auto right) {
                             auto abs_left = left > 0 ? left : -left;
                             auto abs_right = right > 0 ? right : -right;
@@ -127,8 +106,11 @@ class vector_t {
 
   template <typename ThrustExecPolicy>
   void fill(ThrustExecPolicy t_exe_pol, value_type value) {
-    thrust::fill_n(t_exe_pol, buffer_, size_, value);
+    thrust::fill_n(t_exe_pol, buffer_.data(), buffer_.size(), value);
   }
+
+ private:
+  rmm::device_uvector<value_type> buffer_;
 };
 
 template <typename index_type, typename value_type>
diff --git a/cpp/test/label/label.cu b/cpp/test/label/label.cu
index dc2846fdba..2a159994dc 100644
--- a/cpp/test/label/label.cu
+++ b/cpp/test/label/label.cu
@@ -20,7 +20,6 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/mr/device/allocator.hpp>
 #include "../test_utils.h"
 
 #include <iostream>
@@ -57,9 +56,7 @@ TEST_F(MakeMonotonicTest, Result) {
   raft::update_device(data, data_h, m, stream);
   raft::update_device(expected, expected_h, m, stream);
 
-  std::shared_ptr<raft::mr::device::allocator> allocator(
-    new raft::mr::device::default_allocator);
-  make_monotonic(actual, data, m, stream, allocator);
+  make_monotonic(actual, data, m, stream);
 
   CUDA_CHECK(cudaStreamSynchronize(stream));
 
@@ -76,8 +73,6 @@ TEST_F(MakeMonotonicTest, Result) {
 TEST(labelTest, Classlabels) {
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
-  std::shared_ptr<raft::mr::device::allocator> allocator(
-    new raft::mr::device::default_allocator);
 
   int n_rows = 6;
   float *y_d;
@@ -86,20 +81,20 @@ TEST(labelTest, Classlabels) {
   float y_h[] = {2, -1, 1, 2, 1, 1};
   raft::update_device(y_d, y_h, n_rows, stream);
 
-  int n_classes;
-  float *y_unique_d;
-  getUniquelabels(y_d, n_rows, &y_unique_d, &n_classes, stream, allocator);
+  rmm::device_uvector<float> y_unique_d(0, stream);
+  int n_classes = getUniquelabels(y_unique_d, y_d, n_rows, stream);
 
   ASSERT_EQ(n_classes, 3);
 
   float y_unique_exp[] = {-1, 1, 2};
-  EXPECT_TRUE(devArrMatchHost(y_unique_exp, y_unique_d, n_classes,
+  EXPECT_TRUE(devArrMatchHost(y_unique_exp, y_unique_d.data(), n_classes,
                               raft::Compare<float>(), stream));
 
   float *y_relabeled_d;
   raft::allocate(y_relabeled_d, n_rows);
 
-  getOvrlabels(y_d, n_rows, y_unique_d, n_classes, y_relabeled_d, 2, stream);
+  getOvrlabels(y_d, n_rows, y_unique_d.data(), n_classes, y_relabeled_d, 2,
+               stream);
 
   float y_relabeled_exp[] = {1, -1, -1, 1, -1, -1};
   EXPECT_TRUE(devArrMatchHost(y_relabeled_exp, y_relabeled_d, n_rows,
@@ -107,7 +102,6 @@ TEST(labelTest, Classlabels) {
 
   CUDA_CHECK(cudaStreamDestroy(stream));
   CUDA_CHECK(cudaFree(y_d));
-  CUDA_CHECK(cudaFree(y_unique_d));
   CUDA_CHECK(cudaFree(y_relabeled_d));
 }
 };  // namespace label
diff --git a/cpp/test/label/merge_labels.cu b/cpp/test/label/merge_labels.cu
index a2f14a8dbc..0648f165a6 100644
--- a/cpp/test/label/merge_labels.cu
+++ b/cpp/test/label/merge_labels.cu
@@ -20,7 +20,6 @@
 #include <raft/cudart_utils.h>
 #include <thrust/device_ptr.h>
 #include <raft/handle.hpp>
-#include <raft/mr/device/allocator.hpp>
 #include <rmm/device_uvector.hpp>
 #include "../test_utils.h"
 
diff --git a/cpp/test/linalg/binary_op.cu b/cpp/test/linalg/binary_op.cu
index 3ae4f86066..979b12f237 100644
--- a/cpp/test/linalg/binary_op.cu
+++ b/cpp/test/linalg/binary_op.cu
@@ -17,8 +17,8 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/binary_op.cuh>
-#include <raft/mr/device/buffer.hpp>
 #include <raft/random/rng.cuh>
+#include <rmm/device_uvector.hpp>
 #include "../test_utils.h"
 #include "binary_op.cuh"
 
@@ -136,9 +136,9 @@ class BinaryOpAlignment : public ::testing::Test {
     // Test to trigger cudaErrorMisalignedAddress if veclen is incorrectly
     // chosen.
     int n = 1024;
-    mr::device::buffer<math_t> x(handle.get_device_allocator(), stream, n);
-    mr::device::buffer<math_t> y(handle.get_device_allocator(), stream, n);
-    mr::device::buffer<math_t> z(handle.get_device_allocator(), stream, n);
+    rmm::device_uvector<math_t> x(n, stream);
+    rmm::device_uvector<math_t> y(n, stream);
+    rmm::device_uvector<math_t> z(n, stream);
     CUDA_CHECK(cudaMemsetAsync(x.data(), 0, n * sizeof(math_t), stream));
     CUDA_CHECK(cudaMemsetAsync(y.data(), 0, n * sizeof(math_t), stream));
     raft::linalg::binaryOp(
diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/test/linalg/cholesky_r1.cu
index 3c84d0db5f..2cb3bed754 100644
--- a/cpp/test/linalg/cholesky_r1.cu
+++ b/cpp/test/linalg/cholesky_r1.cu
@@ -19,7 +19,6 @@
 #include <raft/linalg/cusolver_wrappers.h>
 #include <raft/handle.hpp>
 #include <raft/linalg/cholesky_r1_update.cuh>
-#include <raft/mr/device/allocator.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <sstream>
diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu
index 227bce6a48..089edd738e 100644
--- a/cpp/test/linalg/map.cu
+++ b/cpp/test/linalg/map.cu
@@ -18,7 +18,6 @@
 #include <raft/cudart_utils.h>
 #include <raft/linalg/eltwise.cuh>
 #include <raft/linalg/map.cuh>
-#include <raft/mr/device/buffer.hpp>
 #include <raft/random/rng.cuh>
 #include "../test_utils.h"
 
diff --git a/cpp/test/mr/device/buffer.cpp b/cpp/test/mr/device/buffer.cpp
index 223efdbfe8..fe42cea8b3 100644
--- a/cpp/test/mr/device/buffer.cpp
+++ b/cpp/test/mr/device/buffer.cpp
@@ -15,22 +15,21 @@
  */
 
 #include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
 #include <iostream>
 #include <memory>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/limiting_resource_adaptor.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
 
 namespace raft {
 namespace mr {
 namespace device {
 
 TEST(Raft, DeviceBufferAlloc) {
-  auto alloc = std::make_shared<default_allocator>();
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
   // no allocation at construction
-  buffer<char> buff(alloc, stream);
+  rmm::device_uvector<char> buff(0, stream);
   ASSERT_EQ(0, buff.size());
   // explicit allocation after construction
   buff.resize(20, stream);
@@ -39,12 +38,12 @@ TEST(Raft, DeviceBufferAlloc) {
   buff.resize(10, stream);
   ASSERT_EQ(10, buff.size());
   // explicit deallocation
-  buff.release(stream);
+  buff.release();
   ASSERT_EQ(0, buff.size());
   // use these methods without the explicit stream parameter
-  buff.resize(20);
+  buff.resize(20, stream);
   ASSERT_EQ(20, buff.size());
-  buff.resize(10);
+  buff.resize(10, stream);
   ASSERT_EQ(10, buff.size());
   buff.release();
   ASSERT_EQ(0, buff.size());
@@ -62,11 +61,10 @@ TEST(Raft, DeviceBufferZeroResize) {
 
   rmm::mr::set_current_device_resource(limit_mr.get());
 
-  auto alloc = std::make_shared<default_allocator>();
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
   // no allocation at construction
-  buffer<char> buff(alloc, stream, 10);
+  rmm::device_uvector<char> buff(10, stream);
   ASSERT_EQ(10, buff.size());
   // explicit allocation after construction
   buff.resize(0, stream);
@@ -75,7 +73,7 @@ TEST(Raft, DeviceBufferZeroResize) {
   buff.resize(20, stream);
   ASSERT_EQ(20, buff.size());
   // explicit deallocation
-  buff.release(stream);
+  buff.release();
   ASSERT_EQ(0, buff.size());
 
   // Now check that there is no memory left. (Used to not be true)
diff --git a/cpp/test/sparse/add.cu b/cpp/test/sparse/add.cu
index 713708d4cd..38c3e5bfbb 100644
--- a/cpp/test/sparse/add.cu
+++ b/cpp/test/sparse/add.cu
@@ -74,9 +74,6 @@ class CSRAddTest
   }
 
   void Run() {
-    std::shared_ptr<raft::mr::device::allocator> alloc(
-      new raft::mr::device::default_allocator);
-
     raft::update_device(ind_a, params.matrix_a.row_ind.data(), n_rows, stream);
     raft::update_device(ind_ptr_a, params.matrix_a.row_ind_ptr.data(), nnz_a,
                         stream);
@@ -96,7 +93,7 @@ class CSRAddTest
 
     Index_ nnz = linalg::csr_add_calc_inds<Type_f, 32>(
       ind_a, ind_ptr_a, values_a, nnz_a, ind_b, ind_ptr_b, values_b, nnz_b,
-      n_rows, ind_result, alloc, stream);
+      n_rows, ind_result, stream);
 
     ASSERT_TRUE(nnz == nnz_result);
     ASSERT_TRUE(raft::devArrMatch<Index_>(ind_verify, ind_result, n_rows,
diff --git a/cpp/test/sparse/connect_components.cu b/cpp/test/sparse/connect_components.cu
index d98f9de9c3..dd6ba1479e 100644
--- a/cpp/test/sparse/connect_components.cu
+++ b/cpp/test/sparse/connect_components.cu
@@ -28,7 +28,6 @@
 
 #include <raft/linalg/distance_type.h>
 #include <raft/linalg/transpose.h>
-#include <raft/mr/device/allocator.hpp>
 #include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/coo.cuh>
 #include <raft/sparse/hierarchy/single_linkage.hpp>
@@ -57,14 +56,12 @@ class ConnectComponentsTest : public ::testing::TestWithParam<
   void basicTest() {
     raft::handle_t handle;
 
-    auto d_alloc = handle.get_device_allocator();
     auto stream = handle.get_stream();
 
     params = ::testing::TestWithParam<
       ConnectComponentsInputs<value_t, value_idx>>::GetParam();
 
-    raft::sparse::COO<value_t, value_idx> out_edges(
-      handle.get_device_allocator(), handle.get_stream());
+    raft::sparse::COO<value_t, value_idx> out_edges(handle.get_stream());
 
     rmm::device_uvector<value_t> data(params.n_row * params.n_col,
                                       handle.get_stream());
@@ -77,7 +74,7 @@ class ConnectComponentsTest : public ::testing::TestWithParam<
     /**
      * 1. Construct knn graph
      */
-    raft::sparse::COO<value_t, value_idx> knn_graph_coo(d_alloc, stream);
+    raft::sparse::COO<value_t, value_idx> knn_graph_coo(stream);
 
     raft::sparse::selection::knn_graph(
       handle, data.data(), params.n_row, params.n_col,
@@ -85,7 +82,7 @@ class ConnectComponentsTest : public ::testing::TestWithParam<
 
     raft::sparse::convert::sorted_coo_to_csr(knn_graph_coo.rows(),
                                              knn_graph_coo.nnz, indptr.data(),
-                                             params.n_row + 1, d_alloc, stream);
+                                             params.n_row + 1, stream);
 
     /**
      * 2. Construct MST, sorted by weights
@@ -112,7 +109,7 @@ class ConnectComponentsTest : public ::testing::TestWithParam<
 
     raft::sparse::convert::sorted_coo_to_csr(out_edges.rows(), out_edges.nnz,
                                              indptr2.data(), params.n_row + 1,
-                                             d_alloc, stream);
+                                             stream);
 
     auto output_mst = raft::mst::mst<value_idx, value_idx, value_t>(
       handle, indptr2.data(), out_edges.cols(), out_edges.vals(), params.n_row,
diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu
index 553ef2ddee..5bee5a000e 100644
--- a/cpp/test/sparse/convert_csr.cu
+++ b/cpp/test/sparse/convert_csr.cu
@@ -19,7 +19,6 @@
 #include <raft/random/rng.cuh>
 #include "../test_utils.h"
 
-#include <raft/mr/device/allocator.hpp>
 #include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/coo.cuh>
 
@@ -61,8 +60,6 @@ typedef SparseConvertCSRTest<float> SortedCOOToCSR;
 TEST_P(SortedCOOToCSR, Result) {
   cudaStream_t stream;
   cudaStreamCreate(&stream);
-  std::shared_ptr<raft::mr::device::allocator> alloc(
-    new raft::mr::device::default_allocator);
 
   int nnz = 8;
 
@@ -78,7 +75,7 @@ TEST_P(SortedCOOToCSR, Result) {
   raft::update_device(in, in_h, nnz, stream);
   raft::update_device(exp, exp_h, 4, stream);
 
-  convert::sorted_coo_to_csr<int>(in, nnz, out, 4, alloc, stream);
+  convert::sorted_coo_to_csr<int>(in, nnz, out, 4, stream);
 
   ASSERT_TRUE(raft::devArrMatch<int>(out, exp, 4, raft::Compare<int>()));
 
diff --git a/cpp/test/sparse/csr_row_slice.cu b/cpp/test/sparse/csr_row_slice.cu
index 625772a842..b158ffdedd 100644
--- a/cpp/test/sparse/csr_row_slice.cu
+++ b/cpp/test/sparse/csr_row_slice.cu
@@ -19,8 +19,6 @@
 
 #include <gtest/gtest.h>
 #include <raft/sparse/cusparse_wrappers.h>
-#include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
 
 #include <raft/sparse/op/slice.h>
 
@@ -92,8 +90,6 @@ class CSRRowSliceTest
   void SetUp() override {
     params = ::testing::TestWithParam<
       CSRRowSliceInputs<value_idx, value_t>>::GetParam();
-    std::shared_ptr<raft::mr::device::allocator> alloc(
-      new raft::mr::device::default_allocator);
     CUDA_CHECK(cudaStreamCreate(&stream));
 
     make_data();
diff --git a/cpp/test/sparse/csr_to_dense.cu b/cpp/test/sparse/csr_to_dense.cu
index 5535df4fe3..d04799befa 100644
--- a/cpp/test/sparse/csr_to_dense.cu
+++ b/cpp/test/sparse/csr_to_dense.cu
@@ -16,8 +16,6 @@
 
 #include <cusparse_v2.h>
 #include <raft/cudart_utils.h>
-#include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
 
 #include <gtest/gtest.h>
 #include <raft/sparse/cusparse_wrappers.h>
@@ -77,8 +75,6 @@ class CSRToDenseTest
   void SetUp() override {
     params = ::testing::TestWithParam<
       CSRToDenseInputs<value_idx, value_t>>::GetParam();
-    std::shared_ptr<raft::mr::device::allocator> alloc(
-      new raft::mr::device::default_allocator);
     CUDA_CHECK(cudaStreamCreate(&stream));
     CUSPARSE_CHECK(cusparseCreate(&handle));
 
diff --git a/cpp/test/sparse/csr_transpose.cu b/cpp/test/sparse/csr_transpose.cu
index c257d6eb3c..5baeadd16f 100644
--- a/cpp/test/sparse/csr_transpose.cu
+++ b/cpp/test/sparse/csr_transpose.cu
@@ -20,8 +20,6 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
-#include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
 
 #include <raft/sparse/linalg/transpose.h>
 
@@ -94,8 +92,6 @@ class CSRTransposeTest
   void SetUp() override {
     params = ::testing::TestWithParam<
       CSRTransposeInputs<value_idx, value_t>>::GetParam();
-    std::shared_ptr<raft::mr::device::allocator> alloc(
-      new raft::mr::device::default_allocator);
     CUDA_CHECK(cudaStreamCreate(&stream));
     CUSPARSE_CHECK(cusparseCreate(&handle));
 
@@ -103,7 +99,7 @@ class CSRTransposeTest
 
     raft::sparse::linalg::csr_transpose(
       handle, indptr, indices, data, out_indptr, out_indices, out_data,
-      params.nrows, params.ncols, params.nnz, alloc, stream);
+      params.nrows, params.ncols, params.nnz, stream);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
     CUSPARSE_CHECK(cusparseDestroy(handle));
diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu
index 3582075bfb..8b2e69e5a2 100644
--- a/cpp/test/sparse/dist_coo_spmv.cu
+++ b/cpp/test/sparse/dist_coo_spmv.cu
@@ -22,7 +22,6 @@
 #include <raft/linalg/distance_type.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/linalg/unary_op.cuh>
-#include <raft/mr/device/allocator.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <raft/sparse/convert/coo.cuh>
@@ -150,8 +149,6 @@ class SparseDistanceCOOSPMVTest
   void SetUp() override {
     params = ::testing::TestWithParam<
       SparseDistanceCOOSPMVInputs<value_idx, value_t>>::GetParam();
-    std::shared_ptr<raft::mr::device::allocator> alloc(
-      new raft::mr::device::default_allocator);
     CUDA_CHECK(cudaStreamCreate(&stream));
 
     CUSPARSE_CHECK(cusparseCreate(&cusparseHandle));
@@ -172,7 +169,6 @@ class SparseDistanceCOOSPMVTest
     dist_config.a_indices = indices;
     dist_config.a_data = data;
     dist_config.handle = cusparseHandle;
-    dist_config.allocator = alloc;
     dist_config.stream = stream;
 
     int out_size = dist_config.a_nrows * dist_config.b_nrows;
diff --git a/cpp/test/sparse/dist_csr_spmv.cu b/cpp/test/sparse/dist_csr_spmv.cu
index c32748a04e..b087c3a612 100644
--- a/cpp/test/sparse/dist_csr_spmv.cu
+++ b/cpp/test/sparse/dist_csr_spmv.cu
@@ -22,7 +22,6 @@
 #include <raft/linalg/distance_type.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/linalg/unary_op.cuh>
-#include <raft/mr/device/allocator.hpp>
 
 #include <raft/sparse/distance/csr_spmv.cuh>
 #include <raft/sparse/distance/operators.cuh>
@@ -130,8 +129,6 @@ class SparseDistanceCSRSPMVTest
   void SetUp() override {
     params = ::testing::TestWithParam<
       SparseDistanceCSRSPMVInputs<value_idx, value_t>>::GetParam();
-    std::shared_ptr<raft::mr::device::allocator> alloc(
-      new raft::mr::device::default_allocator);
     CUDA_CHECK(cudaStreamCreate(&stream));
 
     CUSPARSE_CHECK(cusparseCreate(&cusparseHandle));
@@ -151,7 +148,6 @@ class SparseDistanceCSRSPMVTest
     dist_config.a_indices = indices;
     dist_config.a_data = data;
     dist_config.handle = cusparseHandle;
-    dist_config.allocator = alloc;
     dist_config.stream = stream;
 
     int out_size = dist_config.a_nrows * dist_config.b_nrows;
diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu
index 4247e374d6..729394fd2c 100644
--- a/cpp/test/sparse/distance.cu
+++ b/cpp/test/sparse/distance.cu
@@ -21,7 +21,6 @@
 #include <raft/cudart_utils.h>
 #include <raft/linalg/distance_type.h>
 #include <raft/sparse/cusparse_wrappers.h>
-#include <raft/mr/device/allocator.hpp>
 
 #include <raft/sparse/distance/distance.cuh>
 
@@ -83,8 +82,6 @@ class SparseDistanceTest
   void SetUp() override {
     params = ::testing::TestWithParam<
       SparseDistanceInputs<value_idx, value_t>>::GetParam();
-    std::shared_ptr<raft::mr::device::allocator> alloc(
-      new raft::mr::device::default_allocator);
     CUDA_CHECK(cudaStreamCreate(&stream));
 
     CUSPARSE_CHECK(cusparseCreate(&cusparseHandle));
@@ -106,7 +103,6 @@ class SparseDistanceTest
     dist_config.a_indices = indices;
     dist_config.a_data = data;
     dist_config.handle = cusparseHandle;
-    dist_config.allocator = alloc;
     dist_config.stream = stream;
 
     int out_size = dist_config.a_nrows * dist_config.b_nrows;
diff --git a/cpp/test/sparse/filter.cu b/cpp/test/sparse/filter.cu
index f7954f899f..4634e5fc0e 100644
--- a/cpp/test/sparse/filter.cu
+++ b/cpp/test/sparse/filter.cu
@@ -20,7 +20,6 @@
 #include "../test_utils.h"
 
 #include <raft/sparse/op/sort.h>
-#include <raft/mr/device/allocator.hpp>
 #include <raft/sparse/coo.cuh>
 #include <raft/sparse/op/filter.cuh>
 
@@ -53,13 +52,11 @@ typedef SparseFilterTests<float> COORemoveZeros;
 TEST_P(COORemoveZeros, Result) {
   cudaStream_t stream;
   cudaStreamCreate(&stream);
-  std::shared_ptr<raft::mr::device::allocator> alloc(
-    new raft::mr::device::default_allocator);
   params = ::testing::TestWithParam<SparseFilterInputs<float>>::GetParam();
 
   float *in_h_vals = new float[params.nnz];
 
-  COO<float> in(alloc, stream, params.nnz, 5, 5);
+  COO<float> in(stream, params.nnz, 5, 5);
 
   raft::random::Rng r(params.seed);
   r.uniform(in.vals(), params.nnz, float(-1.0), float(1.0), stream);
@@ -82,7 +79,7 @@ TEST_P(COORemoveZeros, Result) {
   raft::update_device(in.cols(), in_h_cols, params.nnz, stream);
   raft::update_device(in.vals(), in_h_vals, params.nnz, stream);
 
-  op::coo_sort<float>(&in, alloc, stream);
+  op::coo_sort<float>(&in, stream);
 
   int out_rows_ref_h[2] = {0, 3};
   int out_cols_ref_h[2] = {4, 1};
@@ -91,14 +88,14 @@ TEST_P(COORemoveZeros, Result) {
   out_vals_ref_h[0] = in_h_vals[4];
   out_vals_ref_h[1] = in_h_vals[1];
 
-  COO<float> out_ref(alloc, stream, 2, 5, 5);
-  COO<float> out(alloc, stream);
+  COO<float> out_ref(stream, 2, 5, 5);
+  COO<float> out(stream);
 
   raft::update_device(out_ref.rows(), *&out_rows_ref_h, 2, stream);
   raft::update_device(out_ref.cols(), *&out_cols_ref_h, 2, stream);
   raft::update_device(out_ref.vals(), out_vals_ref_h, 2, stream);
 
-  op::coo_remove_zeros<32, float>(&in, &out, alloc, stream);
+  op::coo_remove_zeros<32, float>(&in, &out, stream);
 
   ASSERT_TRUE(raft::devArrMatch<int>(out_ref.rows(), out.rows(), 2,
                                      raft::Compare<int>()));
diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/knn.cu
index 4759eebe4b..e92eb53ada 100644
--- a/cpp/test/sparse/knn.cu
+++ b/cpp/test/sparse/knn.cu
@@ -24,8 +24,6 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
-#include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
 
 namespace raft {
 namespace sparse {
@@ -95,8 +93,6 @@ class SparseKNNTest
   void SetUp() override {
     params =
       ::testing::TestWithParam<SparseKNNInputs<value_idx, value_t>>::GetParam();
-    std::shared_ptr<raft::mr::device::allocator> alloc(
-      new raft::mr::device::default_allocator);
     CUDA_CHECK(cudaStreamCreate(&stream));
 
     CUSPARSE_CHECK(cusparseCreate(&cusparseHandle));
@@ -110,8 +106,7 @@ class SparseKNNTest
     raft::sparse::selection::brute_force_knn<value_idx, value_t>(
       indptr, indices, data, nnz, n_rows, params.n_cols, indptr, indices, data,
       nnz, n_rows, params.n_cols, out_indices, out_dists, k, cusparseHandle,
-      alloc, stream, params.batch_size_index, params.batch_size_query,
-      params.metric);
+      stream, params.batch_size_index, params.batch_size_query, params.metric);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
diff --git a/cpp/test/sparse/knn_graph.cu b/cpp/test/sparse/knn_graph.cu
index ec41b32374..d4e9a915e5 100644
--- a/cpp/test/sparse/knn_graph.cu
+++ b/cpp/test/sparse/knn_graph.cu
@@ -64,10 +64,9 @@ class KNNGraphTest
 
     raft::handle_t handle;
 
-    auto alloc = handle.get_device_allocator();
     stream = handle.get_stream();
 
-    out = new raft::sparse::COO<value_t, value_idx>(alloc, stream);
+    out = new raft::sparse::COO<value_t, value_idx>(stream);
 
     allocate(X, params.X.size());
 
diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu
index 64403eab7f..291880bf53 100644
--- a/cpp/test/sparse/linkage.cu
+++ b/cpp/test/sparse/linkage.cu
@@ -21,7 +21,6 @@
 
 #include <raft/linalg/distance_type.h>
 #include <raft/linalg/transpose.h>
-#include <raft/mr/device/allocator.hpp>
 #include <raft/sparse/coo.cuh>
 #include <raft/sparse/hierarchy/single_linkage.hpp>
 #include <rmm/device_uvector.hpp>
@@ -106,13 +105,11 @@ __global__ void computeTheNumerator(const T* firstClusterArray,
 * @param firstClusterArray: the array of classes of type T
 * @param secondClusterArray: the array of classes of type T
 * @param size: the size of the data points of type uint64_t
-* @param allocator: object that takes care of temporary device memory allocation of type std::shared_ptr<MLCommon::deviceAllocator>
 * @param stream: the cudaStream object
 */
 template <typename T>
-double compute_rand_index(
-  T* firstClusterArray, T* secondClusterArray, uint64_t size,
-  std::shared_ptr<raft::mr::device::allocator> allocator, cudaStream_t stream) {
+double compute_rand_index(T* firstClusterArray, T* secondClusterArray,
+                          uint64_t size, cudaStream_t stream) {
   //rand index for size less than 2 is not defined
   ASSERT(size >= 2, "Rand Index for size less than 2 not defined!");
 
@@ -190,8 +187,7 @@ class LinkageTest : public ::testing::TestWithParam<LinkageInputs<T, IdxT>> {
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
 
     score =
-      compute_rand_index(labels, labels_ref, params.n_row,
-                         handle.get_device_allocator(), handle.get_stream());
+      compute_rand_index(labels, labels_ref, params.n_row, handle.get_stream());
   }
 
   void SetUp() override { basicTest(); }
diff --git a/cpp/test/sparse/reduce.cu b/cpp/test/sparse/reduce.cu
index 50b5dc5993..8ff4a600bc 100644
--- a/cpp/test/sparse/reduce.cu
+++ b/cpp/test/sparse/reduce.cu
@@ -53,7 +53,6 @@ class SparseReduceTest
   void Run() {
     raft::handle_t handle;
 
-    auto d_alloc = handle.get_device_allocator();
     auto stream = handle.get_stream();
 
     rmm::device_uvector<value_idx> in_rows(params.in_rows.size(), stream);
@@ -76,7 +75,7 @@ class SparseReduceTest
     raft::update_device(out_vals.data(), params.out_vals.data(),
                         params.out_vals.size(), stream);
 
-    raft::sparse::COO<value_t, value_idx> out(d_alloc, stream);
+    raft::sparse::COO<value_t, value_idx> out(stream);
     raft::sparse::op::max_duplicates(handle, out, in_rows.data(),
                                      in_cols.data(), in_vals.data(),
                                      params.in_rows.size(), params.m, params.n);
diff --git a/cpp/test/sparse/selection.cu b/cpp/test/sparse/selection.cu
index 46f2f6a844..74aa9d6eaf 100644
--- a/cpp/test/sparse/selection.cu
+++ b/cpp/test/sparse/selection.cu
@@ -81,8 +81,6 @@ class SparseSelectionTest
   void SetUp() override {
     params = ::testing::TestWithParam<
       SparseSelectionInputs<value_idx, value_t>>::GetParam();
-    std::shared_ptr<raft::mr::device::allocator> alloc(
-      new raft::mr::device::default_allocator);
     CUDA_CHECK(cudaStreamCreate(&stream));
 
     n_rows = params.n_rows;
diff --git a/cpp/test/sparse/sort.cu b/cpp/test/sparse/sort.cu
index b9a8b849eb..9deac1b82f 100644
--- a/cpp/test/sparse/sort.cu
+++ b/cpp/test/sparse/sort.cu
@@ -20,7 +20,6 @@
 #include "../test_utils.h"
 
 #include <raft/sparse/op/sort.h>
-#include <raft/mr/device/allocator.hpp>
 
 #include <iostream>
 
@@ -55,8 +54,6 @@ TEST_P(COOSort, Result) {
   raft::random::Rng r(params.seed);
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
-  std::shared_ptr<raft::mr::device::allocator> alloc(
-    new raft::mr::device::default_allocator);
 
   raft::allocate(in_vals, params.nnz);
   r.uniform(in_vals, params.nnz, float(-1.0), float(1.0), stream);
@@ -80,7 +77,7 @@ TEST_P(COOSort, Result) {
   raft::update_device(in_cols, in_cols_h, params.nnz, stream);
   raft::update_device(verify, verify_h, params.nnz, stream);
 
-  op::coo_sort(params.m, params.n, params.nnz, in_rows, in_cols, in_vals, alloc,
+  op::coo_sort(params.m, params.n, params.nnz, in_rows, in_cols, in_vals,
                stream);
 
   ASSERT_TRUE(
diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu
index e96e5c289c..a060b8128a 100644
--- a/cpp/test/sparse/symmetrize.cu
+++ b/cpp/test/sparse/symmetrize.cu
@@ -79,7 +79,6 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam<
 
     raft::handle_t handle;
 
-    auto alloc = handle.get_device_allocator();
     stream = handle.get_stream();
 
     make_data();
@@ -92,7 +91,7 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam<
 
     raft::sparse::convert::csr_to_coo(indptr, m, coo_rows.data(), nnz, stream);
 
-    raft::sparse::COO<value_t, value_idx> out(alloc, stream);
+    raft::sparse::COO<value_t, value_idx> out(stream);
 
     raft::sparse::linalg::symmetrize(handle, coo_rows.data(), indices, data, m,
                                      n, coo_rows.size(), out);
@@ -149,9 +148,6 @@ TEST_P(COOSymmetrize, Result) {
   cudaStream_t stream;
   cudaStreamCreate(&stream);
 
-  std::shared_ptr<raft::mr::device::default_allocator> alloc(
-    new raft::mr::device::default_allocator);
-
   int nnz = 8;
 
   int *in_rows_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3};
@@ -165,19 +161,19 @@ TEST_P(COOSymmetrize, Result) {
   float *exp_vals_h = new float[nnz * 2]{0.5, 0.5, 1.5, 0, 0.5, 0.5, 0.5, 0,
                                          0.5, 0.5, 0.5, 0, 1.5, 0.5, 0.5, 0.0};
 
-  COO<float> in(alloc, stream, nnz, 4, 4);
+  COO<float> in(stream, nnz, 4, 4);
   raft::update_device(in.rows(), *&in_rows_h, nnz, stream);
   raft::update_device(in.cols(), *&in_cols_h, nnz, stream);
   raft::update_device(in.vals(), *&in_vals_h, nnz, stream);
 
-  COO<float> out(alloc, stream);
+  COO<float> out(stream);
 
   linalg::coo_symmetrize<32, float>(
     &in, &out,
     [] __device__(int row, int col, float val, float trans) {
       return val + trans;
     },
-    alloc, stream);
+    stream);
 
   CUDA_CHECK(cudaStreamSynchronize(stream));
   std::cout << out << std::endl;
diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu
index def1f1685b..e62bec4381 100644
--- a/cpp/test/spatial/haversine.cu
+++ b/cpp/test/spatial/haversine.cu
@@ -18,7 +18,6 @@
 #include <raft/linalg/distance_type.h>
 #include <iostream>
 #include <raft/spatial/knn/detail/haversine_distance.cuh>
-#include <rmm/device_buffer.hpp>
 #include <vector>
 #include "../test_utils.h"
 
@@ -30,8 +29,6 @@ template <typename value_idx, typename value_t>
 class HaversineKNNTest : public ::testing::Test {
  protected:
   void basicTest() {
-    auto alloc = std::make_shared<raft::mr::device::default_allocator>();
-
     // Allocate input
     raft::allocate(d_train_inputs, n * d);
 

From 565cf68404670ec65a5db26237dff6d03447315e Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Thu, 1 Jul 2021 11:52:45 +0200
Subject: [PATCH 03/17] RAFT handle update

---
 cpp/include/raft/handle.hpp   | 52 +++++++++++++++++++----------------
 python/raft/common/handle.pxd |  2 --
 2 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index dbe7e83189..1b1923abb7 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -36,9 +36,8 @@
 #include <raft/linalg/cusolver_wrappers.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/comms/comms.hpp>
-#include <raft/mr/device/allocator.hpp>
-#include <raft/mr/host/allocator.hpp>
 #include <rmm/cuda_stream_pool.hpp>
+#include <rmm/exec_policy.hpp>
 #include "cudart_utils.h"
 
 namespace raft {
@@ -48,6 +47,9 @@ namespace raft {
  *        necessary cuda kernels and/or libraries
  */
 class handle_t {
+  using thrust_exec_policy_t = thrust::detail::execute_with_allocator<
+    rmm::mr::thrust_allocator<char>, thrust::cuda_cub::execute_on_stream_base>;
+
  private:
   static constexpr int kNumDefaultWorkerStreams = 0;
 
@@ -63,9 +65,7 @@ class handle_t {
         CUDA_CHECK(cudaGetDevice(&cur_dev));
         return cur_dev;
       }()),
-      streams_(n_streams),
-      device_allocator_(std::make_shared<mr::device::default_allocator>()),
-      host_allocator_(std::make_shared<mr::host::default_allocator>()) {
+      streams_(n_streams) {
     create_resources();
   }
 
@@ -86,8 +86,6 @@ class handle_t {
       "ERROR: the main handle must have at least one worker stream\n");
     prop_ = other.get_device_properties();
     device_prop_initialized_ = true;
-    device_allocator_ = other.get_device_allocator();
-    host_allocator_ = other.get_host_allocator();
     create_resources();
     set_stream(other.get_internal_stream(stream_id));
   }
@@ -97,26 +95,15 @@ class handle_t {
 
   int get_device() const { return dev_id_; }
 
-  void set_stream(cudaStream_t stream) { user_stream_ = stream; }
+  void set_stream(cudaStream_t stream) {
+    thrust_policy_initialized_ = false;
+    user_stream_ = stream;
+  }
   cudaStream_t get_stream() const { return user_stream_; }
   rmm::cuda_stream_view get_stream_view() const {
     return rmm::cuda_stream_view(user_stream_);
   }
 
-  void set_device_allocator(std::shared_ptr<mr::device::allocator> allocator) {
-    device_allocator_ = allocator;
-  }
-  std::shared_ptr<mr::device::allocator> get_device_allocator() const {
-    return device_allocator_;
-  }
-
-  void set_host_allocator(std::shared_ptr<mr::host::allocator> allocator) {
-    host_allocator_ = allocator;
-  }
-  std::shared_ptr<mr::host::allocator> get_host_allocator() const {
-    return host_allocator_;
-  }
-
   cublasHandle_t get_cublas_handle() const {
     std::lock_guard<std::mutex> _(mutex_);
     if (!cublas_initialized_) {
@@ -153,6 +140,23 @@ class handle_t {
     return cusparse_handle_;
   }
 
+  thrust_exec_policy_t get_thrust_policy() const {
+    std::lock_guard<std::mutex> _(mutex_);
+    if (!thrust_policy_initialized_) {
+      if (!thrust_policy_) {
+        thrust_policy_ =
+          (thrust_exec_policy_t*)malloc(sizeof(thrust_exec_policy_t));
+      }
+      *thrust_policy_ = rmm::exec_policy(this->get_stream());
+      thrust_policy_initialized_ = true;
+    }
+    return *thrust_policy_;
+  }
+
+  thrust_exec_policy_t get_thrust_policy(cudaStream_t stream) const {
+    return rmm::exec_policy(stream);
+  }
+
   // legacy compatibility for cuML
   cudaStream_t get_internal_stream(int sid) const {
     return streams_.get_stream(sid).value();
@@ -236,8 +240,8 @@ class handle_t {
   mutable bool cusolver_sp_initialized_{false};
   mutable cusparseHandle_t cusparse_handle_;
   mutable bool cusparse_initialized_{false};
-  std::shared_ptr<mr::device::allocator> device_allocator_;
-  std::shared_ptr<mr::host::allocator> host_allocator_;
+  mutable thrust_exec_policy_t* thrust_policy_{nullptr};
+  mutable bool thrust_policy_initialized_{false};
   cudaStream_t user_stream_{nullptr};
   cudaEvent_t event_;
   mutable cudaDeviceProp prop_;
diff --git a/python/raft/common/handle.pxd b/python/raft/common/handle.pxd
index 6076640312..884d81bed1 100644
--- a/python/raft/common/handle.pxd
+++ b/python/raft/common/handle.pxd
@@ -34,7 +34,5 @@ cdef extern from "raft/handle.hpp" namespace "raft" nogil:
         handle_t() except +
         handle_t(int ns) except +
         void set_stream(_Stream s) except +
-        void set_device_allocator(shared_ptr[allocator] a) except +
-        shared_ptr[allocator] get_device_allocator() except +
         _Stream get_stream() except +
         int get_num_internal_streams() except +

From eb1253ad0e4924a81910d07624eada21624eb52c Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Thu, 1 Jul 2021 11:53:51 +0200
Subject: [PATCH 04/17] Use raft::handle_t::get_thrust_policy to create thrust
 policy

---
 cpp/include/raft/sparse/distance/coo_spmv.cuh |   2 -
 .../coo_spmv_strategies/base_strategy.cuh     |   1 -
 .../coo_mask_row_iterators.cuh                |   1 -
 .../coo_spmv_strategies/hash_strategy.cuh     |   2 +-
 .../raft/sparse/distance/l2_distance.cuh      |   5 +-
 .../sparse/hierarchy/detail/agglomerative.cuh |   3 +-
 .../hierarchy/detail/connectivities.cuh       |   5 +-
 .../raft/sparse/hierarchy/detail/mst.cuh      |   3 -
 cpp/include/raft/sparse/linalg/spectral.cuh   |   5 +-
 cpp/include/raft/sparse/linalg/symmetrize.cuh |   1 -
 .../raft/sparse/mst/detail/mst_solver_inl.cuh |  12 +-
 cpp/include/raft/sparse/op/reduce.cuh         |   8 +-
 .../sparse/selection/connect_components.cuh   |   3 +-
 cpp/include/raft/spectral/cluster_solvers.hpp |   9 +-
 cpp/include/raft/spectral/kmeans.hpp          | 129 ++++++++----------
 cpp/include/raft/spectral/matrix_wrappers.hpp |  43 +++---
 .../raft/spectral/modularity_maximization.hpp |  28 ++--
 cpp/include/raft/spectral/partition.hpp       |  28 ++--
 cpp/include/raft/spectral/spectral_util.hpp   |  22 ++-
 cpp/test/cluster_solvers.cu                   |   9 +-
 cpp/test/eigen_solvers.cu                     |   7 +-
 cpp/test/spectral_matrix.cu                   |  19 ++-
 22 files changed, 140 insertions(+), 205 deletions(-)

diff --git a/cpp/include/raft/sparse/distance/coo_spmv.cuh b/cpp/include/raft/sparse/distance/coo_spmv.cuh
index 4f979aa6f1..24be171900 100644
--- a/cpp/include/raft/sparse/distance/coo_spmv.cuh
+++ b/cpp/include/raft/sparse/distance/coo_spmv.cuh
@@ -34,8 +34,6 @@
 
 #include <cusparse_v2.h>
 
-#include <rmm/exec_policy.hpp>
-
 namespace raft {
 namespace sparse {
 namespace distance {
diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh
index 5ace978a23..194799aed0 100644
--- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh
+++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh
@@ -23,7 +23,6 @@
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/device_vector.hpp>
-#include <rmm/exec_policy.hpp>
 
 namespace raft {
 namespace sparse {
diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh
index 44c3833f96..74eb37bc2b 100644
--- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh
+++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh
@@ -20,7 +20,6 @@
 #include "../utils.cuh"
 
 #include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
 
 namespace raft {
 namespace sparse {
diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh
index 1295d24103..a95c6ff85b 100644
--- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh
+++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh
@@ -55,7 +55,7 @@ class hash_strategy : public coo_spmv_strategy<value_idx, value_t, tpb> {
                        rmm::device_uvector<value_idx> &mask_indptr,
                        std::tuple<value_idx, value_idx> &n_rows_divided,
                        cudaStream_t stream) {
-    auto policy = rmm::exec_policy(stream);
+    auto policy = this->config.handle.get_thrust_policy();
 
     auto less = thrust::copy_if(
       policy, thrust::make_counting_iterator(value_idx(0)),
diff --git a/cpp/include/raft/sparse/distance/l2_distance.cuh b/cpp/include/raft/sparse/distance/l2_distance.cuh
index 5f89101082..534280191b 100644
--- a/cpp/include/raft/sparse/distance/l2_distance.cuh
+++ b/cpp/include/raft/sparse/distance/l2_distance.cuh
@@ -245,9 +245,8 @@ class hellinger_expanded_distances_t : public distances_t<value_t> {
     : config_(&config), workspace(0, config.handle.get_stream()) {}
 
   void compute(value_t *out_dists) {
-    raft::mr::device::buffer<value_idx> coo_rows(
-      config_->handle.get_device_allocator(), config_->handle.get_stream(),
-      max(config_->b_nnz, config_->a_nnz));
+    rmm::device_uvector<value_idx> coo_rows(max(config_->b_nnz, config_->a_nnz),
+                                            config_->handle.get_stream());
 
     raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows,
                                       coo_rows.data(), config_->b_nnz,
diff --git a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
index 187985627f..2a7a8b3e4e 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
@@ -21,7 +21,6 @@
 #include <raft/handle.hpp>
 #include <raft/mr/device/buffer.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
 
 #include <thrust/device_ptr.h>
 #include <thrust/execution_policy.h>
@@ -224,7 +223,7 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels,
                                 const value_idx *children, size_t n_clusters,
                                 size_t n_leaves) {
   auto stream = handle.get_stream();
-  auto thrust_policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
+  auto thrust_policy = handle.get_thrust_policy();
 
   // Handle special case where n_clusters == 1
   if (n_clusters == 1) {
diff --git a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
index b6ec190a98..31e4a0f263 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
@@ -22,7 +22,6 @@
 
 #include <raft/linalg/unary_op.cuh>
 #include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
 
 #include <raft/linalg/distance_type.h>
 #include <raft/sparse/hierarchy/common.h>
@@ -61,7 +60,7 @@ struct distance_graph_impl<raft::hierarchy::LinkageDistance::KNN_GRAPH,
            rmm::device_uvector<value_idx> &indices,
            rmm::device_uvector<value_t> &data, int c) {
     auto stream = handle.get_stream();
-    auto exec_policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
+    auto thrust_policy = handle.get_thrust_policy();
 
     // Need to symmetrize knn into undirected graph
     raft::sparse::COO<value_t, value_idx> knn_graph_coo(stream);
@@ -77,7 +76,7 @@ struct distance_graph_impl<raft::hierarchy::LinkageDistance::KNN_GRAPH,
       knn_graph_coo.rows(), knn_graph_coo.cols(), knn_graph_coo.vals()));
 
     thrust::transform(
-      exec_policy, transform_in, transform_in + knn_graph_coo.nnz,
+      thrust_policy, transform_in, transform_in + knn_graph_coo.nnz,
       knn_graph_coo.vals(),
       [=] __device__(const thrust::tuple<value_idx, value_idx, value_t> &tup) {
         bool self_loop = thrust::get<0>(tup) == thrust::get<1>(tup);
diff --git a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
index 033d5881d5..6ef6f9879b 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
@@ -25,12 +25,9 @@
 #include <raft/sparse/selection/connect_components.cuh>
 #include <rmm/device_uvector.hpp>
 
-#include <rmm/exec_policy.hpp>
-
 #include <thrust/device_ptr.h>
 #include <thrust/execution_policy.h>
 #include <thrust/sort.h>
-#include <rmm/exec_policy.hpp>
 
 namespace raft {
 namespace hierarchy {
diff --git a/cpp/include/raft/sparse/linalg/spectral.cuh b/cpp/include/raft/sparse/linalg/spectral.cuh
index 28b9190c53..d15c2cdf23 100644
--- a/cpp/include/raft/sparse/linalg/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/spectral.cuh
@@ -63,7 +63,7 @@ void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals,
   index_type maxiter = 4000;  //default reset value (when set to 0);
   value_type tol = 0.01;
   index_type restart_iter = 15 + neigvs;  //what cugraph is using
-  auto t_exe_p = thrust::cuda::par.on(stream);
+  auto t_exe_p = handle.get_thrust_policy();
   using thrust_exe_policy_t = decltype(t_exe_p);
 
   raft::eigen_solver_config_t<index_type, value_type> cfg{neigvs, maxiter,
@@ -83,8 +83,7 @@ void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals,
     using value_type_t = value_type;
 
     std::pair<value_type_t, index_type_t> solve(
-      handle_t const &handle, thrust_exe_policy_t t_exe_policy,
-      size_type_t n_obs_vecs, size_type_t dim,
+      handle_t const &handle, size_type_t n_obs_vecs, size_type_t dim,
       value_type_t const *__restrict__ obs,
       index_type_t *__restrict__ codes) const {
       return std::make_pair<value_type_t, index_type_t>(0, 0);
diff --git a/cpp/include/raft/sparse/linalg/symmetrize.cuh b/cpp/include/raft/sparse/linalg/symmetrize.cuh
index 5fcd336551..614c9d830e 100644
--- a/cpp/include/raft/sparse/linalg/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/symmetrize.cuh
@@ -30,7 +30,6 @@
 
 #include <cuda_runtime.h>
 #include <stdio.h>
-#include <rmm/exec_policy.hpp>
 
 #include <algorithm>
 #include <iostream>
diff --git a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
index c5ba4fcb4f..029b76a945 100644
--- a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
+++ b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
@@ -24,7 +24,6 @@
 
 #include <raft/cudart_utils.h>
 #include <rmm/device_buffer.hpp>
-#include <rmm/exec_policy.hpp>
 
 #include <thrust/device_ptr.h>
 #include <thrust/device_vector.h>
@@ -38,7 +37,6 @@
 #include <raft/cudart_utils.h>
 
 #include <rmm/device_buffer.hpp>
-#include <rmm/exec_policy.hpp>
 
 namespace raft {
 namespace mst {
@@ -88,7 +86,7 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::MST_solver(
   sm_count = handle_.get_device_properties().multiProcessorCount;
 
   //Initially, color holds the vertex id as color
-  auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
+  auto policy = handle.get_thrust_policy();
   if (initialize_colors_) {
     thrust::sequence(policy, color.begin(), color.end(), 0);
     thrust::sequence(policy, color_index, color_index + v, 0);
@@ -227,7 +225,7 @@ template <typename vertex_t, typename edge_t, typename weight_t,
           typename alteration_t>
 alteration_t
 MST_solver<vertex_t, edge_t, weight_t, alteration_t>::alteration_max() {
-  auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
+  auto policy = handle.get_thrust_policy();
   rmm::device_vector<weight_t> tmp(e);
   thrust::device_ptr<const weight_t> weights_ptr(weights);
   thrust::copy(policy, weights_ptr, weights_ptr + e, tmp.begin());
@@ -327,7 +325,7 @@ template <typename vertex_t, typename edge_t, typename weight_t,
           typename alteration_t>
 void MST_solver<vertex_t, edge_t, weight_t,
                 alteration_t>::min_edge_per_vertex() {
-  auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
+  auto policy = handle.get_thrust_policy();
   thrust::fill(policy, min_edge_color.begin(), min_edge_color.end(),
                std::numeric_limits<alteration_t>::max());
   thrust::fill(policy, new_mst_edge.begin(), new_mst_edge.end(),
@@ -354,7 +352,7 @@ void MST_solver<vertex_t, edge_t, weight_t,
   auto nthreads = std::min(v, max_threads);
   auto nblocks = std::min((v + nthreads - 1) / nthreads, max_blocks);
 
-  auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
+  auto policy = handle.get_thrust_policy();
   thrust::fill(policy, temp_src.begin(), temp_src.end(),
                std::numeric_limits<vertex_t>::max());
 
@@ -411,7 +409,7 @@ template <typename vertex_t, typename edge_t, typename weight_t,
           typename alteration_t>
 void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::append_src_dst_pair(
   vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights) {
-  auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
+  auto policy = handle.get_thrust_policy();
 
   auto curr_mst_edge_count = prev_mst_edge_count[0];
 
diff --git a/cpp/include/raft/sparse/op/reduce.cuh b/cpp/include/raft/sparse/op/reduce.cuh
index 2708f0491e..09a35720fb 100644
--- a/cpp/include/raft/sparse/op/reduce.cuh
+++ b/cpp/include/raft/sparse/op/reduce.cuh
@@ -31,7 +31,6 @@
 #include <cuda_runtime.h>
 #include <stdio.h>
 #include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
 
 #include <algorithm>
 #include <iostream>
@@ -126,16 +125,15 @@ void max_duplicates(const raft::handle_t &handle,
                     const value_idx *rows, const value_idx *cols,
                     const value_t *vals, size_t nnz, size_t m, size_t n) {
   auto stream = handle.get_stream();
-
-  auto exec_policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
+  auto thrust_policy = handle.get_thrust_policy();
 
   // compute diffs & take exclusive scan
   rmm::device_uvector<value_idx> diff(nnz + 1, stream);
 
   compute_duplicates_mask(diff.data(), rows, cols, nnz, stream);
 
-  thrust::exclusive_scan(thrust::cuda::par.on(stream), diff.data(),
-                         diff.data() + diff.size(), diff.data());
+  thrust::exclusive_scan(thrust_policy, diff.data(), diff.data() + diff.size(),
+                         diff.data());
 
   // compute final size
   value_idx size = 0;
diff --git a/cpp/include/raft/sparse/selection/connect_components.cuh b/cpp/include/raft/sparse/selection/connect_components.cuh
index 9b02ae67e6..390522c9bc 100644
--- a/cpp/include/raft/sparse/selection/connect_components.cuh
+++ b/cpp/include/raft/sparse/selection/connect_components.cuh
@@ -30,7 +30,6 @@
 #include <thrust/device_ptr.h>
 #include <thrust/sort.h>
 #include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
 
 #include <cub/cub.cuh>
 
@@ -361,7 +360,7 @@ void connect_components(const raft::handle_t &handle,
   raft::sparse::op::compute_duplicates_mask(out_index.data(), colors.data(),
                                             nn_colors.data(), n_rows, stream);
 
-  thrust::exclusive_scan(thrust::cuda::par.on(stream), out_index.data(),
+  thrust::exclusive_scan(handle.get_thrust_policy(), out_index.data(),
                          out_index.data() + out_index.size(), out_index.data());
 
   // compute final size
diff --git a/cpp/include/raft/spectral/cluster_solvers.hpp b/cpp/include/raft/spectral/cluster_solvers.hpp
index 922ae7cfab..6f507331d9 100644
--- a/cpp/include/raft/spectral/cluster_solvers.hpp
+++ b/cpp/include/raft/spectral/cluster_solvers.hpp
@@ -42,19 +42,16 @@ struct kmeans_solver_t {
                                                    size_type_t> const& config)
     : config_(config) {}
 
-  template <typename thrust_exe_policy_t>
   std::pair<value_type_t, index_type_t> solve(
-    handle_t const& handle, thrust_exe_policy_t t_exe_policy,
-    size_type_t n_obs_vecs, size_type_t dim,
+    handle_t const& handle, size_type_t n_obs_vecs, size_type_t dim,
     value_type_t const* __restrict__ obs,
     index_type_t* __restrict__ codes) const {
     RAFT_EXPECTS(obs != nullptr, "Null obs buffer.");
     RAFT_EXPECTS(codes != nullptr, "Null codes buffer.");
     value_type_t residual{};
     index_type_t iters{};
-    kmeans(handle, t_exe_policy, n_obs_vecs, dim, config_.n_clusters,
-           config_.tol, config_.maxIter, obs, codes, residual, iters,
-           config_.seed);
+    kmeans(handle, n_obs_vecs, dim, config_.n_clusters, config_.tol,
+           config_.maxIter, obs, codes, residual, iters, config_.seed);
     return std::make_pair(residual, iters);
   }
 
diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp
index fb05bff3e2..5928c727c6 100644
--- a/cpp/include/raft/spectral/kmeans.hpp
+++ b/cpp/include/raft/spectral/kmeans.hpp
@@ -325,7 +325,6 @@ static __global__ void divideCentroids(
  *    Centroid is randomly chosen with k-means++ algorithm.
  *  @tparam index_type_t the type of data used for indexing.
  *  @tparam value_type_t the type of data used for weights, distances.
- *  @tparam thrust_exe_pol_t the type of thrust execution policy.
  *  @param handle the raft handle.
  *  @param n Number of observation vectors.
  *  @param d Dimension of observation vectors.
@@ -341,12 +340,9 @@ static __global__ void divideCentroids(
  *    coordinates.
  *  @return Zero if successful. Otherwise non-zero.
  */
-template <typename index_type_t, typename value_type_t,
-          typename thrust_exe_pol_t>
-static int chooseNewCentroid(handle_t const& handle,
-                             thrust_exe_pol_t thrust_exec_policy,
-                             index_type_t n, index_type_t d, index_type_t k,
-                             value_type_t rand,
+template <typename index_type_t, typename value_type_t>
+static int chooseNewCentroid(handle_t const& handle, index_type_t n,
+                             index_type_t d, index_type_t k, value_type_t rand,
                              const value_type_t* __restrict__ obs,
                              value_type_t* __restrict__ dists,
                              value_type_t* __restrict__ centroid) {
@@ -357,8 +353,9 @@ static int chooseNewCentroid(handle_t const& handle,
   // Observation vector that is chosen as new centroid
   index_type_t obsIndex;
 
-  auto cublas_h = handle.get_cublas_handle();
   auto stream = handle.get_stream();
+  auto cublas_h = handle.get_cublas_handle();
+  auto thrust_exec_policy = handle.get_thrust_policy();
 
   // Compute cumulative sum of distances
   thrust::inclusive_scan(thrust_exec_policy, thrust::device_pointer_cast(dists),
@@ -417,10 +414,7 @@ static int chooseNewCentroid(handle_t const& handle,
  *    Centroids are randomly chosen with k-means++ algorithm
  *  @tparam index_type_t the type of data used for indexing.
  *  @tparam value_type_t the type of data used for weights, distances.
- *  @tparam thrust_exe_pol_t the type of thrust execution policy.
  *  @param handle the raft handle.
- *  @param  thrust_exec_policy thrust execution policy 
- *    (assumed to have same stream as handle.stream).
  *  @param n Number of observation vectors.
  *  @param d Dimension of observation vectors.
  *  @param k Number of clusters.
@@ -439,14 +433,12 @@ static int chooseNewCentroid(handle_t const& handle,
  *    distance between observation vectors and the closest centroid.
  *  @return Zero if successful. Otherwise non-zero.
  */
-template <typename index_type_t, typename value_type_t,
-          typename thrust_exe_pol_t>
+template <typename index_type_t, typename value_type_t>
 static int initializeCentroids(
-  handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, index_type_t n,
-  index_type_t d, index_type_t k, const value_type_t* __restrict__ obs,
-  value_type_t* __restrict__ centroids, index_type_t* __restrict__ codes,
-  index_type_t* __restrict__ clusterSizes, value_type_t* __restrict__ dists,
-  unsigned long long seed) {
+  handle_t const& handle, index_type_t n, index_type_t d, index_type_t k,
+  const value_type_t* __restrict__ obs, value_type_t* __restrict__ centroids,
+  index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes,
+  value_type_t* __restrict__ dists, unsigned long long seed) {
   // -------------------------------------------------------
   // Variable declarations
   // -------------------------------------------------------
@@ -458,8 +450,9 @@ static int initializeCentroids(
   thrust::default_random_engine rng(seed);
   thrust::uniform_real_distribution<value_type_t> uniformDist(0, 1);
 
-  auto cublas_h = handle.get_cublas_handle();
   auto stream = handle.get_stream();
+  auto cublas_h = handle.get_cublas_handle();
+  auto thrust_exec_policy = handle.get_thrust_policy();
 
   constexpr index_type_t grid_lower_bound{65535};
 
@@ -486,8 +479,8 @@ static int initializeCentroids(
   thrust::fill(thrust_exec_policy, thrust::device_pointer_cast(dists),
                thrust::device_pointer_cast(dists + n), 1);
   CHECK_CUDA(stream);
-  if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, uniformDist(rng),
-                        obs, dists, centroids))
+  if (chooseNewCentroid(handle, n, d, k, uniformDist(rng), obs, dists,
+                        centroids))
     WARNING("error in k-means++ (could not pick centroid)");
 
   // Compute distances from first centroid
@@ -499,8 +492,8 @@ static int initializeCentroids(
   // Choose remaining centroids
   for (i = 1; i < k; ++i) {
     // Choose ith centroid
-    if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, uniformDist(rng),
-                          obs, dists, centroids + IDX(0, i, d)))
+    if (chooseNewCentroid(handle, n, d, k, uniformDist(rng), obs, dists,
+                          centroids + IDX(0, i, d)))
       WARNING("error in k-means++ (could not pick centroid)");
 
     // Compute distances from ith centroid
@@ -529,10 +522,7 @@ static int initializeCentroids(
  *    Distance is measured with Euclidean norm.
  *  @tparam index_type_t the type of data used for indexing.
  *  @tparam value_type_t the type of data used for weights, distances.
- *  @tparam thrust_exe_pol_t the type of thrust execution policy.
  *  @param handle the raft handle.
- *  @param  thrust_exec_policy thrust execution policy
- *    (assumed to have same stream as handle.stream).
  *  @param n Number of observation vectors.
  *  @param d Dimension of observation vectors.
  *  @param k Number of clusters.
@@ -553,16 +543,18 @@ static int initializeCentroids(
  *    of squares of assignment.
  *  @return Zero if successful. Otherwise non-zero.
  */
-template <typename index_type_t, typename value_type_t,
-          typename thrust_exe_pol_t>
-static int assignCentroids(
-  handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, index_type_t n,
-  index_type_t d, index_type_t k, const value_type_t* __restrict__ obs,
-  const value_type_t* __restrict__ centroids, value_type_t* __restrict__ dists,
-  index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes,
-  value_type_t* residual_host) {
-  auto cublas_h = handle.get_cublas_handle();
+template <typename index_type_t, typename value_type_t>
+static int assignCentroids(handle_t const& handle, index_type_t n,
+                           index_type_t d, index_type_t k,
+                           const value_type_t* __restrict__ obs,
+                           const value_type_t* __restrict__ centroids,
+                           value_type_t* __restrict__ dists,
+                           index_type_t* __restrict__ codes,
+                           index_type_t* __restrict__ clusterSizes,
+                           value_type_t* residual_host) {
   auto stream = handle.get_stream();
+  auto cublas_h = handle.get_cublas_handle();
+  auto thrust_exec_policy = handle.get_thrust_policy();
 
   // Compute distance between centroids and observation vectors
   CUDA_TRY(cudaMemsetAsync(dists, 0, n * k * sizeof(value_type_t), stream));
@@ -606,10 +598,7 @@ static int assignCentroids(
  *    All clusters are assumed to be non-empty.
  *  @tparam index_type_t the type of data used for indexing.
  *  @tparam value_type_t the type of data used for weights, distances.
- *  @tparam thrust_exe_pol_t the type of thrust execution policy.
  *  @param handle the raft handle.
- *  @param  thrust_exec_policy thrust execution policy
- *    (assumed to have same stream as handle.stream).
  *  @param n Number of observation vectors.
  *  @param d Dimension of observation vectors.
  *  @param k Number of clusters.
@@ -628,10 +617,8 @@ static int assignCentroids(
  *    Workspace.
  *  @return Zero if successful. Otherwise non-zero.
  */
-template <typename index_type_t, typename value_type_t,
-          typename thrust_exe_pol_t>
-static int updateCentroids(handle_t const& handle,
-                           thrust_exe_pol_t thrust_exec_policy, index_type_t n,
+template <typename index_type_t, typename value_type_t>
+static int updateCentroids(handle_t const& handle, index_type_t n,
                            index_type_t d, index_type_t k,
                            const value_type_t* __restrict__ obs,
                            const index_type_t* __restrict__ codes,
@@ -649,8 +636,9 @@ static int updateCentroids(handle_t const& handle,
 
   constexpr index_type_t grid_lower_bound{65535};
 
-  auto cublas_h = handle.get_cublas_handle();
   auto stream = handle.get_stream();
+  auto cublas_h = handle.get_cublas_handle();
+  auto thrust_exec_policy = handle.get_thrust_policy();
 
   // Device memory
   thrust::device_ptr<value_type_t> obs_copy(work);
@@ -722,10 +710,7 @@ namespace raft {
  *    k-means++ algorithm.
  *  @tparam index_type_t the type of data used for indexing.
  *  @tparam value_type_t the type of data used for weights, distances.
- *  @tparam thrust_exe_pol_t the type of thrust execution policy.
  *  @param handle the raft handle.
- *  @param  thrust_exec_policy thrust execution policy
- *    (assumed to have same stream as handle.stream).
  *  @param n Number of observation vectors.
  *  @param d Dimension of observation vectors.
  *  @param k Number of clusters.
@@ -754,11 +739,10 @@ namespace raft {
  *  @param seed random seed to be used.
  *  @return error flag.
  */
-template <typename index_type_t, typename value_type_t,
-          typename thrust_exe_pol_t>
-int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy,
-           index_type_t n, index_type_t d, index_type_t k, value_type_t tol,
-           index_type_t maxiter, const value_type_t* __restrict__ obs,
+template <typename index_type_t, typename value_type_t>
+int kmeans(handle_t const& handle, index_type_t n, index_type_t d,
+           index_type_t k, value_type_t tol, index_type_t maxiter,
+           const value_type_t* __restrict__ obs,
            index_type_t* __restrict__ codes,
            index_type_t* __restrict__ clusterSizes,
            value_type_t* __restrict__ centroids,
@@ -785,16 +769,17 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy,
   // Initialization
   // -------------------------------------------------------
 
-  auto cublas_h = handle.get_cublas_handle();
   auto stream = handle.get_stream();
+  auto cublas_h = handle.get_cublas_handle();
+  auto thrust_exec_policy = handle.get_thrust_policy();
 
   // Trivial cases
   if (k == 1) {
     CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream));
     CUDA_TRY(cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t),
                              cudaMemcpyHostToDevice, stream));
-    if (updateCentroids(handle, thrust_exec_policy, n, d, k, obs, codes,
-                        clusterSizes, centroids, work, work_int))
+    if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids,
+                        work, work_int))
       WARNING("could not compute k-means centroids");
 
     dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE};
@@ -840,21 +825,21 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy,
   // -------------------------------------------------------
 
   // Choose initial cluster centroids
-  if (initializeCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids,
-                          codes, clusterSizes, work, seed))
+  if (initializeCentroids(handle, n, d, k, obs, centroids, codes, clusterSizes,
+                          work, seed))
     WARNING("could not initialize k-means centroids");
 
   // Apply k-means iteration until convergence
   for (iter = 0; iter < maxiter; ++iter) {
     // Update cluster centroids
-    if (updateCentroids(handle, thrust_exec_policy, n, d, k, obs, codes,
-                        clusterSizes, centroids, work, work_int))
+    if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids,
+                        work, work_int))
       WARNING("could not update k-means centroids");
 
     // Determine centroid closest to each observation
     residualPrev = *residual_host;
-    if (assignCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids,
-                        work, codes, clusterSizes, residual_host))
+    if (assignCentroids(handle, n, d, k, obs, centroids, work, codes,
+                        clusterSizes, residual_host))
       WARNING("could not assign observation vectors to k-means clusters");
 
     // Reinitialize empty clusters with new centroids
@@ -868,12 +853,11 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy,
     // conditions, such as if obs is corrupt (as seen as a result of a
     // DataFrame column of NULL edge vals used to create the Graph)
     while (emptyCentroid < k) {
-      if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k,
-                            uniformDist(rng), obs, work,
+      if (chooseNewCentroid(handle, n, d, k, uniformDist(rng), obs, work,
                             centroids + IDX(0, emptyCentroid, d)))
         WARNING("could not replace empty centroid");
-      if (assignCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids,
-                          work, codes, clusterSizes, residual_host))
+      if (assignCentroids(handle, n, d, k, obs, centroids, work, codes,
+                          clusterSizes, residual_host))
         WARNING("could not assign observation vectors to k-means clusters");
       emptyCentroid =
         (thrust::find(thrust_exec_policy,
@@ -905,10 +889,7 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy,
  *    k-means++ algorithm.
  *  @tparam index_type_t the type of data used for indexing.
  *  @tparam value_type_t the type of data used for weights, distances.
- *  @tparam thrust_exe_pol_t the type of thrust execution policy.
  *  @param handle the raft handle.
- *  @param  thrust_exec_policy thrust execution policy
- *    (assumed to have same stream as handle.stream).
  *  @param n Number of observation vectors.
  *  @param d Dimension of observation vectors.
  *  @param k Number of clusters.
@@ -926,11 +907,10 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy,
  *  @param seed random seed to be used.
  *  @return error flag
  */
-template <typename index_type_t, typename value_type_t,
-          typename thrust_exe_pol_t>
-int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy,
-           index_type_t n, index_type_t d, index_type_t k, value_type_t tol,
-           index_type_t maxiter, const value_type_t* __restrict__ obs,
+template <typename index_type_t, typename value_type_t>
+int kmeans(handle_t const& handle, index_type_t n, index_type_t d,
+           index_type_t k, value_type_t tol, index_type_t maxiter,
+           const value_type_t* __restrict__ obs,
            index_type_t* __restrict__ codes, value_type_t& residual,
            index_type_t& iters, unsigned long long seed = 123456) {
   using namespace matrix;
@@ -950,9 +930,8 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy,
 
   // Perform k-means
   return kmeans<index_type_t, value_type_t>(
-    handle, thrust_exec_policy, n, d, k, tol, maxiter, obs, codes,
-    clusterSizes.raw(), centroids.raw(), work.raw(), work_int.raw(), &residual,
-    &iters, seed);
+    handle, n, d, k, tol, maxiter, obs, codes, clusterSizes.raw(),
+    centroids.raw(), work.raw(), work_int.raw(), &residual, &iters, seed);
 }
 
 }  // namespace raft
diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp
index efa98313b6..42fc621a1a 100644
--- a/cpp/include/raft/spectral/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/matrix_wrappers.hpp
@@ -85,7 +85,8 @@ template <typename value_type>
 class vector_t {
  public:
   vector_t(handle_t const& raft_handle, size_type sz)
-    : buffer_(sz, raft_handle.get_stream()) {}
+    : buffer_(sz, raft_handle.get_stream()),
+      thrust_policy(raft_handle.get_thrust_policy()) {}
 
   size_type size(void) const { return buffer_.size(); }
 
@@ -93,9 +94,8 @@ class vector_t {
 
   value_type const* raw(void) const { return buffer_.data(); }
 
-  template <typename ThrustExecPolicy>
-  value_type nrm1(ThrustExecPolicy t_exe_pol) const {
-    return thrust::reduce(t_exe_pol, buffer_.data(),
+  value_type nrm1() const {
+    return thrust::reduce(thrust_policy, buffer_.data(),
                           buffer_.data() + buffer_.size(), value_type{0},
                           [] __device__(auto left, auto right) {
                             auto abs_left = left > 0 ? left : -left;
@@ -104,13 +104,15 @@ class vector_t {
                           });
   }
 
-  template <typename ThrustExecPolicy>
-  void fill(ThrustExecPolicy t_exe_pol, value_type value) {
-    thrust::fill_n(t_exe_pol, buffer_.data(), buffer_.size(), value);
+  void fill(value_type value) {
+    thrust::fill_n(thrust_policy, buffer_.data(), buffer_.size(), value);
   }
 
  private:
+  using thrust_exec_policy_t = thrust::detail::execute_with_allocator<
+    rmm::mr::thrust_allocator<char>, thrust::cuda_cub::execute_on_stream_base>;
   rmm::device_uvector<value_type> buffer_;
+  const thrust_exec_policy_t thrust_policy;
 };
 
 template <typename index_type, typename value_type>
@@ -262,31 +264,26 @@ struct sparse_matrix_t {
 
 template <typename index_type, typename value_type>
 struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
-  template <typename ThrustExePolicy>
-  laplacian_matrix_t(handle_t const& raft_handle,
-                     ThrustExePolicy thrust_exec_policy,
-                     index_type const* row_offsets,
+  laplacian_matrix_t(handle_t const& raft_handle, index_type const* row_offsets,
                      index_type const* col_indices, value_type const* values,
                      index_type const nrows, index_type const nnz)
     : sparse_matrix_t<index_type, value_type>(raft_handle, row_offsets,
                                               col_indices, values, nrows, nnz),
       diagonal_(raft_handle, nrows) {
     vector_t<value_type> ones{raft_handle, nrows};
-    ones.fill(thrust_exec_policy, 1.0);
+    ones.fill(1.0);
     sparse_matrix_t<index_type, value_type>::mv(1, ones.raw(), 0,
                                                 diagonal_.raw());
   }
 
-  template <typename ThrustExePolicy>
   laplacian_matrix_t(handle_t const& raft_handle,
-                     ThrustExePolicy thrust_exec_policy,
                      sparse_matrix_t<index_type, value_type> const& csr_m)
     : sparse_matrix_t<index_type, value_type>(raft_handle, csr_m.row_offsets_,
                                               csr_m.col_indices_, csr_m.values_,
                                               csr_m.nrows_, csr_m.nnz_),
       diagonal_(raft_handle, csr_m.nrows_) {
     vector_t<value_type> ones{raft_handle, csr_m.nrows_};
-    ones.fill(thrust_exec_policy, 1.0);
+    ones.fill(1.0);
     sparse_matrix_t<index_type, value_type>::mv(1, ones.raw(), 0,
                                                 diagonal_.raw());
   }
@@ -333,27 +330,19 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
 
 template <typename index_type, typename value_type>
 struct modularity_matrix_t : laplacian_matrix_t<index_type, value_type> {
-  template <typename ThrustExePolicy>
   modularity_matrix_t(handle_t const& raft_handle,
-                      ThrustExePolicy thrust_exec_policy,
                       index_type const* row_offsets,
                       index_type const* col_indices, value_type const* values,
                       index_type const nrows, index_type const nnz)
     : laplacian_matrix_t<index_type, value_type>(
-        raft_handle, thrust_exec_policy, row_offsets, col_indices, values,
-        nrows, nnz) {
-    edge_sum_ = laplacian_matrix_t<index_type, value_type>::diagonal_.nrm1(
-      thrust_exec_policy);
+        raft_handle, row_offsets, col_indices, values, nrows, nnz) {
+    edge_sum_ = laplacian_matrix_t<index_type, value_type>::diagonal_.nrm1();
   }
 
-  template <typename ThrustExePolicy>
   modularity_matrix_t(handle_t const& raft_handle,
-                      ThrustExePolicy thrust_exec_policy,
                       sparse_matrix_t<index_type, value_type> const& csr_m)
-    : laplacian_matrix_t<index_type, value_type>(raft_handle,
-                                                 thrust_exec_policy, csr_m) {
-    edge_sum_ = laplacian_matrix_t<index_type, value_type>::diagonal_.nrm1(
-      thrust_exec_policy);
+    : laplacian_matrix_t<index_type, value_type>(raft_handle, csr_m) {
+    edge_sum_ = laplacian_matrix_t<index_type, value_type>::diagonal_.nrm1();
   }
 
   // y = alpha*A*x + beta*y
diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp
index f8dfe5daa3..1fe7819a7e 100644
--- a/cpp/include/raft/spectral/modularity_maximization.hpp
+++ b/cpp/include/raft/spectral/modularity_maximization.hpp
@@ -79,19 +79,18 @@ using namespace linalg;
  *    performed.
  *  @return error flag.
  */
-template <typename vertex_t, typename weight_t, typename ThrustExePolicy,
-          typename EigenSolver, typename ClusterSolver>
+template <typename vertex_t, typename weight_t, typename EigenSolver,
+          typename ClusterSolver>
 std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
-  handle_t const &handle, ThrustExePolicy thrust_exec_policy,
-  sparse_matrix_t<vertex_t, weight_t> const &csr_m,
+  handle_t const &handle, sparse_matrix_t<vertex_t, weight_t> const &csr_m,
   EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver,
   vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) {
   RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
   RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
   RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
 
-  auto cublas_h = handle.get_cublas_handle();
   auto stream = handle.get_stream();
+  auto cublas_h = handle.get_cublas_handle();
 
   std::tuple<vertex_t, weight_t, vertex_t>
     stats;  // # iters eigen solver, cluster solver residual, # iters cluster solver
@@ -101,7 +100,7 @@ std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
   // Compute eigenvectors of Modularity Matrix
 
   // Initialize Modularity Matrix
-  modularity_matrix_t<vertex_t, weight_t> B{handle, thrust_exec_policy, csr_m};
+  modularity_matrix_t<vertex_t, weight_t> B{handle, csr_m};
 
   auto eigen_config = eigen_solver.get_config();
   auto nEigVecs = eigen_config.n_eigVecs;
@@ -111,7 +110,7 @@ std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
     eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs);
 
   // Whiten eigenvector matrix
-  transform_eigen_matrix(handle, thrust_exec_policy, n, nEigVecs, eigVecs);
+  transform_eigen_matrix(handle, n, nEigVecs, eigVecs);
 
   // notice that at this point the matrix has already been transposed, so we are scaling
   // columns
@@ -119,8 +118,8 @@ std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
   CHECK_CUDA(stream);
 
   // Find partition clustering
-  auto pair_cluster = cluster_solver.solve(handle, thrust_exec_policy, n,
-                                           nEigVecs, eigVecs, clusters);
+  auto pair_cluster =
+    cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters);
 
   std::get<1>(stats) = pair_cluster.first;
   std::get<2>(stats) = pair_cluster.second;
@@ -138,9 +137,8 @@ std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
  *  @param clusters (Input, device memory, n entries) Cluster assignments.
  *  @param modularity On exit, modularity
  */
-template <typename vertex_t, typename weight_t, typename ThrustExePolicy>
+template <typename vertex_t, typename weight_t>
 void analyzeModularity(handle_t const &handle,
-                       ThrustExePolicy thrust_exec_policy,
                        sparse_matrix_t<vertex_t, weight_t> const &csr_m,
                        vertex_t nClusters,
                        vertex_t const *__restrict__ clusters,
@@ -163,15 +161,15 @@ void analyzeModularity(handle_t const &handle,
     cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // Initialize Modularity
-  modularity_matrix_t<vertex_t, weight_t> B{handle, thrust_exec_policy, csr_m};
+  modularity_matrix_t<vertex_t, weight_t> B{handle, csr_m};
 
   // Initialize output
   modularity = 0;
 
   // Iterate through partitions
   for (i = 0; i < nClusters; ++i) {
-    if (!construct_indicator(handle, thrust_exec_policy, i, n, clustersize,
-                             partModularity, clusters, part_i, Bx, B)) {
+    if (!construct_indicator(handle, i, n, clustersize, partModularity,
+                             clusters, part_i, Bx, B)) {
       WARNING("empty partition");
       continue;
     }
@@ -180,7 +178,7 @@ void analyzeModularity(handle_t const &handle,
     modularity += partModularity;
   }
 
-  modularity = modularity / B.diagonal_.nrm1(thrust_exec_policy);
+  modularity = modularity / B.diagonal_.nrm1();
 }
 
 }  // namespace spectral
diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp
index 841fca04d9..a994895886 100644
--- a/cpp/include/raft/spectral/partition.hpp
+++ b/cpp/include/raft/spectral/partition.hpp
@@ -62,19 +62,18 @@ using namespace linalg;
  *    performed.
  *  @return statistics: number of eigensolver iterations, .
  */
-template <typename vertex_t, typename weight_t, typename ThrustExePolicy,
-          typename EigenSolver, typename ClusterSolver>
+template <typename vertex_t, typename weight_t, typename EigenSolver,
+          typename ClusterSolver>
 std::tuple<vertex_t, weight_t, vertex_t> partition(
-  handle_t const &handle, ThrustExePolicy thrust_exec_policy,
-  sparse_matrix_t<vertex_t, weight_t> const &csr_m,
+  handle_t const &handle, sparse_matrix_t<vertex_t, weight_t> const &csr_m,
   EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver,
   vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) {
   RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
   RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
   RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
 
-  auto cublas_h = handle.get_cublas_handle();
   auto stream = handle.get_stream();
+  auto cublas_h = handle.get_cublas_handle();
 
   std::tuple<vertex_t, weight_t, vertex_t>
     stats;  //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, cluster solver residual, # iters cluster solver
@@ -89,7 +88,7 @@ std::tuple<vertex_t, weight_t, vertex_t> partition(
 
   // Initialize Laplacian
   ///sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
-  laplacian_matrix_t<vertex_t, weight_t> L{handle, thrust_exec_policy, csr_m};
+  laplacian_matrix_t<vertex_t, weight_t> L{handle, csr_m};
 
   auto eigen_config = eigen_solver.get_config();
   auto nEigVecs = eigen_config.n_eigVecs;
@@ -99,11 +98,11 @@ std::tuple<vertex_t, weight_t, vertex_t> partition(
     eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs);
 
   // Whiten eigenvector matrix
-  transform_eigen_matrix(handle, thrust_exec_policy, n, nEigVecs, eigVecs);
+  transform_eigen_matrix(handle, n, nEigVecs, eigVecs);
 
   // Find partition clustering
-  auto pair_cluster = cluster_solver.solve(handle, thrust_exec_policy, n,
-                                           nEigVecs, eigVecs, clusters);
+  auto pair_cluster =
+    cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters);
 
   std::get<1>(stats) = pair_cluster.first;
   std::get<2>(stats) = pair_cluster.second;
@@ -129,9 +128,8 @@ std::tuple<vertex_t, weight_t, vertex_t> partition(
  *  @param cost On exit, partition cost function.
  *  @return error flag.
  */
-template <typename vertex_t, typename weight_t, typename ThrustExePolicy>
+template <typename vertex_t, typename weight_t>
 void analyzePartition(handle_t const &handle,
-                      ThrustExePolicy thrust_exec_policy,
                       sparse_matrix_t<vertex_t, weight_t> const &csr_m,
                       vertex_t nClusters, const vertex_t *__restrict__ clusters,
                       weight_t &edgeCut, weight_t &cost) {
@@ -140,8 +138,8 @@ void analyzePartition(handle_t const &handle,
   vertex_t i;
   vertex_t n = csr_m.nrows_;
 
-  auto cublas_h = handle.get_cublas_handle();
   auto stream = handle.get_stream();
+  auto cublas_h = handle.get_cublas_handle();
 
   weight_t partEdgesCut, clustersize;
 
@@ -155,7 +153,7 @@ void analyzePartition(handle_t const &handle,
 
   // Initialize Laplacian
   ///sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
-  laplacian_matrix_t<vertex_t, weight_t> L{handle, thrust_exec_policy, csr_m};
+  laplacian_matrix_t<vertex_t, weight_t> L{handle, csr_m};
 
   // Initialize output
   cost = 0;
@@ -164,8 +162,8 @@ void analyzePartition(handle_t const &handle,
   // Iterate through partitions
   for (i = 0; i < nClusters; ++i) {
     // Construct indicator vector for ith partition
-    if (!construct_indicator(handle, thrust_exec_policy, i, n, clustersize,
-                             partEdgesCut, clusters, part_i, Lx, L)) {
+    if (!construct_indicator(handle, i, n, clustersize, partEdgesCut, clusters,
+                             part_i, Lx, L)) {
       WARNING("empty partition");
       continue;
     }
diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp
index 40dde30a74..de9ff1917f 100644
--- a/cpp/include/raft/spectral/spectral_util.hpp
+++ b/cpp/include/raft/spectral/spectral_util.hpp
@@ -108,13 +108,12 @@ cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs) {
   return cudaSuccess;
 }
 
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename ThrustExePolicy>
-void transform_eigen_matrix(handle_t const& handle,
-                            ThrustExePolicy thrust_exec_policy, edge_t n,
-                            vertex_t nEigVecs, weight_t* eigVecs) {
-  auto cublas_h = handle.get_cublas_handle();
+template <typename vertex_t, typename edge_t, typename weight_t>
+void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs,
+                            weight_t* eigVecs) {
   auto stream = handle.get_stream();
+  auto cublas_h = handle.get_cublas_handle();
+  auto thrust_exec_policy = handle.get_thrust_policy();
 
   const weight_t zero{0.0};
   const weight_t one{1.0};
@@ -187,16 +186,15 @@ struct equal_to_i_op {
 
 // Construct indicator vector for ith partition
 //
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename ThrustExePolicy>
-bool construct_indicator(handle_t const& handle,
-                         ThrustExePolicy thrust_exec_policy, edge_t index,
-                         edge_t n, weight_t& clustersize, weight_t& partStats,
+template <typename vertex_t, typename edge_t, typename weight_t>
+bool construct_indicator(handle_t const& handle, edge_t index, edge_t n,
+                         weight_t& clustersize, weight_t& partStats,
                          vertex_t const* __restrict__ clusters,
                          vector_t<weight_t>& part_i, vector_t<weight_t>& Bx,
                          laplacian_matrix_t<vertex_t, weight_t> const& B) {
-  auto cublas_h = handle.get_cublas_handle();
   auto stream = handle.get_stream();
+  auto cublas_h = handle.get_cublas_handle();
+  auto thrust_exec_policy = handle.get_thrust_policy();
 
   thrust::for_each(thrust_exec_policy,
                    thrust::make_zip_iterator(thrust::make_tuple(
diff --git a/cpp/test/cluster_solvers.cu b/cpp/test/cluster_solvers.cu
index 4ff6cdf5fa..d280b3e95c 100644
--- a/cpp/test/cluster_solvers.cu
+++ b/cpp/test/cluster_solvers.cu
@@ -49,8 +49,7 @@ TEST(Raft, ClusterSolvers) {
 
   kmeans_solver_t<index_type, value_type> cluster_solver{cfg};
 
-  EXPECT_ANY_THROW(cluster_solver.solve(h, thrust::cuda::par.on(stream), n, d,
-                                        eigvecs, codes));
+  EXPECT_ANY_THROW(cluster_solver.solve(h, n, d, eigvecs, codes));
 }
 
 TEST(Raft, ModularitySolvers) {
@@ -89,14 +88,12 @@ TEST(Raft, ModularitySolvers) {
   auto stream = h.get_stream();
   sparse_matrix_t<index_type, value_type> sm{h,       nullptr, nullptr,
                                              nullptr, 0,       0};
-  auto t_exe_p = thrust::cuda::par.on(stream);
 
   EXPECT_ANY_THROW(spectral::modularity_maximization(
-    h, t_exe_p, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs));
+    h, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs));
 
   value_type modularity{0};
-  EXPECT_ANY_THROW(
-    spectral::analyzeModularity(h, t_exe_p, sm, k, clusters, modularity));
+  EXPECT_ANY_THROW(spectral::analyzeModularity(h, sm, k, clusters, modularity));
 }
 
 }  // namespace raft
diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu
index e6ee09262e..8025d8dcd6 100644
--- a/cpp/test/eigen_solvers.cu
+++ b/cpp/test/eigen_solvers.cu
@@ -102,16 +102,15 @@ TEST(Raft, SpectralSolvers) {
 
   auto stream = h.get_stream();
 
-  auto t_exe_p = thrust::cuda::par.on(stream);
   sparse_matrix_t<index_type, value_type> sm{h,       nullptr, nullptr,
                                              nullptr, 0,       0};
-  EXPECT_ANY_THROW(spectral::partition(
-    h, t_exe_p, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs));
+  EXPECT_ANY_THROW(spectral::partition(h, sm, eig_solver, cluster_solver,
+                                       clusters, eigvals, eigvecs));
 
   value_type edgeCut{0};
   value_type cost{0};
   EXPECT_ANY_THROW(
-    spectral::analyzePartition(h, t_exe_p, sm, k, clusters, edgeCut, cost));
+    spectral::analyzePartition(h, sm, k, clusters, edgeCut, cost));
 }
 
 }  // namespace raft
diff --git a/cpp/test/spectral_matrix.cu b/cpp/test/spectral_matrix.cu
index e5c2d52764..b85d35e3f8 100644
--- a/cpp/test/spectral_matrix.cu
+++ b/cpp/test/spectral_matrix.cu
@@ -57,27 +57,24 @@ TEST(Raft, SpectralMatrices) {
   ASSERT_EQ(nullptr, sm2.row_offsets_);
 
   auto stream = h.get_stream();
-  auto t_exe_pol = thrust::cuda::par.on(stream);
 
-  auto cnstr_lm1 = [&h, t_exe_pol, ro, ci, vs, nrows, nnz](void) {
-    laplacian_matrix_t<index_type, value_type> lm1{h,  t_exe_pol, ro, ci,
-                                                   vs, nrows,     nnz};
+  auto cnstr_lm1 = [&h, ro, ci, vs, nrows, nnz](void) {
+    laplacian_matrix_t<index_type, value_type> lm1{h, ro, ci, vs, nrows, nnz};
   };
   EXPECT_ANY_THROW(cnstr_lm1());  // because of nullptr ptr args
 
-  auto cnstr_lm2 = [&h, t_exe_pol, &sm2](void) {
-    laplacian_matrix_t<index_type, value_type> lm2{h, t_exe_pol, sm2};
+  auto cnstr_lm2 = [&h, &sm2](void) {
+    laplacian_matrix_t<index_type, value_type> lm2{h, sm2};
   };
   EXPECT_ANY_THROW(cnstr_lm2());  // because of nullptr ptr args
 
-  auto cnstr_mm1 = [&h, t_exe_pol, ro, ci, vs, nrows, nnz](void) {
-    modularity_matrix_t<index_type, value_type> mm1{h,  t_exe_pol, ro, ci,
-                                                    vs, nrows,     nnz};
+  auto cnstr_mm1 = [&h, ro, ci, vs, nrows, nnz](void) {
+    modularity_matrix_t<index_type, value_type> mm1{h, ro, ci, vs, nrows, nnz};
   };
   EXPECT_ANY_THROW(cnstr_mm1());  // because of nullptr ptr args
 
-  auto cnstr_mm2 = [&h, t_exe_pol, &sm2](void) {
-    modularity_matrix_t<index_type, value_type> mm2{h, t_exe_pol, sm2};
+  auto cnstr_mm2 = [&h, &sm2](void) {
+    modularity_matrix_t<index_type, value_type> mm2{h, sm2};
   };
   EXPECT_ANY_THROW(cnstr_mm2());  // because of nullptr ptr args
 }

From 0bc3d2b5aa314b26f7b6f1dea5313789e064703b Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Fri, 2 Jul 2021 11:49:55 +0200
Subject: [PATCH 05/17] Use of rmm::device_scalar

---
 cpp/include/raft/comms/test.hpp        | 12 ++++++------
 cpp/include/raft/label/classlabels.cuh |  5 +++--
 cpp/include/raft/lap/lap.cuh           |  6 +++---
 cpp/include/raft/lap/lap_functions.cuh |  3 ++-
 cpp/include/raft/linalg/eig.cuh        | 13 ++++++-------
 cpp/include/raft/linalg/qr.cuh         |  5 +++--
 cpp/include/raft/linalg/svd.cuh        |  5 +++--
 cpp/include/raft/matrix/math.cuh       |  3 ++-
 cpp/include/raft/sparse/csr.cuh        |  5 +++--
 cpp/test/label/merge_labels.cu         |  5 +++--
 cpp/test/linalg/cholesky_r1.cu         |  5 +++--
 cpp/test/linalg/map_then_reduce.cu     |  5 +++--
 cpp/test/sparse/knn_graph.cu           |  8 ++++----
 cpp/test/sparse/symmetrize.cu          |  8 ++++----
 14 files changed, 48 insertions(+), 40 deletions(-)

diff --git a/cpp/include/raft/comms/test.hpp b/cpp/include/raft/comms/test.hpp
index 9f5edc1425..17db8e88af 100644
--- a/cpp/include/raft/comms/test.hpp
+++ b/cpp/include/raft/comms/test.hpp
@@ -43,7 +43,7 @@ bool test_collective_allreduce(const handle_t &handle, int root) {
 
   cudaStream_t stream = handle.get_stream();
 
-  rmm::device_uvector<int> temp_d(1, stream);
+  rmm::device_scalar<int> temp_d(stream);
   CUDA_CHECK(
     cudaMemcpyAsync(temp_d.data(), &send, 1, cudaMemcpyHostToDevice, stream));
 
@@ -74,7 +74,7 @@ bool test_collective_broadcast(const handle_t &handle, int root) {
 
   cudaStream_t stream = handle.get_stream();
 
-  rmm::device_uvector<int> temp_d(1, stream);
+  rmm::device_scalar<int> temp_d(stream);
 
   if (communicator.get_rank() == root)
     CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int),
@@ -101,7 +101,7 @@ bool test_collective_reduce(const handle_t &handle, int root) {
 
   cudaStream_t stream = handle.get_stream();
 
-  rmm::device_uvector<int> temp_d(1, stream);
+  rmm::device_scalar<int> temp_d(stream);
 
   CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int),
                              cudaMemcpyHostToDevice, stream));
@@ -130,7 +130,7 @@ bool test_collective_allgather(const handle_t &handle, int root) {
 
   cudaStream_t stream = handle.get_stream();
 
-  rmm::device_uvector<int> temp_d(1, stream);
+  rmm::device_scalar<int> temp_d(stream);
   rmm::device_uvector<int> recv_d(communicator.get_size(), stream);
 
   CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int),
@@ -162,7 +162,7 @@ bool test_collective_gather(const handle_t &handle, int root) {
 
   cudaStream_t stream = handle.get_stream();
 
-  rmm::device_uvector<int> temp_d(1, stream);
+  rmm::device_scalar<int> temp_d(stream);
   rmm::device_uvector<int> recv_d(
     communicator.get_rank() == root ? communicator.get_size() : 0, stream);
 
@@ -244,7 +244,7 @@ bool test_collective_reducescatter(const handle_t &handle, int root) {
   cudaStream_t stream = handle.get_stream();
 
   rmm::device_uvector<int> temp_d(sends.size(), stream);
-  rmm::device_uvector<int> recv_d(1, stream);
+  rmm::device_scalar<int> recv_d(stream);
 
   CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(),
                              sends.size() * sizeof(int), cudaMemcpyHostToDevice,
diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/classlabels.cuh
index f2b2463165..b7878dc276 100644
--- a/cpp/include/raft/label/classlabels.cuh
+++ b/cpp/include/raft/label/classlabels.cuh
@@ -22,6 +22,7 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/mr/device/allocator.hpp>
+#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
@@ -48,7 +49,7 @@ void getUniquelabels(value_t *y, size_t n, value_t **y_unique, int *n_unique,
                      std::shared_ptr<raft::mr::device::allocator> allocator) {
   rmm::device_uvector<value_t> y2(n, stream);
   rmm::device_uvector<value_t> y3(n, stream);
-  rmm::device_uvector<int> d_num_selected(1, stream);
+  rmm::device_scalar<int> d_num_selected(stream);
   size_t bytes = 0;
   size_t bytes2 = 0;
 
@@ -64,7 +65,7 @@ void getUniquelabels(value_t *y, size_t n, value_t **y_unique, int *n_unique,
   cub::DeviceRadixSort::SortKeys(cub_storage.data(), bytes, y, y2.data(), n);
   cub::DeviceSelect::Unique(cub_storage.data(), bytes, y2.data(), y3.data(),
                             d_num_selected.data(), n);
-  raft::update_host(n_unique, d_num_selected.data(), 1, stream);
+  *n_unique = d_num_selected.value(stream);
   CUDA_CHECK(cudaStreamSynchronize(stream));
 
   // Copy unique classes to output
diff --git a/cpp/include/raft/lap/lap.cuh b/cpp/include/raft/lap/lap.cuh
index 29c1cf6aa1..f64afb3549 100644
--- a/cpp/include/raft/lap/lap.cuh
+++ b/cpp/include/raft/lap/lap.cuh
@@ -233,16 +233,16 @@ class LinearAssignmentProblem {
   int hungarianStep3() {
     int next;
 
-    rmm::device_uvector<bool> flag_v(1, handle_.get_stream());
+    rmm::device_scalar<bool> flag_v(handle_.get_stream());
 
     bool h_flag = false;
-    raft::update_device(flag_v.data(), &h_flag, 1, handle_.get_stream());
+    flag_v.set_value_async(h_flag, handle_.get_stream());
 
     detail::executeZeroCover(handle_, d_costs_, d_vertices_dev, d_row_data_dev,
                              d_col_data_dev, flag_v.data(), batchsize_, size_,
                              epsilon_);
 
-    raft::update_host(&h_flag, flag_v.data(), 1, handle_.get_stream());
+    h_flag = flag_v.value(handle_.get_stream());
 
     next = h_flag ? 4 : 5;
 
diff --git a/cpp/include/raft/lap/lap_functions.cuh b/cpp/include/raft/lap/lap_functions.cuh
index ce3f67f8fe..1e12e3917c 100644
--- a/cpp/include/raft/lap/lap_functions.cuh
+++ b/cpp/include/raft/lap/lap_functions.cuh
@@ -35,6 +35,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
 #include <raft/lap/lap_kernels.cuh>
+#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
@@ -408,7 +409,7 @@ inline void dualUpdate(raft::handle_t const &handle,
   dim3 threads_per_block;
   int total_blocks;
 
-  rmm::device_uvector<weight_t> sp_min_v(1, handle.get_stream());
+  rmm::device_scalar<weight_t> sp_min_v(handle.get_stream());
 
   raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block,
                                          total_blocks, SP);
diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh
index 8ab7011db4..951e2b8e7a 100644
--- a/cpp/include/raft/linalg/eig.cuh
+++ b/cpp/include/raft/linalg/eig.cuh
@@ -22,6 +22,7 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
 #include <raft/matrix/matrix.cuh>
+#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
@@ -53,7 +54,7 @@ void eigDC(const raft::handle_t &handle, const math_t *in, int n_rows,
                                             n_cols, eig_vals, &lwork));
 
   rmm::device_uvector<math_t> d_work(lwork, stream);
-  rmm::device_uvector<int> d_dev_info(1, stream);
+  rmm::device_scalar<int> d_dev_info(stream);
 
   raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream);
 
@@ -63,8 +64,7 @@ void eigDC(const raft::handle_t &handle, const math_t *in, int n_rows,
                                  d_dev_info.data(), stream));
   CUDA_CHECK(cudaGetLastError());
 
-  int dev_info;
-  raft::update_host(&dev_info, d_dev_info.data(), 1, stream);
+  int dev_info = d_dev_info.value(stream);
   CUDA_CHECK(cudaStreamSynchronize(stream));
   ASSERT(dev_info == 0,
          "eig.cuh: eigensolver couldn't converge to a solution. "
@@ -105,7 +105,7 @@ void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
     n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, &lwork));
 
   rmm::device_uvector<math_t> d_work(lwork, stream);
-  rmm::device_uvector<int> d_dev_info(1, stream);
+  rmm::device_scalar<int> d_dev_info(stream);
   rmm::device_uvector<math_t> d_eig_vectors(0, stream);
 
   if (memUsage == OVERWRITE_INPUT) {
@@ -127,8 +127,7 @@ void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
 
   CUDA_CHECK(cudaGetLastError());
 
-  int dev_info;
-  raft::update_host(&dev_info, d_dev_info.data(), 1, stream);
+  int dev_info = d_dev_info.value(stream);
   CUDA_CHECK(cudaStreamSynchronize(stream));
   ASSERT(dev_info == 0,
          "eig.cuh: eigensolver couldn't converge to a solution. "
@@ -177,7 +176,7 @@ void eigJacobi(const raft::handle_t &handle, const math_t *in, int n_rows,
     eig_vectors, n_cols, eig_vals, &lwork, syevj_params));
 
   rmm::device_uvector<math_t> d_work(lwork, stream);
-  rmm::device_uvector<int> dev_info(1, stream);
+  rmm::device_scalar<int> dev_info(stream);
 
   raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream);
 
diff --git a/cpp/include/raft/linalg/qr.cuh b/cpp/include/raft/linalg/qr.cuh
index 14771f289d..cc912d7d86 100644
--- a/cpp/include/raft/linalg/qr.cuh
+++ b/cpp/include/raft/linalg/qr.cuh
@@ -19,6 +19,7 @@
 #include <raft/linalg/cublas_wrappers.h>
 #include <raft/linalg/cusolver_wrappers.h>
 #include <raft/matrix/matrix.cuh>
+#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
@@ -52,7 +53,7 @@ void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q,
   rmm::device_uvector<math_t> tau(k, stream);
   CUDA_CHECK(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * k, stream));
 
-  rmm::device_uvector<int> devInfo(1, stream);
+  rmm::device_scalar<int> devInfo(stream);
   int Lwork;
 
   CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, m, n, Q, m, &Lwork));
@@ -97,7 +98,7 @@ void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R,
                              cudaMemcpyDeviceToDevice, stream));
 
   int Lwork;
-  rmm::device_uvector<int> devInfo(1, stream);
+  rmm::device_scalar<int> devInfo(stream);
 
   CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, R_full_nrows,
                                             R_full_ncols, R_full.data(),
diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/svd.cuh
index a1507cfa9b..8b40a80903 100644
--- a/cpp/include/raft/linalg/svd.cuh
+++ b/cpp/include/raft/linalg/svd.cuh
@@ -23,6 +23,7 @@
 #include <raft/handle.hpp>
 #include <raft/matrix/math.cuh>
 #include <raft/matrix/matrix.cuh>
+#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include "eig.cuh"
 #include "gemm.cuh"
@@ -69,7 +70,7 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols,
   const int m = n_rows;
   const int n = n_cols;
 
-  rmm::device_uvector<int> devInfo(1, stream);
+  rmm::device_scalar<int> devInfo(stream);
   T *d_rwork = nullptr;
 
   int lwork = 0;
@@ -170,7 +171,7 @@ void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
   int m = n_rows;
   int n = n_cols;
 
-  rmm::device_uvector<int> devInfo(1, stream);
+  rmm::device_scalar<int> devInfo(stream);
 
   int lwork = 0;
   int econ = 1;
diff --git a/cpp/include/raft/matrix/math.cuh b/cpp/include/raft/matrix/math.cuh
index f84b85d222..51e83b854b 100644
--- a/cpp/include/raft/matrix/math.cuh
+++ b/cpp/include/raft/matrix/math.cuh
@@ -22,6 +22,7 @@
 #include <raft/linalg/matrix_vector_op.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/mr/device/allocator.hpp>
+#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
@@ -294,7 +295,7 @@ void ratio(const raft::handle_t &handle, math_t *src, math_t *dest, IdxType len,
   auto d_src = src;
   auto d_dest = dest;
 
-  rmm::device_uvector<math_t> d_sum(1, stream);
+  rmm::device_scalar<math_t> d_sum(stream);
   auto *d_sum_ptr = d_sum.data();
   auto no_op = [] __device__(math_t in) { return in; };
   raft::linalg::mapThenSumReduce(d_sum_ptr, len, no_op, stream, src);
diff --git a/cpp/include/raft/sparse/csr.cuh b/cpp/include/raft/sparse/csr.cuh
index b30d7af8b4..d571a4999a 100644
--- a/cpp/include/raft/sparse/csr.cuh
+++ b/cpp/include/raft/sparse/csr.cuh
@@ -21,6 +21,7 @@
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/mr/device/allocator.hpp>
+#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/device_ptr.h>
@@ -219,7 +220,7 @@ void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr,
              Index_ nnz, Index_ N,
              std::shared_ptr<raft::mr::device::allocator> d_alloc,
              cudaStream_t stream, Lambda filter_op) {
-  rmm::device_uvector<bool> m(1, stream);
+  rmm::device_scalar<bool> m(stream);
 
   WeakCCState state(m.data());
   weak_cc_batched<Index_, TPB_X>(labels, row_ind, row_ind_ptr, nnz, N, 0, N,
@@ -253,7 +254,7 @@ void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr,
              Index_ nnz, Index_ N,
              std::shared_ptr<raft::mr::device::allocator> d_alloc,
              cudaStream_t stream) {
-  rmm::device_uvector<bool> m(1, stream);
+  rmm::device_scalar<bool> m(stream);
   WeakCCState state(m.data());
   weak_cc_batched<Index_, TPB_X>(labels, row_ind, row_ind_ptr, nnz, N, 0, N,
                                  stream, [](Index_) { return true; });
diff --git a/cpp/test/label/merge_labels.cu b/cpp/test/label/merge_labels.cu
index a2f14a8dbc..f58daae01b 100644
--- a/cpp/test/label/merge_labels.cu
+++ b/cpp/test/label/merge_labels.cu
@@ -21,6 +21,7 @@
 #include <thrust/device_ptr.h>
 #include <raft/handle.hpp>
 #include <raft/mr/device/allocator.hpp>
+#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include "../test_utils.h"
 
@@ -50,7 +51,7 @@ class MergeLabelsTest
       expected(params.N, stream),
       R(params.N, stream),
       mask(params.N, stream),
-      m(1, stream) {}
+      m(stream) {}
 
   void Run() {
     raft::update_device(labels_a.data(), params.labels_a.data(), params.N,
@@ -76,7 +77,7 @@ class MergeLabelsTest
   raft::handle_t handle;
   cudaStream_t stream;
   rmm::device_uvector<Index_> labels_a, labels_b, expected, R;
-  rmm::device_uvector<bool> mask, m;
+  rmm::device_scalar<bool> mask, m;
 };
 
 using MergeLabelsTestI = MergeLabelsTest<int>;
diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/test/linalg/cholesky_r1.cu
index 3c84d0db5f..3265ef9e13 100644
--- a/cpp/test/linalg/cholesky_r1.cu
+++ b/cpp/test/linalg/cholesky_r1.cu
@@ -20,6 +20,7 @@
 #include <raft/handle.hpp>
 #include <raft/linalg/cholesky_r1_update.cuh>
 #include <raft/mr/device/allocator.hpp>
+#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <sstream>
@@ -35,7 +36,7 @@ class CholeskyR1Test : public ::testing::Test {
     : G(n_rows * n_rows, handle.get_stream()),
       L(n_rows * n_rows, handle.get_stream()),
       L_exp(n_rows * n_rows, handle.get_stream()),
-      devInfo(1, handle.get_stream()),
+      devInfo(handle.get_stream()),
       workspace(0, handle.get_stream()) {
     CUDA_CHECK(cudaStreamCreate(&stream));
     handle.set_stream(stream);
@@ -119,7 +120,7 @@ class CholeskyR1Test : public ::testing::Test {
 
   math_t G2_host[4] = {3, 4, 2, 1};
 
-  rmm::device_uvector<int> devInfo;
+  rmm::device_scalar<int> devInfo;
   rmm::device_uvector<math_t> G;
   rmm::device_uvector<math_t> L_exp;
   rmm::device_uvector<math_t> L;
diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu
index c78dd9e8fb..2a8b356f3f 100644
--- a/cpp/test/linalg/map_then_reduce.cu
+++ b/cpp/test/linalg/map_then_reduce.cu
@@ -19,6 +19,7 @@
 #include <limits>
 #include <raft/linalg/map_then_reduce.cuh>
 #include <raft/random/rng.cuh>
+#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include "../test_utils.h"
 
@@ -132,7 +133,7 @@ class MapGenericReduceTest : public ::testing::Test {
 
  protected:
   MapGenericReduceTest()
-    : input(n, handle.get_stream()), output(1, handle.get_stream()) {
+    : input(n, handle.get_stream()), output(handle.get_stream()) {
     CUDA_CHECK(cudaStreamCreate(&stream));
     handle.set_stream(stream);
     initInput(input.data(), input.size(), stream);
@@ -172,7 +173,7 @@ class MapGenericReduceTest : public ::testing::Test {
   raft::handle_t handle;
   cudaStream_t stream;
   rmm::device_uvector<InType> input;
-  rmm::device_uvector<OutType> output;
+  rmm::device_scalar<OutType> output;
 };
 
 using IoTypePair =
diff --git a/cpp/test/sparse/knn_graph.cu b/cpp/test/sparse/knn_graph.cu
index ec41b32374..e8391c3ea4 100644
--- a/cpp/test/sparse/knn_graph.cu
+++ b/cpp/test/sparse/knn_graph.cu
@@ -17,6 +17,7 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/random/rng.cuh>
+#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include "../test_utils.h"
 
@@ -77,9 +78,8 @@ class KNNGraphTest
       handle, X, params.m, params.n, raft::distance::DistanceType::L2Unexpanded,
       *out);
 
-    rmm::device_uvector<value_idx> sum(1, stream);
-
-    CUDA_CHECK(cudaMemsetAsync(sum.data(), 0, 1 * sizeof(value_idx), stream));
+    rmm::device_scalar<value_idx> sum(stream);
+    sum.set_value_to_zero_async(stream);
 
     /**
      * Assert the knn graph is symmetric
@@ -87,7 +87,7 @@ class KNNGraphTest
     assert_symmetry<<<raft::ceildiv(out->nnz, 256), 256, 0, stream>>>(
       out->rows(), out->cols(), out->vals(), out->nnz, sum.data());
 
-    raft::update_host(&sum_h, sum.data(), 1, stream);
+    sum_h = sum.value(stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu
index e96e5c289c..b1745c9b26 100644
--- a/cpp/test/sparse/symmetrize.cu
+++ b/cpp/test/sparse/symmetrize.cu
@@ -20,6 +20,7 @@
 #include <raft/sparse/convert/coo.cuh>
 #include <raft/sparse/coo.cuh>
 #include <raft/sparse/linalg/symmetrize.cuh>
+#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include "../test_utils.h"
@@ -97,14 +98,13 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam<
     raft::sparse::linalg::symmetrize(handle, coo_rows.data(), indices, data, m,
                                      n, coo_rows.size(), out);
 
-    rmm::device_uvector<value_idx> sum(1, stream);
-
-    CUDA_CHECK(cudaMemsetAsync(sum.data(), 0, 1 * sizeof(value_idx), stream));
+    rmm::device_scalar<value_idx> sum(stream);
+    sum.set_value_to_zero_async(stream);
 
     assert_symmetry<<<raft::ceildiv(out.nnz, 256), 256, 0, stream>>>(
       out.rows(), out.cols(), out.vals(), out.nnz, sum.data());
 
-    raft::update_host(&sum_h, sum.data(), 1, stream);
+    sum_h = sum.value(stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 

From f6fe37a9718adc91a472f47aa357cc438930c69b Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Mon, 5 Jul 2021 12:04:35 +0200
Subject: [PATCH 06/17] Use rmm::exec_policy instead of thrust::cuda::par.on +
 remove rmm::device_vector

---
 cpp/include/raft/linalg/init.h                |   3 +-
 cpp/include/raft/linalg/transpose.h           |   4 +-
 cpp/include/raft/matrix/matrix.cuh            |  14 +--
 cpp/include/raft/sparse/convert/csr.cuh       |   2 +-
 cpp/include/raft/sparse/coo.cuh               |   1 -
 .../coo_spmv_strategies/base_strategy.cuh     |   1 -
 cpp/include/raft/sparse/distance/utils.cuh    |   2 -
 cpp/include/raft/sparse/linalg/add.cuh        |   3 +-
 cpp/include/raft/sparse/linalg/symmetrize.cuh |   5 +-
 .../raft/sparse/mst/detail/mst_solver_inl.cuh | 110 +++++++++---------
 cpp/include/raft/sparse/mst/detail/utils.cuh  |   4 +-
 cpp/include/raft/sparse/mst/mst_solver.cuh    |  24 ++--
 cpp/include/raft/sparse/op/sort.h             |   9 +-
 .../sparse/selection/connect_components.cuh   |   9 +-
 .../knn/detail/ann_quantized_faiss.cuh        |   1 -
 .../knn/detail/knn_brute_force_faiss.cuh      |   1 -
 cpp/include/raft/spectral/kmeans.hpp          |   1 -
 .../raft/spectral/modularity_maximization.hpp |   1 -
 cpp/include/raft/spectral/partition.hpp       |   1 -
 cpp/include/raft/spectral/spectral_util.hpp   |   1 -
 cpp/test/eigen_solvers.cu                     |   1 -
 cpp/test/linalg/reduce.cuh                    |  15 ++-
 cpp/test/matrix/matrix.cu                     |   2 +-
 cpp/test/mst.cu                               |  20 ++--
 24 files changed, 118 insertions(+), 117 deletions(-)

diff --git a/cpp/include/raft/linalg/init.h b/cpp/include/raft/linalg/init.h
index cb2e8ed1ab..9944685a1f 100644
--- a/cpp/include/raft/linalg/init.h
+++ b/cpp/include/raft/linalg/init.h
@@ -19,6 +19,7 @@
 #include <thrust/copy.h>
 #include <thrust/device_ptr.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <rmm/exec_policy.hpp>
 
 namespace raft {
 namespace linalg {
@@ -40,7 +41,7 @@ void range(T *out, int start, int end, cudaStream_t stream) {
   thrust::counting_iterator<int> first(start);
   thrust::counting_iterator<int> last = first + (end - start);
   thrust::device_ptr<T> ptr(out);
-  thrust::copy(thrust::cuda::par.on(stream), first, last, ptr);
+  thrust::copy(rmm::exec_policy(stream), first, last, ptr);
 }
 
 /**
diff --git a/cpp/include/raft/linalg/transpose.h b/cpp/include/raft/linalg/transpose.h
index d90f6271fa..db1cabd694 100644
--- a/cpp/include/raft/linalg/transpose.h
+++ b/cpp/include/raft/linalg/transpose.h
@@ -17,8 +17,8 @@
 #pragma once
 
 #include <raft/linalg/cublas_wrappers.h>
-#include <thrust/device_vector.h>
 #include <raft/handle.hpp>
+#include <rmm/exec_policy.hpp>
 
 namespace raft {
 namespace linalg {
@@ -60,7 +60,7 @@ void transpose(math_t *inout, int n, cudaStream_t stream) {
   auto d_inout = inout;
   auto counting = thrust::make_counting_iterator<int>(0);
 
-  thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size,
+  thrust::for_each(rmm::exec_policy(stream), counting, counting + size,
                    [=] __device__(int idx) {
                      int s_row = idx % m;
                      int s_col = idx / m;
diff --git a/cpp/include/raft/matrix/matrix.cuh b/cpp/include/raft/matrix/matrix.cuh
index 5f5755e24e..688b92da09 100644
--- a/cpp/include/raft/matrix/matrix.cuh
+++ b/cpp/include/raft/matrix/matrix.cuh
@@ -20,13 +20,13 @@
 #include <cusolverDn.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/cublas_wrappers.h>
-#include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 #include <algorithm>
 #include <cstddef>
 #include <raft/cache/cache_util.cuh>
 #include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
+#include <rmm/exec_policy.hpp>
 
 namespace raft {
 namespace matrix {
@@ -64,7 +64,7 @@ void copyRows(const m_t *in, idx_t n_rows, idx_t n_cols, m_t *out,
   idx_t size = n_rows_indices * n_cols;
   auto counting = thrust::make_counting_iterator<idx_t>(0);
 
-  thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size,
+  thrust::for_each(rmm::exec_policy(stream), counting, counting + size,
                    [=] __device__(idx_t idx) {
                      idx_t row = idx % n_rows_indices;
                      idx_t col = idx / n_rows_indices;
@@ -108,7 +108,7 @@ void truncZeroOrigin(m_t *in, idx_t in_n_rows, m_t *out, idx_t out_n_rows,
   auto d_q_trunc = out;
   auto counting = thrust::make_counting_iterator<idx_t>(0);
 
-  thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size,
+  thrust::for_each(rmm::exec_policy(stream), counting, counting + size,
                    [=] __device__(idx_t idx) {
                      idx_t row = idx % m;
                      idx_t col = idx / m;
@@ -133,8 +133,8 @@ void colReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) {
   auto d_q_reversed = inout;
   auto counting = thrust::make_counting_iterator<idx_t>(0);
 
-  thrust::for_each(thrust::cuda::par.on(stream), counting,
-                   counting + (size / 2), [=] __device__(idx_t idx) {
+  thrust::for_each(rmm::exec_policy(stream), counting, counting + (size / 2),
+                   [=] __device__(idx_t idx) {
                      idx_t dest_row = idx % m;
                      idx_t dest_col = idx / m;
                      idx_t src_row = dest_row;
@@ -161,8 +161,8 @@ void rowReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) {
   auto d_q_reversed = inout;
   auto counting = thrust::make_counting_iterator<idx_t>(0);
 
-  thrust::for_each(thrust::cuda::par.on(stream), counting,
-                   counting + (size / 2), [=] __device__(idx_t idx) {
+  thrust::for_each(rmm::exec_policy(stream), counting, counting + (size / 2),
+                   [=] __device__(idx_t idx) {
                      idx_t dest_row = idx % m;
                      idx_t dest_col = idx / m;
                      idx_t src_row = (m - dest_row) - 1;
diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh
index 16f351bf48..79b18ebd0a 100644
--- a/cpp/include/raft/sparse/convert/csr.cuh
+++ b/cpp/include/raft/sparse/convert/csr.cuh
@@ -160,7 +160,7 @@ void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m,
   thrust::device_ptr<T> row_counts_d =
     thrust::device_pointer_cast(row_counts.data());
   thrust::device_ptr<T> c_ind_d = thrust::device_pointer_cast(row_ind);
-  exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m,
+  exclusive_scan(rmm::exec_policy(stream), row_counts_d, row_counts_d + m,
                  c_ind_d);
 }
 
diff --git a/cpp/include/raft/sparse/coo.cuh b/cpp/include/raft/sparse/coo.cuh
index 6af8eae395..fa21614f8f 100644
--- a/cpp/include/raft/sparse/coo.cuh
+++ b/cpp/include/raft/sparse/coo.cuh
@@ -22,7 +22,6 @@
 #include <cusparse_v2.h>
 
 #include <thrust/device_ptr.h>
-#include <thrust/device_vector.h>
 #include <thrust/scan.h>
 
 #include <cuda_runtime.h>
diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh
index 194799aed0..3b57225350 100644
--- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh
+++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh
@@ -22,7 +22,6 @@
 #include "coo_mask_row_iterators.cuh"
 
 #include <rmm/device_uvector.hpp>
-#include <rmm/device_vector.hpp>
 
 namespace raft {
 namespace sparse {
diff --git a/cpp/include/raft/sparse/distance/utils.cuh b/cpp/include/raft/sparse/distance/utils.cuh
index 6b6d77a2d5..3bee1bc87d 100644
--- a/cpp/include/raft/sparse/distance/utils.cuh
+++ b/cpp/include/raft/sparse/distance/utils.cuh
@@ -21,8 +21,6 @@
 
 #include <cub/cub.cuh>
 
-#include <rmm/device_vector.hpp>
-
 namespace raft {
 namespace sparse {
 namespace distance {
diff --git a/cpp/include/raft/sparse/linalg/add.cuh b/cpp/include/raft/sparse/linalg/add.cuh
index 3bf028d14a..7ed627b9e2 100644
--- a/cpp/include/raft/sparse/linalg/add.cuh
+++ b/cpp/include/raft/sparse/linalg/add.cuh
@@ -22,6 +22,7 @@
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
 #include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
 
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
@@ -181,7 +182,7 @@ size_t csr_add_calc_inds(const int *a_ind, const int *a_indptr, const T *a_val,
   thrust::device_ptr<int> row_counts_d =
     thrust::device_pointer_cast(row_counts.data());
   thrust::device_ptr<int> c_ind_d = thrust::device_pointer_cast(out_ind);
-  exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m,
+  exclusive_scan(rmm::exec_policy(stream), row_counts_d, row_counts_d + m,
                  c_ind_d);
 
   return cnnz;
diff --git a/cpp/include/raft/sparse/linalg/symmetrize.cuh b/cpp/include/raft/sparse/linalg/symmetrize.cuh
index 614c9d830e..a6e1027288 100644
--- a/cpp/include/raft/sparse/linalg/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/symmetrize.cuh
@@ -22,6 +22,7 @@
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
 #include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
 
 #include <raft/sparse/op/sort.h>
 #include <thrust/device_ptr.h>
@@ -293,8 +294,8 @@ void from_knn_symmetrize_matrix(const value_idx *restrict knn_indices,
     thrust::device_pointer_cast(row_sizes.data());
 
   // Rolling cumulative sum
-  thrust::exclusive_scan(thrust::cuda::par.on(stream), __row_sizes,
-                         __row_sizes + n, __edges);
+  thrust::exclusive_scan(rmm::exec_policy(stream), __row_sizes, __row_sizes + n,
+                         __edges);
 
   // (5) Perform final data + data.T operation in tandem with memcpying
   symmetric_sum<<<numBlocks, threadsPerBlock, 0, stream>>>(
diff --git a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
index 029b76a945..33b980afcd 100644
--- a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
+++ b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
@@ -23,10 +23,10 @@
 #include "utils.cuh"
 
 #include <raft/cudart_utils.h>
-#include <rmm/device_buffer.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <thrust/device_ptr.h>
-#include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 #include <thrust/host_vector.h>
 #include <thrust/reduce.h>
@@ -34,10 +34,6 @@
 #include <thrust/transform.h>
 #include <iostream>
 
-#include <raft/cudart_utils.h>
-
-#include <rmm/device_buffer.hpp>
-
 namespace raft {
 namespace mst {
 typedef std::chrono::high_resolution_clock Clock;
@@ -63,20 +59,20 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::MST_solver(
     offsets(offsets_),
     indices(indices_),
     weights(weights_),
-    altered_weights(e_),
+    altered_weights(e_, stream_),
     v(v_),
     e(e_),
     color_index(color_),
-    color(v_),
-    next_color(v_),
-    min_edge_color(v_),
-    new_mst_edge(v_),
-    mst_edge(e_, false),
-    temp_src(2 * v_),
-    temp_dst(2 * v_),
-    temp_weights(2 * v_),
-    mst_edge_count(1, 0),
-    prev_mst_edge_count(1, 0),
+    color(v_, stream_),
+    next_color(v_, stream_),
+    min_edge_color(v_, stream_),
+    new_mst_edge(v_, stream_),
+    mst_edge(e_, stream_),
+    temp_src(2 * v_, stream_),
+    temp_dst(2 * v_, stream_),
+    temp_weights(2 * v_, stream_),
+    mst_edge_count(1, stream_),
+    prev_mst_edge_count(1, stream_),
     stream(stream_),
     symmetrize_output(symmetrize_output_),
     initialize_colors(initialize_colors_),
@@ -85,13 +81,18 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::MST_solver(
   max_threads = handle_.get_device_properties().maxThreadsPerBlock;
   sm_count = handle_.get_device_properties().multiProcessorCount;
 
+  mst_edge_count.set_value_to_zero_async(stream);
+  prev_mst_edge_count.set_value_to_zero_async(stream);
+  CUDA_CHECK(cudaMemsetAsync(mst_edge.data(), 0, mst_edge.size() * sizeof(bool),
+                             stream));
+
   //Initially, color holds the vertex id as color
   auto policy = handle.get_thrust_policy();
   if (initialize_colors_) {
     thrust::sequence(policy, color.begin(), color.end(), 0);
     thrust::sequence(policy, color_index, color_index + v, 0);
   } else {
-    raft::copy(color.data().get(), color_index, v, stream);
+    raft::copy(color.data(), color_index, v, stream);
   }
   thrust::sequence(policy, next_color.begin(), next_color.end(), 0);
 }
@@ -158,12 +159,12 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::solve() {
     timer3 += duration_us(stop - start);
 #endif
 
-    auto curr_mst_edge_count = mst_edge_count[0];
+    auto curr_mst_edge_count = mst_edge_count.value(stream);
     RAFT_EXPECTS(curr_mst_edge_count <= max_mst_edges,
                  "Number of edges found by MST is invalid. This may be due to "
                  "loss in precision. Try increasing precision of weights.");
 
-    if (curr_mst_edge_count == prev_mst_edge_count[0]) {
+    if (curr_mst_edge_count == prev_mst_edge_count.value(stream)) {
 #ifdef MST_TIME
       std::cout << "Iterations: " << i << std::endl;
       std::cout << timer0 << "," << timer1 << "," << timer2 << "," << timer3
@@ -194,12 +195,11 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::solve() {
 #endif
 
     // copy this iteration's results and store
-    prev_mst_edge_count = mst_edge_count;
+    prev_mst_edge_count.set_value_async(curr_mst_edge_count, stream);
   }
 
   // result packaging
-  thrust::host_vector<edge_t> host_mst_edge_count = mst_edge_count;
-  mst_result.n_edges = host_mst_edge_count[0];
+  mst_result.n_edges = mst_edge_count.value(stream);
   mst_result.src.resize(mst_result.n_edges, stream);
   mst_result.dst.resize(mst_result.n_edges, stream);
   mst_result.weights.resize(mst_result.n_edges, stream);
@@ -226,7 +226,7 @@ template <typename vertex_t, typename edge_t, typename weight_t,
 alteration_t
 MST_solver<vertex_t, edge_t, weight_t, alteration_t>::alteration_max() {
   auto policy = handle.get_thrust_policy();
-  rmm::device_vector<weight_t> tmp(e);
+  rmm::device_uvector<weight_t> tmp(e, stream);
   thrust::device_ptr<const weight_t> weights_ptr(weights);
   thrust::copy(policy, weights_ptr, weights_ptr + e, tmp.begin());
   //sort tmp weights
@@ -240,7 +240,7 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::alteration_max() {
     thrust::make_zip_iterator(thrust::make_tuple(tmp.begin(), tmp.begin() + 1));
   auto end =
     thrust::make_zip_iterator(thrust::make_tuple(new_end - 1, new_end));
-  auto init = tmp[1] - tmp[0];
+  auto init = tmp.element(1, stream) - tmp.element(0, stream);
   auto max =
     thrust::transform_reduce(policy, begin, end, alteration_functor<weight_t>(),
                              init, thrust::minimum<weight_t>());
@@ -259,7 +259,7 @@ void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::alteration() {
   alteration_t max = alteration_max();
 
   // pool of rand values
-  rmm::device_vector<alteration_t> rand_values(v);
+  rmm::device_uvector<alteration_t> rand_values(v, stream);
 
   // Random number generator
   curandGenerator_t randGen;
@@ -267,8 +267,7 @@ void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::alteration() {
   curandSetPseudoRandomGeneratorSeed(randGen, 1234567);
 
   // Initialize rand values
-  auto curand_status =
-    curand_generate_uniformX(randGen, rand_values.data().get(), v);
+  auto curand_status = curand_generate_uniformX(randGen, rand_values.data(), v);
   RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS, "MST: CURAND failed");
   curand_status = curandDestroyGenerator(randGen);
   RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS,
@@ -276,8 +275,8 @@ void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::alteration() {
 
   //Alterate the weights, make all undirected edge weight unique while keeping Wuv == Wvu
   detail::alteration_kernel<<<nblocks, nthreads, 0, stream>>>(
-    v, e, offsets, indices, weights, max, rand_values.data().get(),
-    altered_weights.data().get());
+    v, e, offsets, indices, weights, max, rand_values.data(),
+    altered_weights.data());
 }
 
 // updates colors of vertices by propagating the lower color to the higher
@@ -286,23 +285,24 @@ template <typename vertex_t, typename edge_t, typename weight_t,
 void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::label_prop(
   vertex_t* mst_src, vertex_t* mst_dst) {
   // update the colors of both ends its until there is no change in colors
-  thrust::host_vector<edge_t> curr_mst_edge_count = mst_edge_count;
+  edge_t curr_mst_edge_count = mst_edge_count.value(stream);
 
   auto min_pair_nthreads = std::min(v, (vertex_t)max_threads);
   auto min_pair_nblocks = std::min(
     (v + min_pair_nthreads - 1) / min_pair_nthreads, (vertex_t)max_blocks);
 
-  rmm::device_vector<bool> done(1, false);
-
-  edge_t* new_mst_edge_ptr = new_mst_edge.data().get();
-  vertex_t* color_ptr = color.data().get();
-  vertex_t* next_color_ptr = next_color.data().get();
+  edge_t* new_mst_edge_ptr = new_mst_edge.data();
+  vertex_t* color_ptr = color.data();
+  vertex_t* next_color_ptr = next_color.data();
 
-  bool* done_ptr = done.data().get();
+  rmm::device_scalar<bool> done(stream);
+  done.set_value_to_zero_async(stream);
+  bool* done_ptr = done.data();
+  const bool true_val = true;
 
   auto i = 0;
-  while (!done[0]) {
-    done[0] = true;
+  while (!done.value(stream)) {
+    done.set_value_async(true_val, stream);
 
     detail::min_pair_colors<<<min_pair_nblocks, min_pair_nthreads, 0, stream>>>(
       v, indices, new_mst_edge_ptr, color_ptr, color_index, next_color_ptr);
@@ -333,11 +333,11 @@ void MST_solver<vertex_t, edge_t, weight_t,
 
   int n_threads = 32;
 
-  vertex_t* color_ptr = color.data().get();
-  edge_t* new_mst_edge_ptr = new_mst_edge.data().get();
-  bool* mst_edge_ptr = mst_edge.data().get();
-  alteration_t* min_edge_color_ptr = min_edge_color.data().get();
-  alteration_t* altered_weights_ptr = altered_weights.data().get();
+  vertex_t* color_ptr = color.data();
+  edge_t* new_mst_edge_ptr = new_mst_edge.data();
+  bool* mst_edge_ptr = mst_edge.data();
+  alteration_t* min_edge_color_ptr = min_edge_color.data();
+  alteration_t* altered_weights_ptr = altered_weights.data();
 
   detail::kernel_min_edge_per_vertex<<<v, n_threads, 0, stream>>>(
     offsets, indices, altered_weights_ptr, color_ptr, color_index,
@@ -356,14 +356,14 @@ void MST_solver<vertex_t, edge_t, weight_t,
   thrust::fill(policy, temp_src.begin(), temp_src.end(),
                std::numeric_limits<vertex_t>::max());
 
-  vertex_t* color_ptr = color.data().get();
-  edge_t* new_mst_edge_ptr = new_mst_edge.data().get();
-  bool* mst_edge_ptr = mst_edge.data().get();
-  alteration_t* min_edge_color_ptr = min_edge_color.data().get();
-  alteration_t* altered_weights_ptr = altered_weights.data().get();
-  vertex_t* temp_src_ptr = temp_src.data().get();
-  vertex_t* temp_dst_ptr = temp_dst.data().get();
-  weight_t* temp_weights_ptr = temp_weights.data().get();
+  vertex_t* color_ptr = color.data();
+  edge_t* new_mst_edge_ptr = new_mst_edge.data();
+  bool* mst_edge_ptr = mst_edge.data();
+  alteration_t* min_edge_color_ptr = min_edge_color.data();
+  alteration_t* altered_weights_ptr = altered_weights.data();
+  vertex_t* temp_src_ptr = temp_src.data();
+  vertex_t* temp_dst_ptr = temp_dst.data();
+  weight_t* temp_weights_ptr = temp_weights.data();
 
   detail::min_edge_per_supervertex<<<nblocks, nthreads, 0, stream>>>(
     color_ptr, color_index, new_mst_edge_ptr, mst_edge_ptr, indices, weights,
@@ -388,8 +388,8 @@ void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::check_termination() {
     std::min((2 * v + nthreads - 1) / nthreads, (vertex_t)max_blocks);
 
   // count number of new mst edges
-  edge_t* mst_edge_count_ptr = mst_edge_count.data().get();
-  vertex_t* temp_src_ptr = temp_src.data().get();
+  edge_t* mst_edge_count_ptr = mst_edge_count.data();
+  vertex_t* temp_src_ptr = temp_src.data();
 
   detail::kernel_count_new_mst_edges<<<nblocks, nthreads, 0, stream>>>(
     temp_src_ptr, mst_edge_count_ptr, 2 * v);
@@ -411,7 +411,7 @@ void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::append_src_dst_pair(
   vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights) {
   auto policy = handle.get_thrust_policy();
 
-  auto curr_mst_edge_count = prev_mst_edge_count[0];
+  edge_t curr_mst_edge_count = prev_mst_edge_count.value(stream);
 
   // iterator to end of mst edges added to final output in previous iteration
   auto src_dst_zip_end = thrust::make_zip_iterator(thrust::make_tuple(
diff --git a/cpp/include/raft/sparse/mst/detail/utils.cuh b/cpp/include/raft/sparse/mst/detail/utils.cuh
index 8f755de459..4d5ca6ebe1 100644
--- a/cpp/include/raft/sparse/mst/detail/utils.cuh
+++ b/cpp/include/raft/sparse/mst/detail/utils.cuh
@@ -18,7 +18,7 @@
 #pragma once
 
 #include <iostream>
-#include <rmm/device_vector.hpp>
+#include <rmm/device_uvector.hpp>
 #define MST_TIME
 
 namespace raft {
@@ -32,7 +32,7 @@ __device__ idx_t get_1D_idx() {
 
 // somewhat smart vector print
 template <typename T>
-void printv(rmm::device_vector<T>& vec, const std::string& name = "",
+void printv(rmm::device_uvector<T>& vec, const std::string& name = "",
             const size_t displ = 5) {
 #ifdef MST_TIME
   std::cout.precision(15);
diff --git a/cpp/include/raft/sparse/mst/mst_solver.cuh b/cpp/include/raft/sparse/mst/mst_solver.cuh
index 833882ea0d..44b34ee5c7 100644
--- a/cpp/include/raft/sparse/mst/mst_solver.cuh
+++ b/cpp/include/raft/sparse/mst/mst_solver.cuh
@@ -18,8 +18,8 @@
 #pragma once
 
 #include <raft/handle.hpp>
+#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/device_vector.hpp>
 
 namespace raft {
 
@@ -68,24 +68,24 @@ class MST_solver {
   vertex_t sm_count;
 
   vertex_t* color_index;  // represent each supervertex as a color
-  rmm::device_vector<alteration_t>
+  rmm::device_uvector<alteration_t>
     min_edge_color;  // minimum incident edge weight per color
-  rmm::device_vector<edge_t> new_mst_edge;  // new minimum edge per vertex
-  rmm::device_vector<alteration_t>
+  rmm::device_uvector<edge_t> new_mst_edge;  // new minimum edge per vertex
+  rmm::device_uvector<alteration_t>
     altered_weights;  // weights to be used for mst
-  rmm::device_vector<edge_t>
+  rmm::device_scalar<edge_t>
     mst_edge_count;  // total number of edges added after every iteration
-  rmm::device_vector<edge_t>
+  rmm::device_scalar<edge_t>
     prev_mst_edge_count;  // total number of edges up to the previous iteration
-  rmm::device_vector<bool>
+  rmm::device_uvector<bool>
     mst_edge;  // mst output -  true if the edge belongs in mst
-  rmm::device_vector<vertex_t> next_color;  //  next iteration color
-  rmm::device_vector<vertex_t> color;  // index of color that vertex points to
+  rmm::device_uvector<vertex_t> next_color;  //  next iteration color
+  rmm::device_uvector<vertex_t> color;  // index of color that vertex points to
 
   // new src-dst pairs found per iteration
-  rmm::device_vector<vertex_t> temp_src;
-  rmm::device_vector<vertex_t> temp_dst;
-  rmm::device_vector<weight_t> temp_weights;
+  rmm::device_uvector<vertex_t> temp_src;
+  rmm::device_uvector<vertex_t> temp_dst;
+  rmm::device_uvector<weight_t> temp_weights;
 
   void label_prop(vertex_t* mst_src, vertex_t* mst_dst);
   void min_edge_per_vertex();
diff --git a/cpp/include/raft/sparse/op/sort.h b/cpp/include/raft/sparse/op/sort.h
index d53ceb62a9..9414a11ade 100644
--- a/cpp/include/raft/sparse/op/sort.h
+++ b/cpp/include/raft/sparse/op/sort.h
@@ -21,7 +21,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/exec_policy.hpp>
 
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
@@ -69,8 +69,8 @@ void coo_sort(int m, int n, int nnz, int *rows, int *cols, T *vals,
   auto coo_indices = thrust::make_zip_iterator(thrust::make_tuple(rows, cols));
 
   // get all the colors in contiguous locations so we can map them to warps.
-  thrust::sort_by_key(thrust::cuda::par.on(stream), coo_indices,
-                      coo_indices + nnz, vals, TupleComp());
+  thrust::sort_by_key(rmm::exec_policy(stream), coo_indices, coo_indices + nnz,
+                      vals, TupleComp());
 }
 
 /**
@@ -104,8 +104,7 @@ void coo_sort_by_weight(value_idx *rows, value_idx *cols, value_t *data,
 
   auto first = thrust::make_zip_iterator(thrust::make_tuple(rows, cols));
 
-  thrust::sort_by_key(thrust::cuda::par.on(stream), t_data, t_data + nnz,
-                      first);
+  thrust::sort_by_key(rmm::exec_policy(stream), t_data, t_data + nnz, first);
 }
 };  // namespace op
 };  // end NAMESPACE sparse
diff --git a/cpp/include/raft/sparse/selection/connect_components.cuh b/cpp/include/raft/sparse/selection/connect_components.cuh
index 390522c9bc..46369ca964 100644
--- a/cpp/include/raft/sparse/selection/connect_components.cuh
+++ b/cpp/include/raft/sparse/selection/connect_components.cuh
@@ -30,6 +30,7 @@
 #include <thrust/device_ptr.h>
 #include <thrust/sort.h>
 #include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
 
 #include <cub/cub.cuh>
 
@@ -211,7 +212,7 @@ void perform_1nn(cub::KeyValuePair<value_idx, value_t> *kvp,
     workspace.data(), reduction_op, reduction_op, true, true, stream);
 
   LookupColorOp<value_idx, value_t> extract_colors_op(colors);
-  thrust::transform(thrust::cuda::par.on(stream), kvp, kvp + n_rows, nn_colors,
+  thrust::transform(rmm::exec_policy(stream), kvp, kvp + n_rows, nn_colors,
                     extract_colors_op);
 }
 
@@ -232,15 +233,15 @@ void sort_by_color(value_idx *colors, value_idx *nn_colors,
                    cub::KeyValuePair<value_idx, value_t> *kvp,
                    value_idx *src_indices, size_t n_rows, cudaStream_t stream) {
   thrust::counting_iterator<value_idx> arg_sort_iter(0);
-  thrust::copy(thrust::cuda::par.on(stream), arg_sort_iter,
-               arg_sort_iter + n_rows, src_indices);
+  thrust::copy(rmm::exec_policy(stream), arg_sort_iter, arg_sort_iter + n_rows,
+               src_indices);
 
   auto keys = thrust::make_zip_iterator(thrust::make_tuple(
     colors, nn_colors, (raft::linkage::KeyValuePair<value_idx, value_t> *)kvp));
   auto vals = thrust::make_zip_iterator(thrust::make_tuple(src_indices));
 
   // get all the colors in contiguous locations so we can map them to warps.
-  thrust::sort_by_key(thrust::cuda::par.on(stream), keys, keys + n_rows, vals,
+  thrust::sort_by_key(rmm::exec_policy(stream), keys, keys + n_rows, vals,
                       TupleComp());
 }
 
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
index c0345a01e6..43bdf12a38 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
@@ -39,7 +39,6 @@
 #include <faiss/gpu/utils/Select.cuh>
 #include <faiss/gpu/utils/Tensor.cuh>
 
-#include <thrust/device_vector.h>
 #include <thrust/iterator/transform_iterator.h>
 
 #include <raft/linalg/distance_type.h>
diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index 6db8fb7a8e..84c130b0e4 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -29,7 +29,6 @@
 #include <faiss/gpu/utils/Select.cuh>
 
 #include <raft/linalg/distance_type.h>
-#include <thrust/device_vector.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <iostream>
 #include <raft/handle.hpp>
diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp
index 5928c727c6..b6f0105487 100644
--- a/cpp/include/raft/spectral/kmeans.hpp
+++ b/cpp/include/raft/spectral/kmeans.hpp
@@ -21,7 +21,6 @@
 
 #include <cuda.h>
 #include <thrust/binary_search.h>
-#include <thrust/device_vector.h>
 #include <thrust/gather.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/random.h>
diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp
index 1fe7819a7e..fededbfcb4 100644
--- a/cpp/include/raft/spectral/modularity_maximization.hpp
+++ b/cpp/include/raft/spectral/modularity_maximization.hpp
@@ -20,7 +20,6 @@
 #include <stdio.h>
 
 #include <cuda.h>
-#include <thrust/device_vector.h>
 #include <thrust/fill.h>
 #include <thrust/reduce.h>
 #include <thrust/transform.h>
diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp
index a994895886..2df3812a4a 100644
--- a/cpp/include/raft/spectral/partition.hpp
+++ b/cpp/include/raft/spectral/partition.hpp
@@ -19,7 +19,6 @@
 #include <stdio.h>
 
 #include <cuda.h>
-#include <thrust/device_vector.h>
 #include <thrust/fill.h>
 #include <thrust/reduce.h>
 #include <thrust/transform.h>
diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp
index de9ff1917f..c148350c0f 100644
--- a/cpp/include/raft/spectral/spectral_util.hpp
+++ b/cpp/include/raft/spectral/spectral_util.hpp
@@ -19,7 +19,6 @@
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
 
-#include <thrust/device_vector.h>
 #include <thrust/fill.h>
 #include <thrust/reduce.h>
 #include <thrust/transform.h>
diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu
index 8025d8dcd6..ef67d95348 100644
--- a/cpp/test/eigen_solvers.cu
+++ b/cpp/test/eigen_solvers.cu
@@ -38,7 +38,6 @@ TEST(Raft, EigenSolvers) {
   index_type nnz = 0;
   index_type nrows = 0;
   auto stream = h.get_stream();
-  auto t_exe_pol = thrust::cuda::par.on(stream);
 
   sparse_matrix_t<index_type, value_type> sm1{h, ro, ci, vs, nrows, nnz};
   ASSERT_EQ(nullptr, sm1.row_offsets_);
diff --git a/cpp/test/linalg/reduce.cuh b/cpp/test/linalg/reduce.cuh
index 18261287cf..86cb4f32b4 100644
--- a/cpp/test/linalg/reduce.cuh
+++ b/cpp/test/linalg/reduce.cuh
@@ -16,9 +16,9 @@
 
 #include <cublas_v2.h>
 #include <raft/linalg/cublas_wrappers.h>
-#include <thrust/device_vector.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/unary_op.cuh>
+#include <rmm/device_uvector.hpp>
 
 namespace raft {
 namespace linalg {
@@ -51,17 +51,20 @@ void unaryAndGemv(Type *dots, const Type *data, int D, int N,
                   cudaStream_t stream) {
   //computes a MLCommon unary op on data (squares it), then computes Ax
   //(A input matrix and x column vector) to sum columns
-  thrust::device_vector<Type> sq(D * N);
+  rmm::device_uvector<Type> sq(D * N, stream);
   raft::linalg::unaryOp(
     thrust::raw_pointer_cast(sq.data()), data, D * N,
     [] __device__(Type v) { return v * v; }, stream);
   cublasHandle_t handle;
   CUBLAS_CHECK(cublasCreate(&handle));
-  thrust::device_vector<Type> ones(N, 1);  //column vector [1...1]
+  rmm::device_uvector<Type> ones(N, stream);  //column vector [1...1]
+  raft::linalg::unaryOp<Type>(
+    ones.data(), ones.data(), ones.size(),
+    [=] __device__(Type input) { return 1; }, stream);
   Type alpha = 1, beta = 0;
-  CUBLAS_CHECK(raft::linalg::cublasgemv(
-    handle, CUBLAS_OP_N, D, N, &alpha, thrust::raw_pointer_cast(sq.data()), D,
-    thrust::raw_pointer_cast(ones.data()), 1, &beta, dots, 1, stream));
+  CUBLAS_CHECK(raft::linalg::cublasgemv(handle, CUBLAS_OP_N, D, N, &alpha,
+                                        sq.data(), D, ones.data(), 1, &beta,
+                                        dots, 1, stream));
   CUDA_CHECK(cudaDeviceSynchronize());
   CUBLAS_CHECK(cublasDestroy(handle));
 }
diff --git a/cpp/test/matrix/matrix.cu b/cpp/test/matrix/matrix.cu
index e7da92a136..2d2d9d2057 100644
--- a/cpp/test/matrix/matrix.cu
+++ b/cpp/test/matrix/matrix.cu
@@ -112,7 +112,7 @@ class MatrixCopyRowsTest : public ::testing::Test {
     // Init input array
     thrust::counting_iterator<idx_t> first(0);
     thrust::device_ptr<math_t> ptr(input.data());
-    thrust::copy(thrust::cuda::par.on(stream), first, first + n_cols * n_rows,
+    thrust::copy(handle.get_thrust_policy(), first, first + n_cols * n_rows,
                  ptr);
   }
 
diff --git a/cpp/test/mst.cu b/cpp/test/mst.cu
index d7aa76500b..4714fd5eaa 100644
--- a/cpp/test/mst.cu
+++ b/cpp/test/mst.cu
@@ -18,7 +18,7 @@
 
 #include <gtest/gtest.h>
 #include <iostream>
-#include <rmm/device_buffer.hpp>
+#include <rmm/device_uvector.hpp>
 #include <vector>
 
 #include <raft/cudart_utils.h>
@@ -127,11 +127,18 @@ class MSTTest
     v = static_cast<vertex_t>((csr_d.offsets.size() / sizeof(vertex_t)) - 1);
     e = static_cast<edge_t>(csr_d.indices.size() / sizeof(edge_t));
 
-    rmm::device_vector<vertex_t> mst_src(2 * v - 2,
-                                         std::numeric_limits<vertex_t>::max());
-    rmm::device_vector<vertex_t> mst_dst(2 * v - 2,
-                                         std::numeric_limits<vertex_t>::max());
-    rmm::device_vector<vertex_t> color(v, 0);
+    rmm::device_uvector<vertex_t> mst_src(2 * v - 2, handle.get_stream());
+    rmm::device_uvector<vertex_t> mst_dst(2 * v - 2, handle.get_stream());
+    rmm::device_uvector<vertex_t> color(v, handle.get_stream());
+
+    CUDA_CHECK(
+      cudaMemsetAsync(mst_src.data(), std::numeric_limits<vertex_t>::max(),
+                      mst_src.size() * sizeof(vertex_t), handle.get_stream()));
+    CUDA_CHECK(
+      cudaMemsetAsync(mst_dst.data(), std::numeric_limits<vertex_t>::max(),
+                      mst_dst.size() * sizeof(vertex_t), handle.get_stream()));
+    CUDA_CHECK(cudaMemsetAsync(color.data(), 0, color.size() * sizeof(vertex_t),
+                               handle.get_stream()));
 
     vertex_t *color_ptr = thrust::raw_pointer_cast(color.data());
 
@@ -214,7 +221,6 @@ class MSTTest
  protected:
   MSTTestInput<vertex_t, edge_t, weight_t> mst_input;
   CSRDevice<vertex_t, edge_t, weight_t> csr_d;
-  rmm::device_vector<bool> mst_edge;
   vertex_t v;
   edge_t e;
   int iterations;

From 1b5dbfdebe81b508ad52ecdfbebb0ff7afd4648e Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Wed, 7 Jul 2021 14:12:20 +0200
Subject: [PATCH 07/17] Update raft::allocate to use RMM

---
 cpp/include/raft/cudart_utils.h               | 17 ++++++++---
 cpp/test/distance/dist_adj.cu                 | 10 +++----
 cpp/test/distance/distance_base.cuh           | 12 ++++----
 cpp/test/distance/fused_l2_nn.cu              | 18 ++++++-----
 cpp/test/label/label.cu                       | 10 +++----
 cpp/test/linalg/add.cu                        |  8 ++---
 cpp/test/linalg/binary_op.cu                  |  8 ++---
 cpp/test/linalg/coalesced_reduction.cu        |  6 ++--
 cpp/test/linalg/divide.cu                     |  6 ++--
 cpp/test/linalg/eig.cu                        | 24 +++++++--------
 cpp/test/linalg/eig_sel.cu                    | 10 +++----
 cpp/test/linalg/eltwise.cu                    | 14 ++++-----
 cpp/test/linalg/map.cu                        | 12 ++++----
 cpp/test/linalg/map_then_reduce.cu            |  6 ++--
 cpp/test/linalg/matrix_vector_op.cu           | 10 +++----
 cpp/test/linalg/multiply.cu                   |  6 ++--
 cpp/test/linalg/norm.cu                       | 12 ++++----
 cpp/test/linalg/reduce.cu                     |  6 ++--
 cpp/test/linalg/strided_reduction.cu          |  6 ++--
 cpp/test/linalg/subtract.cu                   |  8 ++---
 cpp/test/linalg/svd.cu                        | 14 ++++-----
 cpp/test/linalg/transpose.cu                  |  6 ++--
 cpp/test/linalg/unary_op.cu                   |  6 ++--
 cpp/test/matrix/math.cu                       | 30 +++++++++----------
 cpp/test/matrix/matrix.cu                     |  8 ++---
 cpp/test/random/rng.cu                        | 20 ++++++-------
 cpp/test/random/rng_int.cu                    |  4 +--
 cpp/test/random/sample_without_replacement.cu |  8 ++---
 cpp/test/sparse/add.cu                        | 24 +++++++--------
 cpp/test/sparse/convert_coo.cu                |  6 ++--
 cpp/test/sparse/convert_csr.cu                | 14 ++++-----
 cpp/test/sparse/csr_row_slice.cu              | 18 +++++------
 cpp/test/sparse/csr_to_dense.cu               | 10 +++----
 cpp/test/sparse/csr_transpose.cu              | 18 +++++------
 cpp/test/sparse/degree.cu                     | 17 ++++++-----
 cpp/test/sparse/dist_coo_spmv.cu              | 11 +++----
 cpp/test/sparse/distance.cu                   | 11 +++----
 cpp/test/sparse/knn.cu                        | 15 +++++-----
 cpp/test/sparse/knn_graph.cu                  |  2 +-
 cpp/test/sparse/linkage.cu                    |  4 +--
 cpp/test/sparse/norm.cu                       |  8 ++---
 cpp/test/sparse/row_op.cu                     |  6 ++--
 cpp/test/sparse/selection.cu                  | 12 ++++----
 cpp/test/sparse/sort.cu                       |  8 ++---
 cpp/test/sparse/symmetrize.cu                 |  6 ++--
 cpp/test/spatial/haversine.cu                 | 16 +++++-----
 cpp/test/spatial/knn.cu                       | 28 ++++++++---------
 cpp/test/stats/mean.cu                        |  4 +--
 cpp/test/stats/mean_center.cu                 |  8 ++---
 cpp/test/stats/stddev.cu                      |  8 ++---
 cpp/test/stats/sum.cu                         |  4 +--
 51 files changed, 290 insertions(+), 273 deletions(-)

diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h
index 86c60addf2..8b8b3cbeca 100644
--- a/cpp/include/raft/cudart_utils.h
+++ b/cpp/include/raft/cudart_utils.h
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <raft/error.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 
 #include <cuda_runtime.h>
 
@@ -259,11 +260,19 @@ void print_device_vector(const char* variable_name, const T* devMem,
 }
 /** @} */
 
-/** cuda malloc */
 template <typename Type>
-void allocate(Type*& ptr, size_t len, bool setZero = false) {
-  CUDA_CHECK(cudaMalloc((void**)&ptr, sizeof(Type) * len));
-  if (setZero) CUDA_CHECK(cudaMemset(ptr, 0, sizeof(Type) * len));
+void allocate(Type*& ptr, size_t len, cudaStream_t stream,
+              bool setZero = false) {
+  ptr = (Type*)rmm::mr::get_current_device_resource()->allocate(
+    len * sizeof(Type), stream);
+  if (setZero) CUDA_CHECK(cudaMemset(ptr, 0, len * sizeof(Type)));
+}
+
+template <typename Type>
+void deallocate(Type*& ptr, size_t len, cudaStream_t stream,
+                bool setZero = false) {
+  rmm::mr::get_current_device_resource()->deallocate(ptr, len * sizeof(Type),
+                                                     stream);
 }
 
 /** helper method to get max usable shared mem per block parameter */
diff --git a/cpp/test/distance/dist_adj.cu b/cpp/test/distance/dist_adj.cu
index e2ed2c01dc..dc397a5eb2 100644
--- a/cpp/test/distance/dist_adj.cu
+++ b/cpp/test/distance/dist_adj.cu
@@ -79,10 +79,10 @@ class DistanceAdjTest
     bool isRowMajor = params.isRowMajor;
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(x, m * k);
-    raft::allocate(y, n * k);
-    raft::allocate(dist_ref, m * n);
-    raft::allocate(dist, m * n);
+    raft::allocate(x, m * k, stream);
+    raft::allocate(y, n * k, stream);
+    raft::allocate(dist_ref, m * n, stream);
+    raft::allocate(dist, m * n, stream);
     r.uniform(x, m * k, DataType(-1.0), DataType(1.0), stream);
     r.uniform(y, n * k, DataType(-1.0), DataType(1.0), stream);
 
@@ -94,7 +94,7 @@ class DistanceAdjTest
       raft::distance::getWorkspaceSize<raft::distance::DistanceType::L2Expanded,
                                        DataType, DataType, bool>(x, y, m, n, k);
     if (worksize != 0) {
-      raft::allocate(workspace, worksize);
+      raft::allocate(workspace, worksize, stream);
     }
 
     auto fin_op = [threshold] __device__(DataType d_val, int g_d_idx) {
diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh
index d6f06c186a..ac12076e3a 100644
--- a/cpp/test/distance/distance_base.cuh
+++ b/cpp/test/distance/distance_base.cuh
@@ -169,11 +169,11 @@ class DistanceTest : public ::testing::TestWithParam<DistanceInputs<DataType>> {
     bool isRowMajor = params.isRowMajor;
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(x, m * k);
-    raft::allocate(y, n * k);
-    raft::allocate(dist_ref, m * n);
-    raft::allocate(dist, m * n);
-    raft::allocate(dist2, m * n);
+    raft::allocate(x, m * k, stream);
+    raft::allocate(y, n * k, stream);
+    raft::allocate(dist_ref, m * n, stream);
+    raft::allocate(dist, m * n, stream);
+    raft::allocate(dist2, m * n, stream);
     r.uniform(x, m * k, DataType(-1.0), DataType(1.0), stream);
     r.uniform(y, n * k, DataType(-1.0), DataType(1.0), stream);
     naiveDistance(dist_ref, x, y, m, n, k, distanceType, isRowMajor);
@@ -182,7 +182,7 @@ class DistanceTest : public ::testing::TestWithParam<DistanceInputs<DataType>> {
       raft::distance::getWorkspaceSize<distanceType, DataType, DataType,
                                        DataType>(x, y, m, n, k);
     if (worksize != 0) {
-      raft::allocate(workspace, worksize);
+      raft::allocate(workspace, worksize, stream);
     }
 
     DataType threshold = -10000.f;
diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu
index 4573a070b6..8f12b26dc0 100644
--- a/cpp/test/distance/fused_l2_nn.cu
+++ b/cpp/test/distance/fused_l2_nn.cu
@@ -107,13 +107,13 @@ class FusedL2NNTest : public ::testing::TestWithParam<Inputs<DataT>> {
     int n = params.n;
     int k = params.k;
     CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(x, m * k);
-    raft::allocate(y, n * k);
-    raft::allocate(xn, m);
-    raft::allocate(yn, n);
-    raft::allocate(workspace, sizeof(int) * m);
-    raft::allocate(min, m);
-    raft::allocate(min_ref, m);
+    raft::allocate(x, m * k, stream);
+    raft::allocate(y, n * k, stream);
+    raft::allocate(xn, m, stream);
+    raft::allocate(yn, n, stream);
+    raft::allocate(workspace, sizeof(int) * m, stream);
+    raft::allocate(min, m, stream);
+    raft::allocate(min_ref, m, stream);
     r.uniform(x, m * k, DataT(-1.0), DataT(1.0), stream);
     r.uniform(y, n * k, DataT(-1.0), DataT(1.0), stream);
     generateGoldenResult();
@@ -282,7 +282,8 @@ class FusedL2NNDetTest : public FusedL2NNTest<DataT, Sqrt> {
   void SetUp() override {
     FusedL2NNTest<DataT, Sqrt>::SetUp();
     int m = this->params.m;
-    raft::allocate(min1, m);
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    raft::allocate(min1, m, stream);
   }
 
   void TearDown() override {
@@ -294,6 +295,7 @@ class FusedL2NNDetTest : public FusedL2NNTest<DataT, Sqrt> {
   cub::KeyValuePair<int, DataT> *min1;
 
   static const int NumRepeats = 100;
+  cudaStream_t stream;
 
   void generateGoldenResult() override {}
 };
diff --git a/cpp/test/label/label.cu b/cpp/test/label/label.cu
index 2a159994dc..86a8927283 100644
--- a/cpp/test/label/label.cu
+++ b/cpp/test/label/label.cu
@@ -43,9 +43,9 @@ TEST_F(MakeMonotonicTest, Result) {
 
   float *data, *actual, *expected;
 
-  raft::allocate(data, m, true);
-  raft::allocate(actual, m, true);
-  raft::allocate(expected, m, true);
+  raft::allocate(data, m, stream, true);
+  raft::allocate(actual, m, stream, true);
+  raft::allocate(expected, m, stream, true);
 
   float *data_h =
     new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 8.0, 7.0, 8.0, 8.0, 25.0, 80.0};
@@ -76,7 +76,7 @@ TEST(labelTest, Classlabels) {
 
   int n_rows = 6;
   float *y_d;
-  raft::allocate(y_d, n_rows);
+  raft::allocate(y_d, n_rows, stream);
 
   float y_h[] = {2, -1, 1, 2, 1, 1};
   raft::update_device(y_d, y_h, n_rows, stream);
@@ -91,7 +91,7 @@ TEST(labelTest, Classlabels) {
                               raft::Compare<float>(), stream));
 
   float *y_relabeled_d;
-  raft::allocate(y_relabeled_d, n_rows);
+  raft::allocate(y_relabeled_d, n_rows, stream);
 
   getOvrlabels(y_d, n_rows, y_unique_d.data(), n_classes, y_relabeled_d, 2,
                stream);
diff --git a/cpp/test/linalg/add.cu b/cpp/test/linalg/add.cu
index 2fc9d4e30f..ab7de0f24b 100644
--- a/cpp/test/linalg/add.cu
+++ b/cpp/test/linalg/add.cu
@@ -32,10 +32,10 @@ class AddTest : public ::testing::TestWithParam<AddInputs<InT, OutT>> {
     raft::random::Rng r(params.seed);
     int len = params.len;
     CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(in1, len);
-    raft::allocate(in2, len);
-    raft::allocate(out_ref, len);
-    raft::allocate(out, len);
+    raft::allocate(in1, len, stream);
+    raft::allocate(in2, len, stream);
+    raft::allocate(out_ref, len, stream);
+    raft::allocate(out, len, stream);
     r.uniform(in1, len, InT(-1.0), InT(1.0), stream);
     r.uniform(in2, len, InT(-1.0), InT(1.0), stream);
     naiveAddElem<InT, OutT>(out_ref, in1, in2, len);
diff --git a/cpp/test/linalg/binary_op.cu b/cpp/test/linalg/binary_op.cu
index 979b12f237..475d8e58ff 100644
--- a/cpp/test/linalg/binary_op.cu
+++ b/cpp/test/linalg/binary_op.cu
@@ -48,10 +48,10 @@ class BinaryOpTest
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
     IdxType len = params.len;
-    allocate(in1, len);
-    allocate(in2, len);
-    allocate(out_ref, len);
-    allocate(out, len);
+    raft::allocate(in1, len, stream);
+    raft::allocate(in2, len, stream);
+    raft::allocate(out_ref, len, stream);
+    raft::allocate(out, len, stream);
     r.uniform(in1, len, InType(-1.0), InType(1.0), stream);
     r.uniform(in2, len, InType(-1.0), InType(1.0), stream);
     naiveAdd(out_ref, in1, in2, len);
diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu
index e45f5651b4..bfbaf9b5f9 100644
--- a/cpp/test/linalg/coalesced_reduction.cu
+++ b/cpp/test/linalg/coalesced_reduction.cu
@@ -59,9 +59,9 @@ class coalescedReductionTest
     int len = rows * cols;
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(data, len);
-    raft::allocate(dots_exp, rows);
-    raft::allocate(dots_act, rows);
+    raft::allocate(data, len, stream);
+    raft::allocate(dots_exp, rows, stream);
+    raft::allocate(dots_act, rows, stream);
     r.uniform(data, len, T(-1.0), T(1.0), stream);
     naiveCoalescedReduction(dots_exp, data, cols, rows, stream);
 
diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu
index 2396558939..07fbda7e1c 100644
--- a/cpp/test/linalg/divide.cu
+++ b/cpp/test/linalg/divide.cu
@@ -54,9 +54,9 @@ class DivideTest
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
 
-    raft::allocate(in, len);
-    raft::allocate(out_ref, len);
-    raft::allocate(out, len);
+    raft::allocate(in, len, stream);
+    raft::allocate(out_ref, len, stream);
+    raft::allocate(out, len, stream);
     r.uniform(in, len, T(-1.0), T(1.0), stream);
     naiveDivide(out_ref, in, params.scalar, len, stream);
     divideScalar(out, in, params.scalar, len, stream);
diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu
index 159d288174..d53713f004 100644
--- a/cpp/test/linalg/eig.cu
+++ b/cpp/test/linalg/eig.cu
@@ -50,24 +50,24 @@ class EigTest : public ::testing::TestWithParam<EigInputs<T>> {
     raft::random::Rng r(params.seed);
     int len = params.len;
 
-    raft::allocate(cov_matrix, len);
+    raft::allocate(cov_matrix, len, stream);
     T cov_matrix_h[] = {1.0,  0.9, 0.81, 0.729, 0.9,   1.0,  0.9, 0.81,
                         0.81, 0.9, 1.0,  0.9,   0.729, 0.81, 0.9, 1.0};
     ASSERT(len == 16, "This test only works with 4x4 matrices!");
     raft::update_device(cov_matrix, cov_matrix_h, len, stream);
 
-    raft::allocate(eig_vectors, len);
-    raft::allocate(eig_vals, params.n_col);
-    raft::allocate(eig_vectors_jacobi, len);
-    raft::allocate(eig_vals_jacobi, params.n_col);
+    raft::allocate(eig_vectors, len, stream);
+    raft::allocate(eig_vals, params.n_col, stream);
+    raft::allocate(eig_vectors_jacobi, len, stream);
+    raft::allocate(eig_vals_jacobi, params.n_col, stream);
 
     T eig_vectors_ref_h[] = {0.2790, -0.6498, 0.6498, -0.2789, -0.5123, 0.4874,
                              0.4874, -0.5123, 0.6498, 0.2789,  -0.2789, -0.6498,
                              0.4874, 0.5123,  0.5123, 0.4874};
     T eig_vals_ref_h[] = {0.0614, 0.1024, 0.3096, 3.5266};
 
-    raft::allocate(eig_vectors_ref, len);
-    raft::allocate(eig_vals_ref, params.n_col);
+    raft::allocate(eig_vectors_ref, len, stream);
+    raft::allocate(eig_vals_ref, params.n_col, stream);
 
     raft::update_device(eig_vectors_ref, eig_vectors_ref_h, len, stream);
     raft::update_device(eig_vals_ref, eig_vals_ref_h, params.n_col, stream);
@@ -82,11 +82,11 @@ class EigTest : public ::testing::TestWithParam<EigInputs<T>> {
 
     // test code for comparing two methods
     len = params.n * params.n;
-    raft::allocate(cov_matrix_large, len);
-    raft::allocate(eig_vectors_large, len);
-    raft::allocate(eig_vectors_jacobi_large, len);
-    raft::allocate(eig_vals_large, params.n);
-    raft::allocate(eig_vals_jacobi_large, params.n);
+    raft::allocate(cov_matrix_large, len, stream);
+    raft::allocate(eig_vectors_large, len, stream);
+    raft::allocate(eig_vectors_jacobi_large, len, stream);
+    raft::allocate(eig_vals_large, params.n, stream);
+    raft::allocate(eig_vals_jacobi_large, params.n, stream);
 
     r.uniform(cov_matrix_large, len, T(-1.0), T(1.0), stream);
 
diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu
index b3980f281d..26c8177269 100644
--- a/cpp/test/linalg/eig_sel.cu
+++ b/cpp/test/linalg/eig_sel.cu
@@ -51,21 +51,21 @@ class EigSelTest : public ::testing::TestWithParam<EigSelInputs<T>> {
     params = ::testing::TestWithParam<EigSelInputs<T>>::GetParam();
     int len = params.len;
 
-    raft::allocate(cov_matrix, len);
+    raft::allocate(cov_matrix, len, stream);
     T cov_matrix_h[] = {1.0,  0.9, 0.81, 0.729, 0.9,   1.0,  0.9, 0.81,
                         0.81, 0.9, 1.0,  0.9,   0.729, 0.81, 0.9, 1.0};
     ASSERT(len == 16, "This test only works with 4x4 matrices!");
     raft::update_device(cov_matrix, cov_matrix_h, len, stream);
 
-    raft::allocate(eig_vectors, 12);
-    raft::allocate(eig_vals, params.n_col);
+    raft::allocate(eig_vectors, 12, stream);
+    raft::allocate(eig_vals, params.n_col, stream);
 
     T eig_vectors_ref_h[] = {-0.5123, 0.4874,  0.4874, -0.5123, 0.6498, 0.2789,
                              -0.2789, -0.6498, 0.4874, 0.5123,  0.5123, 0.4874};
     T eig_vals_ref_h[] = {0.1024, 0.3096, 3.5266, 3.5266};
 
-    raft::allocate(eig_vectors_ref, 12);
-    raft::allocate(eig_vals_ref, params.n_col);
+    raft::allocate(eig_vectors_ref, 12, stream);
+    raft::allocate(eig_vals_ref, params.n_col, stream);
 
     raft::update_device(eig_vectors_ref, eig_vectors_ref_h, 12, stream);
     raft::update_device(eig_vals_ref, eig_vals_ref_h, 4, stream);
diff --git a/cpp/test/linalg/eltwise.cu b/cpp/test/linalg/eltwise.cu
index 572951c557..e955f7a354 100644
--- a/cpp/test/linalg/eltwise.cu
+++ b/cpp/test/linalg/eltwise.cu
@@ -69,9 +69,9 @@ class ScalarMultiplyTest
 
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
-    allocate(in, len);
-    allocate(out_ref, len);
-    allocate(out, len);
+    raft::allocate(in, len, stream);
+    raft::allocate(out_ref, len, stream);
+    raft::allocate(out, len, stream);
     r.uniform(in, len, T(-1.0), T(1.0), stream);
     naiveScale(out_ref, in, scalar, len, stream);
     scalarMultiply(out, in, scalar, len, stream);
@@ -156,10 +156,10 @@ class EltwiseAddTest : public ::testing::TestWithParam<EltwiseAddInputs<T>> {
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
     int len = params.len;
-    allocate(in1, len);
-    allocate(in2, len);
-    allocate(out_ref, len);
-    allocate(out, len);
+    raft::allocate(in1, len, stream);
+    raft::allocate(in2, len, stream);
+    raft::allocate(out_ref, len, stream);
+    raft::allocate(out, len, stream);
     r.uniform(in1, len, T(-1.0), T(1.0), stream);
     r.uniform(in2, len, T(-1.0), T(1.0), stream);
     naiveAdd(out_ref, in1, in2, len, stream);
diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu
index 089edd738e..6087893006 100644
--- a/cpp/test/linalg/map.cu
+++ b/cpp/test/linalg/map.cu
@@ -47,7 +47,7 @@ void create_ref(OutType *out_ref, const InType *in1, const InType *in2,
                 const InType *in3, InType scalar, IdxType len,
                 cudaStream_t stream) {
   InType *tmp;
-  allocate(tmp, len);
+  raft::allocate(tmp, len, stream);
   eltwiseAdd(tmp, in1, in2, len, stream);
   eltwiseAdd(out_ref, tmp, in3, len, stream);
   scalarAdd(out_ref, out_ref, (OutType)scalar, len, stream);
@@ -66,11 +66,11 @@ class MapTest
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
     IdxType len = params.len;
-    allocate(in1, len);
-    allocate(in2, len);
-    allocate(in3, len);
-    allocate(out_ref, len);
-    allocate(out, len);
+    raft::allocate(in1, len, stream);
+    raft::allocate(in2, len, stream);
+    raft::allocate(in3, len, stream);
+    raft::allocate(out_ref, len, stream);
+    raft::allocate(out, len, stream);
     r.uniform(in1, len, InType(-1.0), InType(1.0), stream);
     r.uniform(in2, len, InType(-1.0), InType(1.0), stream);
     r.uniform(in3, len, InType(-1.0), InType(1.0), stream);
diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu
index c78dd9e8fb..49b7deaf1f 100644
--- a/cpp/test/linalg/map_then_reduce.cu
+++ b/cpp/test/linalg/map_then_reduce.cu
@@ -77,9 +77,9 @@ class MapReduceTest : public ::testing::TestWithParam<MapReduceInputs<InType>> {
 
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
-    allocate(in, len);
-    allocate(out_ref, len);
-    allocate(out, len);
+    raft::allocate(in, len, stream);
+    raft::allocate(out_ref, len, stream);
+    raft::allocate(out, len, stream);
     r.uniform(in, len, InType(-1.0), InType(1.0), stream);
     mapReduceLaunch(out_ref, out, in, len, stream);
     CUDA_CHECK(cudaStreamDestroy(stream));
diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu
index aa46c78b0f..933f806ff5 100644
--- a/cpp/test/linalg/matrix_vector_op.cu
+++ b/cpp/test/linalg/matrix_vector_op.cu
@@ -68,12 +68,12 @@ class MatVecOpTest
 
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
-    allocate(in, len);
-    allocate(out_ref, len);
-    allocate(out, len);
+    raft::allocate(in, len, stream);
+    raft::allocate(out_ref, len, stream);
+    raft::allocate(out, len, stream);
     IdxType vecLen = params.bcastAlongRows ? D : N;
-    allocate(vec1, vecLen);
-    allocate(vec2, vecLen);
+    raft::allocate(vec1, vecLen, stream);
+    raft::allocate(vec2, vecLen, stream);
     r.uniform(in, len, (T)-1.0, (T)1.0, stream);
     r.uniform(vec1, vecLen, (T)-1.0, (T)1.0, stream);
     r.uniform(vec2, vecLen, (T)-1.0, (T)1.0, stream);
diff --git a/cpp/test/linalg/multiply.cu b/cpp/test/linalg/multiply.cu
index 1d3e753de3..33f2b1c104 100644
--- a/cpp/test/linalg/multiply.cu
+++ b/cpp/test/linalg/multiply.cu
@@ -34,9 +34,9 @@ class MultiplyTest : public ::testing::TestWithParam<UnaryOpInputs<T>> {
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
 
-    raft::allocate(in, len);
-    raft::allocate(out_ref, len);
-    raft::allocate(out, len);
+    raft::allocate(in, len, stream);
+    raft::allocate(out_ref, len, stream);
+    raft::allocate(out, len, stream);
     r.uniform(in, len, T(-1.0), T(1.0), stream);
     naiveScale(out_ref, in, params.scalar, len, stream);
     multiplyScalar(out, in, params.scalar, len, stream);
diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu
index acc25addd0..29ea0a1915 100644
--- a/cpp/test/linalg/norm.cu
+++ b/cpp/test/linalg/norm.cu
@@ -78,9 +78,9 @@ class RowNormTest : public ::testing::TestWithParam<NormInputs<T>> {
     int rows = params.rows, cols = params.cols, len = rows * cols;
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(data, len);
-    raft::allocate(dots_exp, rows);
-    raft::allocate(dots_act, rows);
+    raft::allocate(data, len, stream);
+    raft::allocate(dots_exp, rows, stream);
+    raft::allocate(dots_act, rows, stream);
     r.uniform(data, len, T(-1.0), T(1.0), stream);
     naiveRowNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt,
                  stream);
@@ -143,10 +143,10 @@ class ColNormTest : public ::testing::TestWithParam<NormInputs<T>> {
     int rows = params.rows, cols = params.cols, len = rows * cols;
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(data, len);
+    raft::allocate(data, len, stream);
     r.uniform(data, len, T(-1.0), T(1.0), stream);
-    raft::allocate(dots_exp, cols);
-    raft::allocate(dots_act, cols);
+    raft::allocate(dots_exp, cols, stream);
+    raft::allocate(dots_act, cols, stream);
 
     naiveColNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt,
                  stream);
diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu
index 255cf1a696..dba8f76a22 100644
--- a/cpp/test/linalg/reduce.cu
+++ b/cpp/test/linalg/reduce.cu
@@ -58,9 +58,9 @@ class ReduceTest : public ::testing::TestWithParam<ReduceInputs<T>> {
     int rows = params.rows, cols = params.cols;
     int len = rows * cols;
     outlen = params.alongRows ? rows : cols;
-    raft::allocate(data, len);
-    raft::allocate(dots_exp, outlen);
-    raft::allocate(dots_act, outlen);
+    raft::allocate(data, len, stream);
+    raft::allocate(dots_exp, outlen, stream);
+    raft::allocate(dots_act, outlen, stream);
     r.uniform(data, len, T(-1.0), T(1.0), stream);
     naiveReduction(dots_exp, data, cols, rows, params.rowMajor,
                    params.alongRows, stream);
diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/test/linalg/strided_reduction.cu
index b27fa2ac1a..3fe076f1d4 100644
--- a/cpp/test/linalg/strided_reduction.cu
+++ b/cpp/test/linalg/strided_reduction.cu
@@ -49,9 +49,9 @@ class stridedReductionTest
     int rows = params.rows, cols = params.cols;
     int len = rows * cols;
 
-    raft::allocate(data, len);
-    raft::allocate(dots_exp, cols);  //expected dot products (from test)
-    raft::allocate(dots_act, cols);  //actual dot products (from prim)
+    raft::allocate(data, len, stream);
+    raft::allocate(dots_exp, cols, stream);  //expected dot products (from test)
+    raft::allocate(dots_act, cols, stream);  //actual dot products (from prim)
     r.uniform(data, len, T(-1.0), T(1.0),
               stream);  //initialize matrix to random
 
diff --git a/cpp/test/linalg/subtract.cu b/cpp/test/linalg/subtract.cu
index ced3f65fdd..7318410330 100644
--- a/cpp/test/linalg/subtract.cu
+++ b/cpp/test/linalg/subtract.cu
@@ -81,10 +81,10 @@ class SubtractTest : public ::testing::TestWithParam<SubtractInputs<T>> {
     int len = params.len;
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(in1, len);
-    raft::allocate(in2, len);
-    raft::allocate(out_ref, len);
-    raft::allocate(out, len);
+    raft::allocate(in1, len, stream);
+    raft::allocate(in2, len, stream);
+    raft::allocate(out_ref, len, stream);
+    raft::allocate(out, len, stream);
     r.uniform(in1, len, T(-1.0), T(1.0), stream);
     r.uniform(in2, len, T(-1.0), T(1.0), stream);
 
diff --git a/cpp/test/linalg/svd.cu b/cpp/test/linalg/svd.cu
index fff321768f..adfbfd01d6 100644
--- a/cpp/test/linalg/svd.cu
+++ b/cpp/test/linalg/svd.cu
@@ -49,7 +49,7 @@ class SvdTest : public ::testing::TestWithParam<SvdInputs<T>> {
     raft::random::Rng r(params.seed);
     int len = params.len;
     cudaStream_t stream = handle.get_stream();
-    raft::allocate(data, len);
+    raft::allocate(data, len, stream);
 
     ASSERT(params.n_row == 3, "This test only supports nrows=3!");
     ASSERT(params.len == 6, "This test only supports len=6!");
@@ -59,9 +59,9 @@ class SvdTest : public ::testing::TestWithParam<SvdInputs<T>> {
     int left_evl = params.n_row * params.n_col;
     int right_evl = params.n_col * params.n_col;
 
-    raft::allocate(left_eig_vectors_qr, left_evl);
-    raft::allocate(right_eig_vectors_trans_qr, right_evl);
-    raft::allocate(sing_vals_qr, params.n_col);
+    raft::allocate(left_eig_vectors_qr, left_evl, stream);
+    raft::allocate(right_eig_vectors_trans_qr, right_evl, stream);
+    raft::allocate(sing_vals_qr, params.n_col, stream);
 
     // allocate(left_eig_vectors_jacobi, left_evl);
     // allocate(right_eig_vectors_trans_jacobi, right_evl);
@@ -74,9 +74,9 @@ class SvdTest : public ::testing::TestWithParam<SvdInputs<T>> {
 
     T sing_vals_ref_h[] = {7.065283, 1.040081};
 
-    raft::allocate(left_eig_vectors_ref, left_evl);
-    raft::allocate(right_eig_vectors_ref, right_evl);
-    raft::allocate(sing_vals_ref, params.n_col);
+    raft::allocate(left_eig_vectors_ref, left_evl, stream);
+    raft::allocate(right_eig_vectors_ref, right_evl, stream);
+    raft::allocate(sing_vals_ref, params.n_col, stream);
 
     raft::update_device(left_eig_vectors_ref, left_eig_vectors_ref_h, left_evl,
                         stream);
diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu
index f10b029962..08be179c59 100644
--- a/cpp/test/linalg/transpose.cu
+++ b/cpp/test/linalg/transpose.cu
@@ -48,16 +48,16 @@ class TransposeTest : public ::testing::TestWithParam<TranposeInputs<T>> {
 
     int len = params.len;
 
-    raft::allocate(data, len);
+    raft::allocate(data, len, stream);
     ASSERT(params.len == 9, "This test works only with len=9!");
     T data_h[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0};
     raft::update_device(data, data_h, len, stream);
 
-    raft::allocate(data_trans_ref, len);
+    raft::allocate(data_trans_ref, len, stream);
     T data_ref_h[] = {1.0, 4.0, 7.0, 2.0, 5.0, 8.0, 3.0, 6.0, 9.0};
     raft::update_device(data_trans_ref, data_ref_h, len, stream);
 
-    raft::allocate(data_trans, len);
+    raft::allocate(data_trans, len, stream);
 
     transpose(handle, data, data_trans, params.n_row, params.n_col, stream);
     transpose(data, params.n_row, stream);
diff --git a/cpp/test/linalg/unary_op.cu b/cpp/test/linalg/unary_op.cu
index 666ab8619d..8ef810d794 100644
--- a/cpp/test/linalg/unary_op.cu
+++ b/cpp/test/linalg/unary_op.cu
@@ -53,9 +53,9 @@ class UnaryOpTest
     raft::random::Rng r(params.seed);
     CUDA_CHECK(cudaStreamCreate(&stream));
     auto len = params.len;
-    allocate(in, len);
-    allocate(out_ref, len);
-    allocate(out, len);
+    raft::allocate(in, len, stream);
+    raft::allocate(out_ref, len, stream);
+    raft::allocate(out, len, stream);
     r.uniform(in, len, InType(-1.0), InType(1.0), stream);
   }
 
diff --git a/cpp/test/matrix/math.cu b/cpp/test/matrix/math.cu
index 578139623a..227aca643e 100644
--- a/cpp/test/matrix/math.cu
+++ b/cpp/test/matrix/math.cu
@@ -115,22 +115,22 @@ class MathTest : public ::testing::TestWithParam<MathInputs<T>> {
     random::Rng r(params.seed);
     int len = params.len;
 
-    allocate(in_power, len);
-    allocate(out_power_ref, len);
-    allocate(in_sqrt, len);
-    allocate(out_sqrt_ref, len);
-    allocate(in_sign_flip, len);
-    allocate(out_sign_flip_ref, len);
-
     raft::handle_t handle;
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
 
-    allocate(in_ratio, 4);
+    raft::allocate(in_power, len, stream);
+    raft::allocate(out_power_ref, len, stream);
+    raft::allocate(in_sqrt, len, stream);
+    raft::allocate(out_sqrt_ref, len, stream);
+    raft::allocate(in_sign_flip, len, stream);
+    raft::allocate(out_sign_flip_ref, len, stream);
+
+    raft::allocate(in_ratio, 4, stream);
     T in_ratio_h[4] = {1.0, 2.0, 2.0, 3.0};
     update_device(in_ratio, in_ratio_h, 4, stream);
 
-    allocate(out_ratio_ref, 4);
+    raft::allocate(out_ratio_ref, 4, stream);
     T out_ratio_ref_h[4] = {0.125, 0.25, 0.25, 0.375};
     update_device(out_ratio_ref, out_ratio_ref_h, 4, stream);
 
@@ -150,9 +150,9 @@ class MathTest : public ::testing::TestWithParam<MathInputs<T>> {
     naiveSignFlip(in_sign_flip, out_sign_flip_ref, params.n_row, params.n_col);
     signFlip(in_sign_flip, params.n_row, params.n_col, stream);
 
-    allocate(in_recip, 4);
-    allocate(in_recip_ref, 4);
-    allocate(out_recip, 4);
+    raft::allocate(in_recip, 4, stream);
+    raft::allocate(in_recip_ref, 4, stream);
+    raft::allocate(out_recip, 4, stream);
     // default threshold is 1e-15
     std::vector<T> in_recip_h = {0.1, 0.01, -0.01, 0.1e-16};
     std::vector<T> in_recip_ref_h = {10.0, 100.0, -100.0, 0.0};
@@ -167,9 +167,9 @@ class MathTest : public ::testing::TestWithParam<MathInputs<T>> {
 
     std::vector<T> in_small_val_zero_h = {0.1, 1e-16, -1e-16, -0.1};
     std::vector<T> in_small_val_zero_ref_h = {0.1, 0.0, 0.0, -0.1};
-    allocate(in_smallzero, 4);
-    allocate(out_smallzero, 4);
-    allocate(out_smallzero_ref, 4);
+    raft::allocate(in_smallzero, 4, stream);
+    raft::allocate(out_smallzero, 4, stream);
+    raft::allocate(out_smallzero_ref, 4, stream);
     update_device(in_smallzero, in_small_val_zero_h.data(), 4, stream);
     update_device(out_smallzero_ref, in_small_val_zero_ref_h.data(), 4, stream);
     setSmallValuesZero(out_smallzero, in_smallzero, 4, stream);
diff --git a/cpp/test/matrix/matrix.cu b/cpp/test/matrix/matrix.cu
index 2d2d9d2057..047415957c 100644
--- a/cpp/test/matrix/matrix.cu
+++ b/cpp/test/matrix/matrix.cu
@@ -46,9 +46,9 @@ class MatrixTest : public ::testing::TestWithParam<MatrixInputs<T>> {
     int len = params.n_row * params.n_col;
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(in1, len);
-    raft::allocate(in2, len);
-    raft::allocate(in1_revr, len);
+    raft::allocate(in1, len, stream);
+    raft::allocate(in2, len, stream);
+    raft::allocate(in1_revr, len, stream);
     r.uniform(in1, len, T(-1.0), T(1.0), stream);
 
     copy(in1, in2, params.n_row, params.n_col, stream);
@@ -56,7 +56,7 @@ class MatrixTest : public ::testing::TestWithParam<MatrixInputs<T>> {
     // colReverse(in1_revr, params.n_row, params.n_col);
 
     T *outTrunc;
-    raft::allocate(outTrunc, 6);
+    raft::allocate(outTrunc, 6, stream);
     truncZeroOrigin(in1, params.n_row, outTrunc, 3, 2, stream);
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu
index af10dcab30..ee5312de22 100644
--- a/cpp/test/random/rng.cu
+++ b/cpp/test/random/rng.cu
@@ -88,8 +88,8 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
     Rng r(params.seed, params.gtype);
-    allocate(data, params.len);
-    allocate(stats, 2, true);
+    raft::allocate(data, params.len, stream);
+    raft::allocate(stats, 2, stream, true);
     switch (params.type) {
       case RNG_Normal:
         r.normal(data, params.len, params.start, params.end, stream);
@@ -383,9 +383,9 @@ TEST(Rng, MeanError) {
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
 
-  allocate(data, len);
-  allocate(mean_result, num_experiments);
-  allocate(std_result, num_experiments);
+  raft::allocate(data, len, stream);
+  raft::allocate(mean_result, num_experiments, stream);
+  raft::allocate(std_result, num_experiments, stream);
 
   for (auto rtype : {GenPhilox, GenKiss99 /*, raft::random::GenTaps */}) {
     Rng r(seed, rtype);
@@ -432,7 +432,7 @@ class ScaledBernoulliTest : public ::testing::Test {
 
     Rng r(42);
 
-    allocate(data, len * sizeof(T), stream);
+    raft::allocate(data, len * sizeof(T), stream);
     r.scaled_bernoulli(data, len, T(0.5), T(scale), stream);
   }
 
@@ -463,7 +463,7 @@ class BernoulliTest : public ::testing::Test {
   void SetUp() override {
     CUDA_CHECK(cudaStreamCreate(&stream));
     Rng r(42);
-    allocate(data, len * sizeof(bool), stream);
+    raft::allocate(data, len * sizeof(bool), stream);
     r.bernoulli(data, len, T(0.5), stream);
   }
 
@@ -518,9 +518,9 @@ class RngNormalTableTest
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
     Rng r(params.seed, params.gtype);
-    allocate(data, len);
-    allocate(stats, 2, true);
-    allocate(mu_vec, params.cols);
+    raft::allocate(data, len, stream);
+    raft::allocate(stats, 2, stream, true);
+    raft::allocate(mu_vec, params.cols, stream);
     r.fill(mu_vec, params.cols, params.mu, stream);
     T* sigma_vec = nullptr;
     r.normalTable(data, params.rows, params.cols, mu_vec, sigma_vec,
diff --git a/cpp/test/random/rng_int.cu b/cpp/test/random/rng_int.cu
index 92f12206e8..cd948d50aa 100644
--- a/cpp/test/random/rng_int.cu
+++ b/cpp/test/random/rng_int.cu
@@ -72,8 +72,8 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
 
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
-    allocate(data, params.len);
-    allocate(stats, 2, true);
+    raft::allocate(data, params.len, stream);
+    raft::allocate(stats, 2, stream, true);
     switch (params.type) {
       case RNG_Uniform:
         r.uniformInt(data, params.len, params.start, params.end, stream);
diff --git a/cpp/test/random/sample_without_replacement.cu b/cpp/test/random/sample_without_replacement.cu
index d7e52a8958..7883c8fe1b 100644
--- a/cpp/test/random/sample_without_replacement.cu
+++ b/cpp/test/random/sample_without_replacement.cu
@@ -50,10 +50,10 @@ class SWoRTest : public ::testing::TestWithParam<SWoRInputs<T>> {
     CUDA_CHECK(cudaStreamCreate(&stream));
 
     Rng r(params.seed, params.gtype);
-    allocate(in, params.len);
-    allocate(wts, params.len);
-    allocate(out, params.sampledLen);
-    allocate(outIdx, params.sampledLen);
+    raft::allocate(in, params.len, stream);
+    raft::allocate(wts, params.len, stream);
+    raft::allocate(out, params.sampledLen, stream);
+    raft::allocate(outIdx, params.sampledLen, stream);
     h_outIdx.resize(params.sampledLen);
     r.uniform(in, params.len, T(-1.0), T(1.0), stream);
     r.uniform(wts, params.len, T(1.0), T(2.0), stream);
diff --git a/cpp/test/sparse/add.cu b/cpp/test/sparse/add.cu
index 38c3e5bfbb..5b2e5bd5b4 100644
--- a/cpp/test/sparse/add.cu
+++ b/cpp/test/sparse/add.cu
@@ -56,21 +56,21 @@ class CSRAddTest
 
     cudaStreamCreate(&stream);
 
-    raft::allocate(ind_a, n_rows);
-    raft::allocate(ind_ptr_a, nnz_a);
-    raft::allocate(values_a, nnz_a);
+    raft::allocate(ind_a, n_rows, stream);
+    raft::allocate(ind_ptr_a, nnz_a, stream);
+    raft::allocate(values_a, nnz_a, stream);
 
-    raft::allocate(ind_b, n_rows);
-    raft::allocate(ind_ptr_b, nnz_b);
-    raft::allocate(values_b, nnz_b);
+    raft::allocate(ind_b, n_rows, stream);
+    raft::allocate(ind_ptr_b, nnz_b, stream);
+    raft::allocate(values_b, nnz_b, stream);
 
-    raft::allocate(ind_verify, n_rows);
-    raft::allocate(ind_ptr_verify, nnz_result);
-    raft::allocate(values_verify, nnz_result);
+    raft::allocate(ind_verify, n_rows, stream);
+    raft::allocate(ind_ptr_verify, nnz_result, stream);
+    raft::allocate(values_verify, nnz_result, stream);
 
-    raft::allocate(ind_result, n_rows);
-    raft::allocate(ind_ptr_result, nnz_result);
-    raft::allocate(values_result, nnz_result);
+    raft::allocate(ind_result, n_rows, stream);
+    raft::allocate(ind_ptr_result, nnz_result, stream);
+    raft::allocate(values_result, nnz_result, stream);
   }
 
   void Run() {
diff --git a/cpp/test/sparse/convert_coo.cu b/cpp/test/sparse/convert_coo.cu
index ea69ecfc53..8af6d30d7d 100644
--- a/cpp/test/sparse/convert_coo.cu
+++ b/cpp/test/sparse/convert_coo.cu
@@ -43,9 +43,9 @@ class CSRtoCOOTest : public ::testing::TestWithParam<CSRtoCOOInputs<Index_>> {
     params = ::testing::TestWithParam<CSRtoCOOInputs<Index_>>::GetParam();
 
     cudaStreamCreate(&stream);
-    raft::allocate(ex_scan, params.ex_scan.size());
-    raft::allocate(verify, params.verify.size());
-    raft::allocate(result, params.verify.size(), true);
+    raft::allocate(ex_scan, params.ex_scan.size(), stream);
+    raft::allocate(verify, params.verify.size(), stream);
+    raft::allocate(result, params.verify.size(), stream, true);
   }
 
   void Run() {
diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu
index 5bee5a000e..871d8475e8 100644
--- a/cpp/test/sparse/convert_csr.cu
+++ b/cpp/test/sparse/convert_csr.cu
@@ -68,9 +68,9 @@ TEST_P(SortedCOOToCSR, Result) {
   int *in_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3};
   int *exp_h = new int[4]{0, 2, 4, 6};
 
-  raft::allocate(in, nnz, true);
-  raft::allocate(exp, 4, true);
-  raft::allocate(out, 4, true);
+  raft::allocate(in, nnz, stream, true);
+  raft::allocate(exp, 4, stream, true);
+  raft::allocate(out, 4, stream, true);
 
   raft::update_device(in, in_h, nnz, stream);
   raft::update_device(exp, exp_h, 4, stream);
@@ -112,10 +112,10 @@ class CSRAdjGraphTest
     cudaStreamCreate(&stream);
     nnz = params.verify.size();
 
-    raft::allocate(row_ind, params.n_rows);
-    raft::allocate(adj, params.n_rows * params.n_cols);
-    raft::allocate(result, nnz, true);
-    raft::allocate(verify, nnz);
+    raft::allocate(row_ind, params.n_rows, stream);
+    raft::allocate(adj, params.n_rows * params.n_cols, stream);
+    raft::allocate(result, nnz, stream, true);
+    raft::allocate(verify, nnz, stream);
   }
 
   void Run() {
diff --git a/cpp/test/sparse/csr_row_slice.cu b/cpp/test/sparse/csr_row_slice.cu
index b158ffdedd..1cf88e7a77 100644
--- a/cpp/test/sparse/csr_row_slice.cu
+++ b/cpp/test/sparse/csr_row_slice.cu
@@ -59,9 +59,9 @@ class CSRRowSliceTest
     std::vector<value_idx> indices_h = params.indices_h;
     std::vector<value_t> data_h = params.data_h;
 
-    allocate(indptr, indptr_h.size());
-    allocate(indices, indices_h.size());
-    allocate(data, data_h.size());
+    raft::allocate(indptr, indptr_h.size(), stream);
+    raft::allocate(indices, indices_h.size(), stream);
+    raft::allocate(data, data_h.size(), stream);
 
     update_device(indptr, indptr_h.data(), indptr_h.size(), stream);
     update_device(indices, indices_h.data(), indices_h.size(), stream);
@@ -71,9 +71,9 @@ class CSRRowSliceTest
     std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
     std::vector<value_t> out_data_ref_h = params.out_data_ref_h;
 
-    allocate(out_indptr_ref, out_indptr_ref_h.size());
-    allocate(out_indices_ref, out_indices_ref_h.size());
-    allocate(out_data_ref, out_data_ref_h.size());
+    raft::allocate(out_indptr_ref, out_indptr_ref_h.size(), stream);
+    raft::allocate(out_indices_ref, out_indices_ref_h.size(), stream);
+    raft::allocate(out_data_ref, out_data_ref_h.size(), stream);
 
     update_device(out_indptr_ref, out_indptr_ref_h.data(),
                   out_indptr_ref_h.size(), stream);
@@ -82,9 +82,9 @@ class CSRRowSliceTest
     update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(),
                   stream);
 
-    allocate(out_indptr, out_indptr_ref_h.size());
-    allocate(out_indices, out_indices_ref_h.size());
-    allocate(out_data, out_data_ref_h.size());
+    raft::allocate(out_indptr, out_indptr_ref_h.size(), stream);
+    raft::allocate(out_indices, out_indices_ref_h.size(), stream);
+    raft::allocate(out_data, out_data_ref_h.size(), stream);
   }
 
   void SetUp() override {
diff --git a/cpp/test/sparse/csr_to_dense.cu b/cpp/test/sparse/csr_to_dense.cu
index d04799befa..4a7c9b7b10 100644
--- a/cpp/test/sparse/csr_to_dense.cu
+++ b/cpp/test/sparse/csr_to_dense.cu
@@ -55,9 +55,9 @@ class CSRToDenseTest
     std::vector<value_idx> indices_h = params.indices_h;
     std::vector<value_t> data_h = params.data_h;
 
-    allocate(indptr, indptr_h.size());
-    allocate(indices, indices_h.size());
-    allocate(data, data_h.size());
+    raft::allocate(indptr, indptr_h.size(), stream);
+    raft::allocate(indices, indices_h.size(), stream);
+    raft::allocate(data, data_h.size(), stream);
 
     update_device(indptr, indptr_h.data(), indptr_h.size(), stream);
     update_device(indices, indices_h.data(), indices_h.size(), stream);
@@ -65,11 +65,11 @@ class CSRToDenseTest
 
     std::vector<value_t> out_ref_h = params.out_ref_h;
 
-    allocate(out_ref, out_ref_h.size());
+    raft::allocate(out_ref, out_ref_h.size(), stream);
 
     update_device(out_ref, out_ref_h.data(), out_ref_h.size(), stream);
 
-    allocate(out, out_ref_h.size());
+    raft::allocate(out, out_ref_h.size(), stream);
   }
 
   void SetUp() override {
diff --git a/cpp/test/sparse/csr_transpose.cu b/cpp/test/sparse/csr_transpose.cu
index 5baeadd16f..352a656bf7 100644
--- a/cpp/test/sparse/csr_transpose.cu
+++ b/cpp/test/sparse/csr_transpose.cu
@@ -61,9 +61,9 @@ class CSRTransposeTest
     std::vector<value_idx> indices_h = params.indices_h;
     std::vector<value_t> data_h = params.data_h;
 
-    allocate(indptr, indptr_h.size());
-    allocate(indices, indices_h.size());
-    allocate(data, data_h.size());
+    raft::allocate(indptr, indptr_h.size(), stream);
+    raft::allocate(indices, indices_h.size(), stream);
+    raft::allocate(data, data_h.size(), stream);
 
     update_device(indptr, indptr_h.data(), indptr_h.size(), stream);
     update_device(indices, indices_h.data(), indices_h.size(), stream);
@@ -73,9 +73,9 @@ class CSRTransposeTest
     std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
     std::vector<value_t> out_data_ref_h = params.out_data_ref_h;
 
-    allocate(out_indptr_ref, out_indptr_ref_h.size());
-    allocate(out_indices_ref, out_indices_ref_h.size());
-    allocate(out_data_ref, out_data_ref_h.size());
+    raft::allocate(out_indptr_ref, out_indptr_ref_h.size(), stream);
+    raft::allocate(out_indices_ref, out_indices_ref_h.size(), stream);
+    raft::allocate(out_data_ref, out_data_ref_h.size(), stream);
 
     update_device(out_indptr_ref, out_indptr_ref_h.data(),
                   out_indptr_ref_h.size(), stream);
@@ -84,9 +84,9 @@ class CSRTransposeTest
     update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(),
                   stream);
 
-    allocate(out_indptr, out_indptr_ref_h.size());
-    allocate(out_indices, out_indices_ref_h.size());
-    allocate(out_data, out_data_ref_h.size());
+    raft::allocate(out_indptr, out_indptr_ref_h.size(), stream);
+    raft::allocate(out_indices, out_indices_ref_h.size(), stream);
+    raft::allocate(out_data, out_data_ref_h.size(), stream);
   }
 
   void SetUp() override {
diff --git a/cpp/test/sparse/degree.cu b/cpp/test/sparse/degree.cu
index 5d687ad92b..68dc1b51ac 100644
--- a/cpp/test/sparse/degree.cu
+++ b/cpp/test/sparse/degree.cu
@@ -48,14 +48,16 @@ const std::vector<SparseDegreeInputs<float>> inputsf = {{5, 10, 5, 1234ULL}};
 
 typedef SparseDegreeTests<float> COODegree;
 TEST_P(COODegree, Result) {
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
   int *in_rows, *verify, *results;
 
   int in_rows_h[5] = {0, 0, 1, 2, 2};
   int verify_h[5] = {2, 1, 2, 0, 0};
 
-  raft::allocate(in_rows, 5);
-  raft::allocate(verify, 5, true);
-  raft::allocate(results, 5, true);
+  raft::allocate(in_rows, 5, stream);
+  raft::allocate(verify, 5, stream, true);
+  raft::allocate(results, 5, stream, true);
 
   raft::update_device(in_rows, *&in_rows_h, 5, 0);
   raft::update_device(verify, *&verify_h, 5, 0);
@@ -67,6 +69,7 @@ TEST_P(COODegree, Result) {
 
   CUDA_CHECK(cudaFree(in_rows));
   CUDA_CHECK(cudaFree(verify));
+  CUDA_CHECK(cudaStreamDestroy(stream));
 }
 
 typedef SparseDegreeTests<float> COODegreeNonzero;
@@ -81,10 +84,10 @@ TEST_P(COODegreeNonzero, Result) {
   float in_vals_h[5] = {0.0, 5.0, 0.0, 1.0, 1.0};
   int verify_h[5] = {1, 0, 2, 0, 0};
 
-  raft::allocate(in_rows, 5);
-  raft::allocate(verify, 5, true);
-  raft::allocate(results, 5, true);
-  raft::allocate(in_vals, 5, true);
+  raft::allocate(in_rows, 5, stream);
+  raft::allocate(verify, 5, stream, true);
+  raft::allocate(results, 5, stream, true);
+  raft::allocate(in_vals, 5, stream, true);
 
   raft::update_device(in_rows, *&in_rows_h, 5, 0);
   raft::update_device(verify, *&verify_h, 5, 0);
diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu
index 4f40e16eba..6b18e773cc 100644
--- a/cpp/test/sparse/dist_coo_spmv.cu
+++ b/cpp/test/sparse/dist_coo_spmv.cu
@@ -160,9 +160,9 @@ class SparseDistanceCOOSPMVTest
     std::vector<value_idx> indices_h = params.input_configuration.indices_h;
     std::vector<value_t> data_h = params.input_configuration.data_h;
 
-    allocate(indptr, indptr_h.size());
-    allocate(indices, indices_h.size());
-    allocate(data, data_h.size());
+    raft::allocate(indptr, indptr_h.size(), handle.get_stream());
+    raft::allocate(indices, indices_h.size(), handle.get_stream());
+    raft::allocate(data, data_h.size(), handle.get_stream());
 
     update_device(indptr, indptr_h.data(), indptr_h.size(),
                   handle.get_stream());
@@ -173,7 +173,8 @@ class SparseDistanceCOOSPMVTest
     std::vector<value_t> out_dists_ref_h =
       params.input_configuration.out_dists_ref_h;
 
-    allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1));
+    raft::allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1),
+                   handle.get_stream());
 
     update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(),
                   handle.get_stream());
@@ -200,7 +201,7 @@ class SparseDistanceCOOSPMVTest
 
     int out_size = dist_config.a_nrows * dist_config.b_nrows;
 
-    allocate(out_dists, out_size);
+    raft::allocate(out_dists, out_size, handle.get_stream());
 
     run_spmv();
 
diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu
index 8874e5ddb8..4ae95ae232 100644
--- a/cpp/test/sparse/distance.cu
+++ b/cpp/test/sparse/distance.cu
@@ -81,7 +81,7 @@ class SparseDistanceTest
 
     int out_size = dist_config.a_nrows * dist_config.b_nrows;
 
-    allocate(out_dists, out_size);
+    raft::allocate(out_dists, out_size, handle.get_stream());
 
     pairwiseDistance(out_dists, dist_config, params.metric, params.metric_arg);
 
@@ -109,9 +109,9 @@ class SparseDistanceTest
     std::vector<value_idx> indices_h = params.indices_h;
     std::vector<value_t> data_h = params.data_h;
 
-    allocate(indptr, indptr_h.size());
-    allocate(indices, indices_h.size());
-    allocate(data, data_h.size());
+    raft::allocate(indptr, indptr_h.size(), handle.get_stream());
+    raft::allocate(indices, indices_h.size(), handle.get_stream());
+    raft::allocate(data, data_h.size(), handle.get_stream());
 
     update_device(indptr, indptr_h.data(), indptr_h.size(),
                   handle.get_stream());
@@ -121,7 +121,8 @@ class SparseDistanceTest
 
     std::vector<value_t> out_dists_ref_h = params.out_dists_ref_h;
 
-    allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1));
+    raft::allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1),
+                   handle.get_stream());
 
     update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(),
                   dist_config.handle.get_stream());
diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/knn.cu
index dbb9ac0a3d..8b5c9edebe 100644
--- a/cpp/test/sparse/knn.cu
+++ b/cpp/test/sparse/knn.cu
@@ -103,9 +103,9 @@ class SparseKNNTest
     std::vector<value_idx> indices_h = params.indices_h;
     std::vector<value_t> data_h = params.data_h;
 
-    allocate(indptr, indptr_h.size());
-    allocate(indices, indices_h.size());
-    allocate(data, data_h.size());
+    raft::allocate(indptr, indptr_h.size(), handle.get_stream());
+    raft::allocate(indices, indices_h.size(), handle.get_stream());
+    raft::allocate(data, data_h.size(), handle.get_stream());
 
     update_device(indptr, indptr_h.data(), indptr_h.size(),
                   handle.get_stream());
@@ -116,16 +116,17 @@ class SparseKNNTest
     std::vector<value_t> out_dists_ref_h = params.out_dists_ref_h;
     std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
 
-    allocate(out_indices_ref, out_indices_ref_h.size());
-    allocate(out_dists_ref, out_dists_ref_h.size());
+    raft::allocate(out_indices_ref, out_indices_ref_h.size(),
+                   handle.get_stream());
+    raft::allocate(out_dists_ref, out_dists_ref_h.size(), handle.get_stream());
 
     update_device(out_indices_ref, out_indices_ref_h.data(),
                   out_indices_ref_h.size(), handle.get_stream());
     update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(),
                   handle.get_stream());
 
-    allocate(out_dists, n_rows * k);
-    allocate(out_indices, n_rows * k);
+    raft::allocate(out_dists, n_rows * k, handle.get_stream());
+    raft::allocate(out_indices, n_rows * k, handle.get_stream());
   }
 
   raft::handle_t handle;
diff --git a/cpp/test/sparse/knn_graph.cu b/cpp/test/sparse/knn_graph.cu
index d4e9a915e5..a89529ddb3 100644
--- a/cpp/test/sparse/knn_graph.cu
+++ b/cpp/test/sparse/knn_graph.cu
@@ -68,7 +68,7 @@ class KNNGraphTest
 
     out = new raft::sparse::COO<value_t, value_idx>(stream);
 
-    allocate(X, params.X.size());
+    raft::allocate(X, params.X.size(), stream);
 
     update_device(X, params.X.data(), params.X.size(), stream);
 
diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu
index 291880bf53..4d1a5a9ad6 100644
--- a/cpp/test/sparse/linkage.cu
+++ b/cpp/test/sparse/linkage.cu
@@ -162,8 +162,8 @@ class LinkageTest : public ::testing::TestWithParam<LinkageInputs<T, IdxT>> {
                                 handle.get_stream());
 
     // Allocate result labels and expected labels on device
-    raft::allocate(labels, params.n_row);
-    raft::allocate(labels_ref, params.n_row);
+    raft::allocate(labels, params.n_row, handle.get_stream());
+    raft::allocate(labels_ref, params.n_row, handle.get_stream());
 
     raft::copy(data.data(), params.data.data(), data.size(),
                handle.get_stream());
diff --git a/cpp/test/sparse/norm.cu b/cpp/test/sparse/norm.cu
index 7adbbf8b9a..f9a30d5147 100644
--- a/cpp/test/sparse/norm.cu
+++ b/cpp/test/sparse/norm.cu
@@ -47,10 +47,10 @@ class CSRRowNormalizeTest
       CSRRowNormalizeInputs<Type_f, Index_>>::GetParam();
     cudaStreamCreate(&stream);
 
-    raft::allocate(in_vals, params.in_vals.size());
-    raft::allocate(verify, params.verify.size());
-    raft::allocate(ex_scan, params.ex_scan.size());
-    raft::allocate(result, params.verify.size(), true);
+    raft::allocate(in_vals, params.in_vals.size(), stream);
+    raft::allocate(verify, params.verify.size(), stream);
+    raft::allocate(ex_scan, params.ex_scan.size(), stream);
+    raft::allocate(result, params.verify.size(), stream, true);
   }
 
   void Run() {
diff --git a/cpp/test/sparse/row_op.cu b/cpp/test/sparse/row_op.cu
index b64fa25883..e584598f85 100644
--- a/cpp/test/sparse/row_op.cu
+++ b/cpp/test/sparse/row_op.cu
@@ -59,9 +59,9 @@ class CSRRowOpTest
     n_rows = params.ex_scan.size();
     nnz = params.verify.size();
 
-    raft::allocate(verify, nnz);
-    raft::allocate(ex_scan, n_rows);
-    raft::allocate(result, nnz, true);
+    raft::allocate(verify, nnz, stream);
+    raft::allocate(ex_scan, n_rows, stream);
+    raft::allocate(result, nnz, stream, true);
   }
 
   void Run() {
diff --git a/cpp/test/sparse/selection.cu b/cpp/test/sparse/selection.cu
index 74aa9d6eaf..4434b65dde 100644
--- a/cpp/test/sparse/selection.cu
+++ b/cpp/test/sparse/selection.cu
@@ -57,25 +57,25 @@ class SparseSelectionTest
   void make_data() {
     std::vector<value_t> dists_h = params.dists_h;
 
-    allocate(dists, n_rows * n_cols);
+    raft::allocate(dists, n_rows * n_cols, stream);
     update_device(dists, dists_h.data(), dists_h.size(), stream);
 
-    allocate(inds, n_rows * n_cols);
+    raft::allocate(inds, n_rows * n_cols, stream);
     iota_fill(inds, n_rows, n_cols, stream);
 
     std::vector<value_t> out_dists_ref_h = params.out_dists_ref_h;
     std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
 
-    allocate(out_indices_ref, out_indices_ref_h.size());
-    allocate(out_dists_ref, out_dists_ref_h.size());
+    raft::allocate(out_indices_ref, out_indices_ref_h.size(), stream);
+    raft::allocate(out_dists_ref, out_dists_ref_h.size(), stream);
 
     update_device(out_indices_ref, out_indices_ref_h.data(),
                   out_indices_ref_h.size(), stream);
     update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(),
                   stream);
 
-    allocate(out_dists, n_rows * k);
-    allocate(out_indices, n_rows * k);
+    raft::allocate(out_dists, n_rows * k, stream);
+    raft::allocate(out_indices, n_rows * k, stream);
   }
 
   void SetUp() override {
diff --git a/cpp/test/sparse/sort.cu b/cpp/test/sparse/sort.cu
index 9deac1b82f..e73a8a547b 100644
--- a/cpp/test/sparse/sort.cu
+++ b/cpp/test/sparse/sort.cu
@@ -55,7 +55,7 @@ TEST_P(COOSort, Result) {
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
 
-  raft::allocate(in_vals, params.nnz);
+  raft::allocate(in_vals, params.nnz, stream);
   r.uniform(in_vals, params.nnz, float(-1.0), float(1.0), stream);
 
   int *in_rows_h = (int *)malloc(params.nnz * sizeof(int));
@@ -68,9 +68,9 @@ TEST_P(COOSort, Result) {
     in_cols_h[i] = i;
   }
 
-  raft::allocate(in_rows, params.nnz);
-  raft::allocate(in_cols, params.nnz);
-  raft::allocate(verify, params.nnz);
+  raft::allocate(in_rows, params.nnz, stream);
+  raft::allocate(in_cols, params.nnz, stream);
+  raft::allocate(verify, params.nnz, stream);
 
   raft::update_device(in_rows, in_rows_h, params.nnz, stream);
 
diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu
index a060b8128a..c854292b2d 100644
--- a/cpp/test/sparse/symmetrize.cu
+++ b/cpp/test/sparse/symmetrize.cu
@@ -64,9 +64,9 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam<
     std::vector<value_idx> indices_h = params.indices_h;
     std::vector<value_t> data_h = params.data_h;
 
-    allocate(indptr, indptr_h.size());
-    allocate(indices, indices_h.size());
-    allocate(data, data_h.size());
+    raft::allocate(indptr, indptr_h.size(), stream);
+    raft::allocate(indices, indices_h.size(), stream);
+    raft::allocate(data, data_h.size(), stream);
 
     update_device(indptr, indptr_h.data(), indptr_h.size(), stream);
     update_device(indices, indices_h.data(), indices_h.size(), stream);
diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu
index e62bec4381..a56dc03be3 100644
--- a/cpp/test/spatial/haversine.cu
+++ b/cpp/test/spatial/haversine.cu
@@ -29,16 +29,19 @@ template <typename value_idx, typename value_t>
 class HaversineKNNTest : public ::testing::Test {
  protected:
   void basicTest() {
+    cudaStream_t stream;
+    CUDA_CHECK(cudaStreamCreate(&stream));
+
     // Allocate input
-    raft::allocate(d_train_inputs, n * d);
+    raft::allocate(d_train_inputs, n * d, stream);
 
     // Allocate reference arrays
-    raft::allocate<value_idx>(d_ref_I, n * n);
-    raft::allocate(d_ref_D, n * n);
+    raft::allocate<value_idx>(d_ref_I, n * n, stream);
+    raft::allocate(d_ref_D, n * n, stream);
 
     // Allocate predicted arrays
-    raft::allocate<value_idx>(d_pred_I, n * n);
-    raft::allocate(d_pred_D, n * n);
+    raft::allocate<value_idx>(d_pred_I, n * n, stream);
+    raft::allocate(d_pred_D, n * n, stream);
 
     // make testdata on host
     std::vector<value_t> h_train_inputs = {
@@ -68,9 +71,6 @@ class HaversineKNNTest : public ::testing::Test {
     std::vector<value_t *> input_vec = {d_train_inputs};
     std::vector<value_idx> sizes_vec = {n};
 
-    cudaStream_t stream;
-    CUDA_CHECK(cudaStreamCreate(&stream));
-
     raft::spatial::knn::detail::haversine_knn(
       d_pred_I, d_pred_D, d_train_inputs, d_train_inputs, n, n, k, stream);
 
diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu
index 2b1ef89f7a..edf46b3fe0 100644
--- a/cpp/test/spatial/knn.cu
+++ b/cpp/test/spatial/knn.cu
@@ -64,8 +64,8 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
 
     auto stream = handle_.get_stream();
 
-    raft::allocate(actual_labels_, rows_ * k_, true);
-    raft::allocate(expected_labels_, rows_ * k_, true);
+    raft::allocate(actual_labels_, rows_ * k_, stream, true);
+    raft::allocate(expected_labels_, rows_ * k_, stream, true);
 
     std::vector<float *> input_vec;
     std::vector<int> sizes_vec;
@@ -100,6 +100,8 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
     cols_ = params_.input[0].size();
     k_ = params_.k;
 
+    cudaStream_t stream = handle_.get_stream();
+
     std::vector<float> row_major_input;
     for (int i = 0; i < params_.input.size(); ++i) {
       for (int j = 0; j < params_.input[i].size(); ++j) {
@@ -107,24 +109,22 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
       }
     }
     rmm::device_buffer input_d = rmm::device_buffer(
-      row_major_input.data(), row_major_input.size() * sizeof(float),
-      handle_.get_stream());
+      row_major_input.data(), row_major_input.size() * sizeof(float), stream);
     float *input_ptr = static_cast<float *>(input_d.data());
 
     rmm::device_buffer labels_d = rmm::device_buffer(
-      params_.labels.data(), params_.labels.size() * sizeof(int),
-      handle_.get_stream());
+      params_.labels.data(), params_.labels.size() * sizeof(int), stream);
     int *labels_ptr = static_cast<int *>(labels_d.data());
 
-    raft::allocate(input_, rows_ * cols_, true);
-    raft::allocate(search_data_, rows_ * cols_, true);
-    raft::allocate(indices_, rows_ * k_, true);
-    raft::allocate(distances_, rows_ * k_, true);
-    raft::allocate(search_labels_, rows_, true);
+    raft::allocate(input_, rows_ * cols_, stream, true);
+    raft::allocate(search_data_, rows_ * cols_, stream, true);
+    raft::allocate(indices_, rows_ * k_, stream, true);
+    raft::allocate(distances_, rows_ * k_, stream, true);
+    raft::allocate(search_labels_, rows_, stream, true);
 
-    raft::copy(input_, input_ptr, rows_ * cols_, handle_.get_stream());
-    raft::copy(search_data_, input_ptr, rows_ * cols_, handle_.get_stream());
-    raft::copy(search_labels_, labels_ptr, rows_, handle_.get_stream());
+    raft::copy(input_, input_ptr, rows_ * cols_, stream);
+    raft::copy(search_data_, input_ptr, rows_ * cols_, stream);
+    raft::copy(search_labels_, labels_ptr, rows_, stream);
   }
 
   void TearDown() override {
diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu
index 4a3b0ed196..3c79fc2cae 100644
--- a/cpp/test/stats/mean.cu
+++ b/cpp/test/stats/mean.cu
@@ -52,8 +52,8 @@ class MeanTest : public ::testing::TestWithParam<MeanInputs<T>> {
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
 
-    allocate(data, len);
-    allocate(mean_act, cols);
+    raft::allocate(data, len, stream);
+    raft::allocate(mean_act, cols, stream);
     r.normal(data, len, params.mean, (T)1.0, stream);
 
     meanSGtest(data, stream);
diff --git a/cpp/test/stats/mean_center.cu b/cpp/test/stats/mean_center.cu
index 8b0d607561..89118b877b 100644
--- a/cpp/test/stats/mean_center.cu
+++ b/cpp/test/stats/mean_center.cu
@@ -54,10 +54,10 @@ class MeanCenterTest
     auto len = rows * cols;
     IdxType vecLen = params.bcastAlongRows ? cols : rows;
 
-    raft::allocate(out, len);
-    raft::allocate(out_ref, len);
-    raft::allocate(data, len);
-    raft::allocate(meanVec, vecLen);
+    raft::allocate(out, len, stream);
+    raft::allocate(out_ref, len, stream);
+    raft::allocate(data, len, stream);
+    raft::allocate(meanVec, vecLen, stream);
     r.normal(data, len, params.mean, (T)1.0, stream);
     raft::stats::mean(meanVec, data, cols, rows, params.sample, params.rowMajor,
                       stream);
diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu
index ff2698788f..6dfa186825 100644
--- a/cpp/test/stats/stddev.cu
+++ b/cpp/test/stats/stddev.cu
@@ -49,10 +49,10 @@ class StdDevTest : public ::testing::TestWithParam<StdDevInputs<T>> {
 
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
-    allocate(data, len);
-    allocate(mean_act, cols);
-    allocate(stddev_act, cols);
-    allocate(vars_act, cols);
+    raft::allocate(data, len, stream);
+    raft::allocate(mean_act, cols, stream);
+    raft::allocate(stddev_act, cols, stream);
+    raft::allocate(vars_act, cols, stream);
     r.normal(data, len, params.mean, params.stddev, stream);
     stdVarSGtest(data, stream);
     CUDA_CHECK(cudaStreamDestroy(stream));
diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu
index c3140d4588..bf7a615f84 100644
--- a/cpp/test/stats/sum.cu
+++ b/cpp/test/stats/sum.cu
@@ -45,7 +45,7 @@ class SumTest : public ::testing::TestWithParam<SumInputs<T>> {
     int len = rows * cols;
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(data, len);
+    raft::allocate(data, len, stream);
 
     T data_h[len];
     for (int i = 0; i < len; i++) {
@@ -54,7 +54,7 @@ class SumTest : public ::testing::TestWithParam<SumInputs<T>> {
 
     raft::update_device(data, data_h, len, stream);
 
-    raft::allocate(sum_act, cols);
+    raft::allocate(sum_act, cols, stream);
     sum(sum_act, data, cols, rows, false, stream);
     CUDA_CHECK(cudaStreamDestroy(stream));
   }

From e571293dcb126da17ce58a72bdec8f8075a3322c Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Wed, 7 Jul 2021 19:43:03 +0200
Subject: [PATCH 08/17] RAFT alloc + dealloc helper system

---
 cpp/include/raft/cudart_utils.h               | 33 +++++++++++++++----
 cpp/test/distance/dist_adj.cu                 | 12 ++-----
 cpp/test/distance/distance_base.cuh           | 12 +++----
 cpp/test/distance/fused_l2_nn.cu              | 14 ++------
 cpp/test/label/label.cu                       |  6 ++--
 cpp/test/linalg/add.cu                        |  6 +---
 cpp/test/linalg/coalesced_reduction.cu        |  9 +++--
 cpp/test/linalg/divide.cu                     |  9 +++--
 cpp/test/linalg/eig.cu                        | 10 +-----
 cpp/test/linalg/eig_sel.cu                    |  9 ++---
 cpp/test/linalg/map.cu                        | 12 +++----
 cpp/test/linalg/map_then_reduce.cu            |  9 +++--
 cpp/test/linalg/matrix_vector_op.cu           | 11 +++----
 cpp/test/linalg/multiply.cu                   |  9 +++--
 cpp/test/linalg/norm.cu                       |  6 ++--
 cpp/test/linalg/reduce.cu                     |  4 +--
 cpp/test/linalg/strided_reduction.cu          |  4 +--
 cpp/test/linalg/subtract.cu                   | 10 +++---
 cpp/test/linalg/svd.cu                        | 14 +++-----
 cpp/test/linalg/transpose.cu                  |  6 +---
 cpp/test/linalg/unary_op.cu                   |  5 +--
 cpp/test/matrix/math.cu                       | 21 ++----------
 cpp/test/matrix/matrix.cu                     |  9 +++--
 cpp/test/random/rng.cu                        | 21 +++++-------
 cpp/test/random/rng_int.cu                    |  8 ++---
 cpp/test/random/sample_without_replacement.cu |  6 +---
 cpp/test/sparse/add.cu                        | 14 ++------
 cpp/test/sparse/convert_coo.cu                |  4 +--
 cpp/test/sparse/convert_csr.cu                |  7 ++--
 cpp/test/sparse/csr_row_slice.cu              | 12 ++-----
 cpp/test/sparse/csr_to_dense.cu               |  8 ++---
 cpp/test/sparse/csr_transpose.cu              | 12 ++-----
 cpp/test/sparse/degree.cu                     |  7 ++--
 cpp/test/sparse/dist_coo_spmv.cu              |  9 +----
 cpp/test/sparse/distance.cu                   |  9 +----
 cpp/test/sparse/knn.cu                        | 10 +-----
 cpp/test/sparse/knn_graph.cu                  |  3 +-
 cpp/test/sparse/linkage.cu                    | 28 +++++++---------
 cpp/test/sparse/norm.cu                       |  7 ++--
 cpp/test/sparse/row_op.cu                     |  6 ++--
 cpp/test/sparse/selection.cu                  | 10 +-----
 cpp/test/spatial/haversine.cu                 | 12 +++----
 cpp/test/spatial/knn.cu                       |  6 ++--
 cpp/test/stats/mean.cu                        |  6 ++--
 cpp/test/stats/mean_center.cu                 | 10 +++---
 cpp/test/stats/stddev.cu                      | 10 +++---
 cpp/test/stats/sum.cu                         |  8 ++---
 47 files changed, 157 insertions(+), 316 deletions(-)

diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h
index 8b8b3cbeca..3c07f1974d 100644
--- a/cpp/include/raft/cudart_utils.h
+++ b/cpp/include/raft/cudart_utils.h
@@ -26,6 +26,8 @@
 #include <cstdio>
 #include <iomanip>
 #include <iostream>
+#include <mutex>
+#include <unordered_map>
 
 ///@todo: enable once logging has been enabled in raft
 //#include "logger.hpp"
@@ -260,19 +262,36 @@ void print_device_vector(const char* variable_name, const T* devMem,
 }
 /** @} */
 
+static std::mutex mutex_;
+static std::unordered_map<void*, size_t> allocations;
+
 template <typename Type>
 void allocate(Type*& ptr, size_t len, cudaStream_t stream,
               bool setZero = false) {
-  ptr = (Type*)rmm::mr::get_current_device_resource()->allocate(
-    len * sizeof(Type), stream);
-  if (setZero) CUDA_CHECK(cudaMemset(ptr, 0, len * sizeof(Type)));
+  size_t size = len * sizeof(Type);
+  ptr = (Type*)rmm::mr::get_current_device_resource()->allocate(size, stream);
+  if (setZero) CUDA_CHECK(cudaMemset((void*)ptr, 0, size));
+
+  std::lock_guard<std::mutex> _(mutex_);
+  allocations[ptr] = size;
 }
 
 template <typename Type>
-void deallocate(Type*& ptr, size_t len, cudaStream_t stream,
-                bool setZero = false) {
-  rmm::mr::get_current_device_resource()->deallocate(ptr, len * sizeof(Type),
-                                                     stream);
+void deallocate(Type*& ptr, cudaStream_t stream) {
+  std::lock_guard<std::mutex> _(mutex_);
+  size_t size = allocations[ptr];
+  allocations.erase(ptr);
+  rmm::mr::get_current_device_resource()->deallocate((void*)ptr, size, stream);
+}
+
+inline void deallocate_all(cudaStream_t stream) {
+  std::lock_guard<std::mutex> _(mutex_);
+  for (auto& alloc : allocations) {
+    void* ptr = alloc.first;
+    size_t size = alloc.second;
+    rmm::mr::get_current_device_resource()->deallocate(ptr, size, stream);
+  }
+  allocations.clear();
 }
 
 /** helper method to get max usable shared mem per block parameter */
diff --git a/cpp/test/distance/dist_adj.cu b/cpp/test/distance/dist_adj.cu
index dc397a5eb2..8d5cd68f13 100644
--- a/cpp/test/distance/dist_adj.cu
+++ b/cpp/test/distance/dist_adj.cu
@@ -77,7 +77,6 @@ class DistanceAdjTest
     int n = params.n;
     int k = params.k;
     bool isRowMajor = params.isRowMajor;
-    cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
     raft::allocate(x, m * k, stream);
     raft::allocate(y, n * k, stream);
@@ -103,21 +102,16 @@ class DistanceAdjTest
     raft::distance::distance<raft::distance::DistanceType::L2Expanded, DataType,
                              DataType, bool>(
       x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor);
-    CUDA_CHECK(cudaStreamDestroy(stream));
-    CUDA_CHECK(cudaFree(workspace));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void TearDown() override {
-    CUDA_CHECK(cudaFree(x));
-    CUDA_CHECK(cudaFree(y));
-    CUDA_CHECK(cudaFree(dist_ref));
-    CUDA_CHECK(cudaFree(dist));
-  }
+  void TearDown() override {}
 
  protected:
   DistanceAdjInputs<DataType> params;
   DataType *x, *y;
   bool *dist_ref, *dist;
+  cudaStream_t stream;
 };
 
 const std::vector<DistanceAdjInputs<float>> inputsf = {
diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh
index ac12076e3a..2b0d718890 100644
--- a/cpp/test/distance/distance_base.cuh
+++ b/cpp/test/distance/distance_base.cuh
@@ -167,7 +167,6 @@ class DistanceTest : public ::testing::TestWithParam<DistanceInputs<DataType>> {
     int n = params.n;
     int k = params.k;
     bool isRowMajor = params.isRowMajor;
-    cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
     raft::allocate(x, m * k, stream);
     raft::allocate(y, n * k, stream);
@@ -189,21 +188,18 @@ class DistanceTest : public ::testing::TestWithParam<DistanceInputs<DataType>> {
     distanceLauncher<distanceType, DataType>(x, y, dist, dist2, m, n, k, params,
                                              threshold, workspace, worksize,
                                              stream, isRowMajor);
-    CUDA_CHECK(cudaStreamDestroy(stream));
-    CUDA_CHECK(cudaFree(workspace));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(x));
-    CUDA_CHECK(cudaFree(y));
-    CUDA_CHECK(cudaFree(dist_ref));
-    CUDA_CHECK(cudaFree(dist));
-    CUDA_CHECK(cudaFree(dist2));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
   DistanceInputs<DataType> params;
   DataType *x, *y, *dist_ref, *dist, *dist2;
+  cudaStream_t stream;
 };
 
 }  // end namespace distance
diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu
index 8f12b26dc0..cfea4ee2d9 100644
--- a/cpp/test/distance/fused_l2_nn.cu
+++ b/cpp/test/distance/fused_l2_nn.cu
@@ -122,15 +122,8 @@ class FusedL2NNTest : public ::testing::TestWithParam<Inputs<DataT>> {
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    raft::deallocate_all(stream);
     CUDA_CHECK(cudaStreamDestroy(stream));
-    CUDA_CHECK(cudaFree(x));
-    CUDA_CHECK(cudaFree(y));
-    CUDA_CHECK(cudaFree(xn));
-    CUDA_CHECK(cudaFree(yn));
-    CUDA_CHECK(cudaFree(workspace));
-    CUDA_CHECK(cudaFree(min_ref));
-    CUDA_CHECK(cudaFree(min));
   }
 
  protected:
@@ -286,10 +279,7 @@ class FusedL2NNDetTest : public FusedL2NNTest<DataT, Sqrt> {
     raft::allocate(min1, m, stream);
   }
 
-  void TearDown() override {
-    FusedL2NNTest<DataT, Sqrt>::TearDown();
-    CUDA_CHECK(cudaFree(min1));
-  }
+  void TearDown() override { FusedL2NNTest<DataT, Sqrt>::TearDown(); }
 
  protected:
   cub::KeyValuePair<int, DataT> *min1;
diff --git a/cpp/test/label/label.cu b/cpp/test/label/label.cu
index 86a8927283..b28c754a5a 100644
--- a/cpp/test/label/label.cu
+++ b/cpp/test/label/label.cu
@@ -62,9 +62,8 @@ TEST_F(MakeMonotonicTest, Result) {
 
   ASSERT_TRUE(devArrMatch(actual, expected, m, raft::Compare<bool>(), stream));
 
+  raft::deallocate_all(stream);
   CUDA_CHECK(cudaStreamDestroy(stream));
-  CUDA_CHECK(cudaFree(data));
-  CUDA_CHECK(cudaFree(actual));
 
   delete data_h;
   delete expected_h;
@@ -100,9 +99,8 @@ TEST(labelTest, Classlabels) {
   EXPECT_TRUE(devArrMatchHost(y_relabeled_exp, y_relabeled_d, n_rows,
                               raft::Compare<float>(), stream));
 
+  raft::deallocate_all(stream);
   CUDA_CHECK(cudaStreamDestroy(stream));
-  CUDA_CHECK(cudaFree(y_d));
-  CUDA_CHECK(cudaFree(y_relabeled_d));
 }
 };  // namespace label
 };  // namespace raft
diff --git a/cpp/test/linalg/add.cu b/cpp/test/linalg/add.cu
index ab7de0f24b..301f069a33 100644
--- a/cpp/test/linalg/add.cu
+++ b/cpp/test/linalg/add.cu
@@ -43,11 +43,7 @@ class AddTest : public ::testing::TestWithParam<AddInputs<InT, OutT>> {
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaStreamSynchronize(stream));
-    CUDA_CHECK(cudaFree(in1));
-    CUDA_CHECK(cudaFree(in2));
-    CUDA_CHECK(cudaFree(out_ref));
-    CUDA_CHECK(cudaFree(out));
+    raft::deallocate_all(stream);
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu
index bfbaf9b5f9..45dbd9dcc4 100644
--- a/cpp/test/linalg/coalesced_reduction.cu
+++ b/cpp/test/linalg/coalesced_reduction.cu
@@ -57,7 +57,6 @@ class coalescedReductionTest
     raft::random::Rng r(params.seed);
     int rows = params.rows, cols = params.cols;
     int len = rows * cols;
-    cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
     raft::allocate(data, len, stream);
     raft::allocate(dots_exp, rows, stream);
@@ -70,18 +69,18 @@ class coalescedReductionTest
     // Add to result with inplace = true next
     coalescedReductionLaunch(dots_act, data, cols, rows, stream, true);
 
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(dots_exp));
-    CUDA_CHECK(cudaFree(dots_act));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
   coalescedReductionInputs<T> params;
   T *data, *dots_exp, *dots_act;
+  cudaStream_t stream;
 };
 
 const std::vector<coalescedReductionInputs<float>> inputsf = {
diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu
index 07fbda7e1c..563f96c835 100644
--- a/cpp/test/linalg/divide.cu
+++ b/cpp/test/linalg/divide.cu
@@ -51,7 +51,6 @@ class DivideTest
       ::testing::TestWithParam<raft::linalg::UnaryOpInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int len = params.len;
-    cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
 
     raft::allocate(in, len, stream);
@@ -60,18 +59,18 @@ class DivideTest
     r.uniform(in, len, T(-1.0), T(1.0), stream);
     naiveDivide(out_ref, in, params.scalar, len, stream);
     divideScalar(out, in, params.scalar, len, stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(in));
-    CUDA_CHECK(cudaFree(out_ref));
-    CUDA_CHECK(cudaFree(out));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
   UnaryOpInputs<T> params;
   T *in, *out_ref, *out;
+  cudaStream_t stream;
 };
 
 const std::vector<UnaryOpInputs<float>> inputsf = {
diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu
index d53713f004..6e26757cf3 100644
--- a/cpp/test/linalg/eig.cu
+++ b/cpp/test/linalg/eig.cu
@@ -97,15 +97,7 @@ class EigTest : public ::testing::TestWithParam<EigInputs<T>> {
               sweeps);
   }
 
-  void TearDown() override {
-    CUDA_CHECK(cudaFree(cov_matrix));
-    CUDA_CHECK(cudaFree(eig_vectors));
-    CUDA_CHECK(cudaFree(eig_vectors_jacobi));
-    CUDA_CHECK(cudaFree(eig_vals));
-    CUDA_CHECK(cudaFree(eig_vals_jacobi));
-    CUDA_CHECK(cudaFree(eig_vectors_ref));
-    CUDA_CHECK(cudaFree(eig_vals_ref));
-  }
+  void TearDown() override { raft::deallocate_all(stream); }
 
  protected:
   EigInputs<T> params;
diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu
index 26c8177269..bdd0a08ff6 100644
--- a/cpp/test/linalg/eig_sel.cu
+++ b/cpp/test/linalg/eig_sel.cu
@@ -72,15 +72,10 @@ class EigSelTest : public ::testing::TestWithParam<EigSelInputs<T>> {
 
     eigSelDC(handle, cov_matrix, params.n_row, params.n_col, 3, eig_vectors,
              eig_vals, EigVecMemUsage::OVERWRITE_INPUT, stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void TearDown() override {
-    CUDA_CHECK(cudaFree(cov_matrix));
-    CUDA_CHECK(cudaFree(eig_vectors));
-    CUDA_CHECK(cudaFree(eig_vals));
-    CUDA_CHECK(cudaFree(eig_vectors_ref));
-    CUDA_CHECK(cudaFree(eig_vals_ref));
-  }
+  void TearDown() override { raft::deallocate_all(stream); }
 
  protected:
   EigSelInputs<T> params;
diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu
index 6087893006..5b13fb5362 100644
--- a/cpp/test/linalg/map.cu
+++ b/cpp/test/linalg/map.cu
@@ -51,7 +51,6 @@ void create_ref(OutType *out_ref, const InType *in1, const InType *in2,
   eltwiseAdd(tmp, in1, in2, len, stream);
   eltwiseAdd(out_ref, tmp, in3, len, stream);
   scalarAdd(out_ref, out_ref, (OutType)scalar, len, stream);
-  CUDA_CHECK(cudaFree(tmp));
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
@@ -63,7 +62,6 @@ class MapTest
       ::testing::TestWithParam<MapInputs<InType, IdxType, OutType>>::GetParam();
     raft::random::Rng r(params.seed);
 
-    cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
     IdxType len = params.len;
     raft::allocate(in1, len, stream);
@@ -77,21 +75,19 @@ class MapTest
 
     create_ref(out_ref, in1, in2, in3, params.scalar, len, stream);
     mapLaunch(out, in1, in2, in3, params.scalar, len, stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(in1));
-    CUDA_CHECK(cudaFree(in2));
-    CUDA_CHECK(cudaFree(in3));
-    CUDA_CHECK(cudaFree(out_ref));
-    CUDA_CHECK(cudaFree(out));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
   MapInputs<InType, IdxType, OutType> params;
   InType *in1, *in2, *in3;
   OutType *out_ref, *out;
+  cudaStream_t stream;
 };
 
 const std::vector<MapInputs<float, int>> inputsf_i32 = {
diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu
index 49b7deaf1f..edc6382706 100644
--- a/cpp/test/linalg/map_then_reduce.cu
+++ b/cpp/test/linalg/map_then_reduce.cu
@@ -75,26 +75,25 @@ class MapReduceTest : public ::testing::TestWithParam<MapReduceInputs<InType>> {
     raft::random::Rng r(params.seed);
     auto len = params.len;
 
-    cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
     raft::allocate(in, len, stream);
     raft::allocate(out_ref, len, stream);
     raft::allocate(out, len, stream);
     r.uniform(in, len, InType(-1.0), InType(1.0), stream);
     mapReduceLaunch(out_ref, out, in, len, stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(in));
-    CUDA_CHECK(cudaFree(out_ref));
-    CUDA_CHECK(cudaFree(out));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
   MapReduceInputs<InType> params;
   InType *in;
   OutType *out_ref, *out;
+  cudaStream_t stream;
 };
 
 const std::vector<MapReduceInputs<float>> inputsf = {
diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu
index 933f806ff5..e017ee0918 100644
--- a/cpp/test/linalg/matrix_vector_op.cu
+++ b/cpp/test/linalg/matrix_vector_op.cu
@@ -66,7 +66,6 @@ class MatVecOpTest
     IdxType N = params.rows, D = params.cols;
     IdxType len = N * D;
 
-    cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
     raft::allocate(in, len, stream);
     raft::allocate(out_ref, len, stream);
@@ -86,20 +85,18 @@ class MatVecOpTest
     }
     matrixVectorOpLaunch(out, in, vec1, vec2, D, N, params.rowMajor,
                          params.bcastAlongRows, params.useTwoVectors, stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(vec1));
-    CUDA_CHECK(cudaFree(vec2));
-    CUDA_CHECK(cudaFree(out));
-    CUDA_CHECK(cudaFree(out_ref));
-    CUDA_CHECK(cudaFree(in));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
   MatVecOpInputs<T, IdxType> params;
   T *in, *out, *out_ref, *vec1, *vec2;
+  cudaStream_t stream;
 };
 
 const std::vector<MatVecOpInputs<float, int>> inputsf_i32 = {
diff --git a/cpp/test/linalg/multiply.cu b/cpp/test/linalg/multiply.cu
index 33f2b1c104..d7bda7c27d 100644
--- a/cpp/test/linalg/multiply.cu
+++ b/cpp/test/linalg/multiply.cu
@@ -31,7 +31,6 @@ class MultiplyTest : public ::testing::TestWithParam<UnaryOpInputs<T>> {
     params = ::testing::TestWithParam<UnaryOpInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int len = params.len;
-    cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
 
     raft::allocate(in, len, stream);
@@ -40,18 +39,18 @@ class MultiplyTest : public ::testing::TestWithParam<UnaryOpInputs<T>> {
     r.uniform(in, len, T(-1.0), T(1.0), stream);
     naiveScale(out_ref, in, params.scalar, len, stream);
     multiplyScalar(out, in, params.scalar, len, stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(in));
-    CUDA_CHECK(cudaFree(out_ref));
-    CUDA_CHECK(cudaFree(out));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
   UnaryOpInputs<T> params;
   T *in, *out_ref, *out;
+  cudaStream_t stream;
 };
 
 const std::vector<UnaryOpInputs<float>> inputsf = {
diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu
index 29ea0a1915..5563064982 100644
--- a/cpp/test/linalg/norm.cu
+++ b/cpp/test/linalg/norm.cu
@@ -157,13 +157,11 @@ class ColNormTest : public ::testing::TestWithParam<NormInputs<T>> {
     } else {
       colNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream);
     }
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(dots_exp));
-    CUDA_CHECK(cudaFree(dots_act));
+    raft::deallocate_all(stream);
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu
index dba8f76a22..8a3cf5d885 100644
--- a/cpp/test/linalg/reduce.cu
+++ b/cpp/test/linalg/reduce.cu
@@ -77,9 +77,7 @@ class ReduceTest : public ::testing::TestWithParam<ReduceInputs<T>> {
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(dots_exp));
-    CUDA_CHECK(cudaFree(dots_act));
+    raft::deallocate_all(stream);
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/test/linalg/strided_reduction.cu
index 3fe076f1d4..55d8cc0e92 100644
--- a/cpp/test/linalg/strided_reduction.cu
+++ b/cpp/test/linalg/strided_reduction.cu
@@ -60,9 +60,7 @@ class stridedReductionTest
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(dots_exp));
-    CUDA_CHECK(cudaFree(dots_act));
+    raft::deallocate_all(stream);
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
diff --git a/cpp/test/linalg/subtract.cu b/cpp/test/linalg/subtract.cu
index 7318410330..27dea8503f 100644
--- a/cpp/test/linalg/subtract.cu
+++ b/cpp/test/linalg/subtract.cu
@@ -79,7 +79,6 @@ class SubtractTest : public ::testing::TestWithParam<SubtractInputs<T>> {
     params = ::testing::TestWithParam<SubtractInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int len = params.len;
-    cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
     raft::allocate(in1, len, stream);
     raft::allocate(in2, len, stream);
@@ -95,19 +94,18 @@ class SubtractTest : public ::testing::TestWithParam<SubtractInputs<T>> {
     subtractScalar(out, out, T(1), len, stream);
     subtract(in1, in1, in2, len, stream);
     subtractScalar(in1, in1, T(1), len, stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(in1));
-    CUDA_CHECK(cudaFree(in2));
-    CUDA_CHECK(cudaFree(out_ref));
-    CUDA_CHECK(cudaFree(out));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
   SubtractInputs<T> params;
   T *in1, *in2, *out_ref, *out;
+  cudaStream_t stream;
 };
 
 const std::vector<SubtractInputs<float>> inputsf2 = {
diff --git a/cpp/test/linalg/svd.cu b/cpp/test/linalg/svd.cu
index adfbfd01d6..72a27790de 100644
--- a/cpp/test/linalg/svd.cu
+++ b/cpp/test/linalg/svd.cu
@@ -48,7 +48,7 @@ class SvdTest : public ::testing::TestWithParam<SvdInputs<T>> {
     params = ::testing::TestWithParam<SvdInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int len = params.len;
-    cudaStream_t stream = handle.get_stream();
+    stream = handle.get_stream();
     raft::allocate(data, len, stream);
 
     ASSERT(params.n_row == 3, "This test only supports nrows=3!");
@@ -87,22 +87,16 @@ class SvdTest : public ::testing::TestWithParam<SvdInputs<T>> {
     svdQR(handle, data, params.n_row, params.n_col, sing_vals_qr,
           left_eig_vectors_qr, right_eig_vectors_trans_qr, true, true, true,
           stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void TearDown() override {
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(left_eig_vectors_qr));
-    CUDA_CHECK(cudaFree(right_eig_vectors_trans_qr));
-    CUDA_CHECK(cudaFree(sing_vals_qr));
-    CUDA_CHECK(cudaFree(left_eig_vectors_ref));
-    CUDA_CHECK(cudaFree(right_eig_vectors_ref));
-    CUDA_CHECK(cudaFree(sing_vals_ref));
-  }
+  void TearDown() override { raft::deallocate_all(stream); }
 
  protected:
   SvdInputs<T> params;
   T *data, *left_eig_vectors_qr, *right_eig_vectors_trans_qr, *sing_vals_qr,
     *left_eig_vectors_ref, *right_eig_vectors_ref, *sing_vals_ref;
+  cudaStream_t stream;
 };
 
 const std::vector<SvdInputs<float>> inputsf2 = {
diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu
index 08be179c59..c574f54a05 100644
--- a/cpp/test/linalg/transpose.cu
+++ b/cpp/test/linalg/transpose.cu
@@ -63,11 +63,7 @@ class TransposeTest : public ::testing::TestWithParam<TranposeInputs<T>> {
     transpose(data, params.n_row, stream);
   }
 
-  void TearDown() override {
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(data_trans));
-    CUDA_CHECK(cudaFree(data_trans_ref));
-  }
+  void TearDown() override { raft::deallocate_all(stream); }
 
  protected:
   TranposeInputs<T> params;
diff --git a/cpp/test/linalg/unary_op.cu b/cpp/test/linalg/unary_op.cu
index 8ef810d794..042e8b9cbf 100644
--- a/cpp/test/linalg/unary_op.cu
+++ b/cpp/test/linalg/unary_op.cu
@@ -60,11 +60,8 @@ class UnaryOpTest
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    raft::deallocate_all(stream);
     CUDA_CHECK(cudaStreamDestroy(stream));
-    CUDA_CHECK(cudaFree(in));
-    CUDA_CHECK(cudaFree(out_ref));
-    CUDA_CHECK(cudaFree(out));
   }
 
   virtual void DoTest() {
diff --git a/cpp/test/matrix/math.cu b/cpp/test/matrix/math.cu
index 227aca643e..63381dec07 100644
--- a/cpp/test/matrix/math.cu
+++ b/cpp/test/matrix/math.cu
@@ -116,7 +116,7 @@ class MathTest : public ::testing::TestWithParam<MathInputs<T>> {
     int len = params.len;
 
     raft::handle_t handle;
-    cudaStream_t stream;
+    stream = handle.get_stream();
     CUDA_CHECK(cudaStreamCreate(&stream));
 
     raft::allocate(in_power, len, stream);
@@ -174,31 +174,16 @@ class MathTest : public ::testing::TestWithParam<MathInputs<T>> {
     update_device(out_smallzero_ref, in_small_val_zero_ref_h.data(), 4, stream);
     setSmallValuesZero(out_smallzero, in_smallzero, 4, stream);
     setSmallValuesZero(in_smallzero, 4, stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override {
-    CUDA_CHECK(cudaFree(in_power));
-    CUDA_CHECK(cudaFree(out_power_ref));
-    CUDA_CHECK(cudaFree(in_sqrt));
-    CUDA_CHECK(cudaFree(out_sqrt_ref));
-    CUDA_CHECK(cudaFree(in_ratio));
-    CUDA_CHECK(cudaFree(out_ratio_ref));
-    CUDA_CHECK(cudaFree(in_sign_flip));
-    CUDA_CHECK(cudaFree(out_sign_flip_ref));
-    CUDA_CHECK(cudaFree(in_recip));
-    CUDA_CHECK(cudaFree(in_recip_ref));
-    CUDA_CHECK(cudaFree(out_recip));
-    CUDA_CHECK(cudaFree(in_smallzero));
-    CUDA_CHECK(cudaFree(out_smallzero));
-    CUDA_CHECK(cudaFree(out_smallzero_ref));
-  }
+  void TearDown() override { raft::deallocate_all(stream); }
 
  protected:
   MathInputs<T> params;
   T *in_power, *out_power_ref, *in_sqrt, *out_sqrt_ref, *in_ratio,
     *out_ratio_ref, *in_sign_flip, *out_sign_flip_ref, *in_recip, *in_recip_ref,
     *out_recip, *in_smallzero, *out_smallzero, *out_smallzero_ref;
+  cudaStream_t stream;
 };
 
 const std::vector<MathInputs<float>> inputsf = {
diff --git a/cpp/test/matrix/matrix.cu b/cpp/test/matrix/matrix.cu
index 047415957c..cc88df0a73 100644
--- a/cpp/test/matrix/matrix.cu
+++ b/cpp/test/matrix/matrix.cu
@@ -44,7 +44,6 @@ class MatrixTest : public ::testing::TestWithParam<MatrixInputs<T>> {
     params = ::testing::TestWithParam<MatrixInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int len = params.n_row * params.n_col;
-    cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
     raft::allocate(in1, len, stream);
     raft::allocate(in2, len, stream);
@@ -58,18 +57,18 @@ class MatrixTest : public ::testing::TestWithParam<MatrixInputs<T>> {
     T *outTrunc;
     raft::allocate(outTrunc, 6, stream);
     truncZeroOrigin(in1, params.n_row, outTrunc, 3, 2, stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(in1));
-    CUDA_CHECK(cudaFree(in2));
-    // CUDA_CHECK(cudaFree(in1_revr));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
   MatrixInputs<T> params;
   T *in1, *in2, *in1_revr;
+  cudaStream_t stream;
 };
 
 const std::vector<MatrixInputs<float>> inputsf2 = {{0.000001f, 4, 4, 1234ULL}};
diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu
index ee5312de22..c2ec7a340f 100644
--- a/cpp/test/random/rng.cu
+++ b/cpp/test/random/rng.cu
@@ -85,7 +85,6 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
     // 4 x sigma indicates the test shouldn't fail 99.9% of the time.
     num_sigma = 10;
     params = ::testing::TestWithParam<RngInputs<T>>::GetParam();
-    cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
     Rng r(params.seed, params.gtype);
     raft::allocate(data, params.len, stream);
@@ -124,12 +123,12 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
     CUDA_CHECK(cudaStreamSynchronize(stream));
     h_stats[0] /= params.len;
     h_stats[1] = (h_stats[1] / params.len) - (h_stats[0] * h_stats[0]);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(stats));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
   void getExpectedMeanVar(T meanvar[2]) {
@@ -182,6 +181,7 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
   T *data, *stats;
   T h_stats[2];  // mean, var
   int num_sigma;
+  cudaStream_t stream;
 };
 
 // The measured mean and standard deviation for each tested distribution are,
@@ -416,10 +416,8 @@ TEST(Rng, MeanError) {
     ASSERT_TRUE(
       (diff_expected_vs_measured_mean_error / d_std_of_mean_analytical < 0.5));
   }
+  raft::deallocate_all(stream);
   CUDA_CHECK(cudaStreamDestroy(stream));
-  CUDA_CHECK(cudaFree(data));
-  CUDA_CHECK(cudaFree(mean_result));
-  CUDA_CHECK(cudaFree(std_result));
 
   // std::cout << "mean_res:" << h_mean_result << "\n";
 }
@@ -515,7 +513,6 @@ class RngNormalTableTest
     params = ::testing::TestWithParam<RngNormalTableInputs<T>>::GetParam();
     int len = params.rows * params.cols;
 
-    cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
     Rng r(params.seed, params.gtype);
     raft::allocate(data, len, stream);
@@ -532,13 +529,12 @@ class RngNormalTableTest
     CUDA_CHECK(cudaStreamSynchronize(stream));
     h_stats[0] /= len;
     h_stats[1] = (h_stats[1] / len) - (h_stats[0] * h_stats[0]);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(stats));
-    CUDA_CHECK(cudaFree(mu_vec));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
   void getExpectedMeanVar(T meanvar[2]) {
@@ -551,6 +547,7 @@ class RngNormalTableTest
   T *data, *stats, *mu_vec;
   T h_stats[2];  // mean, var
   int num_sigma;
+  cudaStream_t stream;
 };
 
 typedef RngNormalTableTest<float> RngNormalTableTestF;
diff --git a/cpp/test/random/rng_int.cu b/cpp/test/random/rng_int.cu
index cd948d50aa..a98619e5b4 100644
--- a/cpp/test/random/rng_int.cu
+++ b/cpp/test/random/rng_int.cu
@@ -70,7 +70,6 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
     params = ::testing::TestWithParam<RngInputs<T>>::GetParam();
     Rng r(params.seed, params.gtype);
 
-    cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
     raft::allocate(data, params.len, stream);
     raft::allocate(stats, 2, stream, true);
@@ -87,12 +86,12 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
     CUDA_CHECK(cudaStreamSynchronize(stream));
     h_stats[0] /= params.len;
     h_stats[1] = (h_stats[1] / params.len) - (h_stats[0] * h_stats[0]);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(stats));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
   void getExpectedMeanVar(float meanvar[2]) {
@@ -110,6 +109,7 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
   T *data;
   float *stats;
   float h_stats[2];  // mean, var
+  cudaStream_t stream;
 };
 
 typedef RngTest<uint32_t> RngTestU32;
diff --git a/cpp/test/random/sample_without_replacement.cu b/cpp/test/random/sample_without_replacement.cu
index 7883c8fe1b..cf60f46afe 100644
--- a/cpp/test/random/sample_without_replacement.cu
+++ b/cpp/test/random/sample_without_replacement.cu
@@ -67,12 +67,8 @@ class SWoRTest : public ::testing::TestWithParam<SWoRInputs<T>> {
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    raft::deallocate_all(stream);
     CUDA_CHECK(cudaStreamDestroy(stream));
-    CUDA_CHECK(cudaFree(in));
-    CUDA_CHECK(cudaFree(wts));
-    CUDA_CHECK(cudaFree(out));
-    CUDA_CHECK(cudaFree(outIdx));
   }
 
  protected:
diff --git a/cpp/test/sparse/add.cu b/cpp/test/sparse/add.cu
index 5b2e5bd5b4..8429a46941 100644
--- a/cpp/test/sparse/add.cu
+++ b/cpp/test/sparse/add.cu
@@ -110,18 +110,8 @@ class CSRAddTest
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(ind_a));
-    CUDA_CHECK(cudaFree(ind_b));
-    CUDA_CHECK(cudaFree(ind_result));
-    CUDA_CHECK(cudaFree(ind_ptr_a));
-    CUDA_CHECK(cudaFree(ind_ptr_b));
-    CUDA_CHECK(cudaFree(ind_ptr_verify));
-    CUDA_CHECK(cudaFree(ind_ptr_result));
-    CUDA_CHECK(cudaFree(values_a));
-    CUDA_CHECK(cudaFree(values_b));
-    CUDA_CHECK(cudaFree(values_verify));
-    CUDA_CHECK(cudaFree(values_result));
-    cudaStreamDestroy(stream);
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
diff --git a/cpp/test/sparse/convert_coo.cu b/cpp/test/sparse/convert_coo.cu
index 8af6d30d7d..4f9c00c7ab 100644
--- a/cpp/test/sparse/convert_coo.cu
+++ b/cpp/test/sparse/convert_coo.cu
@@ -62,9 +62,7 @@ class CSRtoCOOTest : public ::testing::TestWithParam<CSRtoCOOInputs<Index_>> {
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(ex_scan));
-    CUDA_CHECK(cudaFree(verify));
-    CUDA_CHECK(cudaFree(result));
+    raft::deallocate_all(stream);
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu
index 871d8475e8..465aad4e7f 100644
--- a/cpp/test/sparse/convert_csr.cu
+++ b/cpp/test/sparse/convert_csr.cu
@@ -132,11 +132,8 @@ class CSRAdjGraphTest
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(row_ind));
-    CUDA_CHECK(cudaFree(adj));
-    CUDA_CHECK(cudaFree(verify));
-    CUDA_CHECK(cudaFree(result));
-    cudaStreamDestroy(stream);
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
diff --git a/cpp/test/sparse/csr_row_slice.cu b/cpp/test/sparse/csr_row_slice.cu
index 1cf88e7a77..00e6899cb2 100644
--- a/cpp/test/sparse/csr_row_slice.cu
+++ b/cpp/test/sparse/csr_row_slice.cu
@@ -109,16 +109,8 @@ class CSRRowSliceTest
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaStreamSynchronize(stream));
-    CUDA_CHECK(cudaFree(indptr));
-    CUDA_CHECK(cudaFree(indices));
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(out_indptr));
-    CUDA_CHECK(cudaFree(out_indices));
-    CUDA_CHECK(cudaFree(out_data));
-    CUDA_CHECK(cudaFree(out_indptr_ref));
-    CUDA_CHECK(cudaFree(out_indices_ref));
-    CUDA_CHECK(cudaFree(out_data_ref));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
   void compare() {
diff --git a/cpp/test/sparse/csr_to_dense.cu b/cpp/test/sparse/csr_to_dense.cu
index 4a7c9b7b10..7f6b7dad07 100644
--- a/cpp/test/sparse/csr_to_dense.cu
+++ b/cpp/test/sparse/csr_to_dense.cu
@@ -88,12 +88,8 @@ class CSRToDenseTest
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaStreamSynchronize(stream));
-    CUDA_CHECK(cudaFree(indptr));
-    CUDA_CHECK(cudaFree(indices));
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(out));
-    CUDA_CHECK(cudaFree(out_ref));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
   void compare() {
diff --git a/cpp/test/sparse/csr_transpose.cu b/cpp/test/sparse/csr_transpose.cu
index 352a656bf7..e50a9d94a9 100644
--- a/cpp/test/sparse/csr_transpose.cu
+++ b/cpp/test/sparse/csr_transpose.cu
@@ -106,16 +106,8 @@ class CSRTransposeTest
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaStreamSynchronize(stream));
-    CUDA_CHECK(cudaFree(indptr));
-    CUDA_CHECK(cudaFree(indices));
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(out_indptr));
-    CUDA_CHECK(cudaFree(out_indices));
-    CUDA_CHECK(cudaFree(out_data));
-    CUDA_CHECK(cudaFree(out_indptr_ref));
-    CUDA_CHECK(cudaFree(out_indices_ref));
-    CUDA_CHECK(cudaFree(out_data_ref));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
   void compare() {
diff --git a/cpp/test/sparse/degree.cu b/cpp/test/sparse/degree.cu
index 68dc1b51ac..eede0dfc10 100644
--- a/cpp/test/sparse/degree.cu
+++ b/cpp/test/sparse/degree.cu
@@ -67,8 +67,7 @@ TEST_P(COODegree, Result) {
 
   ASSERT_TRUE(raft::devArrMatch<int>(verify, results, 5, raft::Compare<int>()));
 
-  CUDA_CHECK(cudaFree(in_rows));
-  CUDA_CHECK(cudaFree(verify));
+  raft::deallocate_all(stream);
   CUDA_CHECK(cudaStreamDestroy(stream));
 }
 
@@ -98,9 +97,7 @@ TEST_P(COODegreeNonzero, Result) {
 
   ASSERT_TRUE(raft::devArrMatch<int>(verify, results, 5, raft::Compare<int>()));
 
-  CUDA_CHECK(cudaFree(in_rows));
-  CUDA_CHECK(cudaFree(verify));
-
+  raft::deallocate_all(stream);
   CUDA_CHECK(cudaStreamDestroy(stream));
 }
 
diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu
index 6b18e773cc..563dcf6f15 100644
--- a/cpp/test/sparse/dist_coo_spmv.cu
+++ b/cpp/test/sparse/dist_coo_spmv.cu
@@ -208,14 +208,7 @@ class SparseDistanceCOOSPMVTest
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
   }
 
-  void TearDown() override {
-    CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
-    CUDA_CHECK(cudaFree(indptr));
-    CUDA_CHECK(cudaFree(indices));
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(out_dists));
-    CUDA_CHECK(cudaFree(out_dists_ref));
-  }
+  void TearDown() override { raft::deallocate_all(handle.get_stream()); }
 
   void compare() {
     ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists,
diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu
index 4ae95ae232..9115dbf0b5 100644
--- a/cpp/test/sparse/distance.cu
+++ b/cpp/test/sparse/distance.cu
@@ -88,14 +88,7 @@ class SparseDistanceTest
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
   }
 
-  void TearDown() override {
-    CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
-    CUDA_CHECK(cudaFree(indptr));
-    CUDA_CHECK(cudaFree(indices));
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(out_dists));
-    CUDA_CHECK(cudaFree(out_dists_ref));
-  }
+  void TearDown() override { raft::deallocate_all(handle.get_stream()); }
 
   void compare() {
     ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists,
diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/knn.cu
index 8b5c9edebe..22f97559b1 100644
--- a/cpp/test/sparse/knn.cu
+++ b/cpp/test/sparse/knn.cu
@@ -80,15 +80,7 @@ class SparseKNNTest
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
   }
 
-  void TearDown() override {
-    CUDA_CHECK(cudaFree(indptr));
-    CUDA_CHECK(cudaFree(indices));
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(out_indices));
-    CUDA_CHECK(cudaFree(out_dists));
-    CUDA_CHECK(cudaFree(out_indices_ref));
-    CUDA_CHECK(cudaFree(out_dists_ref));
-  }
+  void TearDown() override { raft::deallocate_all(handle.get_stream()); }
 
   void compare() {
     ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, n_rows * k,
diff --git a/cpp/test/sparse/knn_graph.cu b/cpp/test/sparse/knn_graph.cu
index a89529ddb3..945943bd5a 100644
--- a/cpp/test/sparse/knn_graph.cu
+++ b/cpp/test/sparse/knn_graph.cu
@@ -91,7 +91,8 @@ class KNNGraphTest
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(X));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
 
     delete out;
   }
diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu
index 4d1a5a9ad6..d2222d12bf 100644
--- a/cpp/test/sparse/linkage.cu
+++ b/cpp/test/sparse/linkage.cu
@@ -154,30 +154,27 @@ template <typename T, typename IdxT>
 class LinkageTest : public ::testing::TestWithParam<LinkageInputs<T, IdxT>> {
  protected:
   void basicTest() {
-    raft::handle_t handle;
+    CUDA_CHECK(cudaStreamCreate(&stream));
 
     params = ::testing::TestWithParam<LinkageInputs<T, IdxT>>::GetParam();
 
-    rmm::device_uvector<T> data(params.n_row * params.n_col,
-                                handle.get_stream());
+    rmm::device_uvector<T> data(params.n_row * params.n_col, stream);
 
     // Allocate result labels and expected labels on device
-    raft::allocate(labels, params.n_row, handle.get_stream());
-    raft::allocate(labels_ref, params.n_row, handle.get_stream());
+    raft::allocate(labels, params.n_row, stream);
+    raft::allocate(labels_ref, params.n_row, stream);
 
-    raft::copy(data.data(), params.data.data(), data.size(),
-               handle.get_stream());
-    raft::copy(labels_ref, params.expected_labels.data(), params.n_row,
-               handle.get_stream());
+    raft::copy(data.data(), params.data.data(), data.size(), stream);
+    raft::copy(labels_ref, params.expected_labels.data(), params.n_row, stream);
 
     raft::hierarchy::linkage_output<IdxT, T> out_arrs;
     out_arrs.labels = labels;
 
-    rmm::device_uvector<IdxT> out_children(params.n_row * 2,
-                                           handle.get_stream());
+    rmm::device_uvector<IdxT> out_children(params.n_row * 2, stream);
 
     out_arrs.children = out_children.data();
 
+    raft::handle_t handle;
     raft::hierarchy::single_linkage<
       IdxT, T, raft::hierarchy::LinkageDistance::KNN_GRAPH>(
       handle, data.data(), params.n_row, params.n_col,
@@ -186,22 +183,21 @@ class LinkageTest : public ::testing::TestWithParam<LinkageInputs<T, IdxT>> {
 
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
 
-    score =
-      compute_rand_index(labels, labels_ref, params.n_row, handle.get_stream());
+    score = compute_rand_index(labels, labels_ref, params.n_row, stream);
   }
 
   void SetUp() override { basicTest(); }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(labels));
-    CUDA_CHECK(cudaFree(labels_ref));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
   LinkageInputs<T, IdxT> params;
   IdxT *labels, *labels_ref;
-
   double score;
+  cudaStream_t stream;
 };
 
 const std::vector<LinkageInputs<float, int>> linkage_inputsf2 = {
diff --git a/cpp/test/sparse/norm.cu b/cpp/test/sparse/norm.cu
index f9a30d5147..d69dd15c57 100644
--- a/cpp/test/sparse/norm.cu
+++ b/cpp/test/sparse/norm.cu
@@ -77,11 +77,8 @@ class CSRRowNormalizeTest
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(ex_scan));
-    CUDA_CHECK(cudaFree(in_vals));
-    CUDA_CHECK(cudaFree(verify));
-    CUDA_CHECK(cudaFree(result));
-    cudaStreamDestroy(stream);
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
diff --git a/cpp/test/sparse/row_op.cu b/cpp/test/sparse/row_op.cu
index e584598f85..805a3d85da 100644
--- a/cpp/test/sparse/row_op.cu
+++ b/cpp/test/sparse/row_op.cu
@@ -75,10 +75,8 @@ class CSRRowOpTest
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(ex_scan));
-    CUDA_CHECK(cudaFree(verify));
-    CUDA_CHECK(cudaFree(result));
-    cudaStreamDestroy(stream);
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
diff --git a/cpp/test/sparse/selection.cu b/cpp/test/sparse/selection.cu
index 4434b65dde..256ecfdfb7 100644
--- a/cpp/test/sparse/selection.cu
+++ b/cpp/test/sparse/selection.cu
@@ -97,15 +97,7 @@ class SparseSelectionTest
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaStreamSynchronize(stream));
-
-    CUDA_CHECK(cudaFree(dists));
-    CUDA_CHECK(cudaFree(inds));
-    CUDA_CHECK(cudaFree(out_indices));
-    CUDA_CHECK(cudaFree(out_dists));
-    CUDA_CHECK(cudaFree(out_indices_ref));
-    CUDA_CHECK(cudaFree(out_dists_ref));
-
+    raft::deallocate_all(stream);
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu
index a56dc03be3..542f2d6fc8 100644
--- a/cpp/test/spatial/haversine.cu
+++ b/cpp/test/spatial/haversine.cu
@@ -29,7 +29,6 @@ template <typename value_idx, typename value_t>
 class HaversineKNNTest : public ::testing::Test {
  protected:
   void basicTest() {
-    cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
 
     // Allocate input
@@ -74,17 +73,14 @@ class HaversineKNNTest : public ::testing::Test {
     raft::spatial::knn::detail::haversine_knn(
       d_pred_I, d_pred_D, d_train_inputs, d_train_inputs, n, n, k, stream);
 
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void SetUp() override { basicTest(); }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(d_train_inputs));
-    CUDA_CHECK(cudaFree(d_pred_I));
-    CUDA_CHECK(cudaFree(d_pred_D));
-    CUDA_CHECK(cudaFree(d_ref_I));
-    CUDA_CHECK(cudaFree(d_ref_D));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
@@ -100,6 +96,8 @@ class HaversineKNNTest : public ::testing::Test {
 
   value_idx *d_ref_I;
   value_t *d_ref_D;
+
+  cudaStream_t stream;
 };
 
 typedef HaversineKNNTest<int, float> HaversineKNNTestF;
diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu
index edf46b3fe0..8682dea7fc 100644
--- a/cpp/test/spatial/knn.cu
+++ b/cpp/test/spatial/knn.cu
@@ -128,10 +128,8 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(search_data_));
-    CUDA_CHECK(cudaFree(indices_));
-    CUDA_CHECK(cudaFree(distances_));
-    CUDA_CHECK(cudaFree(actual_labels_));
+    cudaStream_t stream = handle_.get_stream();
+    raft::deallocate_all(stream);
   }
 
  private:
diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu
index 3c79fc2cae..a3c88a92be 100644
--- a/cpp/test/stats/mean.cu
+++ b/cpp/test/stats/mean.cu
@@ -49,7 +49,6 @@ class MeanTest : public ::testing::TestWithParam<MeanInputs<T>> {
     int rows = params.rows, cols = params.cols;
     int len = rows * cols;
 
-    cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
 
     raft::allocate(data, len, stream);
@@ -66,13 +65,14 @@ class MeanTest : public ::testing::TestWithParam<MeanInputs<T>> {
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(mean_act));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
   MeanInputs<T> params;
   T *data, *mean_act;
+  cudaStream_t stream;
 };
 
 // Note: For 1024 samples, 256 experiments, a mean of 1.0 with stddev=1.0, the
diff --git a/cpp/test/stats/mean_center.cu b/cpp/test/stats/mean_center.cu
index 89118b877b..b827230b5d 100644
--- a/cpp/test/stats/mean_center.cu
+++ b/cpp/test/stats/mean_center.cu
@@ -47,7 +47,6 @@ class MeanCenterTest
     params = ::testing::TestWithParam<MeanCenterInputs<T, IdxType>>::GetParam();
     raft::random::Rng r(params.seed);
 
-    cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
 
     auto rows = params.rows, cols = params.cols;
@@ -65,19 +64,18 @@ class MeanCenterTest
                params.bcastAlongRows, stream);
     raft::linalg::naiveMatVec(out_ref, data, meanVec, cols, rows,
                               params.rowMajor, params.bcastAlongRows, (T)-1.0);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(out));
-    CUDA_CHECK(cudaFree(out_ref));
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(meanVec));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
   MeanCenterInputs<T, IdxType> params;
   T *data, *meanVec, *out, *out_ref;
+  cudaStream_t stream;
 };
 
 const std::vector<MeanCenterInputs<float, int>> inputsf_i32 = {
diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu
index 6dfa186825..fd374249d2 100644
--- a/cpp/test/stats/stddev.cu
+++ b/cpp/test/stats/stddev.cu
@@ -47,7 +47,6 @@ class StdDevTest : public ::testing::TestWithParam<StdDevInputs<T>> {
     int rows = params.rows, cols = params.cols;
     int len = rows * cols;
 
-    cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
     raft::allocate(data, len, stream);
     raft::allocate(mean_act, cols, stream);
@@ -55,7 +54,7 @@ class StdDevTest : public ::testing::TestWithParam<StdDevInputs<T>> {
     raft::allocate(vars_act, cols, stream);
     r.normal(data, len, params.mean, params.stddev, stream);
     stdVarSGtest(data, stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void stdVarSGtest(T *data, cudaStream_t stream) {
@@ -73,15 +72,14 @@ class StdDevTest : public ::testing::TestWithParam<StdDevInputs<T>> {
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(mean_act));
-    CUDA_CHECK(cudaFree(stddev_act));
-    CUDA_CHECK(cudaFree(vars_act));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
   StdDevInputs<T> params;
   T *data, *mean_act, *stddev_act, *vars_act;
+  cudaStream_t stream;
 };
 
 const std::vector<StdDevInputs<float>> inputsf = {
diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu
index bf7a615f84..58ebec7859 100644
--- a/cpp/test/stats/sum.cu
+++ b/cpp/test/stats/sum.cu
@@ -43,7 +43,6 @@ class SumTest : public ::testing::TestWithParam<SumInputs<T>> {
     params = ::testing::TestWithParam<SumInputs<T>>::GetParam();
     int rows = params.rows, cols = params.cols;
     int len = rows * cols;
-    cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
     raft::allocate(data, len, stream);
 
@@ -56,17 +55,18 @@ class SumTest : public ::testing::TestWithParam<SumInputs<T>> {
 
     raft::allocate(sum_act, cols, stream);
     sum(sum_act, data, cols, rows, false, stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(sum_act));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
   SumInputs<T> params;
   T *data, *sum_act;
+  cudaStream_t stream;
 };
 
 const std::vector<SumInputs<float>> inputsf = {{0.05f, 1024, 32, 1234ULL},

From 192882a5faf45680eb21f3e1ef1786a082958612 Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Tue, 3 Aug 2021 12:03:13 +0200
Subject: [PATCH 09/17] Update handle exec policy

---
 cpp/include/raft/handle.hpp | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index 1b1923abb7..611868f55f 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -140,14 +140,10 @@ class handle_t {
     return cusparse_handle_;
   }
 
-  thrust_exec_policy_t get_thrust_policy() const {
+  rmm::exec_policy get_thrust_policy() const {
     std::lock_guard<std::mutex> _(mutex_);
     if (!thrust_policy_initialized_) {
-      if (!thrust_policy_) {
-        thrust_policy_ =
-          (thrust_exec_policy_t*)malloc(sizeof(thrust_exec_policy_t));
-      }
-      *thrust_policy_ = rmm::exec_policy(this->get_stream());
+      thrust_policy_ = new rmm::exec_policy(get_stream());
       thrust_policy_initialized_ = true;
     }
     return *thrust_policy_;
@@ -240,7 +236,7 @@ class handle_t {
   mutable bool cusolver_sp_initialized_{false};
   mutable cusparseHandle_t cusparse_handle_;
   mutable bool cusparse_initialized_{false};
-  mutable thrust_exec_policy_t* thrust_policy_{nullptr};
+  mutable rmm::exec_policy* thrust_policy_{nullptr};
   mutable bool thrust_policy_initialized_{false};
   cudaStream_t user_stream_{nullptr};
   cudaEvent_t event_;

From e13696e09b88d902e11c9b514691a10e6e491408 Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Wed, 4 Aug 2021 14:10:38 +0200
Subject: [PATCH 10/17] Small updates

---
 cpp/cmake/thirdparty/get_rmm.cmake              | 17 ++++++++++++-----
 cpp/include/raft/handle.hpp                     |  3 ---
 cpp/include/raft/sparse/linalg/spectral.cuh     |  4 +---
 .../spatial/knn/detail/ann_quantized_faiss.cuh  |  1 -
 4 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_rmm.cmake b/cpp/cmake/thirdparty/get_rmm.cmake
index 51f959a8d9..8a9f3ffe45 100644
--- a/cpp/cmake/thirdparty/get_rmm.cmake
+++ b/cpp/cmake/thirdparty/get_rmm.cmake
@@ -14,7 +14,7 @@
 # limitations under the License.
 #=============================================================================
 
-function(find_and_configure_rmm VERSION)
+function(find_and_configure_rmm)
 
     if(TARGET rmm::rmm)
         return()
@@ -26,13 +26,17 @@ function(find_and_configure_rmm VERSION)
         set(MAJOR_AND_MINOR "${VERSION}")
     endif()
 
-    rapids_cpm_find(rmm ${VERSION}
+    set(oneValueArgs VERSION FORK PINNED_TAG)
+    cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN} )
+
+    rapids_cpm_find(rmm ${PKG_VERSION}
         GLOBAL_TARGETS      rmm::rmm
         BUILD_EXPORT_SET    raft-exports
         INSTALL_EXPORT_SET  raft-exports
         CPM_ARGS
-            GIT_REPOSITORY  https://github.com/rapidsai/rmm.git
-            GIT_TAG         branch-${MAJOR_AND_MINOR}
+            GIT_REPOSITORY  https://github.com/${PKG_FORK}/rmm.git
+            GIT_TAG         ${PKG_PINNED_TAG}
             GIT_SHALLOW     TRUE
             OPTIONS         "BUILD_TESTS OFF"
                             "BUILD_BENCHMARKS OFF"
@@ -44,4 +48,7 @@ endfunction()
 
 set(RAFT_MIN_VERSION_rmm "${RAFT_VERSION_MAJOR}.${RAFT_VERSION_MINOR}.00")
 
-find_and_configure_rmm(${RAFT_MIN_VERSION_rmm})
+find_and_configure_rmm(VERSION    ${RAFT_MIN_VERSION_rmm}
+                       FORK       viclafargue
+                       PINNED_TAG 84a1328bfb894ca23b8b5efce358473c358d47bf
+                      )
diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index aa0fd8af8d..bcec17e7d3 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -47,9 +47,6 @@ namespace raft {
  *        necessary cuda kernels and/or libraries
  */
 class handle_t {
-  using thrust_exec_policy_t = thrust::detail::execute_with_allocator<
-    rmm::mr::thrust_allocator<char>, thrust::cuda_cub::execute_on_stream_base>;
-
  private:
   static constexpr int kNumDefaultWorkerStreams = 0;
 
diff --git a/cpp/include/raft/sparse/linalg/spectral.cuh b/cpp/include/raft/sparse/linalg/spectral.cuh
index d15c2cdf23..ce0c4bbe6f 100644
--- a/cpp/include/raft/sparse/linalg/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/spectral.cuh
@@ -63,8 +63,6 @@ void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals,
   index_type maxiter = 4000;  //default reset value (when set to 0);
   value_type tol = 0.01;
   index_type restart_iter = 15 + neigvs;  //what cugraph is using
-  auto t_exe_p = handle.get_thrust_policy();
-  using thrust_exe_policy_t = decltype(t_exe_p);
 
   raft::eigen_solver_config_t<index_type, value_type> cfg{neigvs, maxiter,
                                                           restart_iter, tol};
@@ -90,7 +88,7 @@ void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals,
     }
   };
 
-  raft::spectral::partition(handle, t_exe_p, r_csr_m, eig_solver,
+  raft::spectral::partition(handle, r_csr_m, eig_solver,
                             no_op_cluster_solver_t{}, labels.data(),
                             eigVals.data(), eigVecs.data());
 
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
index 43bdf12a38..77ad4afe96 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
@@ -43,7 +43,6 @@
 
 #include <raft/linalg/distance_type.h>
 
-#include <cuml/common/device_buffer.hpp>
 #include <cuml/neighbors/knn.hpp>
 
 #include <iostream>

From afd1a1bfdc08b013b562cd6bda45dfa3bfeab945 Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Wed, 4 Aug 2021 15:25:42 +0200
Subject: [PATCH 11/17] Use of CUDA stream view

---
 cpp/include/raft/cudart_utils.h | 37 +++++++++++++++++++--------------
 cpp/test/sparse/degree.cu       | 12 +++++------
 cpp/test/spatial/haversine.cu   |  6 +++---
 3 files changed, 30 insertions(+), 25 deletions(-)

diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h
index 3c07f1974d..b46b5457a4 100644
--- a/cpp/include/raft/cudart_utils.h
+++ b/cpp/include/raft/cudart_utils.h
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <raft/error.hpp>
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 
 #include <cuda_runtime.h>
@@ -203,9 +204,10 @@ class grid_1d_block_t {
  * @param stream cuda stream
  */
 template <typename Type>
-void copy(Type* dst, const Type* src, size_t len, cudaStream_t stream) {
-  CUDA_CHECK(
-    cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
+void copy(Type* dst, const Type* src, size_t len,
+          rmm::cuda_stream_view stream) {
+  CUDA_CHECK(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault,
+                             stream.value()));
 }
 
 /**
@@ -217,22 +219,22 @@ void copy(Type* dst, const Type* src, size_t len, cudaStream_t stream) {
 /** performs a host to device copy */
 template <typename Type>
 void update_device(Type* d_ptr, const Type* h_ptr, size_t len,
-                   cudaStream_t stream) {
-  copy(d_ptr, h_ptr, len, stream);
+                   rmm::cuda_stream_view stream) {
+  copy(d_ptr, h_ptr, len, stream.value());
 }
 
 /** performs a device to host copy */
 template <typename Type>
 void update_host(Type* h_ptr, const Type* d_ptr, size_t len,
-                 cudaStream_t stream) {
-  copy(h_ptr, d_ptr, len, stream);
+                 rmm::cuda_stream_view stream) {
+  copy(h_ptr, d_ptr, len, stream.value());
 }
 
 template <typename Type>
 void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len,
-                cudaStream_t stream) {
+                rmm::cuda_stream_view stream) {
   CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type),
-                             cudaMemcpyDeviceToDevice, stream));
+                             cudaMemcpyDeviceToDevice, stream.value()));
 }
 /** @} */
 
@@ -266,30 +268,33 @@ static std::mutex mutex_;
 static std::unordered_map<void*, size_t> allocations;
 
 template <typename Type>
-void allocate(Type*& ptr, size_t len, cudaStream_t stream,
+void allocate(Type*& ptr, size_t len, rmm::cuda_stream_view stream,
               bool setZero = false) {
   size_t size = len * sizeof(Type);
-  ptr = (Type*)rmm::mr::get_current_device_resource()->allocate(size, stream);
-  if (setZero) CUDA_CHECK(cudaMemset((void*)ptr, 0, size));
+  ptr = (Type*)rmm::mr::get_current_device_resource()->allocate(size,
+                                                                stream.value());
+  if (setZero) CUDA_CHECK(cudaMemsetAsync((void*)ptr, 0, size, stream.value()));
 
   std::lock_guard<std::mutex> _(mutex_);
   allocations[ptr] = size;
 }
 
 template <typename Type>
-void deallocate(Type*& ptr, cudaStream_t stream) {
+void deallocate(Type*& ptr, rmm::cuda_stream_view stream) {
   std::lock_guard<std::mutex> _(mutex_);
   size_t size = allocations[ptr];
   allocations.erase(ptr);
-  rmm::mr::get_current_device_resource()->deallocate((void*)ptr, size, stream);
+  rmm::mr::get_current_device_resource()->deallocate((void*)ptr, size,
+                                                     stream.value());
 }
 
-inline void deallocate_all(cudaStream_t stream) {
+inline void deallocate_all(rmm::cuda_stream_view stream) {
   std::lock_guard<std::mutex> _(mutex_);
   for (auto& alloc : allocations) {
     void* ptr = alloc.first;
     size_t size = alloc.second;
-    rmm::mr::get_current_device_resource()->deallocate(ptr, size, stream);
+    rmm::mr::get_current_device_resource()->deallocate(ptr, size,
+                                                       stream.value());
   }
   allocations.clear();
 }
diff --git a/cpp/test/sparse/degree.cu b/cpp/test/sparse/degree.cu
index eede0dfc10..f8a469af45 100644
--- a/cpp/test/sparse/degree.cu
+++ b/cpp/test/sparse/degree.cu
@@ -59,10 +59,10 @@ TEST_P(COODegree, Result) {
   raft::allocate(verify, 5, stream, true);
   raft::allocate(results, 5, stream, true);
 
-  raft::update_device(in_rows, *&in_rows_h, 5, 0);
-  raft::update_device(verify, *&verify_h, 5, 0);
+  raft::update_device(in_rows, *&in_rows_h, 5, stream);
+  raft::update_device(verify, *&verify_h, 5, stream);
 
-  linalg::coo_degree<32>(in_rows, 5, results, 0);
+  linalg::coo_degree<32>(in_rows, 5, results, stream);
   cudaDeviceSynchronize();
 
   ASSERT_TRUE(raft::devArrMatch<int>(verify, results, 5, raft::Compare<int>()));
@@ -88,9 +88,9 @@ TEST_P(COODegreeNonzero, Result) {
   raft::allocate(results, 5, stream, true);
   raft::allocate(in_vals, 5, stream, true);
 
-  raft::update_device(in_rows, *&in_rows_h, 5, 0);
-  raft::update_device(verify, *&verify_h, 5, 0);
-  raft::update_device(in_vals, *&in_vals_h, 5, 0);
+  raft::update_device(in_rows, *&in_rows_h, 5, stream);
+  raft::update_device(verify, *&verify_h, 5, stream);
+  raft::update_device(in_vals, *&in_vals_h, 5, stream);
 
   linalg::coo_degree_nz<32, float>(in_rows, in_vals, 5, results, stream);
   cudaDeviceSynchronize();
diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu
index 542f2d6fc8..122d7f2d6a 100644
--- a/cpp/test/spatial/haversine.cu
+++ b/cpp/test/spatial/haversine.cu
@@ -49,7 +49,7 @@ class HaversineKNNTest : public ::testing::Test {
       0.53154002, -1.47049808, 0.72891737, -1.54095137};
 
     h_train_inputs.resize(n);
-    raft::update_device(d_train_inputs, h_train_inputs.data(), n * d, 0);
+    raft::update_device(d_train_inputs, h_train_inputs.data(), n * d, stream);
 
     std::vector<value_t> h_res_D = {
       0., 0.05041587, 0.18767063, 0.23048252, 0.35749438, 0.62925595,
@@ -59,13 +59,13 @@ class HaversineKNNTest : public ::testing::Test {
       0., 0.16461092, 0.20535265, 0.23048252, 0.2426416,  0.5170737,
       0., 0.152463,   0.18767063, 0.20535265, 0.2345792,  0.44288665};
     h_res_D.resize(n * n);
-    raft::update_device(d_ref_D, h_res_D.data(), n * n, 0);
+    raft::update_device(d_ref_D, h_res_D.data(), n * n, stream);
 
     std::vector<value_idx> h_res_I = {0, 2, 5, 4, 3, 1, 1, 3, 5, 4, 2, 0,
                                       2, 0, 5, 4, 3, 1, 3, 4, 5, 2, 0, 1,
                                       4, 3, 5, 0, 2, 1, 5, 2, 0, 4, 3, 1};
     h_res_I.resize(n * n);
-    raft::update_device<value_idx>(d_ref_I, h_res_I.data(), n * n, 0);
+    raft::update_device<value_idx>(d_ref_I, h_res_I.data(), n * n, stream);
 
     std::vector<value_t *> input_vec = {d_train_inputs};
     std::vector<value_idx> sizes_vec = {n};

From 598463f9fcedef93549dc64b1d47c4b6afd37c59 Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Tue, 10 Aug 2021 11:04:51 +0200
Subject: [PATCH 12/17] Apply requested changes

---
 cpp/include/raft/label/classlabels.cuh | 1 -
 cpp/include/raft/linalg/eig.cuh        | 2 --
 2 files changed, 3 deletions(-)

diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/classlabels.cuh
index b7878dc276..e12620ee0f 100644
--- a/cpp/include/raft/label/classlabels.cuh
+++ b/cpp/include/raft/label/classlabels.cuh
@@ -66,7 +66,6 @@ void getUniquelabels(value_t *y, size_t n, value_t **y_unique, int *n_unique,
   cub::DeviceSelect::Unique(cub_storage.data(), bytes, y2.data(), y3.data(),
                             d_num_selected.data(), n);
   *n_unique = d_num_selected.value(stream);
-  CUDA_CHECK(cudaStreamSynchronize(stream));
 
   // Copy unique classes to output
   *y_unique =
diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh
index 951e2b8e7a..7fdbd371af 100644
--- a/cpp/include/raft/linalg/eig.cuh
+++ b/cpp/include/raft/linalg/eig.cuh
@@ -65,7 +65,6 @@ void eigDC(const raft::handle_t &handle, const math_t *in, int n_rows,
   CUDA_CHECK(cudaGetLastError());
 
   int dev_info = d_dev_info.value(stream);
-  CUDA_CHECK(cudaStreamSynchronize(stream));
   ASSERT(dev_info == 0,
          "eig.cuh: eigensolver couldn't converge to a solution. "
          "This usually occurs when some of the features do not vary enough.");
@@ -128,7 +127,6 @@ void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
   CUDA_CHECK(cudaGetLastError());
 
   int dev_info = d_dev_info.value(stream);
-  CUDA_CHECK(cudaStreamSynchronize(stream));
   ASSERT(dev_info == 0,
          "eig.cuh: eigensolver couldn't converge to a solution. "
          "This usually occurs when some of the features do not vary enough.");

From 819f698776ff9aa710cee1783a45990e7fc97f32 Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Tue, 10 Aug 2021 14:38:13 +0200
Subject: [PATCH 13/17] Completing adoption of rmm::exec_policy

---
 cpp/include/raft/sparse/op/filter.cuh | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/cpp/include/raft/sparse/op/filter.cuh b/cpp/include/raft/sparse/op/filter.cuh
index 5383f6fe7e..492058f85f 100644
--- a/cpp/include/raft/sparse/op/filter.cuh
+++ b/cpp/include/raft/sparse/op/filter.cuh
@@ -22,6 +22,7 @@
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
 #include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
 
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
@@ -93,14 +94,14 @@ void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz,
   thrust::device_ptr<int> dev_cnnz = thrust::device_pointer_cast(cnnz);
   thrust::device_ptr<int> dev_ex_scan =
     thrust::device_pointer_cast(ex_scan.data());
-  thrust::exclusive_scan(thrust::cuda::par.on(stream), dev_cnnz, dev_cnnz + n,
+  thrust::exclusive_scan(rmm::exec_policy(stream), dev_cnnz, dev_cnnz + n,
                          dev_ex_scan);
   CUDA_CHECK(cudaPeekAtLastError());
 
   thrust::device_ptr<int> dev_cur_cnnz = thrust::device_pointer_cast(cur_cnnz);
   thrust::device_ptr<int> dev_cur_ex_scan =
     thrust::device_pointer_cast(cur_ex_scan.data());
-  thrust::exclusive_scan(thrust::cuda::par.on(stream), dev_cur_cnnz,
+  thrust::exclusive_scan(rmm::exec_policy(stream), dev_cur_cnnz,
                          dev_cur_cnnz + n, dev_cur_ex_scan);
   CUDA_CHECK(cudaPeekAtLastError());
 
@@ -140,7 +141,7 @@ void coo_remove_scalar(COO<T> *in, COO<T> *out, T scalar, cudaStream_t stream) {
 
   thrust::device_ptr<int> d_row_count_nz =
     thrust::device_pointer_cast(row_count_nz.data());
-  int out_nnz = thrust::reduce(thrust::cuda::par.on(stream), d_row_count_nz,
+  int out_nnz = thrust::reduce(rmm::exec_policy(stream), d_row_count_nz,
                                d_row_count_nz + in->n_rows);
 
   out->allocate(out_nnz, in->n_rows, in->n_cols, false, stream);

From f990cf91c3518a128115808af5a4105aa105e8b4 Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Tue, 10 Aug 2021 14:38:50 +0200
Subject: [PATCH 14/17] get_rmm.cmake beck to default

---
 cpp/cmake/thirdparty/get_rmm.cmake | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_rmm.cmake b/cpp/cmake/thirdparty/get_rmm.cmake
index 8a9f3ffe45..9efee7cf29 100644
--- a/cpp/cmake/thirdparty/get_rmm.cmake
+++ b/cpp/cmake/thirdparty/get_rmm.cmake
@@ -14,7 +14,7 @@
 # limitations under the License.
 #=============================================================================
 
-function(find_and_configure_rmm)
+function(find_and_configure_rmm VERSION)
 
     if(TARGET rmm::rmm)
         return()
@@ -26,17 +26,13 @@ function(find_and_configure_rmm)
         set(MAJOR_AND_MINOR "${VERSION}")
     endif()
 
-    set(oneValueArgs VERSION FORK PINNED_TAG)
-    cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
-                          "${multiValueArgs}" ${ARGN} )
-
-    rapids_cpm_find(rmm ${PKG_VERSION}
+    rapids_cpm_find(rmm ${VERSION}
         GLOBAL_TARGETS      rmm::rmm
         BUILD_EXPORT_SET    raft-exports
         INSTALL_EXPORT_SET  raft-exports
         CPM_ARGS
-            GIT_REPOSITORY  https://github.com/${PKG_FORK}/rmm.git
-            GIT_TAG         ${PKG_PINNED_TAG}
+            GIT_REPOSITORY  https://github.com/rapidsai/rmm.git
+            GIT_TAG         branch-${MAJOR_AND_MINOR}
             GIT_SHALLOW     TRUE
             OPTIONS         "BUILD_TESTS OFF"
                             "BUILD_BENCHMARKS OFF"
@@ -48,7 +44,4 @@ endfunction()
 
 set(RAFT_MIN_VERSION_rmm "${RAFT_VERSION_MAJOR}.${RAFT_VERSION_MINOR}.00")
 
-find_and_configure_rmm(VERSION    ${RAFT_MIN_VERSION_rmm}
-                       FORK       viclafargue
-                       PINNED_TAG 84a1328bfb894ca23b8b5efce358473c358d47bf
-                      )
+find_and_configure_rmm(${RAFT_MIN_VERSION_rmm})
\ No newline at end of file

From be9b5bbdd25b9d1b01efa7d575ba46fe16a19cda Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Mon, 16 Aug 2021 13:12:58 +0200
Subject: [PATCH 15/17] Answer requested changes

---
 cpp/include/raft/handle.hpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index bcec17e7d3..b045c67c98 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -146,10 +146,6 @@ class handle_t {
     return *thrust_policy_;
   }
 
-  rmm::exec_policy get_thrust_policy(cudaStream_t stream) const {
-    return rmm::exec_policy(stream);
-  }
-
   // legacy compatibility for cuML
   cudaStream_t get_internal_stream(int sid) const {
     return streams_.get_stream(sid).value();

From c9e0fa82af6d5b0316df4706e2c914534a5ad4d8 Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Wed, 18 Aug 2021 20:01:21 +0200
Subject: [PATCH 16/17] Replace stream.value() occurences

---
 cpp/include/raft/cudart_utils.h | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h
index b46b5457a4..85ca310530 100644
--- a/cpp/include/raft/cudart_utils.h
+++ b/cpp/include/raft/cudart_utils.h
@@ -206,8 +206,8 @@ class grid_1d_block_t {
 template <typename Type>
 void copy(Type* dst, const Type* src, size_t len,
           rmm::cuda_stream_view stream) {
-  CUDA_CHECK(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault,
-                             stream.value()));
+  CUDA_CHECK(
+    cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
 }
 
 /**
@@ -220,21 +220,21 @@ void copy(Type* dst, const Type* src, size_t len,
 template <typename Type>
 void update_device(Type* d_ptr, const Type* h_ptr, size_t len,
                    rmm::cuda_stream_view stream) {
-  copy(d_ptr, h_ptr, len, stream.value());
+  copy(d_ptr, h_ptr, len, stream);
 }
 
 /** performs a device to host copy */
 template <typename Type>
 void update_host(Type* h_ptr, const Type* d_ptr, size_t len,
                  rmm::cuda_stream_view stream) {
-  copy(h_ptr, d_ptr, len, stream.value());
+  copy(h_ptr, d_ptr, len, stream);
 }
 
 template <typename Type>
 void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len,
                 rmm::cuda_stream_view stream) {
   CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type),
-                             cudaMemcpyDeviceToDevice, stream.value()));
+                             cudaMemcpyDeviceToDevice, stream));
 }
 /** @} */
 
@@ -271,9 +271,8 @@ template <typename Type>
 void allocate(Type*& ptr, size_t len, rmm::cuda_stream_view stream,
               bool setZero = false) {
   size_t size = len * sizeof(Type);
-  ptr = (Type*)rmm::mr::get_current_device_resource()->allocate(size,
-                                                                stream.value());
-  if (setZero) CUDA_CHECK(cudaMemsetAsync((void*)ptr, 0, size, stream.value()));
+  ptr = (Type*)rmm::mr::get_current_device_resource()->allocate(size, stream);
+  if (setZero) CUDA_CHECK(cudaMemsetAsync((void*)ptr, 0, size, stream));
 
   std::lock_guard<std::mutex> _(mutex_);
   allocations[ptr] = size;
@@ -284,8 +283,7 @@ void deallocate(Type*& ptr, rmm::cuda_stream_view stream) {
   std::lock_guard<std::mutex> _(mutex_);
   size_t size = allocations[ptr];
   allocations.erase(ptr);
-  rmm::mr::get_current_device_resource()->deallocate((void*)ptr, size,
-                                                     stream.value());
+  rmm::mr::get_current_device_resource()->deallocate((void*)ptr, size, stream);
 }
 
 inline void deallocate_all(rmm::cuda_stream_view stream) {
@@ -293,8 +291,7 @@ inline void deallocate_all(rmm::cuda_stream_view stream) {
   for (auto& alloc : allocations) {
     void* ptr = alloc.first;
     size_t size = alloc.second;
-    rmm::mr::get_current_device_resource()->deallocate(ptr, size,
-                                                       stream.value());
+    rmm::mr::get_current_device_resource()->deallocate(ptr, size, stream);
   }
   allocations.clear();
 }

From c2a3fbd39ef4cd6ae70b7af9cbc3129798e3cf9e Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Thu, 19 Aug 2021 11:11:55 +0200
Subject: [PATCH 17/17] Update RAFT handle

---
 cpp/include/raft/handle.hpp | 22 +++++-----------------
 1 file changed, 5 insertions(+), 17 deletions(-)

diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index b045c67c98..c925669530 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -64,6 +64,7 @@ class handle_t {
       }()),
       streams_(n_streams) {
     create_resources();
+    thrust_policy_ = std::make_unique<rmm::exec_policy>(user_stream_);
   }
 
   /**
@@ -85,6 +86,7 @@ class handle_t {
     device_prop_initialized_ = true;
     create_resources();
     set_stream(other.get_internal_stream(stream_id));
+    thrust_policy_ = std::make_unique<rmm::exec_policy>(user_stream_);
   }
 
   /** Destroys all held-up resources */
@@ -92,10 +94,7 @@ class handle_t {
 
   int get_device() const { return dev_id_; }
 
-  void set_stream(cudaStream_t stream) {
-    thrust_policy_initialized_ = false;
-    user_stream_ = stream;
-  }
+  void set_stream(cudaStream_t stream) { user_stream_ = stream; }
   cudaStream_t get_stream() const { return user_stream_; }
   rmm::cuda_stream_view get_stream_view() const {
     return rmm::cuda_stream_view(user_stream_);
@@ -137,14 +136,7 @@ class handle_t {
     return cusparse_handle_;
   }
 
-  rmm::exec_policy get_thrust_policy() const {
-    std::lock_guard<std::mutex> _(mutex_);
-    if (!thrust_policy_initialized_) {
-      thrust_policy_ = new rmm::exec_policy(get_stream());
-      thrust_policy_initialized_ = true;
-    }
-    return *thrust_policy_;
-  }
+  rmm::exec_policy& get_thrust_policy() const { return *thrust_policy_; }
 
   // legacy compatibility for cuML
   cudaStream_t get_internal_stream(int sid) const {
@@ -229,8 +221,7 @@ class handle_t {
   mutable bool cusolver_sp_initialized_{false};
   mutable cusparseHandle_t cusparse_handle_;
   mutable bool cusparse_initialized_{false};
-  mutable rmm::exec_policy* thrust_policy_{nullptr};
-  mutable bool thrust_policy_initialized_{false};
+  std::unique_ptr<rmm::exec_policy> thrust_policy_{nullptr};
   cudaStream_t user_stream_{nullptr};
   cudaEvent_t event_;
   mutable cudaDeviceProp prop_;
@@ -259,9 +250,6 @@ class handle_t {
       //CUBLAS_CHECK_NO_THROW(cublasDestroy(cublas_handle_));
       CUBLAS_CHECK(cublasDestroy(cublas_handle_));
     }
-    if (thrust_policy_initialized_) {
-      delete thrust_policy_;
-    }
     //CUDA_CHECK_NO_THROW(cudaEventDestroy(event_));
     CUDA_CHECK(cudaEventDestroy(event_));
   }