diff --git a/cpp/include/raft/matrix/detail/select_k-ext.cuh b/cpp/include/raft/matrix/detail/select_k-ext.cuh
index dfdbfa2d07..e8db6827b5 100644
--- a/cpp/include/raft/matrix/detail/select_k-ext.cuh
+++ b/cpp/include/raft/matrix/detail/select_k-ext.cuh
@@ -38,25 +38,23 @@ void select_k(raft::resources const& handle,
               T* out_val,
               IdxT* out_idx,
               bool select_min,
-              rmm::mr::device_memory_resource* mr = nullptr,
-              bool sorted                         = false,
-              SelectAlgo algo                     = SelectAlgo::kAuto) RAFT_EXPLICIT;
+              bool sorted     = false,
+              SelectAlgo algo = SelectAlgo::kAuto) RAFT_EXPLICIT;
 }  // namespace raft::matrix::detail
 
 #endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
 
-#define instantiate_raft_matrix_detail_select_k(T, IdxT)                                   \
-  extern template void raft::matrix::detail::select_k(raft::resources const& handle,       \
-                                                      const T* in_val,                     \
-                                                      const IdxT* in_idx,                  \
-                                                      size_t batch_size,                   \
-                                                      size_t len,                          \
-                                                      int k,                               \
-                                                      T* out_val,                          \
-                                                      IdxT* out_idx,                       \
-                                                      bool select_min,                     \
-                                                      rmm::mr::device_memory_resource* mr, \
-                                                      bool sorted,                         \
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)                             \
+  extern template void raft::matrix::detail::select_k(raft::resources const& handle, \
+                                                      const T* in_val,               \
+                                                      const IdxT* in_idx,            \
+                                                      size_t batch_size,             \
+                                                      size_t len,                    \
+                                                      int k,                         \
+                                                      T* out_val,                    \
+                                                      IdxT* out_idx,                 \
+                                                      bool select_min,               \
+                                                      bool sorted,                   \
                                                       raft::matrix::SelectAlgo algo)
 instantiate_raft_matrix_detail_select_k(__half, uint32_t);
 instantiate_raft_matrix_detail_select_k(__half, int64_t);
diff --git a/cpp/include/raft/matrix/detail/select_k-inl.cuh b/cpp/include/raft/matrix/detail/select_k-inl.cuh
index 0a6f292e68..8f40e6ae00 100644
--- a/cpp/include/raft/matrix/detail/select_k-inl.cuh
+++ b/cpp/include/raft/matrix/detail/select_k-inl.cuh
@@ -23,13 +23,12 @@
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/nvtx.hpp>
-#include <raft/matrix/init.cuh>
+#include <raft/core/operators.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/linalg/map.cuh>
 #include <raft/matrix/select_k_types.hpp>
 
-#include <raft/core/resource/thrust_policy.hpp>
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
-#include <thrust/scan.h>
+#include <cub/cub.cuh>
 
 namespace raft::matrix::detail {
 
@@ -95,15 +94,17 @@ void segmented_sort_by_key(raft::resources const& handle,
                            const ValT* offsets,
                            bool asc)
 {
-  auto stream    = raft::resource::get_cuda_stream(handle);
-  auto out_inds  = raft::make_device_vector<ValT, ValT>(handle, n_elements);
-  auto out_dists = raft::make_device_vector<KeyT, ValT>(handle, n_elements);
+  auto stream = resource::get_cuda_stream(handle);
+  auto mr     = resource::get_workspace_resource(handle);
+  auto out_inds =
+    raft::make_device_mdarray<ValT, ValT>(handle, mr, raft::make_extents<ValT>(n_elements));
+  auto out_dists =
+    raft::make_device_mdarray<KeyT, ValT>(handle, mr, raft::make_extents<ValT>(n_elements));
 
   // Determine temporary device storage requirements
-  auto d_temp_storage       = raft::make_device_vector<char, int>(handle, 0);
   size_t temp_storage_bytes = 0;
   if (asc) {
-    cub::DeviceSegmentedRadixSort::SortPairs((void*)d_temp_storage.data_handle(),
+    cub::DeviceSegmentedRadixSort::SortPairs(nullptr,
                                              temp_storage_bytes,
                                              keys,
                                              out_dists.data_handle(),
@@ -117,7 +118,7 @@ void segmented_sort_by_key(raft::resources const& handle,
                                              sizeof(ValT) * 8,
                                              stream);
   } else {
-    cub::DeviceSegmentedRadixSort::SortPairsDescending((void*)d_temp_storage.data_handle(),
+    cub::DeviceSegmentedRadixSort::SortPairsDescending(nullptr,
                                                        temp_storage_bytes,
                                                        keys,
                                                        out_dists.data_handle(),
@@ -132,7 +133,8 @@ void segmented_sort_by_key(raft::resources const& handle,
                                                        stream);
   }
 
-  d_temp_storage = raft::make_device_vector<char, int>(handle, temp_storage_bytes);
+  auto d_temp_storage = raft::make_device_mdarray<char, size_t>(
+    handle, mr, raft::make_extents<size_t>(temp_storage_bytes));
 
   if (asc) {
     // Run sorting operation
@@ -201,6 +203,7 @@ void segmented_sort_by_key(raft::resources const& handle,
  * @tparam IdxT
  *   the index type (what is being selected together with the keys).
  *
+ * @param[in] handle container of reusable resources
  * @param[in] in_val
  *   contiguous device array of inputs of size (len * batch_size);
  *   these are compared and selected.
@@ -222,9 +225,10 @@ void segmented_sort_by_key(raft::resources const& handle,
  *   the payload selected together with `out_val`.
  * @param select_min
  *   whether to select k smallest (true) or largest (false) keys.
- * @param stream
- * @param mr an optional memory resource to use across the calls (you can provide a large enough
- *           memory pool here to avoid memory allocations within the call).
+ * @param[in] sorted
+ *   whether to make sure selected pairs are sorted by value
+ * @param[in] algo
+ *   the selection algorithm to use
  */
 template <typename T, typename IdxT>
 void select_k(raft::resources const& handle,
@@ -236,24 +240,21 @@ void select_k(raft::resources const& handle,
               T* out_val,
               IdxT* out_idx,
               bool select_min,
-              rmm::mr::device_memory_resource* mr = nullptr,
-              bool sorted                         = false,
-              SelectAlgo algo                     = SelectAlgo::kAuto)
+              bool sorted     = false,
+              SelectAlgo algo = SelectAlgo::kAuto)
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
     "matrix::select_k(batch_size = %zu, len = %zu, k = %d)", batch_size, len, k);
 
-  if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); }
-
   if (algo == SelectAlgo::kAuto) { algo = choose_select_k_algorithm(batch_size, len, k); }
 
-  auto stream = raft::resource::get_cuda_stream(handle);
   switch (algo) {
     case SelectAlgo::kRadix8bits:
     case SelectAlgo::kRadix11bits:
     case SelectAlgo::kRadix11bitsExtraPass: {
       if (algo == SelectAlgo::kRadix8bits) {
-        detail::select::radix::select_k<T, IdxT, 8, 512>(in_val,
+        detail::select::radix::select_k<T, IdxT, 8, 512>(handle,
+                                                         in_val,
                                                          in_idx,
                                                          batch_size,
                                                          len,
@@ -261,13 +262,13 @@ void select_k(raft::resources const& handle,
                                                          out_val,
                                                          out_idx,
                                                          select_min,
-                                                         true,  // fused_last_filter
-                                                         stream,
-                                                         mr);
+                                                         true  // fused_last_filter
+        );
 
       } else {
         bool fused_last_filter = algo == SelectAlgo::kRadix11bits;
-        detail::select::radix::select_k<T, IdxT, 11, 512>(in_val,
+        detail::select::radix::select_k<T, IdxT, 11, 512>(handle,
+                                                          in_val,
                                                           in_idx,
                                                           batch_size,
                                                           len,
@@ -275,20 +276,12 @@ void select_k(raft::resources const& handle,
                                                           out_val,
                                                           out_idx,
                                                           select_min,
-                                                          fused_last_filter,
-                                                          stream,
-                                                          mr);
+                                                          fused_last_filter);
       }
       if (sorted) {
-        auto offsets = raft::make_device_vector<IdxT, IdxT>(handle, (IdxT)(batch_size + 1));
-
-        raft::matrix::fill(handle, offsets.view(), (IdxT)k);
-
-        thrust::exclusive_scan(raft::resource::get_thrust_policy(handle),
-                               offsets.data_handle(),
-                               offsets.data_handle() + offsets.size(),
-                               offsets.data_handle(),
-                               0);
+        auto offsets = make_device_mdarray<IdxT, IdxT>(
+          handle, resource::get_workspace_resource(handle), make_extents<IdxT>(batch_size + 1));
+        raft::linalg::map_offset(handle, offsets.view(), mul_const_op<IdxT>(k));
 
         auto keys = raft::make_device_vector_view<T, IdxT>(out_val, (IdxT)(batch_size * k));
         auto vals = raft::make_device_vector_view<IdxT, IdxT>(out_idx, (IdxT)(batch_size * k));
@@ -301,22 +294,22 @@ void select_k(raft::resources const& handle,
     case SelectAlgo::kWarpDistributed:
       return detail::select::warpsort::
         select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_distributed>(
-          in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr);
+          handle, in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min);
     case SelectAlgo::kWarpDistributedShm:
       return detail::select::warpsort::
         select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_distributed_ext>(
-          in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr);
+          handle, in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min);
     case SelectAlgo::kWarpAuto:
       return detail::select::warpsort::select_k<T, IdxT>(
-        in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr);
+        handle, in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min);
     case SelectAlgo::kWarpImmediate:
       return detail::select::warpsort::
         select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_immediate>(
-          in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr);
+          handle, in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min);
     case SelectAlgo::kWarpFiltered:
       return detail::select::warpsort::
         select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_filtered>(
-          in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr);
+          handle, in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min);
     default: RAFT_FAIL("K-selection Algorithm not supported.");
   }
 }
diff --git a/cpp/include/raft/matrix/detail/select_radix.cuh b/cpp/include/raft/matrix/detail/select_radix.cuh
index b6ed03b93d..16b9ac0c6d 100644
--- a/cpp/include/raft/matrix/detail/select_radix.cuh
+++ b/cpp/include/raft/matrix/detail/select_radix.cuh
@@ -19,6 +19,9 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/core/operators.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/core/resource/device_properties.hpp>
 #include <raft/linalg/map.cuh>
 #include <raft/util/cudart_utils.hpp>
 #include <raft/util/device_atomics.cuh>
@@ -1157,6 +1160,7 @@ void radix_topk_one_block(const T* in,
  * @tparam BlockSize
  *   Number of threads in a kernel thread block.
  *
+ * @param[in] res container of reusable resources
  * @param[in] in
  *   contiguous device array of inputs of size (len * batch_size);
  *   these are compared and selected.
@@ -1184,12 +1188,10 @@ void radix_topk_one_block(const T* in,
  *   blocks is called. The later case is preferable when leading bits of input data are almost the
  *   same. That is, when the value range of input data is narrow. In such case, there could be a
  *   large number of inputs for the last filter, hence using multiple thread blocks is beneficial.
- * @param stream
- * @param mr an optional memory resource to use across the calls (you can provide a large enough
- *           memory pool here to avoid memory allocations within the call).
  */
 template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
-void select_k(const T* in,
+void select_k(raft::resources const& res,
+              const T* in,
               const IdxT* in_idx,
               int batch_size,
               IdxT len,
@@ -1197,10 +1199,10 @@ void select_k(const T* in,
               T* out,
               IdxT* out_idx,
               bool select_min,
-              bool fused_last_filter,
-              rmm::cuda_stream_view stream,
-              rmm::mr::device_memory_resource* mr = nullptr)
+              bool fused_last_filter)
 {
+  auto stream = resource::get_cuda_stream(res);
+  auto mr     = resource::get_workspace_resource(res);
   if (k == len) {
     RAFT_CUDA_TRY(
       cudaMemcpyAsync(out, in, sizeof(T) * batch_size * len, cudaMemcpyDeviceToDevice, stream));
@@ -1210,21 +1212,12 @@ void select_k(const T* in,
     } else {
       auto out_idx_view =
         raft::make_device_vector_view(out_idx, static_cast<size_t>(len) * batch_size);
-      raft::resources handle;
-      resource::set_cuda_stream(handle, stream);
-      raft::linalg::map_offset(handle, out_idx_view, raft::mod_const_op<IdxT>(len));
+      raft::linalg::map_offset(res, out_idx_view, raft::mod_const_op<IdxT>(len));
     }
     return;
   }
 
-  // TODO: use device_resources::get_device_properties() instead; should change it when we refactor
-  // resource management
-  int sm_cnt;
-  {
-    int dev;
-    RAFT_CUDA_TRY(cudaGetDevice(&dev));
-    RAFT_CUDA_TRY(cudaDeviceGetAttribute(&sm_cnt, cudaDevAttrMultiProcessorCount, dev));
-  }
+  int sm_cnt = resource::get_device_properties(res).multiProcessorCount;
 
   constexpr int items_per_thread = 32;
 
diff --git a/cpp/include/raft/matrix/detail/select_warpsort.cuh b/cpp/include/raft/matrix/detail/select_warpsort.cuh
index 018eea2306..7cd43b030b 100644
--- a/cpp/include/raft/matrix/detail/select_warpsort.cuh
+++ b/cpp/include/raft/matrix/detail/select_warpsort.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,11 @@
 
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/logger.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/custom_resource.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
 #include <raft/util/bitonic_sort.cuh>
+#include <raft/util/cache.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/integer_utils.hpp>
 #include <raft/util/pow2_utils.cuh>
@@ -773,6 +777,11 @@ __launch_bounds__(256) RAFT_KERNEL
   queue.store(out + block_id * k, out_idx + block_id * k);
 }
 
+struct launch_params {
+  int block_size    = 0;
+  int min_grid_size = 0;
+};
+
 template <template <int, bool, typename, typename> class WarpSortClass,
           typename T,
           typename IdxT,
@@ -790,16 +799,13 @@ struct launch_setup {
    * @param[in] block_size_limit
    *   Forcefully limit the block size (optional)
    */
-  static void calc_optimal_params(int k,
-                                  int* block_size,
-                                  int* min_grid_size,
-                                  int block_size_limit = 0)
+  static auto calc_optimal_params(int k, int block_size_limit) -> launch_params
   {
     const int capacity = bound_by_power_of_two(k);
     if constexpr (Capacity > 1) {
       if (capacity < Capacity) {
         return launch_setup<WarpSortClass, T, IdxT, Capacity / 2>::calc_optimal_params(
-          capacity, block_size, min_grid_size, block_size_limit);
+          capacity, block_size_limit);
       }
     }
     ASSERT(capacity <= Capacity, "Requested k is too big (%d)", k);
@@ -807,12 +813,14 @@ struct launch_setup {
       int num_of_warp = block_size / std::min<int>(WarpSize, Capacity);
       return calc_smem_size_for_block_wide<T, IdxT>(num_of_warp, k);
     };
+    launch_params ps;
     RAFT_CUDA_TRY(cudaOccupancyMaxPotentialBlockSizeVariableSMem(
-      min_grid_size,
-      block_size,
+      &ps.min_grid_size,
+      &ps.block_size,
       block_kernel<WarpSortClass, Capacity, true, T, IdxT>,
       calc_smem,
       block_size_limit));
+    return ps;
   }
 
   static void kernel(int k,
@@ -869,6 +877,28 @@ struct launch_setup {
   }
 };
 
+template <template <int, bool, typename, typename> class WarpSortClass, typename T, typename IdxT>
+struct warpsort_params_cache {
+  static constexpr size_t kDefaultSize = 100;
+  cache::lru<uint64_t, std::hash<uint64_t>, std::equal_to<>, launch_params> value{kDefaultSize};
+};
+
+template <template <int, bool, typename, typename> class WarpSortClass, typename T, typename IdxT>
+static auto calc_optimal_params(raft::resources const& res, int k, int block_size_limit = 0)
+  -> launch_params
+{
+  uint64_t key = (static_cast<uint64_t>(k) << 32) | static_cast<uint64_t>(block_size_limit);
+  auto& cache =
+    resource::get_custom_resource<warpsort_params_cache<WarpSortClass, T, IdxT>>(res)->value;
+  launch_params val;
+  if (!cache.get(key, &val)) {
+    val =
+      launch_setup<WarpSortClass, T, IdxT, kMaxCapacity>::calc_optimal_params(k, block_size_limit);
+    cache.set(key, val);
+  }
+  return val;
+}
+
 template <template <int, bool, typename, typename> class WarpSortClass>
 struct LaunchThreshold {};
 
@@ -898,15 +928,19 @@ struct LaunchThreshold<warp_sort_immediate> {
 };
 
 template <template <int, bool, typename, typename> class WarpSortClass, typename T, typename IdxT>
-void calc_launch_parameter(
-  size_t batch_size, size_t len, int k, int* p_num_of_block, int* p_num_of_warp)
+void calc_launch_parameter(raft::resources const& res,
+                           size_t batch_size,
+                           size_t len,
+                           int k,
+                           int* p_num_of_block,
+                           int* p_num_of_warp)
 {
   const int capacity               = bound_by_power_of_two(k);
   const int capacity_per_full_warp = std::max(capacity, WarpSize);
-  int block_size                   = 0;
-  int min_grid_size                = 0;
-  launch_setup<WarpSortClass, T, IdxT>::calc_optimal_params(k, &block_size, &min_grid_size);
-  block_size = Pow2<WarpSize>::roundDown(block_size);
+  auto lps                         = calc_optimal_params<WarpSortClass, T, IdxT>(res, k);
+  int block_size                   = lps.block_size;
+  int min_grid_size                = lps.min_grid_size;
+  block_size                       = Pow2<WarpSize>::roundDown(block_size);
 
   int num_of_warp;
   int num_of_block;
@@ -950,19 +984,16 @@ void calc_launch_parameter(
     // to occupy a single block well.
     block_size = adjust_block_size(block_size);
     do {
-      num_of_warp               = block_size / WarpSize;
-      int another_block_size    = 0;
-      int another_min_grid_size = 0;
-      launch_setup<WarpSortClass, T, IdxT>::calc_optimal_params(
-        k, &another_block_size, &another_min_grid_size, block_size);
-      another_block_size = adjust_block_size(another_block_size);
-      if (batch_size >= size_t(another_min_grid_size)  // still have enough work
-          && another_block_size < block_size           // protect against an infinite loop
-          && another_min_grid_size * another_block_size >
+      num_of_warp        = block_size / WarpSize;
+      auto another       = calc_optimal_params<WarpSortClass, T, IdxT>(res, k, block_size);
+      another.block_size = adjust_block_size(another.block_size);
+      if (batch_size >= size_t(another.min_grid_size)  // still have enough work
+          && another.block_size < block_size           // protect against an infinite loop
+          && another.min_grid_size * another.block_size >
                min_grid_size * block_size  // improve occupancy
       ) {
-        block_size    = another_block_size;
-        min_grid_size = another_min_grid_size;
+        block_size    = another.block_size;
+        min_grid_size = another.min_grid_size;
       } else {
         break;
       }
@@ -986,10 +1017,8 @@ void select_k_(int num_of_block,
                IdxT* out_idx,
                bool select_min,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr = nullptr)
+               rmm::mr::device_memory_resource* mr)
 {
-  if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); }
-
   rmm::device_uvector<T> tmp_val(num_of_block * k * batch_size, stream, mr);
   rmm::device_uvector<IdxT> tmp_idx(num_of_block * k * batch_size, stream, mr);
 
@@ -1034,20 +1063,20 @@ void select_k_(int num_of_block,
 }
 
 template <typename T, typename IdxT, template <int, bool, typename, typename> class WarpSortClass>
-void select_k_impl(const T* in,
+void select_k_impl(raft::resources const& res,
+                   const T* in,
                    const IdxT* in_idx,
                    size_t batch_size,
                    size_t len,
                    int k,
                    T* out,
                    IdxT* out_idx,
-                   bool select_min,
-                   rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr = nullptr)
+                   bool select_min)
 {
   int num_of_block = 0;
   int num_of_warp  = 0;
-  calc_launch_parameter<WarpSortClass, T, IdxT>(batch_size, len, k, &num_of_block, &num_of_warp);
+  calc_launch_parameter<WarpSortClass, T, IdxT>(
+    res, batch_size, len, k, &num_of_block, &num_of_warp);
 
   select_k_<WarpSortClass, T, IdxT>(num_of_block,
                                     num_of_warp,
@@ -1059,8 +1088,8 @@ void select_k_impl(const T* in,
                                     out,
                                     out_idx,
                                     select_min,
-                                    stream,
-                                    mr);
+                                    resource::get_cuda_stream(res),
+                                    resource::get_workspace_resource(res));
 }
 
 /**
@@ -1075,6 +1104,7 @@ void select_k_impl(const T* in,
  * @tparam IdxT
  *   the index type (what is being selected together with the keys).
  *
+ * @param[in] res container of reusable resources
  * @param[in] in
  *   contiguous device array of inputs of size (len * batch_size);
  *   these are compared and selected.
@@ -1096,21 +1126,17 @@ void select_k_impl(const T* in,
  *   the payload selected together with `out`.
  * @param select_min
  *   whether to select k smallest (true) or largest (false) keys.
- * @param stream
- * @param mr an optional memory resource to use across the calls (you can provide a large enough
- *           memory pool here to avoid memory allocations within the call).
  */
 template <typename T, typename IdxT>
-void select_k(const T* in,
+void select_k(raft::resources const& res,
+              const T* in,
               const IdxT* in_idx,
               size_t batch_size,
               size_t len,
               int k,
               T* out,
               IdxT* out_idx,
-              bool select_min,
-              rmm::cuda_stream_view stream,
-              rmm::mr::device_memory_resource* mr = nullptr)
+              bool select_min)
 {
   ASSERT(k <= kMaxCapacity, "Current max k is %d (requested %d)", kMaxCapacity, k);
   ASSERT(len <= size_t(std::numeric_limits<IdxT>::max()),
@@ -1121,7 +1147,7 @@ void select_k(const T* in,
   int num_of_block = 0;
   int num_of_warp  = 0;
   calc_launch_parameter<warp_sort_immediate, T, IdxT>(
-    batch_size, len, k, &num_of_block, &num_of_warp);
+    res, batch_size, len, k, &num_of_block, &num_of_warp);
   int len_per_thread = len / (num_of_block * num_of_warp * std::min(capacity, WarpSize));
 
   if (len_per_thread <= LaunchThreshold<warp_sort_immediate>::len_factor_for_choosing) {
@@ -1135,11 +1161,11 @@ void select_k(const T* in,
                                             out,
                                             out_idx,
                                             select_min,
-                                            stream,
-                                            mr);
+                                            resource::get_cuda_stream(res),
+                                            resource::get_workspace_resource(res));
   } else {
     calc_launch_parameter<warp_sort_filtered, T, IdxT>(
-      batch_size, len, k, &num_of_block, &num_of_warp);
+      res, batch_size, len, k, &num_of_block, &num_of_warp);
     select_k_<warp_sort_filtered, T, IdxT>(num_of_block,
                                            num_of_warp,
                                            in,
@@ -1150,8 +1176,8 @@ void select_k(const T* in,
                                            out,
                                            out_idx,
                                            select_min,
-                                           stream,
-                                           mr);
+                                           resource::get_cuda_stream(res),
+                                           resource::get_workspace_resource(res));
   }
 }
 
diff --git a/cpp/include/raft/matrix/select_k.cuh b/cpp/include/raft/matrix/select_k.cuh
index 92d7db006d..e2d94c67ae 100644
--- a/cpp/include/raft/matrix/select_k.cuh
+++ b/cpp/include/raft/matrix/select_k.cuh
@@ -112,7 +112,6 @@ void select_k(raft::resources const& handle,
                                    out_val.data_handle(),
                                    out_idx.data_handle(),
                                    select_min,
-                                   nullptr,
                                    sorted,
                                    algo);
 }
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh
index 09c58602a4..29d521566d 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -135,8 +135,7 @@ void search_impl(raft::resources const& handle,
                                            n_probes,
                                            coarse_distances_dev.data(),
                                            coarse_indices_dev.data(),
-                                           select_min,
-                                           search_mr);
+                                           select_min);
   RAFT_LOG_TRACE_VEC(coarse_indices_dev.data(), n_probes);
   RAFT_LOG_TRACE_VEC(coarse_distances_dev.data(), n_probes);
 
@@ -199,8 +198,7 @@ void search_impl(raft::resources const& handle,
                                          k,
                                          distances,
                                          neighbors,
-                                         select_min,
-                                         search_mr);
+                                         select_min);
   }
 }
 
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
index 129f9d6ecf..d000a1a4d3 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
@@ -31,6 +31,7 @@
 #include <raft/core/logger.hpp>
 #include <raft/core/nvtx.hpp>
 #include <raft/core/operators.hpp>
+#include <raft/core/resource/custom_resource.hpp>
 #include <raft/core/resource/device_memory_resource.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/distance/distance_types.hpp>
@@ -39,6 +40,7 @@
 #include <raft/linalg/unary_op.cuh>
 #include <raft/matrix/detail/select_k.cuh>
 #include <raft/matrix/detail/select_warpsort.cuh>
+#include <raft/util/cache.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/device_atomics.cuh>
 #include <raft/util/device_loads_stores.cuh>
@@ -79,6 +81,12 @@ void select_clusters(raft::resources const& handle,
                      const float* cluster_centers,  // [n_lists, dim_ext]
                      rmm::mr::device_memory_resource* mr)
 {
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "ivf_pq::search::select_clusters(n_probes = %u, n_queries = %u, n_lists = %u, dim = %u)",
+    n_probes,
+    n_queries,
+    n_lists,
+    dim);
   auto stream = resource::get_cuda_stream(handle);
   /* NOTE[qc_distances]
 
@@ -160,8 +168,7 @@ void select_clusters(raft::resources const& handle,
                                             n_probes,
                                             cluster_dists.data(),
                                             clusters_to_probe,
-                                            true,
-                                            mr);
+                                            true);
 }
 
 /**
@@ -408,6 +415,46 @@ constexpr inline auto expected_probe_coresidency(uint32_t n_clusters,
   return 1 + (n_queries - 1) * n_probes / (2 * n_clusters);
 }
 
+struct search_kernel_key {
+  bool manage_local_topk;
+  uint32_t locality_hint;
+  double preferred_shmem_carveout;
+  uint32_t pq_bits;
+  uint32_t pq_dim;
+  uint32_t precomp_data_count;
+  uint32_t n_queries;
+  uint32_t n_probes;
+  uint32_t topk;
+};
+
+inline auto operator==(const search_kernel_key& a, const search_kernel_key& b) -> bool
+{
+  return a.manage_local_topk == b.manage_local_topk && a.locality_hint == b.locality_hint &&
+         a.preferred_shmem_carveout == b.preferred_shmem_carveout && a.pq_bits == b.pq_bits &&
+         a.pq_dim == b.pq_dim && a.precomp_data_count == b.precomp_data_count &&
+         a.n_queries == b.n_queries && a.n_probes == b.n_probes && a.topk == b.topk;
+}
+
+struct search_kernel_key_hash {
+  inline auto operator()(const search_kernel_key& x) const noexcept -> std::size_t
+  {
+    return (size_t{x.manage_local_topk} << 63) +
+           size_t{x.topk} * size_t{x.n_probes} * size_t{x.n_queries} +
+           size_t{x.precomp_data_count} * size_t{x.pq_dim} * size_t{x.pq_bits};
+  }
+};
+
+template <typename OutT, typename LutT, typename IvfSampleFilterT>
+struct search_kernel_cache {
+  /** Number of matmul invocations to cache. */
+  static constexpr size_t kDefaultSize = 100;
+  cache::lru<search_kernel_key,
+             search_kernel_key_hash,
+             std::equal_to<>,
+             selected<OutT, LutT, IvfSampleFilterT>>
+    value{kDefaultSize};
+};
+
 /**
  * The "main part" of the search, which assumes that outer-level `search` has already:
  *
@@ -432,6 +479,12 @@ void ivfpq_search_worker(raft::resources const& handle,
                          double preferred_shmem_carveout,
                          IvfSampleFilterT sample_filter)
 {
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "ivf_pq::search-worker(n_queries = %u, n_probes = %u, k = %u, dim = %zu)",
+    n_queries,
+    n_probes,
+    topK,
+    index.dim());
   auto stream = resource::get_cuda_stream(handle);
   auto mr     = resource::get_workspace_resource(handle);
 
@@ -534,17 +587,33 @@ void ivfpq_search_worker(raft::resources const& handle,
     } break;
   }
 
-  auto search_instance = compute_similarity_select<ScoreT, LutT, IvfSampleFilterT>(
-    resource::get_device_properties(handle),
-    manage_local_topk,
-    coresidency,
-    preferred_shmem_carveout,
-    index.pq_bits(),
-    index.pq_dim(),
-    precomp_data_count,
-    n_queries,
-    n_probes,
-    topK);
+  selected<ScoreT, LutT, IvfSampleFilterT> search_instance;
+  search_kernel_key search_key{manage_local_topk,
+                               coresidency,
+                               preferred_shmem_carveout,
+                               index.pq_bits(),
+                               index.pq_dim(),
+                               precomp_data_count,
+                               n_queries,
+                               n_probes,
+                               topK};
+  auto& cache =
+    resource::get_custom_resource<search_kernel_cache<ScoreT, LutT, IvfSampleFilterT>>(handle)
+      ->value;
+  if (!cache.get(search_key, &search_instance)) {
+    search_instance = compute_similarity_select<ScoreT, LutT, IvfSampleFilterT>(
+      resource::get_device_properties(handle),
+      manage_local_topk,
+      coresidency,
+      preferred_shmem_carveout,
+      index.pq_bits(),
+      index.pq_dim(),
+      precomp_data_count,
+      n_queries,
+      n_probes,
+      topK);
+    cache.set(search_key, search_instance);
+  }
 
   rmm::device_uvector<LutT> device_lut(search_instance.device_lut_size, stream, mr);
   std::optional<device_vector<float>> query_kths_buf{std::nullopt};
@@ -591,8 +660,7 @@ void ivfpq_search_worker(raft::resources const& handle,
                                              topK,
                                              topk_dists.data(),
                                              neighbors_uint32,
-                                             true,
-                                             mr);
+                                             true);
 
   // Postprocessing
   postprocess_distances(
@@ -695,7 +763,7 @@ inline auto get_max_batch_size(raft::resources const& res,
                                uint32_t max_samples) -> uint32_t
 {
   uint32_t max_batch_size         = n_queries;
-  uint32_t n_ctas_total           = getMultiProcessorCount() * 2;
+  uint32_t n_ctas_total           = resource::get_device_properties(res).multiProcessorCount * 2;
   uint32_t n_ctas_total_per_batch = n_ctas_total / max_batch_size;
   float utilization               = float(n_ctas_total_per_batch * max_batch_size) / n_ctas_total;
   if (n_ctas_total_per_batch > 1 || (n_ctas_total_per_batch == 1 && utilization < 0.6)) {
@@ -799,6 +867,8 @@ inline void search(raft::resources const& handle,
 
   for (uint32_t offset_q = 0; offset_q < n_queries; offset_q += max_queries) {
     uint32_t queries_batch = min(max_queries, n_queries - offset_q);
+    common::nvtx::range<common::nvtx::domain::raft> batch_scope(
+      "ivf_pq::search-batch(queries: %u - %u)", offset_q, offset_q + queries_batch);
 
     select_clusters(handle,
                     clusters_to_probe.data(),
diff --git a/cpp/internal/raft_internal/neighbors/naive_knn.cuh b/cpp/internal/raft_internal/neighbors/naive_knn.cuh
index 594fff0ba0..35d533316f 100644
--- a/cpp/internal/raft_internal/neighbors/naive_knn.cuh
+++ b/cpp/internal/raft_internal/neighbors/naive_knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/matrix/detail/select_k.cuh>
 #include <raft/spatial/knn/detail/ann_utils.cuh>
@@ -90,8 +92,7 @@ void naive_knn(raft::resources const& handle,
                uint32_t k,
                raft::distance::DistanceType type)
 {
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
-
+  auto mr     = resource::get_workspace_resource(handle);
   auto stream = raft::resource::get_cuda_stream(handle);
   dim3 block_dim(16, 32, 1);
   // maximum reasonable grid size in `y` direction
@@ -118,8 +119,7 @@ void naive_knn(raft::resources const& handle,
                                           static_cast<int>(k),
                                           dist_topk + offset * k,
                                           indices_topk + offset * k,
-                                          type != raft::distance::DistanceType::InnerProduct,
-                                          mr);
+                                          type != raft::distance::DistanceType::InnerProduct);
   }
   RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 }
diff --git a/cpp/src/matrix/detail/select_k_double_int64_t.cu b/cpp/src/matrix/detail/select_k_double_int64_t.cu
index 87e5d49d29..e32b4ef6f0 100644
--- a/cpp/src/matrix/detail/select_k_double_int64_t.cu
+++ b/cpp/src/matrix/detail/select_k_double_int64_t.cu
@@ -16,18 +16,17 @@
 
 #include <raft/matrix/detail/select_k-inl.cuh>
 
-#define instantiate_raft_matrix_detail_select_k(T, IdxT)                            \
-  template void raft::matrix::detail::select_k(raft::resources const& handle,       \
-                                               const T* in_val,                     \
-                                               const IdxT* in_idx,                  \
-                                               size_t batch_size,                   \
-                                               size_t len,                          \
-                                               int k,                               \
-                                               T* out_val,                          \
-                                               IdxT* out_idx,                       \
-                                               bool select_min,                     \
-                                               rmm::mr::device_memory_resource* mr, \
-                                               bool sorted,                         \
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)                      \
+  template void raft::matrix::detail::select_k(raft::resources const& handle, \
+                                               const T* in_val,               \
+                                               const IdxT* in_idx,            \
+                                               size_t batch_size,             \
+                                               size_t len,                    \
+                                               int k,                         \
+                                               T* out_val,                    \
+                                               IdxT* out_idx,                 \
+                                               bool select_min,               \
+                                               bool sorted,                   \
                                                raft::matrix::SelectAlgo algo)
 
 instantiate_raft_matrix_detail_select_k(double, int64_t);
diff --git a/cpp/src/matrix/detail/select_k_double_uint32_t.cu b/cpp/src/matrix/detail/select_k_double_uint32_t.cu
index 67dce0e166..9aa4e957af 100644
--- a/cpp/src/matrix/detail/select_k_double_uint32_t.cu
+++ b/cpp/src/matrix/detail/select_k_double_uint32_t.cu
@@ -17,18 +17,17 @@
 #include <cstdint>  // uint32_t
 #include <raft/matrix/detail/select_k-inl.cuh>
 
-#define instantiate_raft_matrix_detail_select_k(T, IdxT)                            \
-  template void raft::matrix::detail::select_k(raft::resources const& handle,       \
-                                               const T* in_val,                     \
-                                               const IdxT* in_idx,                  \
-                                               size_t batch_size,                   \
-                                               size_t len,                          \
-                                               int k,                               \
-                                               T* out_val,                          \
-                                               IdxT* out_idx,                       \
-                                               bool select_min,                     \
-                                               rmm::mr::device_memory_resource* mr, \
-                                               bool sorted,                         \
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)                      \
+  template void raft::matrix::detail::select_k(raft::resources const& handle, \
+                                               const T* in_val,               \
+                                               const IdxT* in_idx,            \
+                                               size_t batch_size,             \
+                                               size_t len,                    \
+                                               int k,                         \
+                                               T* out_val,                    \
+                                               IdxT* out_idx,                 \
+                                               bool select_min,               \
+                                               bool sorted,                   \
                                                raft::matrix::SelectAlgo algo)
 
 instantiate_raft_matrix_detail_select_k(double, uint32_t);
diff --git a/cpp/src/matrix/detail/select_k_float_int32.cu b/cpp/src/matrix/detail/select_k_float_int32.cu
index 4be7c54839..7f163a0b0d 100644
--- a/cpp/src/matrix/detail/select_k_float_int32.cu
+++ b/cpp/src/matrix/detail/select_k_float_int32.cu
@@ -16,18 +16,17 @@
 
 #include <raft/matrix/detail/select_k-inl.cuh>
 
-#define instantiate_raft_matrix_detail_select_k(T, IdxT)                            \
-  template void raft::matrix::detail::select_k(raft::resources const& handle,       \
-                                               const T* in_val,                     \
-                                               const IdxT* in_idx,                  \
-                                               size_t batch_size,                   \
-                                               size_t len,                          \
-                                               int k,                               \
-                                               T* out_val,                          \
-                                               IdxT* out_idx,                       \
-                                               bool select_min,                     \
-                                               rmm::mr::device_memory_resource* mr, \
-                                               bool sorted,                         \
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)                      \
+  template void raft::matrix::detail::select_k(raft::resources const& handle, \
+                                               const T* in_val,               \
+                                               const IdxT* in_idx,            \
+                                               size_t batch_size,             \
+                                               size_t len,                    \
+                                               int k,                         \
+                                               T* out_val,                    \
+                                               IdxT* out_idx,                 \
+                                               bool select_min,               \
+                                               bool sorted,                   \
                                                raft::matrix::SelectAlgo algo)
 
 instantiate_raft_matrix_detail_select_k(float, int);
diff --git a/cpp/src/matrix/detail/select_k_float_int64_t.cu b/cpp/src/matrix/detail/select_k_float_int64_t.cu
index 6337994e86..87b6525356 100644
--- a/cpp/src/matrix/detail/select_k_float_int64_t.cu
+++ b/cpp/src/matrix/detail/select_k_float_int64_t.cu
@@ -16,18 +16,17 @@
 
 #include <raft/matrix/detail/select_k-inl.cuh>
 
-#define instantiate_raft_matrix_detail_select_k(T, IdxT)                            \
-  template void raft::matrix::detail::select_k(raft::resources const& handle,       \
-                                               const T* in_val,                     \
-                                               const IdxT* in_idx,                  \
-                                               size_t batch_size,                   \
-                                               size_t len,                          \
-                                               int k,                               \
-                                               T* out_val,                          \
-                                               IdxT* out_idx,                       \
-                                               bool select_min,                     \
-                                               rmm::mr::device_memory_resource* mr, \
-                                               bool sorted,                         \
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)                      \
+  template void raft::matrix::detail::select_k(raft::resources const& handle, \
+                                               const T* in_val,               \
+                                               const IdxT* in_idx,            \
+                                               size_t batch_size,             \
+                                               size_t len,                    \
+                                               int k,                         \
+                                               T* out_val,                    \
+                                               IdxT* out_idx,                 \
+                                               bool select_min,               \
+                                               bool sorted,                   \
                                                raft::matrix::SelectAlgo algo)
 
 instantiate_raft_matrix_detail_select_k(float, int64_t);
diff --git a/cpp/src/matrix/detail/select_k_float_uint32_t.cu b/cpp/src/matrix/detail/select_k_float_uint32_t.cu
index ad26547812..e698f811d8 100644
--- a/cpp/src/matrix/detail/select_k_float_uint32_t.cu
+++ b/cpp/src/matrix/detail/select_k_float_uint32_t.cu
@@ -16,18 +16,17 @@
 
 #include <raft/matrix/detail/select_k-inl.cuh>
 
-#define instantiate_raft_matrix_detail_select_k(T, IdxT)                            \
-  template void raft::matrix::detail::select_k(raft::resources const& handle,       \
-                                               const T* in_val,                     \
-                                               const IdxT* in_idx,                  \
-                                               size_t batch_size,                   \
-                                               size_t len,                          \
-                                               int k,                               \
-                                               T* out_val,                          \
-                                               IdxT* out_idx,                       \
-                                               bool select_min,                     \
-                                               rmm::mr::device_memory_resource* mr, \
-                                               bool sorted,                         \
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)                      \
+  template void raft::matrix::detail::select_k(raft::resources const& handle, \
+                                               const T* in_val,               \
+                                               const IdxT* in_idx,            \
+                                               size_t batch_size,             \
+                                               size_t len,                    \
+                                               int k,                         \
+                                               T* out_val,                    \
+                                               IdxT* out_idx,                 \
+                                               bool select_min,               \
+                                               bool sorted,                   \
                                                raft::matrix::SelectAlgo algo)
 
 instantiate_raft_matrix_detail_select_k(float, uint32_t);
diff --git a/cpp/src/matrix/detail/select_k_half_int64_t.cu b/cpp/src/matrix/detail/select_k_half_int64_t.cu
index e3c29a2033..0eee20b1fa 100644
--- a/cpp/src/matrix/detail/select_k_half_int64_t.cu
+++ b/cpp/src/matrix/detail/select_k_half_int64_t.cu
@@ -16,18 +16,17 @@
 
 #include <raft/matrix/detail/select_k-inl.cuh>
 
-#define instantiate_raft_matrix_detail_select_k(T, IdxT)                            \
-  template void raft::matrix::detail::select_k(raft::resources const& handle,       \
-                                               const T* in_val,                     \
-                                               const IdxT* in_idx,                  \
-                                               size_t batch_size,                   \
-                                               size_t len,                          \
-                                               int k,                               \
-                                               T* out_val,                          \
-                                               IdxT* out_idx,                       \
-                                               bool select_min,                     \
-                                               rmm::mr::device_memory_resource* mr, \
-                                               bool sorted,                         \
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)                      \
+  template void raft::matrix::detail::select_k(raft::resources const& handle, \
+                                               const T* in_val,               \
+                                               const IdxT* in_idx,            \
+                                               size_t batch_size,             \
+                                               size_t len,                    \
+                                               int k,                         \
+                                               T* out_val,                    \
+                                               IdxT* out_idx,                 \
+                                               bool select_min,               \
+                                               bool sorted,                   \
                                                raft::matrix::SelectAlgo algo)
 
 instantiate_raft_matrix_detail_select_k(__half, int64_t);
diff --git a/cpp/src/matrix/detail/select_k_half_uint32_t.cu b/cpp/src/matrix/detail/select_k_half_uint32_t.cu
index 3e3a738915..f4e6bae21f 100644
--- a/cpp/src/matrix/detail/select_k_half_uint32_t.cu
+++ b/cpp/src/matrix/detail/select_k_half_uint32_t.cu
@@ -16,18 +16,17 @@
 
 #include <raft/matrix/detail/select_k-inl.cuh>
 
-#define instantiate_raft_matrix_detail_select_k(T, IdxT)                            \
-  template void raft::matrix::detail::select_k(raft::resources const& handle,       \
-                                               const T* in_val,                     \
-                                               const IdxT* in_idx,                  \
-                                               size_t batch_size,                   \
-                                               size_t len,                          \
-                                               int k,                               \
-                                               T* out_val,                          \
-                                               IdxT* out_idx,                       \
-                                               bool select_min,                     \
-                                               rmm::mr::device_memory_resource* mr, \
-                                               bool sorted,                         \
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)                      \
+  template void raft::matrix::detail::select_k(raft::resources const& handle, \
+                                               const T* in_val,               \
+                                               const IdxT* in_idx,            \
+                                               size_t batch_size,             \
+                                               size_t len,                    \
+                                               int k,                         \
+                                               T* out_val,                    \
+                                               IdxT* out_idx,                 \
+                                               bool select_min,               \
+                                               bool sorted,                   \
                                                raft::matrix::SelectAlgo algo)
 
 instantiate_raft_matrix_detail_select_k(__half, uint32_t);