From a9397843081cc011b1f408ab978fcebac90431d9 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 20 Mar 2023 11:23:04 +0100
Subject: [PATCH 01/25] Wrap the workspace resource into a
 limiting_resource_adaptor

---
 cpp/include/raft/core/device_resources.hpp    | 17 ++--
 .../core/resource/device_memory_resource.hpp  | 80 +++++++++++++++----
 cpp/include/raft/core/resources.hpp           |  7 +-
 3 files changed, 80 insertions(+), 24 deletions(-)
diff --git a/cpp/include/raft/core/device_resources.hpp b/cpp/include/raft/core/device_resources.hpp
index df6b39a368..0275f08152 100644
--- a/cpp/include/raft/core/device_resources.hpp
+++ b/cpp/include/raft/core/device_resources.hpp
@@ -21,6 +21,7 @@
 
 #include <memory>
 #include <mutex>
+#include <optional>
 #include <string>
 #include <unordered_map>
 #include <utility>
@@ -60,12 +61,13 @@ namespace raft {
 class device_resources : public resources {
  public:
   device_resources(const device_resources& handle,
-                   rmm::mr::device_memory_resource* workspace_resource)
+                   rmm::mr::device_memory_resource* workspace_resource,
+                   std::optional<std::size_t> allocation_limit = std::nullopt)
     : resources{handle}
   {
     // replace the resource factory for the workspace_resources
-    resources::add_resource_factory(
-      std::make_shared<resource::workspace_resource_factory>(workspace_resource));
+    resources::add_resource_factory(std::make_shared<resource::workspace_resource_factory>(
+      workspace_resource, allocation_limit, std::nullopt));
   }
 
   device_resources(const device_resources& handle) : resources{handle} {}
@@ -83,7 +85,8 @@ class device_resources : public resources {
    */
   device_resources(rmm::cuda_stream_view stream_view                  = rmm::cuda_stream_per_thread,
                    std::shared_ptr<rmm::cuda_stream_pool> stream_pool = {nullptr},
-                   rmm::mr::device_memory_resource* workspace_resource = nullptr)
+                   rmm::mr::device_memory_resource* workspace_resource = nullptr,
+                   std::optional<std::size_t> allocation_limit         = std::nullopt)
     : resources{}
   {
     resources::add_resource_factory(std::make_shared<resource::device_id_resource_factory>());
@@ -91,8 +94,8 @@ class device_resources : public resources {
       std::make_shared<resource::cuda_stream_resource_factory>(stream_view));
     resources::add_resource_factory(
       std::make_shared<resource::cuda_stream_pool_resource_factory>(stream_pool));
-    resources::add_resource_factory(
-      std::make_shared<resource::workspace_resource_factory>(workspace_resource));
+    resources::add_resource_factory(std::make_shared<resource::workspace_resource_factory>(
+      workspace_resource, allocation_limit, std::nullopt));
   }
 
   /** Destroys all held-up resources */
@@ -255,4 +258,4 @@ class stream_syncer {
 
 }  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp
index 35ae3d715f..061ba09235 100644
--- a/cpp/include/raft/core/resource/device_memory_resource.hpp
+++ b/cpp/include/raft/core/resource/device_memory_resource.hpp
@@ -17,21 +17,52 @@
 
 #include <raft/core/resource/resource_types.hpp>
 #include <raft/core/resources.hpp>
+#include <raft/util/cudart_utils.hpp>
+
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/limiting_resource_adaptor.hpp>
+
+#include <cstddef>
+#include <optional>
 
 namespace raft::resource {
-class device_memory_resource : public resource {
+class limited_memory_resource : public resource {
  public:
-  device_memory_resource(rmm::mr::device_memory_resource* mr_ = nullptr) : mr(mr_)
+  limited_memory_resource(rmm::mr::device_memory_resource* mr,
+                          std::optional<std::size_t> allocation_limit,
+                          std::optional<std::size_t> alignment)
+    : mr_(make_adaptor(mr, allocation_limit, alignment))
   {
-    if (mr_ == nullptr) { mr = rmm::mr::get_current_device_resource(); }
   }
-  void* get_resource() override { return mr; }
 
-  ~device_memory_resource() override {}
+  auto get_resource() -> void* override { return &mr_; }
+
+  ~limited_memory_resource() override = default;
 
  private:
-  rmm::mr::device_memory_resource* mr;
+  rmm::mr::limiting_resource_adaptor<rmm::mr::device_memory_resource> mr_;
+
+  static inline auto get_device_mem()
+  {
+    std::size_t free_size{};
+    std::size_t total_size{};
+    RAFT_CUDA_TRY(cudaMemGetInfo(&free_size, &total_size));
+    return total_size;
+  }
+
+  static inline auto make_adaptor(rmm::mr::device_memory_resource* mr,
+                                  std::optional<std::size_t> allocation_limit,
+                                  std::optional<std::size_t> alignment)
+    -> rmm::mr::limiting_resource_adaptor<rmm::mr::device_memory_resource>
+  {
+    if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); }
+    auto limit = allocation_limit.value_or(get_device_mem());
+    if (alignment.has_value()) {
+      return rmm::mr::limiting_resource_adaptor(mr, limit, alignment.value());
+    } else {
+      return rmm::mr::limiting_resource_adaptor(mr, limit);
+    }
+  }
 };
 
 /**
@@ -40,12 +71,22 @@ class device_memory_resource : public resource {
  */
 class workspace_resource_factory : public resource_factory {
  public:
-  workspace_resource_factory(rmm::mr::device_memory_resource* mr_ = nullptr) : mr(mr_) {}
-  resource_type get_resource_type() override { return resource_type::WORKSPACE_RESOURCE; }
-  resource* make_resource() override { return new device_memory_resource(mr); }
+  workspace_resource_factory(rmm::mr::device_memory_resource* mr,
+                             std::optional<std::size_t> allocation_limit,
+                             std::optional<std::size_t> alignment)
+    : mr_(mr), allocation_limit_(allocation_limit), alignment_(alignment)
+  {
+  }
+  auto get_resource_type() -> resource_type override { return resource_type::WORKSPACE_RESOURCE; }
+  auto make_resource() -> resource* override
+  {
+    return new limited_memory_resource(mr_, allocation_limit_, alignment_);
+  }
 
  private:
-  rmm::mr::device_memory_resource* mr;
+  rmm::mr::device_memory_resource* mr_;
+  std::optional<std::size_t> allocation_limit_;
+  std::optional<std::size_t> alignment_;
 };
 
 /**
@@ -54,12 +95,15 @@ class workspace_resource_factory : public resource_factory {
  * @param res raft resources object for managing resources
  * @return device memory resource object
  */
-inline rmm::mr::device_memory_resource* get_workspace_resource(resources const& res)
+inline auto get_workspace_resource(resources const& res)
+  -> rmm::mr::limiting_resource_adaptor<rmm::mr::device_memory_resource>*
 {
   if (!res.has_resource_factory(resource_type::WORKSPACE_RESOURCE)) {
-    res.add_resource_factory(std::make_shared<workspace_resource_factory>());
+    res.add_resource_factory(
+      std::make_shared<workspace_resource_factory>(nullptr, std::nullopt, std::nullopt));
   }
-  return res.get_resource<rmm::mr::device_memory_resource>(resource_type::WORKSPACE_RESOURCE);
+  return res.get_resource<rmm::mr::limiting_resource_adaptor<rmm::mr::device_memory_resource>>(
+    resource_type::WORKSPACE_RESOURCE);
 };
 
 /**
@@ -68,8 +112,12 @@ inline rmm::mr::device_memory_resource* get_workspace_resource(resources const&
  * @param res raft resources object for managing resources
  * @param mr a valid rmm device_memory_resource
  */
-inline void set_workspace_resource(resources const& res, rmm::mr::device_memory_resource* mr)
+inline void set_workspace_resource(resources const& res,
+                                   rmm::mr::device_memory_resource* mr         = nullptr,
+                                   std::optional<std::size_t> allocation_limit = std::nullopt,
+                                   std::optional<std::size_t> alignment        = std::nullopt)
 {
-  res.add_resource_factory(std::make_shared<workspace_resource_factory>(mr));
+  res.add_resource_factory(
+    std::make_shared<workspace_resource_factory>(mr, allocation_limit, alignment));
 };
-}  // namespace raft::resource
\ No newline at end of file
+}  // namespace raft::resource
diff --git a/cpp/include/raft/core/resources.hpp b/cpp/include/raft/core/resources.hpp
index 64e281e934..e31918dc6d 100644
--- a/cpp/include/raft/core/resources.hpp
+++ b/cpp/include/raft/core/resources.hpp
@@ -94,6 +94,11 @@ class resources {
     RAFT_EXPECTS(rtype != resource::resource_type::LAST_KEY,
                  "LAST_KEY is a placeholder and not a valid resource factory type.");
     factories_.at(rtype) = std::make_pair(rtype, factory);
+    // Clear the corresponding resource, so that on next `get_resource` the new factory is used
+    if (resources_.at(rtype).first != resource::resource_type::LAST_KEY) {
+      resources_.at(rtype) = std::make_pair(resource::resource_type::LAST_KEY,
+                                            std::make_shared<resource::empty_resource>());
+    }
   }
 
   /**
@@ -128,4 +133,4 @@ class resources {
   mutable std::vector<pair_res_factory> factories_;
   mutable std::vector<pair_resource> resources_;
 };
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft

From ac5762bad46d30dbab04f0bbcefc89fe02e90af6 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 20 Mar 2023 19:56:54 +0100
Subject: [PATCH 02/25] Set the pool memory resource by default and start the
 ivf-pq use case

---
 .../core/resource/device_memory_resource.hpp  | 60 ++++++++++++++++---
 .../raft/neighbors/detail/ivf_pq_search.cuh   | 16 ++---
 cpp/include/raft/neighbors/ivf_pq.cuh         |  6 +-
 3 files changed, 59 insertions(+), 23 deletions(-)

diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp
index 061ba09235..ce6dc0623b 100644
--- a/cpp/include/raft/core/resource/device_memory_resource.hpp
+++ b/cpp/include/raft/core/resource/device_memory_resource.hpp
@@ -21,6 +21,7 @@
 
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/limiting_resource_adaptor.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
 
 #include <cstddef>
 #include <optional>
@@ -31,7 +32,16 @@ class limited_memory_resource : public resource {
   limited_memory_resource(rmm::mr::device_memory_resource* mr,
                           std::optional<std::size_t> allocation_limit,
                           std::optional<std::size_t> alignment)
-    : mr_(make_adaptor(mr, allocation_limit, alignment))
+    : limited_memory_resource(mr, get_alloc_limit(allocation_limit), alignment)
+  {
+  }
+
+  template <class Deleter>
+  limited_memory_resource(rmm::mr::device_memory_resource* mr,
+                          Deleter d,
+                          std::optional<std::size_t> allocation_limit,
+                          std::optional<std::size_t> alignment)
+    : limited_memory_resource(mr, d, get_alloc_limit(allocation_limit), alignment)
   {
   }
 
@@ -40,27 +50,59 @@ class limited_memory_resource : public resource {
   ~limited_memory_resource() override = default;
 
  private:
+  std::shared_ptr<rmm::mr::device_memory_resource> upstream_;
   rmm::mr::limiting_resource_adaptor<rmm::mr::device_memory_resource> mr_;
 
-  static inline auto get_device_mem()
+  limited_memory_resource(rmm::mr::device_memory_resource* mr,
+                          std::size_t allocation_limit,
+                          std::optional<std::size_t> alignment)
+    : upstream_{get_upstream(mr, allocation_limit)},
+      mr_(make_adaptor(upstream_, allocation_limit, alignment))
+  {
+  }
+
+  template <class Deleter>
+  limited_memory_resource(rmm::mr::device_memory_resource* mr,
+                          Deleter d,
+                          std::size_t allocation_limit,
+                          std::optional<std::size_t> alignment)
+    : upstream_{get_upstream(mr, allocation_limit), d},
+      mr_{make_adaptor(upstream_, allocation_limit, alignment)}
+  {
+  }
+
+  static inline auto get_upstream(rmm::mr::device_memory_resource* mr, std::size_t allocation_limit)
+    -> rmm::mr::device_memory_resource*
+  {
+    if (mr != nullptr) { return mr; }
+    // Create a pool memory resource by default
+    constexpr std::size_t kOneGb = 1024lu * 1024lu * 1024lu;
+    auto min_size                = std::min<std::size_t>(kOneGb, allocation_limit / 2);
+    auto max_size                = allocation_limit * 3lu / 2lu;
+    return new rmm::mr::pool_memory_resource(
+      rmm::mr::get_current_device_resource(), min_size, max_size);
+  }
+
+  static inline auto get_alloc_limit(std::optional<std::size_t> limit) -> std::size_t
   {
+    if (limit.has_value()) { return limit.value(); }
+    // Allow a fraction of available memory by default.
     std::size_t free_size{};
     std::size_t total_size{};
     RAFT_CUDA_TRY(cudaMemGetInfo(&free_size, &total_size));
-    return total_size;
+    return free_size / 2;
   }
 
-  static inline auto make_adaptor(rmm::mr::device_memory_resource* mr,
-                                  std::optional<std::size_t> allocation_limit,
+  static inline auto make_adaptor(std::shared_ptr<rmm::mr::device_memory_resource> upstream,
+                                  std::size_t limit,
                                   std::optional<std::size_t> alignment)
     -> rmm::mr::limiting_resource_adaptor<rmm::mr::device_memory_resource>
   {
-    if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); }
-    auto limit = allocation_limit.value_or(get_device_mem());
+    auto p = upstream.get();
     if (alignment.has_value()) {
-      return rmm::mr::limiting_resource_adaptor(mr, limit, alignment.value());
+      return rmm::mr::limiting_resource_adaptor(p, limit, alignment.value());
     } else {
-      return rmm::mr::limiting_resource_adaptor(mr, limit);
+      return rmm::mr::limiting_resource_adaptor(p, limit);
     }
   }
 };
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
index 4b6e6f5e31..7323239e67 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
@@ -1262,10 +1262,10 @@ void ivfpq_search_worker(raft::device_resources const& handle,
                          IdxT* neighbors,                    // [n_queries, topK]
                          float* distances,                   // [n_queries, topK]
                          float scaling_factor,
-                         double preferred_shmem_carveout,
-                         rmm::mr::device_memory_resource* mr)
+                         double preferred_shmem_carveout)
 {
   auto stream = handle.get_stream();
+  auto mr     = handle.get_workspace_resource();
 
   bool manage_local_topk = is_local_topk_feasible(topK, n_probes, n_queries);
   auto topk_len          = manage_local_topk ? n_probes * topK : max_samples;
@@ -1554,8 +1554,7 @@ inline void search(raft::device_resources const& handle,
                    uint32_t n_queries,
                    uint32_t k,
                    IdxT* neighbors,
-                   float* distances,
-                   rmm::mr::device_memory_resource* mr = nullptr)
+                   float* distances)
 {
   static_assert(std::is_same_v<T, float> || std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
                 "Unsupported element type.");
@@ -1601,11 +1600,7 @@ inline void search(raft::device_resources const& handle,
     max_samples = ms;
   }
 
-  auto pool_guard = raft::get_pool_memory_resource(mr, n_queries * n_probes * k * 16);
-  if (pool_guard) {
-    RAFT_LOG_DEBUG("ivf_pq::search: using pool memory resource with initial size %zu bytes",
-                   pool_guard->pool_size());
-  }
+  auto mr = handle.get_workspace_resource();
 
   // Maximum number of query vectors to search at the same time.
   const auto max_queries = std::min<uint32_t>(std::max<uint32_t>(n_queries, 1), 4096);
@@ -1669,8 +1664,7 @@ inline void search(raft::device_resources const& handle,
                       neighbors + uint64_t(k) * (offset_q + offset_b),
                       distances + uint64_t(k) * (offset_q + offset_b),
                       utils::config<T>::kDivisor / utils::config<float>::kDivisor,
-                      params.preferred_shmem_carveout,
-                      mr);
+                      params.preferred_shmem_carveout);
     }
   }
 }
diff --git a/cpp/include/raft/neighbors/ivf_pq.cuh b/cpp/include/raft/neighbors/ivf_pq.cuh
index 4a12ca72a4..2a5c322016 100644
--- a/cpp/include/raft/neighbors/ivf_pq.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq.cuh
@@ -182,8 +182,7 @@ void search(raft::device_resources const& handle,
                         static_cast<std::uint32_t>(queries.extent(0)),
                         k,
                         neighbors.data_handle(),
-                        distances.data_handle(),
-                        handle.get_workspace_resource());
+                        distances.data_handle());
 }
 
 /** @} */  // end group ivf_pq
@@ -349,7 +348,8 @@ void search(raft::device_resources const& handle,
             float* distances,
             rmm::mr::device_memory_resource* mr = nullptr)
 {
-  return detail::search(handle, params, idx, queries, n_queries, k, neighbors, distances, mr);
+  if (mr != nullptr) { raft::resource::set_workspace_resource(handle, mr); }
+  return detail::search(handle, params, idx, queries, n_queries, k, neighbors, distances);
 }
 
 }  // namespace raft::neighbors::ivf_pq

From c33d519ad1271e32a55c599a3b19176159727f82 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 21 Mar 2023 08:16:47 +0100
Subject: [PATCH 03/25] Refactor the resource to more rely on shared_ptr to
 manage lifetime

---
 cpp/include/raft/core/device_resources.hpp    |   8 +-
 .../core/resource/device_memory_resource.hpp  | 118 +++++++++---------
 2 files changed, 61 insertions(+), 65 deletions(-)

diff --git a/cpp/include/raft/core/device_resources.hpp b/cpp/include/raft/core/device_resources.hpp
index 0275f08152..6f82fc4f6c 100644
--- a/cpp/include/raft/core/device_resources.hpp
+++ b/cpp/include/raft/core/device_resources.hpp
@@ -66,8 +66,7 @@ class device_resources : public resources {
     : resources{handle}
   {
     // replace the resource factory for the workspace_resources
-    resources::add_resource_factory(std::make_shared<resource::workspace_resource_factory>(
-      workspace_resource, allocation_limit, std::nullopt));
+    resource::set_workspace_resource(*this, workspace_resource, allocation_limit);
   }
 
   device_resources(const device_resources& handle) : resources{handle} {}
@@ -82,6 +81,8 @@ class device_resources : public resources {
    * @param[in] stream_pool the stream pool used (which has default of nullptr if unspecified)
    * @param[in] workspace_resource an optional resource used by some functions for allocating
    *            temporary workspaces.
+   *            NB: the function takes the ownership of the resource; the semantics is the same
+   *            as if the pointer was passed to std::shared_ptr constructor.
    */
   device_resources(rmm::cuda_stream_view stream_view                  = rmm::cuda_stream_per_thread,
                    std::shared_ptr<rmm::cuda_stream_pool> stream_pool = {nullptr},
@@ -94,8 +95,7 @@ class device_resources : public resources {
       std::make_shared<resource::cuda_stream_resource_factory>(stream_view));
     resources::add_resource_factory(
       std::make_shared<resource::cuda_stream_pool_resource_factory>(stream_pool));
-    resources::add_resource_factory(std::make_shared<resource::workspace_resource_factory>(
-      workspace_resource, allocation_limit, std::nullopt));
+    resource::set_workspace_resource(*this, workspace_resource, allocation_limit);
   }
 
   /** Destroys all held-up resources */
diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp
index ce6dc0623b..a0685ebc07 100644
--- a/cpp/include/raft/core/resource/device_memory_resource.hpp
+++ b/cpp/include/raft/core/resource/device_memory_resource.hpp
@@ -29,19 +29,10 @@
 namespace raft::resource {
 class limited_memory_resource : public resource {
  public:
-  limited_memory_resource(rmm::mr::device_memory_resource* mr,
-                          std::optional<std::size_t> allocation_limit,
-                          std::optional<std::size_t> alignment)
-    : limited_memory_resource(mr, get_alloc_limit(allocation_limit), alignment)
-  {
-  }
-
-  template <class Deleter>
-  limited_memory_resource(rmm::mr::device_memory_resource* mr,
-                          Deleter d,
-                          std::optional<std::size_t> allocation_limit,
+  limited_memory_resource(std::shared_ptr<rmm::mr::device_memory_resource> mr,
+                          std::size_t allocation_limit,
                           std::optional<std::size_t> alignment)
-    : limited_memory_resource(mr, d, get_alloc_limit(allocation_limit), alignment)
+    : upstream_(mr), mr_(make_adaptor(mr, allocation_limit, alignment))
   {
   }
 
@@ -53,46 +44,6 @@ class limited_memory_resource : public resource {
   std::shared_ptr<rmm::mr::device_memory_resource> upstream_;
   rmm::mr::limiting_resource_adaptor<rmm::mr::device_memory_resource> mr_;
 
-  limited_memory_resource(rmm::mr::device_memory_resource* mr,
-                          std::size_t allocation_limit,
-                          std::optional<std::size_t> alignment)
-    : upstream_{get_upstream(mr, allocation_limit)},
-      mr_(make_adaptor(upstream_, allocation_limit, alignment))
-  {
-  }
-
-  template <class Deleter>
-  limited_memory_resource(rmm::mr::device_memory_resource* mr,
-                          Deleter d,
-                          std::size_t allocation_limit,
-                          std::optional<std::size_t> alignment)
-    : upstream_{get_upstream(mr, allocation_limit), d},
-      mr_{make_adaptor(upstream_, allocation_limit, alignment)}
-  {
-  }
-
-  static inline auto get_upstream(rmm::mr::device_memory_resource* mr, std::size_t allocation_limit)
-    -> rmm::mr::device_memory_resource*
-  {
-    if (mr != nullptr) { return mr; }
-    // Create a pool memory resource by default
-    constexpr std::size_t kOneGb = 1024lu * 1024lu * 1024lu;
-    auto min_size                = std::min<std::size_t>(kOneGb, allocation_limit / 2);
-    auto max_size                = allocation_limit * 3lu / 2lu;
-    return new rmm::mr::pool_memory_resource(
-      rmm::mr::get_current_device_resource(), min_size, max_size);
-  }
-
-  static inline auto get_alloc_limit(std::optional<std::size_t> limit) -> std::size_t
-  {
-    if (limit.has_value()) { return limit.value(); }
-    // Allow a fraction of available memory by default.
-    std::size_t free_size{};
-    std::size_t total_size{};
-    RAFT_CUDA_TRY(cudaMemGetInfo(&free_size, &total_size));
-    return free_size / 2;
-  }
-
   static inline auto make_adaptor(std::shared_ptr<rmm::mr::device_memory_resource> upstream,
                                   std::size_t limit,
                                   std::optional<std::size_t> alignment)
@@ -113,12 +64,16 @@ class limited_memory_resource : public resource {
  */
 class workspace_resource_factory : public resource_factory {
  public:
-  workspace_resource_factory(rmm::mr::device_memory_resource* mr,
-                             std::optional<std::size_t> allocation_limit,
-                             std::optional<std::size_t> alignment)
-    : mr_(mr), allocation_limit_(allocation_limit), alignment_(alignment)
+  explicit workspace_resource_factory(
+    std::shared_ptr<rmm::mr::device_memory_resource> mr = {nullptr},
+    std::optional<std::size_t> allocation_limit         = std::nullopt,
+    std::optional<std::size_t> alignment                = std::nullopt)
+    : allocation_limit_(allocation_limit.value_or(default_allocation_limit())),
+      alignment_(alignment),
+      mr_(mr ? mr : default_memory_resource(allocation_limit_))
   {
   }
+
   auto get_resource_type() -> resource_type override { return resource_type::WORKSPACE_RESOURCE; }
   auto make_resource() -> resource* override
   {
@@ -126,9 +81,30 @@ class workspace_resource_factory : public resource_factory {
   }
 
  private:
-  rmm::mr::device_memory_resource* mr_;
-  std::optional<std::size_t> allocation_limit_;
+  std::size_t allocation_limit_;
   std::optional<std::size_t> alignment_;
+  std::shared_ptr<rmm::mr::device_memory_resource> mr_;
+
+  // Create a pool memory resource by default
+  static inline auto default_memory_resource(std::size_t limit)
+    -> std::shared_ptr<rmm::mr::device_memory_resource>
+  {
+    constexpr std::size_t kOneGb = 1024lu * 1024lu * 1024lu;
+    auto min_size                = std::min<std::size_t>(kOneGb, limit / 2);
+    auto max_size                = limit * 3lu / 2lu;
+    auto upstream                = rmm::mr::get_current_device_resource();
+    return std::make_shared<rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>>(
+      upstream, min_size, max_size);
+  }
+
+  // Allow a fraction of available memory by default.
+  static inline auto default_allocation_limit() -> std::size_t
+  {
+    std::size_t free_size{};
+    std::size_t total_size{};
+    RAFT_CUDA_TRY(cudaMemGetInfo(&free_size, &total_size));
+    return free_size / 2;
+  }
 };
 
 /**
@@ -141,8 +117,7 @@ inline auto get_workspace_resource(resources const& res)
   -> rmm::mr::limiting_resource_adaptor<rmm::mr::device_memory_resource>*
 {
   if (!res.has_resource_factory(resource_type::WORKSPACE_RESOURCE)) {
-    res.add_resource_factory(
-      std::make_shared<workspace_resource_factory>(nullptr, std::nullopt, std::nullopt));
+    res.add_resource_factory(std::make_shared<workspace_resource_factory>());
   }
   return res.get_resource<rmm::mr::limiting_resource_adaptor<rmm::mr::device_memory_resource>>(
     resource_type::WORKSPACE_RESOURCE);
@@ -155,11 +130,32 @@ inline auto get_workspace_resource(resources const& res)
  * @param mr a valid rmm device_memory_resource
  */
 inline void set_workspace_resource(resources const& res,
-                                   rmm::mr::device_memory_resource* mr         = nullptr,
+                                   std::shared_ptr<rmm::mr::device_memory_resource> mr = {nullptr},
                                    std::optional<std::size_t> allocation_limit = std::nullopt,
                                    std::optional<std::size_t> alignment        = std::nullopt)
 {
   res.add_resource_factory(
     std::make_shared<workspace_resource_factory>(mr, allocation_limit, alignment));
 };
+
+inline void set_workspace_resource(resources const& res,
+                                   rmm::mr::device_memory_resource* mr,
+                                   std::optional<std::size_t> allocation_limit = std::nullopt,
+                                   std::optional<std::size_t> alignment        = std::nullopt)
+{
+  set_workspace_resource(
+    res, std::shared_ptr<rmm::mr::device_memory_resource>{mr}, allocation_limit, alignment);
+};
+
+template <class Deleter>
+inline void set_workspace_resource(resources const& res,
+                                   rmm::mr::device_memory_resource* mr,
+                                   Deleter d,
+                                   std::optional<std::size_t> allocation_limit = std::nullopt,
+                                   std::optional<std::size_t> alignment        = std::nullopt)
+{
+  set_workspace_resource(
+    res, std::shared_ptr<rmm::mr::device_memory_resource>{mr, d}, allocation_limit, alignment);
+};
+
 }  // namespace raft::resource

From de5dd848ae6603f1c8f1e330f3b1dc178d4a4fa9 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 21 Mar 2023 09:43:35 +0100
Subject: [PATCH 04/25] Preserve the semantics of not transfering the ownership
 of raw pointers to keep the change non-breaking

---
 cpp/include/raft/core/device_resources.hpp    |  4 +-
 .../core/resource/device_memory_resource.hpp  | 42 ++++++++++++-------
 .../raft/neighbors/detail/ivf_pq_build.cuh    | 10 +----
 cpp/include/raft/neighbors/ivf_pq.cuh         |  5 ++-
 4 files changed, 35 insertions(+), 26 deletions(-)

diff --git a/cpp/include/raft/core/device_resources.hpp b/cpp/include/raft/core/device_resources.hpp
index 6f82fc4f6c..47370f5371 100644
--- a/cpp/include/raft/core/device_resources.hpp
+++ b/cpp/include/raft/core/device_resources.hpp
@@ -81,8 +81,8 @@ class device_resources : public resources {
    * @param[in] stream_pool the stream pool used (which has default of nullptr if unspecified)
    * @param[in] workspace_resource an optional resource used by some functions for allocating
    *            temporary workspaces.
-   *            NB: the function takes the ownership of the resource; the semantics is the same
-   *            as if the pointer was passed to std::shared_ptr constructor.
+   * @param[in] allocation_limit the total amount of memory in bytes available to the temporary
+   *            workspace resources.
    */
   device_resources(rmm::cuda_stream_view stream_view                  = rmm::cuda_stream_per_thread,
                    std::shared_ptr<rmm::cuda_stream_pool> stream_pool = {nullptr},
diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp
index a0685ebc07..d1c5b1726a 100644
--- a/cpp/include/raft/core/resource/device_memory_resource.hpp
+++ b/cpp/include/raft/core/resource/device_memory_resource.hpp
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include <raft/core/operators.hpp>
 #include <raft/core/resource/resource_types.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/util/cudart_utils.hpp>
@@ -110,6 +111,7 @@ class workspace_resource_factory : public resource_factory {
 /**
  * Load a temp workspace resource from a resources instance (and populate it on the res
  * if needed).
+ *
  * @param res raft resources object for managing resources
  * @return device memory resource object
  */
@@ -124,10 +126,14 @@ inline auto get_workspace_resource(resources const& res)
 };
 
 /**
- * Set a temp workspace resource on a resources instance.
+ * Set a temporary workspace resource on a resources instance.
  *
  * @param res raft resources object for managing resources
- * @param mr a valid rmm device_memory_resource
+ * @param mr an optional RMM device_memory_resource
+ * @param allocation_limit
+ *   the total amount of memory in bytes available to the temporary workspace resources.
+ * @param alignment optional alignment requirements passed to RMM allocations
+ *
  */
 inline void set_workspace_resource(resources const& res,
                                    std::shared_ptr<rmm::mr::device_memory_resource> mr = {nullptr},
@@ -138,24 +144,30 @@ inline void set_workspace_resource(resources const& res,
     std::make_shared<workspace_resource_factory>(mr, allocation_limit, alignment));
 };
 
+/**
+ * Set a temporary workspace resource on a resources instance.
+ *
+ * @param res raft resources object for managing resources
+ * @param mr an optional RMM device_memory_resource;
+ *   note, the ownership of the object is not transferred with this raw pointer interface.
+ * @param allocation_limit
+ *   the total amount of memory in bytes available to the temporary workspace resources.
+ * @param alignment optional alignment requirements passed to RMM allocations
+ *
+ */
 inline void set_workspace_resource(resources const& res,
                                    rmm::mr::device_memory_resource* mr,
                                    std::optional<std::size_t> allocation_limit = std::nullopt,
                                    std::optional<std::size_t> alignment        = std::nullopt)
 {
-  set_workspace_resource(
-    res, std::shared_ptr<rmm::mr::device_memory_resource>{mr}, allocation_limit, alignment);
-};
-
-template <class Deleter>
-inline void set_workspace_resource(resources const& res,
-                                   rmm::mr::device_memory_resource* mr,
-                                   Deleter d,
-                                   std::optional<std::size_t> allocation_limit = std::nullopt,
-                                   std::optional<std::size_t> alignment        = std::nullopt)
-{
-  set_workspace_resource(
-    res, std::shared_ptr<rmm::mr::device_memory_resource>{mr, d}, allocation_limit, alignment);
+  // NB: to preserve the semantics of passing memory resource without transferring the ownership,
+  //     we create a shared pointer with a dummy deleter (void_op).
+  set_workspace_resource(res,
+                         mr != nullptr
+                           ? std::shared_ptr<rmm::mr::device_memory_resource>{mr, void_op{}}
+                           : std::shared_ptr<rmm::mr::device_memory_resource>{nullptr},
+                         allocation_limit,
+                         alignment);
 };
 
 }  // namespace raft::resource
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
index 1a563d213e..d424d59dd1 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -442,9 +442,6 @@ void train_per_subset(raft::device_resources const& handle,
                  index.pq_len(),
                  stream);
 
-    // clone the handle and attached the device memory resource to it
-    const device_resources new_handle(handle, device_memory);
-
     // train PQ codebook for this subspace
     auto sub_trainset_view =
       raft::make_device_matrix_view<const float, IdxT>(sub_trainset.data(), n_rows, index.pq_len());
@@ -458,7 +455,7 @@ void train_per_subset(raft::device_resources const& handle,
     raft::cluster::kmeans_balanced_params kmeans_params;
     kmeans_params.n_iters = kmeans_n_iters;
     kmeans_params.metric  = raft::distance::DistanceType::L2Expanded;
-    raft::cluster::kmeans_balanced::helpers::build_clusters(new_handle,
+    raft::cluster::kmeans_balanced::helpers::build_clusters(handle,
                                                             kmeans_params,
                                                             sub_trainset_view,
                                                             centers_tmp_view,
@@ -523,9 +520,6 @@ void train_per_cluster(raft::device_resources const& handle,
                      indices + cluster_offsets[l],
                      device_memory);
 
-    // clone the handle and attached the device memory resource to it
-    const device_resources new_handle(handle, device_memory);
-
     // limit the cluster size to bound the training time.
     // [sic] we interpret the data as pq_len-dimensional
     size_t big_enough     = 256ul * std::max<size_t>(index.pq_book_size(), index.pq_dim());
@@ -546,7 +540,7 @@ void train_per_cluster(raft::device_resources const& handle,
     raft::cluster::kmeans_balanced_params kmeans_params;
     kmeans_params.n_iters = kmeans_n_iters;
     kmeans_params.metric  = raft::distance::DistanceType::L2Expanded;
-    raft::cluster::kmeans_balanced::helpers::build_clusters(new_handle,
+    raft::cluster::kmeans_balanced::helpers::build_clusters(handle,
                                                             kmeans_params,
                                                             rot_vectors_view,
                                                             centers_tmp_view,
diff --git a/cpp/include/raft/neighbors/ivf_pq.cuh b/cpp/include/raft/neighbors/ivf_pq.cuh
index 2a5c322016..6e51f65cf3 100644
--- a/cpp/include/raft/neighbors/ivf_pq.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq.cuh
@@ -23,10 +23,13 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/device_resources.hpp>
+#include <raft/core/operators.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 
+#include <memory>
+
 namespace raft::neighbors::ivf_pq {
 
 /**
@@ -348,7 +351,7 @@ void search(raft::device_resources const& handle,
             float* distances,
             rmm::mr::device_memory_resource* mr = nullptr)
 {
-  if (mr != nullptr) { raft::resource::set_workspace_resource(handle, mr); }
+  if (mr != nullptr) { resource::set_workspace_resource(handle, mr); }
   return detail::search(handle, params, idx, queries, n_queries, k, neighbors, distances);
 }
 

From 79c954e228030ff2aa58ce3b9c4be3c8778b931c Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 9 May 2023 08:29:16 +0200
Subject: [PATCH 05/25] Fix a missing merge change

---
 cpp/include/raft/neighbors/ivf_pq-inl.cuh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/include/raft/neighbors/ivf_pq-inl.cuh b/cpp/include/raft/neighbors/ivf_pq-inl.cuh
index aecce5e289..8e10e99088 100644
--- a/cpp/include/raft/neighbors/ivf_pq-inl.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq-inl.cuh
@@ -182,8 +182,7 @@ void search(raft::device_resources const& handle,
                         static_cast<std::uint32_t>(queries.extent(0)),
                         k,
                         neighbors.data_handle(),
-                        distances.data_handle(),
-                        handle.get_workspace_resource());
+                        distances.data_handle());
 }
 
 /** @} */  // end group ivf_pq

From 6cf11037b4c4f64f9057cd7cc6357907b8631b33 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 9 May 2023 09:02:35 +0200
Subject: [PATCH 06/25] Make the resource change not permanent

---
 cpp/include/raft/neighbors/ivf_pq-inl.cuh | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/cpp/include/raft/neighbors/ivf_pq-inl.cuh b/cpp/include/raft/neighbors/ivf_pq-inl.cuh
index 8e10e99088..be8f7edaf1 100644
--- a/cpp/include/raft/neighbors/ivf_pq-inl.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq-inl.cuh
@@ -348,8 +348,12 @@ void search(raft::device_resources const& handle,
             float* distances,
             rmm::mr::device_memory_resource* mr = nullptr)
 {
-  if (mr != nullptr) { resource::set_workspace_resource(handle, mr); }
-  return detail::search(handle, params, idx, queries, n_queries, k, neighbors, distances);
+  if (mr != nullptr) {
+    const device_resources res_local(handle, mr);
+    return detail::search(res_local, params, idx, queries, n_queries, k, neighbors, distances);
+  } else {
+    return detail::search(handle, params, idx, queries, n_queries, k, neighbors, distances);
+  }
 }
 
 }  // namespace raft::neighbors::ivf_pq

From 370b9edd1036ebc95509d644ac335f1711100579 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 9 May 2023 11:09:18 +0200
Subject: [PATCH 07/25] Don't force use the temp local workspace for all raft
 allocations

---
 .../raft/core/device_container_policy.hpp     | 41 ++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/cpp/include/raft/core/device_container_policy.hpp b/cpp/include/raft/core/device_container_policy.hpp
index eef981e56f..1fb2080fdb 100644
--- a/cpp/include/raft/core/device_container_policy.hpp
+++ b/cpp/include/raft/core/device_container_policy.hpp
@@ -164,7 +164,7 @@ class device_uvector_policy {
  public:
   auto create(raft::resources const& res, size_t n) -> container_type
   {
-    return container_type(n, resource::get_cuda_stream(res), resource::get_workspace_resource(res));
+    return container_type(n, resource::get_cuda_stream(res));
   }
 
   device_uvector_policy() = default;
@@ -183,4 +183,43 @@ class device_uvector_policy {
   [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; }
 };
 
+/**
+ * @brief A container policy for device mdarray allocated in the temporary workspace.
+ */
+template <typename ElementType>
+class device_uvector_workspace_policy {
+ public:
+  using element_type   = ElementType;
+  using container_type = device_uvector<element_type>;
+  // FIXME(jiamingy): allocator type is not supported by rmm::device_uvector
+  using pointer         = typename container_type::pointer;
+  using const_pointer   = typename container_type::const_pointer;
+  using reference       = device_reference<element_type>;
+  using const_reference = device_reference<element_type const>;
+
+  using accessor_policy       = std::experimental::default_accessor<element_type>;
+  using const_accessor_policy = std::experimental::default_accessor<element_type const>;
+
+ public:
+  auto create(raft::resources const& res, size_t n) -> container_type
+  {
+    return container_type(n, resource::get_cuda_stream(res), resource::get_workspace_resource(res));
+  }
+
+  device_uvector_workspace_policy() = default;
+
+  [[nodiscard]] constexpr auto access(container_type& c, size_t n) const noexcept -> reference
+  {
+    return c[n];
+  }
+  [[nodiscard]] constexpr auto access(container_type const& c, size_t n) const noexcept
+    -> const_reference
+  {
+    return c[n];
+  }
+
+  [[nodiscard]] auto make_accessor_policy() noexcept { return accessor_policy{}; }
+  [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; }
+};
+
 }  // namespace raft

From 06cf4ff6aa655548f4f3b82fa99bbc7dd042c5c7 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 28 Jun 2023 13:06:07 +0200
Subject: [PATCH 08/25] Don't use device_resources

---
 cpp/include/raft/neighbors/ivf_pq-inl.cuh | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/cpp/include/raft/neighbors/ivf_pq-inl.cuh b/cpp/include/raft/neighbors/ivf_pq-inl.cuh
index 7fc0944e7b..8be13f675a 100644
--- a/cpp/include/raft/neighbors/ivf_pq-inl.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq-inl.cuh
@@ -16,18 +16,18 @@
 
 #pragma once
 
-#include <raft/core/resource/device_memory_resource.hpp>
 #include <raft/neighbors/detail/ivf_pq_build.cuh>
 #include <raft/neighbors/detail/ivf_pq_search.cuh>
 #include <raft/neighbors/ivf_pq_serialize.cuh>
 #include <raft/neighbors/ivf_pq_types.hpp>
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/device_resources.hpp>  // TODO: remove when possible
+#include <raft/core/resource/device_memory_resource.hpp>
 #include <raft/core/resources.hpp>
 
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <memory>  // shared_ptr
 
 namespace raft::neighbors::ivf_pq {
 
@@ -459,8 +459,11 @@ void search(raft::resources const& handle,
             rmm::mr::device_memory_resource* mr = nullptr)
 {
   if (mr != nullptr) {
-    // TODO: replace in with a method that would allow to clone raft::resources
-    const device_resources res_local(handle, mr);
+    // Shallow copy of the resource with the automatic lifespan:
+    //                               change the workspace resource temporarily
+    raft::resources res_local(handle);
+    resource::set_workspace_resource(
+      res_local, std::shared_ptr<rmm::mr::device_memory_resource>{mr, void_op{}});
     return detail::search(res_local, params, idx, queries, n_queries, k, neighbors, distances);
   } else {
     return detail::search(handle, params, idx, queries, n_queries, k, neighbors, distances);

From f27ba869fe581d9b0bb1a06007baf2bd6545c43d Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 28 Jun 2023 14:23:30 +0200
Subject: [PATCH 09/25] Using more of workspace memory resource

---
 .../raft/neighbors/detail/ivf_pq_build.cuh    | 44 ++++++-------------
 1 file changed, 13 insertions(+), 31 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
index 9ada69e0f1..a006e2f1d2 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -346,10 +346,10 @@ void train_per_subset(raft::resources const& handle,
                       const float* trainset,   // [n_rows, dim]
                       const uint32_t* labels,  // [n_rows]
                       uint32_t kmeans_n_iters,
-                      rmm::mr::device_memory_resource* managed_memory,
-                      rmm::mr::device_memory_resource* device_memory)
+                      rmm::mr::device_memory_resource* managed_memory)
 {
-  auto stream = resource::get_cuda_stream(handle);
+  auto stream        = resource::get_cuda_stream(handle);
+  auto device_memory = resource::get_workspace_resource(handle);
 
   rmm::device_uvector<float> pq_centers_tmp(index.pq_centers().size(), stream, device_memory);
   rmm::device_uvector<float> sub_trainset(n_rows * size_t(index.pq_len()), stream, device_memory);
@@ -392,11 +392,6 @@ void train_per_subset(raft::resources const& handle,
                  index.pq_len(),
                  stream);
 
-    // clone the handle and attached the device memory resource to it
-    // TODO(achirkin): check if we should remove the device memory in the arguments
-    const resources new_handle(handle);
-    resource::set_workspace_resource(new_handle, device_memory);
-
     // train PQ codebook for this subspace
     auto sub_trainset_view =
       raft::make_device_matrix_view<const float, IdxT>(sub_trainset.data(), n_rows, index.pq_len());
@@ -428,10 +423,10 @@ void train_per_cluster(raft::resources const& handle,
                        const float* trainset,   // [n_rows, dim]
                        const uint32_t* labels,  // [n_rows]
                        uint32_t kmeans_n_iters,
-                       rmm::mr::device_memory_resource* managed_memory,
-                       rmm::mr::device_memory_resource* device_memory)
+                       rmm::mr::device_memory_resource* managed_memory)
 {
-  auto stream = resource::get_cuda_stream(handle);
+  auto stream        = resource::get_cuda_stream(handle);
+  auto device_memory = resource::get_workspace_resource(handle);
 
   rmm::device_uvector<float> pq_centers_tmp(index.pq_centers().size(), stream, device_memory);
   rmm::device_uvector<uint32_t> cluster_sizes(index.n_lists(), stream, managed_memory);
@@ -475,11 +470,6 @@ void train_per_cluster(raft::resources const& handle,
                      indices + cluster_offsets[l],
                      device_memory);
 
-    // clone the handle and attached the device memory resource to it
-    // TODO(achirkin): check if we should remove the device memory in the arguments
-    const resources new_handle(handle);
-    resource::set_workspace_resource(new_handle, device_memory);
-
     // limit the cluster size to bound the training time.
     // [sic] we interpret the data as pq_len-dimensional
     size_t big_enough     = 256ul * std::max<size_t>(index.pq_book_size(), index.pq_dim());
@@ -1545,24 +1535,18 @@ auto build(raft::resources const& handle,
       size_t(n_rows) / std::max<size_t>(params.kmeans_trainset_fraction * n_rows, index.n_lists()));
     size_t n_rows_train = n_rows / trainset_ratio;
 
-    rmm::mr::device_memory_resource* device_memory = nullptr;
-    auto pool_guard = raft::get_pool_memory_resource(device_memory, 1024 * 1024);
-    if (pool_guard) { RAFT_LOG_DEBUG("ivf_pq::build: using pool memory resource"); }
-
+    auto* device_memory = resource::get_workspace_resource(handle);
     rmm::mr::managed_memory_resource managed_memory_upstream;
     rmm::mr::pool_memory_resource<rmm::mr::managed_memory_resource> managed_memory(
       &managed_memory_upstream, 1024 * 1024);
 
     // If the trainset is small enough to comfortably fit into device memory, put it there.
     // Otherwise, use the managed memory.
+    constexpr size_t kTolerableRatio                     = 4;
     rmm::mr::device_memory_resource* big_memory_resource = &managed_memory;
-    {
-      size_t free_mem, total_mem;
-      constexpr size_t kTolerableRatio = 4;
-      RAFT_CUDA_TRY(cudaMemGetInfo(&free_mem, &total_mem));
-      if (sizeof(float) * n_rows_train * index.dim() * kTolerableRatio < free_mem) {
-        big_memory_resource = device_memory;
-      }
+    if (sizeof(float) * n_rows_train * index.dim() * kTolerableRatio <
+        resource::get_workspace_free_bytes(handle)) {
+      big_memory_resource = device_memory;
     }
 
     // Besides just sampling, we transform the input dataset into floats to make it easier
@@ -1711,8 +1695,7 @@ auto build(raft::resources const& handle,
                          trainset.data(),
                          labels.data(),
                          params.kmeans_n_iters,
-                         &managed_memory,
-                         device_memory);
+                         &managed_memory);
         break;
       case codebook_gen::PER_CLUSTER:
         train_per_cluster(handle,
@@ -1721,8 +1704,7 @@ auto build(raft::resources const& handle,
                           trainset.data(),
                           labels.data(),
                           params.kmeans_n_iters,
-                          &managed_memory,
-                          device_memory);
+                          &managed_memory);
         break;
       default: RAFT_FAIL("Unreachable code");
     }

From d435855dd76930db05d1270979ba8dd714d12d7a Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 28 Jun 2023 14:24:35 +0200
Subject: [PATCH 10/25] Let device_uvector_policy keep the memory resource when
 needed

---
 .../raft/core/device_container_policy.hpp     | 51 +++++--------------
 cpp/include/raft/core/device_mdarray.hpp      |  2 +-
 .../neighbors/detail/ivf_flat_serialize.cuh   |  2 +-
 .../neighbors/detail/ivf_pq_serialize.cuh     |  2 +-
 4 files changed, 15 insertions(+), 42 deletions(-)

diff --git a/cpp/include/raft/core/device_container_policy.hpp b/cpp/include/raft/core/device_container_policy.hpp
index 1fb2080fdb..011de307db 100644
--- a/cpp/include/raft/core/device_container_policy.hpp
+++ b/cpp/include/raft/core/device_container_policy.hpp
@@ -164,49 +164,19 @@ class device_uvector_policy {
  public:
   auto create(raft::resources const& res, size_t n) -> container_type
   {
-    return container_type(n, resource::get_cuda_stream(res));
+    if (mr_ == nullptr) {
+      // NB: not using the workspace resource by default!
+      //     The workspace resource is for short-lived temporary allocations.
+      return container_type(n, resource::get_cuda_stream(res));
+    } else {
+      return container_type(n, resource::get_cuda_stream(res), mr_);
+    }
   }
 
-  device_uvector_policy() = default;
-
-  [[nodiscard]] constexpr auto access(container_type& c, size_t n) const noexcept -> reference
+  constexpr device_uvector_policy() = default;
+  constexpr explicit device_uvector_policy(rmm::mr::device_memory_resource* mr) noexcept : mr_(mr)
   {
-    return c[n];
   }
-  [[nodiscard]] constexpr auto access(container_type const& c, size_t n) const noexcept
-    -> const_reference
-  {
-    return c[n];
-  }
-
-  [[nodiscard]] auto make_accessor_policy() noexcept { return accessor_policy{}; }
-  [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; }
-};
-
-/**
- * @brief A container policy for device mdarray allocated in the temporary workspace.
- */
-template <typename ElementType>
-class device_uvector_workspace_policy {
- public:
-  using element_type   = ElementType;
-  using container_type = device_uvector<element_type>;
-  // FIXME(jiamingy): allocator type is not supported by rmm::device_uvector
-  using pointer         = typename container_type::pointer;
-  using const_pointer   = typename container_type::const_pointer;
-  using reference       = device_reference<element_type>;
-  using const_reference = device_reference<element_type const>;
-
-  using accessor_policy       = std::experimental::default_accessor<element_type>;
-  using const_accessor_policy = std::experimental::default_accessor<element_type const>;
-
- public:
-  auto create(raft::resources const& res, size_t n) -> container_type
-  {
-    return container_type(n, resource::get_cuda_stream(res), resource::get_workspace_resource(res));
-  }
-
-  device_uvector_workspace_policy() = default;
 
   [[nodiscard]] constexpr auto access(container_type& c, size_t n) const noexcept -> reference
   {
@@ -220,6 +190,9 @@ class device_uvector_workspace_policy {
 
   [[nodiscard]] auto make_accessor_policy() noexcept { return accessor_policy{}; }
   [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; }
+
+ private:
+  rmm::mr::device_memory_resource* mr_{nullptr};
 };
 
 }  // namespace raft
diff --git a/cpp/include/raft/core/device_mdarray.hpp b/cpp/include/raft/core/device_mdarray.hpp
index 68273db15c..fe543c97dd 100644
--- a/cpp/include/raft/core/device_mdarray.hpp
+++ b/cpp/include/raft/core/device_mdarray.hpp
@@ -112,7 +112,7 @@ auto make_device_mdarray(raft::resources const& handle,
   using mdarray_t = device_mdarray<ElementType, decltype(exts), LayoutPolicy>;
 
   typename mdarray_t::mapping_type layout{exts};
-  typename mdarray_t::container_policy_type policy{};
+  typename mdarray_t::container_policy_type policy{mr};
 
   return mdarray_t{handle, layout, policy};
 }
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_serialize.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_serialize.cuh
index b00d308827..61a6046273 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_serialize.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_serialize.cuh
@@ -45,7 +45,7 @@ struct check_index_layout {
                 "paste in the new size and consider updating the serialization logic");
 };
 
-template struct check_index_layout<sizeof(index<double, std::uint64_t>), 296>;
+template struct check_index_layout<sizeof(index<double, std::uint64_t>), 328>;
 
 /**
  * Save the index to file.
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_serialize.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_serialize.cuh
index ff5bd8ef89..f01035cad3 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_serialize.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_serialize.cuh
@@ -48,7 +48,7 @@ struct check_index_layout {
 };
 
 // TODO: Recompute this and come back to it.
-template struct check_index_layout<sizeof(index<std::uint64_t>), 448>;
+template struct check_index_layout<sizeof(index<std::uint64_t>), 480>;
 
 /**
  * Write the index to an output stream

From 1b62e3af8a18ea8fefff2cb64870c144415f1273 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 28 Jun 2023 14:24:57 +0200
Subject: [PATCH 11/25] Make helper to query workspace size

---
 .../core/resource/device_memory_resource.hpp  | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp
index 07b3dd0be0..8cd93fe46b 100644
--- a/cpp/include/raft/core/resource/device_memory_resource.hpp
+++ b/cpp/include/raft/core/resource/device_memory_resource.hpp
@@ -126,6 +126,25 @@ inline auto get_workspace_resource(resources const& res)
     resource_type::WORKSPACE_RESOURCE);
 };
 
+/** Get the total size of the workspace resource. */
+inline auto get_workspace_total_bytes(resources const& res) -> size_t
+{
+  return get_workspace_resource(res)->get_allocation_limit();
+};
+
+/** Get the already allocated size of the workspace resource. */
+inline auto get_workspace_used_bytes(resources const& res) -> size_t
+{
+  return get_workspace_resource(res)->get_allocated_bytes();
+};
+
+/** Get the available size of the workspace resource. */
+inline auto get_workspace_free_bytes(resources const& res) -> size_t
+{
+  const auto* p = get_workspace_resource(res);
+  return p->get_allocation_limit() - p->get_allocated_bytes();
+};
+
 /**
  * Set a temporary workspace resource on a resources instance.
  *

From 5fed631645d8d512f52591cdcb11fc4056fed935 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 28 Jun 2023 15:06:49 +0200
Subject: [PATCH 12/25] Tiny unrelated test fix: copy data in a stream.

---
 cpp/test/util/device_atomics.cu | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/cpp/test/util/device_atomics.cu b/cpp/test/util/device_atomics.cu
index 5e8a67c8f6..f305c7c875 100644
--- a/cpp/test/util/device_atomics.cu
+++ b/cpp/test/util/device_atomics.cu
@@ -53,14 +53,11 @@ TEST(Raft, AtomicIncWarp)
                                                                        out_device.data());
 
   // Copy data to host
-  RAFT_CUDA_TRY(cudaMemcpy(out_host.data(),
-                           (const void*)out_device.data(),
-                           num_elts * sizeof(int),
-                           cudaMemcpyDeviceToHost));
+  copy(out_host.data(), out_device.data(), num_elts, s);
 
   // Check that count is correct and that each thread index is contained in the
   // array exactly once.
-  ASSERT_EQ(num_elts, counter.value(s));
+  ASSERT_EQ(num_elts, counter.value(s));  // NB: accessing the counter synchronizes `s`
   std::sort(out_host.begin(), out_host.end());
   for (int i = 0; i < num_elts; ++i) {
     ASSERT_EQ(i, out_host[i]);

From a2e749dff03223d6482e2f24367c4276b955f822 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 28 Jun 2023 16:45:44 +0200
Subject: [PATCH 13/25] Update the API to always use shared pointers to the
 resources

---
 cpp/include/raft/core/device_resources.hpp    | 10 +--
 cpp/include/raft/core/handle.hpp              |  9 +--
 .../core/resource/device_memory_resource.hpp  | 26 -------
 cpp/test/core/handle.cpp                      | 71 ++++++++++++-------
 4 files changed, 58 insertions(+), 58 deletions(-)

diff --git a/cpp/include/raft/core/device_resources.hpp b/cpp/include/raft/core/device_resources.hpp
index 6c7813890a..cf06920a8c 100644
--- a/cpp/include/raft/core/device_resources.hpp
+++ b/cpp/include/raft/core/device_resources.hpp
@@ -61,7 +61,7 @@ namespace raft {
 class device_resources : public resources {
  public:
   device_resources(const device_resources& handle,
-                   rmm::mr::device_memory_resource* workspace_resource,
+                   std::shared_ptr<rmm::mr::device_memory_resource> workspace_resource,
                    std::optional<std::size_t> allocation_limit = std::nullopt)
     : resources{handle}
   {
@@ -86,8 +86,8 @@ class device_resources : public resources {
    */
   device_resources(rmm::cuda_stream_view stream_view                  = rmm::cuda_stream_per_thread,
                    std::shared_ptr<rmm::cuda_stream_pool> stream_pool = {nullptr},
-                   rmm::mr::device_memory_resource* workspace_resource = nullptr,
-                   std::optional<std::size_t> allocation_limit         = std::nullopt)
+                   std::shared_ptr<rmm::mr::device_memory_resource> workspace_resource = {nullptr},
+                   std::optional<std::size_t> allocation_limit = std::nullopt)
     : resources{}
   {
     resources::add_resource_factory(std::make_shared<resource::device_id_resource_factory>());
@@ -95,7 +95,9 @@ class device_resources : public resources {
       std::make_shared<resource::cuda_stream_resource_factory>(stream_view));
     resources::add_resource_factory(
       std::make_shared<resource::cuda_stream_pool_resource_factory>(stream_pool));
-    resource::set_workspace_resource(*this, workspace_resource, allocation_limit);
+    if (workspace_resource) {
+      resource::set_workspace_resource(*this, workspace_resource, allocation_limit);
+    }
   }
 
   /** Destroys all held-up resources */
diff --git a/cpp/include/raft/core/handle.hpp b/cpp/include/raft/core/handle.hpp
index 2a6b5657e2..124ab8c315 100644
--- a/cpp/include/raft/core/handle.hpp
+++ b/cpp/include/raft/core/handle.hpp
@@ -32,7 +32,8 @@ namespace raft {
  */
 class handle_t : public raft::device_resources {
  public:
-  handle_t(const handle_t& handle, rmm::mr::device_memory_resource* workspace_resource)
+  handle_t(const handle_t& handle,
+           std::shared_ptr<rmm::mr::device_memory_resource> workspace_resource)
     : device_resources(handle, workspace_resource)
   {
   }
@@ -51,9 +52,9 @@ class handle_t : public raft::device_resources {
    * @param[in] workspace_resource an optional resource used by some functions for allocating
    *            temporary workspaces.
    */
-  handle_t(rmm::cuda_stream_view stream_view                   = rmm::cuda_stream_per_thread,
-           std::shared_ptr<rmm::cuda_stream_pool> stream_pool  = {nullptr},
-           rmm::mr::device_memory_resource* workspace_resource = nullptr)
+  handle_t(rmm::cuda_stream_view stream_view                  = rmm::cuda_stream_per_thread,
+           std::shared_ptr<rmm::cuda_stream_pool> stream_pool = {nullptr},
+           std::shared_ptr<rmm::mr::device_memory_resource> workspace_resource = {nullptr})
     : device_resources{stream_view, stream_pool, workspace_resource}
   {
   }
diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp
index 8cd93fe46b..33e8944dcc 100644
--- a/cpp/include/raft/core/resource/device_memory_resource.hpp
+++ b/cpp/include/raft/core/resource/device_memory_resource.hpp
@@ -164,30 +164,4 @@ inline void set_workspace_resource(resources const& res,
     std::make_shared<workspace_resource_factory>(mr, allocation_limit, alignment));
 };
 
-/**
- * Set a temporary workspace resource on a resources instance.
- *
- * @param res raft resources object for managing resources
- * @param mr an optional RMM device_memory_resource;
- *   note, the ownership of the object is not transferred with this raw pointer interface.
- * @param allocation_limit
- *   the total amount of memory in bytes available to the temporary workspace resources.
- * @param alignment optional alignment requirements passed to RMM allocations
- *
- */
-inline void set_workspace_resource(resources const& res,
-                                   rmm::mr::device_memory_resource* mr,
-                                   std::optional<std::size_t> allocation_limit = std::nullopt,
-                                   std::optional<std::size_t> alignment        = std::nullopt)
-{
-  // NB: to preserve the semantics of passing memory resource without transferring the ownership,
-  //     we create a shared pointer with a dummy deleter (void_op).
-  set_workspace_resource(res,
-                         mr != nullptr
-                           ? std::shared_ptr<rmm::mr::device_memory_resource>{mr, void_op{}}
-                           : std::shared_ptr<rmm::mr::device_memory_resource>{nullptr},
-                         allocation_limit,
-                         alignment);
-};
-
 }  // namespace raft::resource
diff --git a/cpp/test/core/handle.cpp b/cpp/test/core/handle.cpp
index 5436396b86..c0efe9236f 100644
--- a/cpp/test/core/handle.cpp
+++ b/cpp/test/core/handle.cpp
@@ -25,6 +25,7 @@
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/cuda_stream_pool.hpp>
 #include <raft/core/resource/device_memory_resource.hpp>
+#include <rmm/device_buffer.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 #include <unordered_map>
@@ -274,39 +275,61 @@ TEST(Raft, WorkspaceResource)
 {
   raft::handle_t handle;
 
-  ASSERT_TRUE(dynamic_cast<const rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>*>(
-                resource::get_workspace_resource(handle)) == nullptr);
-  ASSERT_EQ(rmm::mr::get_current_device_resource(), resource::get_workspace_resource(handle));
+  // The returned resource is always a limiting adaptor
+  auto* orig_mr = resource::get_workspace_resource(handle)->get_upstream();
 
-  auto pool_mr = new rmm::mr::pool_memory_resource(rmm::mr::get_current_device_resource());
-  std::shared_ptr<rmm::cuda_stream_pool> pool = {nullptr};
-  raft::handle_t handle2(rmm::cuda_stream_per_thread, pool, pool_mr);
+  // Let's create a pooled resource
+  auto pool_mr = std::shared_ptr<rmm::mr::device_memory_resource>{
+    new rmm::mr::pool_memory_resource(rmm::mr::get_current_device_resource())};
 
-  ASSERT_TRUE(dynamic_cast<const rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>*>(
-                resource::get_workspace_resource(handle2)) != nullptr);
-  // ASSERT_EQ(pool_mr, resource::get_workspace_resource(handle2)); // TODO: limiting resource!
+  // A tiny workspace of 1MB
+  size_t max_size = 1024 * 1024;
 
-  delete pool_mr;
-}
-
-TEST(Raft, WorkspaceResourceCopy)
-{
-  auto stream_pool = std::make_shared<rmm::cuda_stream_pool>(10);
+  // Replace the resource
+  resource::set_workspace_resource(handle, pool_mr, max_size);
+  auto new_mr = resource::get_workspace_resource(handle);
 
-  handle_t handle(rmm::cuda_stream_per_thread, stream_pool);
+  // By this point, the orig_mr likely points to a non-existent resource; don't dereference!
+  ASSERT_NE(orig_mr, new_mr);
+  ASSERT_EQ(pool_mr.get(), new_mr->get_upstream());
+  // We can safely reset pool_mr, because the shared_ptr to the pool memory stays in the resource
+  pool_mr.reset();
 
-  auto pool_mr = new rmm::mr::pool_memory_resource(rmm::mr::get_current_device_resource());
+  auto stream = resource::get_cuda_stream(handle);
+  rmm::device_buffer buf(max_size / 2, stream, new_mr);
 
-  handle_t copied_handle(handle, pool_mr);
+  // Note, the underlying pool allocator likey uses more space than reported here
+  ASSERT_EQ(max_size, resource::get_workspace_total_bytes(handle));
+  ASSERT_EQ(buf.size(), resource::get_workspace_used_bytes(handle));
+  ASSERT_EQ(max_size - buf.size(), resource::get_workspace_free_bytes(handle));
 
-  assert_handles_equal(handle, copied_handle);
+  // this should throw, becaise we partially used the space.
+  ASSERT_THROW((rmm::device_buffer{max_size, stream, new_mr}), rmm::bad_alloc);
+}
 
-  // Assert the workspace_resources are what we expect
-  ASSERT_TRUE(dynamic_cast<const rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>*>(
-                resource::get_workspace_resource(handle)) == nullptr);
+TEST(Raft, WorkspaceResourceCopy)
+{
+  raft::handle_t res;
+  auto orig_mr   = resource::get_workspace_resource(res);
+  auto orig_size = resource::get_workspace_total_bytes(res);
 
-  ASSERT_TRUE(dynamic_cast<const rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>*>(
-                resource::get_workspace_resource(copied_handle)) != nullptr);
+  {
+    // create a new handle in the inner scope and update the workspace resource for it.
+    raft::resources tmp_res(res);
+    resource::set_workspace_resource(
+      tmp_res,
+      std::shared_ptr<rmm::mr::device_memory_resource>{
+        new rmm::mr::pool_memory_resource(rmm::mr::get_current_device_resource())},
+      orig_size * 2);
+
+    ASSERT_EQ(orig_mr, resource::get_workspace_resource(res));
+    ASSERT_EQ(orig_size, resource::get_workspace_total_bytes(res));
+
+    ASSERT_NE(orig_mr, resource::get_workspace_resource(tmp_res));
+    ASSERT_NE(orig_size, resource::get_workspace_total_bytes(tmp_res));
+  }
+  ASSERT_EQ(orig_mr, resource::get_workspace_resource(res));
+  ASSERT_EQ(orig_size, resource::get_workspace_total_bytes(res));
 }
 
 TEST(Raft, HandleCopy)

From 7736d768158636345bde43fb26ff0ed5d79d9806 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 28 Jun 2023 17:03:30 +0200
Subject: [PATCH 14/25] Fix a typo

---
 cpp/test/core/handle.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/test/core/handle.cpp b/cpp/test/core/handle.cpp
index c0efe9236f..a1ad4385a7 100644
--- a/cpp/test/core/handle.cpp
+++ b/cpp/test/core/handle.cpp
@@ -298,7 +298,7 @@ TEST(Raft, WorkspaceResource)
   auto stream = resource::get_cuda_stream(handle);
   rmm::device_buffer buf(max_size / 2, stream, new_mr);
 
-  // Note, the underlying pool allocator likey uses more space than reported here
+  // Note, the underlying pool allocator likely uses more space than reported here
   ASSERT_EQ(max_size, resource::get_workspace_total_bytes(handle));
   ASSERT_EQ(buf.size(), resource::get_workspace_used_bytes(handle));
   ASSERT_EQ(max_size - buf.size(), resource::get_workspace_free_bytes(handle));

From be63f73ead447d7fef65aedd8c234949d20bb9d8 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 29 Jun 2023 08:33:54 +0200
Subject: [PATCH 15/25] Rename limited->limiting resource for consistency

---
 .../raft/core/resource/device_memory_resource.hpp    | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp
index 33e8944dcc..af2761882a 100644
--- a/cpp/include/raft/core/resource/device_memory_resource.hpp
+++ b/cpp/include/raft/core/resource/device_memory_resource.hpp
@@ -29,18 +29,18 @@
 #include <optional>
 
 namespace raft::resource {
-class limited_memory_resource : public resource {
+class limiting_memory_resource : public resource {
  public:
-  limited_memory_resource(std::shared_ptr<rmm::mr::device_memory_resource> mr,
-                          std::size_t allocation_limit,
-                          std::optional<std::size_t> alignment)
+  limiting_memory_resource(std::shared_ptr<rmm::mr::device_memory_resource> mr,
+                           std::size_t allocation_limit,
+                           std::optional<std::size_t> alignment)
     : upstream_(mr), mr_(make_adaptor(mr, allocation_limit, alignment))
   {
   }
 
   auto get_resource() -> void* override { return &mr_; }
 
-  ~limited_memory_resource() override = default;
+  ~limiting_memory_resource() override = default;
 
  private:
   std::shared_ptr<rmm::mr::device_memory_resource> upstream_;
@@ -79,7 +79,7 @@ class workspace_resource_factory : public resource_factory {
   auto get_resource_type() -> resource_type override { return resource_type::WORKSPACE_RESOURCE; }
   auto make_resource() -> resource* override
   {
-    return new limited_memory_resource(mr_, allocation_limit_, alignment_);
+    return new limiting_memory_resource(mr_, allocation_limit_, alignment_);
   }
 
  private:

From c70728a04f459e528333ee6cd237fd30844e82e5 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 29 Jun 2023 12:49:43 +0200
Subject: [PATCH 16/25] Add comments

---
 .../core/resource/device_memory_resource.hpp  | 98 +++++++++++++++++--
 1 file changed, 89 insertions(+), 9 deletions(-)

diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp
index af2761882a..ec097c0333 100644
--- a/cpp/include/raft/core/resource/device_memory_resource.hpp
+++ b/cpp/include/raft/core/resource/device_memory_resource.hpp
@@ -82,30 +82,72 @@ class workspace_resource_factory : public resource_factory {
     return new limiting_memory_resource(mr_, allocation_limit_, alignment_);
   }
 
+  /** Construct a sensible default pool memory resource. */
+  static inline auto default_pool_resource(std::size_t limit)
+    -> std::shared_ptr<rmm::mr::device_memory_resource>
+  {
+    // Set the default granularity to 1 GiB
+    constexpr std::size_t kOneGb = 1024lu * 1024lu * 1024lu;
+    // The initial size of the pool. The choice of this value only affects the performance a little
+    // bit. Heuristics:
+    //   1) the pool shouldn't be too big from the beginning independently of the limit;
+    //   2) otherwise, set it to half the max size to avoid too many resize calls.
+    auto min_size = std::min<std::size_t>(kOneGb, limit / 2lu);
+    // The pool is going to be place behind the limiting resource adaptor. This means the user won't
+    // be able to allocate more than 'limit' bytes of memory anyway. At the same time, the pool
+    // itself may consume a little bit more memory than the 'limit' due to memory fragmentation.
+    // Therefore, we look for a compromise, such that:
+    //   1) 'limit' is accurate - the user should be more likely to run into the limiting resource
+    //      resource adaptor bad_alloc error than into the pool bad_alloc error.
+    //   2) The pool doesn't grab too much memory on top of the 'limit'.
+    auto max_size = std::min<std::size_t>(limit + kOneGb / 2lu, limit * 3lu / 2lu);
+    auto upstream = rmm::mr::get_current_device_resource();
+    return std::make_shared<rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>>(
+      upstream, min_size, max_size);
+  }
+
+  /** Get the global memory resource wrapped into an unmanaged shared_ptr (with no deleter). */
+  static inline auto default_plain_resource() -> std::shared_ptr<rmm::mr::device_memory_resource>
+  {
+    return std::shared_ptr<rmm::mr::device_memory_resource>{rmm::mr::get_current_device_resource(),
+                                                            void_op{}};
+  }
+
  private:
   std::size_t allocation_limit_;
   std::optional<std::size_t> alignment_;
   std::shared_ptr<rmm::mr::device_memory_resource> mr_;
 
-  // Create a pool memory resource by default
   static inline auto default_memory_resource(std::size_t limit)
     -> std::shared_ptr<rmm::mr::device_memory_resource>
   {
-    constexpr std::size_t kOneGb = 1024lu * 1024lu * 1024lu;
-    auto min_size                = std::min<std::size_t>(kOneGb, limit / 2);
-    auto max_size                = limit * 3lu / 2lu;
-    auto upstream                = rmm::mr::get_current_device_resource();
-    return std::make_shared<rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>>(
-      upstream, min_size, max_size);
+    if (rmm::mr::cuda_memory_resource{}.is_equal(*rmm::mr::get_current_device_resource())) {
+      // Use the memory pool if only we're sure the global memory resource is set to its default,
+      // which is the cuda_memory_resource.
+      // The reason for this is that some raft algorithms rely on the workspace allocator to be
+      // fast; e.g. some buffers are allocated and released in a loop in performance-critical paths
+      // (batching), such as ANN-search routines. We don't want many allocations to happen there
+      // unless the user insists on it.
+      RAFT_LOG_DEBUG("The workspace uses the pool memory resource by default (limit: %zu)", limit);
+      return default_pool_resource(limit);
+    } else {
+      // If the user sets the global (rmm) memory resource to anything but the trivial
+      // cuda_memory_resource, we don't interfere that - they know better. In this case, the
+      // limiting resource adaptor is set on top the global (per-device) resource.
+      RAFT_LOG_DEBUG("The workspace uses the global default memory resource (limit: %zu)", limit);
+      return default_plain_resource();
+    }
   }
 
-  // Allow a fraction of available memory by default.
   static inline auto default_allocation_limit() -> std::size_t
   {
     std::size_t free_size{};
     std::size_t total_size{};
     RAFT_CUDA_TRY(cudaMemGetInfo(&free_size, &total_size));
-    return free_size / 2;
+    // Note, the workspace does not claim all this memory from the start, so it's still usable by
+    // the main resource as well.
+    // This limit is merely an order for algorithm internals to plan the batching accordingly.
+    return total_size / 2;
   }
 };
 
@@ -164,4 +206,42 @@ inline void set_workspace_resource(resources const& res,
     std::make_shared<workspace_resource_factory>(mr, allocation_limit, alignment));
 };
 
+/**
+ * Set the temporary workspace resource to a pool on top of the global memory resource
+ * (`rmm::mr::get_current_device_resource()`.
+ *
+ * @param res raft resources object for managing resources
+ * @param allocation_limit
+ *   the total amount of memory in bytes available to the temporary workspace resources;
+ *   if not provided, a last used or default limit is used.
+ *
+ */
+inline void set_workspace_to_pool_resource(
+  resources const& res, std::optional<std::size_t> allocation_limit = std::nullopt)
+{
+  if (!allocation_limit.has_value()) { allocation_limit = get_workspace_total_bytes(res); }
+  res.add_resource_factory(std::make_shared<workspace_resource_factory>(
+    workspace_resource_factory::default_pool_resource(*allocation_limit),
+    allocation_limit,
+    std::nullopt));
+};
+
+/**
+ * Set the temporary workspace resource the same as the global memory resource
+ * (`rmm::mr::get_current_device_resource()`.
+ *
+ * Note, the workspace resource is always limited; the limit here defines how much of the global
+ * memory resource can be consumed by the workspace allocations.
+ *
+ * @param res raft resources object for managing resources
+ * @param allocation_limit
+ *   the total amount of memory in bytes available to the temporary workspace resources.
+ */
+inline void set_workspace_to_global_resource(
+  resources const& res, std::optional<std::size_t> allocation_limit = std::nullopt)
+{
+  res.add_resource_factory(std::make_shared<workspace_resource_factory>(
+    workspace_resource_factory::default_plain_resource(), allocation_limit, std::nullopt));
+};
+
 }  // namespace raft::resource

From be047d4eb03895005edbf44abb33e0a1c1295b33 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 29 Jun 2023 13:11:33 +0200
Subject: [PATCH 17/25] Remove repeated word in the comment

---
 cpp/include/raft/core/resource/device_memory_resource.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp
index ec097c0333..701cf43f5b 100644
--- a/cpp/include/raft/core/resource/device_memory_resource.hpp
+++ b/cpp/include/raft/core/resource/device_memory_resource.hpp
@@ -97,7 +97,7 @@ class workspace_resource_factory : public resource_factory {
     // be able to allocate more than 'limit' bytes of memory anyway. At the same time, the pool
     // itself may consume a little bit more memory than the 'limit' due to memory fragmentation.
     // Therefore, we look for a compromise, such that:
-    //   1) 'limit' is accurate - the user should be more likely to run into the limiting resource
+    //   1) 'limit' is accurate - the user should be more likely to run into the limiting
     //      resource adaptor bad_alloc error than into the pool bad_alloc error.
     //   2) The pool doesn't grab too much memory on top of the 'limit'.
     auto max_size = std::min<std::size_t>(limit + kOneGb / 2lu, limit * 3lu / 2lu);

From 3e151d4c0735b2173d55d931e24fd9e5158c1474 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 29 Jun 2023 13:13:37 +0200
Subject: [PATCH 18/25] Fix a missing word in the comment

---
 cpp/include/raft/core/resource/device_memory_resource.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp
index 701cf43f5b..10e399439a 100644
--- a/cpp/include/raft/core/resource/device_memory_resource.hpp
+++ b/cpp/include/raft/core/resource/device_memory_resource.hpp
@@ -133,7 +133,7 @@ class workspace_resource_factory : public resource_factory {
     } else {
       // If the user sets the global (rmm) memory resource to anything but the trivial
       // cuda_memory_resource, we don't interfere that - they know better. In this case, the
-      // limiting resource adaptor is set on top the global (per-device) resource.
+      // limiting resource adaptor is set on top of the global (per-device) resource.
       RAFT_LOG_DEBUG("The workspace uses the global default memory resource (limit: %zu)", limit);
       return default_plain_resource();
     }

From 4cf64557d09e05ec287d2957025e5a1dee15a94a Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 7 Jul 2023 10:12:30 +0200
Subject: [PATCH 19/25] Add a deprecation comment to the mr argument

---
 cpp/include/raft/neighbors/ivf_pq-inl.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/include/raft/neighbors/ivf_pq-inl.cuh b/cpp/include/raft/neighbors/ivf_pq-inl.cuh
index 8be13f675a..b875d81f15 100644
--- a/cpp/include/raft/neighbors/ivf_pq-inl.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq-inl.cuh
@@ -444,8 +444,8 @@ void search_with_filtering(raft::resources const& handle,
  * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
  * [n_queries, k]
  * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
- * @param[in] mr an optional memory resource to use across the searches (you can provide a large
- * enough memory pool here to avoid memory allocations within search).
+ * @param[in] mr (deprecated) an optional memory resource to use across the searches (you can
+ * provide a large enough memory pool here to avoid memory allocations within search).
  */
 template <typename T, typename IdxT>
 void search(raft::resources const& handle,

From 127907ca32f6eedc645167b6e712415262808dd2 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 7 Jul 2023 10:42:40 +0200
Subject: [PATCH 20/25] Add function deprecations

---
 cpp/include/raft/neighbors/ivf_pq-ext.cuh |  45 ++++++++-
 cpp/include/raft/neighbors/ivf_pq-inl.cuh | 106 ++++++++++++++++------
 2 files changed, 118 insertions(+), 33 deletions(-)

diff --git a/cpp/include/raft/neighbors/ivf_pq-ext.cuh b/cpp/include/raft/neighbors/ivf_pq-ext.cuh
index 1595f55d8c..fcfe837e2d 100644
--- a/cpp/include/raft/neighbors/ivf_pq-ext.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq-ext.cuh
@@ -92,8 +92,7 @@ void search_with_filtering(raft::resources const& handle,
                            uint32_t k,
                            IdxT* neighbors,
                            float* distances,
-                           rmm::mr::device_memory_resource* mr = nullptr,
-                           IvfSampleFilterT sample_filter      = IvfSampleFilterT()) RAFT_EXPLICIT;
+                           IvfSampleFilterT sample_filter = IvfSampleFilterT{}) RAFT_EXPLICIT;
 
 template <typename T, typename IdxT>
 void search(raft::resources const& handle,
@@ -103,8 +102,34 @@ void search(raft::resources const& handle,
             uint32_t n_queries,
             uint32_t k,
             IdxT* neighbors,
-            float* distances,
-            rmm::mr::device_memory_resource* mr = nullptr) RAFT_EXPLICIT;
+            float* distances) RAFT_EXPLICIT;
+
+template <typename T, typename IdxT, typename IvfSampleFilterT>
+[[deprecated(
+  "Drop the `mr` argument and use `raft::resource::set_workspace_resource` instead")]] void
+search_with_filtering(raft::resources const& handle,
+                      const raft::neighbors::ivf_pq::search_params& params,
+                      const index<IdxT>& idx,
+                      const T* queries,
+                      uint32_t n_queries,
+                      uint32_t k,
+                      IdxT* neighbors,
+                      float* distances,
+                      rmm::mr::device_memory_resource* mr,
+                      IvfSampleFilterT sample_filter = IvfSampleFilterT{}) RAFT_EXPLICIT;
+
+template <typename T, typename IdxT>
+[[deprecated(
+  "Drop the `mr` argument and use `raft::resource::set_workspace_resource` instead")]] void
+search(raft::resources const& handle,
+       const raft::neighbors::ivf_pq::search_params& params,
+       const index<IdxT>& idx,
+       const T* queries,
+       uint32_t n_queries,
+       uint32_t k,
+       IdxT* neighbors,
+       float* distances,
+       rmm::mr::device_memory_resource* mr) RAFT_EXPLICIT;
 
 }  // namespace raft::neighbors::ivf_pq
 
@@ -182,7 +207,17 @@ instantiate_raft_neighbors_ivf_pq_extend(uint8_t, int64_t);
     uint32_t k,                                                      \
     IdxT* neighbors,                                                 \
     float* distances,                                                \
-    rmm::mr::device_memory_resource* mr)
+    rmm::mr::device_memory_resource* mr);                            \
+                                                                     \
+  extern template void raft::neighbors::ivf_pq::search<T, IdxT>(     \
+    raft::resources const& handle,                                   \
+    const raft::neighbors::ivf_pq::search_params& params,            \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx,                 \
+    const T* queries,                                                \
+    uint32_t n_queries,                                              \
+    uint32_t k,                                                      \
+    IdxT* neighbors,                                                 \
+    float* distances)
 
 instantiate_raft_neighbors_ivf_pq_search(float, int64_t);
 instantiate_raft_neighbors_ivf_pq_search(int8_t, int64_t);
diff --git a/cpp/include/raft/neighbors/ivf_pq-inl.cuh b/cpp/include/raft/neighbors/ivf_pq-inl.cuh
index b875d81f15..ccf8717486 100644
--- a/cpp/include/raft/neighbors/ivf_pq-inl.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq-inl.cuh
@@ -166,7 +166,7 @@ void search_with_filtering(raft::resources const& handle,
                            raft::device_matrix_view<const T, uint32_t, row_major> queries,
                            raft::device_matrix_view<IdxT, uint32_t, row_major> neighbors,
                            raft::device_matrix_view<float, uint32_t, row_major> distances,
-                           IvfSampleFilterT sample_filter = IvfSampleFilterT())
+                           IvfSampleFilterT sample_filter = IvfSampleFilterT{})
 {
   RAFT_EXPECTS(
     queries.extent(0) == neighbors.extent(0) && queries.extent(0) == distances.extent(0),
@@ -229,7 +229,7 @@ void search(raft::resources const& handle,
                         queries,
                         neighbors,
                         distances,
-                        raft::neighbors::filtering::none_ivf_sample_filter());
+                        raft::neighbors::filtering::none_ivf_sample_filter{});
 }
 
 /** @} */  // end group ivf_pq
@@ -353,20 +353,17 @@ void extend(raft::resources const& handle,
  * eliminate entirely allocations happening within `search`:
  * @code{.cpp}
  *   ...
- *   // Create a pooling memory resource with a pre-defined initial size.
- *   rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> mr(
- *     rmm::mr::get_current_device_resource(), 1024 * 1024);
  *   // use default search parameters
  *   ivf_pq::search_params search_params;
  *   filtering::none_ivf_sample_filter filter;
  *   // Use the same allocator across multiple searches to reduce the number of
  *   // cuda memory allocations
  *   ivf_pq::search_with_filtering(
- *     handle, search_params, index, queries1, N1, K, out_inds1, out_dists1, &mr, filter);
+ *     handle, search_params, index, queries1, N1, K, out_inds1, out_dists1, filter);
  *   ivf_pq::search_with_filtering(
- *     handle, search_params, index, queries2, N2, K, out_inds2, out_dists2, &mr, filter);
+ *     handle, search_params, index, queries2, N2, K, out_inds2, out_dists2, filter);
  *   ivf_pq::search_with_filtering(
- *     handle, search_params, index, queries3, N3, K, out_inds3, out_dists3, &mr, filter);
+ *     handle, search_params, index, queries3, N3, K, out_inds3, out_dists3, nfilter);
  *   ...
  * @endcode
  * The exact size of the temporary buffer depends on multiple factors and is an implementation
@@ -385,8 +382,6 @@ void extend(raft::resources const& handle,
  * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
  * [n_queries, k]
  * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
- * @param[in] mr an optional memory resource to use across the searches (you can provide a large
- * enough memory pool here to avoid memory allocations within search).
  * @param[in] sample_filter a filter the greenlights samples for a given query
  */
 template <typename T, typename IdxT, typename IvfSampleFilterT>
@@ -398,11 +393,41 @@ void search_with_filtering(raft::resources const& handle,
                            uint32_t k,
                            IdxT* neighbors,
                            float* distances,
-                           rmm::mr::device_memory_resource* mr = nullptr,
-                           IvfSampleFilterT sample_filter      = IvfSampleFilterT())
+                           IvfSampleFilterT sample_filter = IvfSampleFilterT{})
 {
-  detail::search(
-    handle, params, idx, queries, n_queries, k, neighbors, distances, mr, sample_filter);
+  detail::search(handle, params, idx, queries, n_queries, k, neighbors, distances, sample_filter);
+}
+
+/**
+ * This function is deprecated and will be removed in a future.
+ * Please drop the `mr` argument and use `raft::resource::set_workspace_resource` instead.
+ */
+template <typename T, typename IdxT, typename IvfSampleFilterT>
+[[deprecated(
+  "Drop the `mr` argument and use `raft::resource::set_workspace_resource` instead")]] void
+search_with_filtering(raft::resources const& handle,
+                      const search_params& params,
+                      const index<IdxT>& idx,
+                      const T* queries,
+                      uint32_t n_queries,
+                      uint32_t k,
+                      IdxT* neighbors,
+                      float* distances,
+                      rmm::mr::device_memory_resource* mr,
+                      IvfSampleFilterT sample_filter = IvfSampleFilterT{})
+{
+  if (mr != nullptr) {
+    // Shallow copy of the resource with the automatic lifespan:
+    //                               change the workspace resource temporarily
+    raft::resources res_local(handle);
+    resource::set_workspace_resource(
+      res_local, std::shared_ptr<rmm::mr::device_memory_resource>{mr, void_op{}});
+    return search_with_filtering(
+      res_local, params, idx, queries, n_queries, k, neighbors, distances, sample_filter);
+  } else {
+    return search_with_filtering(
+      handle, params, idx, queries, n_queries, k, neighbors, distances, sample_filter);
+  }
 }
 
 /**
@@ -444,8 +469,6 @@ void search_with_filtering(raft::resources const& handle,
  * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
  * [n_queries, k]
  * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
- * @param[in] mr (deprecated) an optional memory resource to use across the searches (you can
- * provide a large enough memory pool here to avoid memory allocations within search).
  */
 template <typename T, typename IdxT>
 void search(raft::resources const& handle,
@@ -455,19 +478,46 @@ void search(raft::resources const& handle,
             uint32_t n_queries,
             uint32_t k,
             IdxT* neighbors,
-            float* distances,
-            rmm::mr::device_memory_resource* mr = nullptr)
+            float* distances)
 {
-  if (mr != nullptr) {
-    // Shallow copy of the resource with the automatic lifespan:
-    //                               change the workspace resource temporarily
-    raft::resources res_local(handle);
-    resource::set_workspace_resource(
-      res_local, std::shared_ptr<rmm::mr::device_memory_resource>{mr, void_op{}});
-    return detail::search(res_local, params, idx, queries, n_queries, k, neighbors, distances);
-  } else {
-    return detail::search(handle, params, idx, queries, n_queries, k, neighbors, distances);
-  }
+  return search_with_filtering(handle,
+                               params,
+                               idx,
+                               queries,
+                               n_queries,
+                               k,
+                               neighbors,
+                               distances,
+                               raft::neighbors::filtering::none_ivf_sample_filter{});
+}
+
+/**
+ * This function is deprecated and will be removed in a future.
+ * Please drop the `mr` argument and use `raft::resource::set_workspace_resource` instead.
+ */
+template <typename T, typename IdxT>
+[[deprecated(
+  "Drop the `mr` argument and use `raft::resource::set_workspace_resource` instead")]] void
+search(raft::resources const& handle,
+       const search_params& params,
+       const index<IdxT>& idx,
+       const T* queries,
+       uint32_t n_queries,
+       uint32_t k,
+       IdxT* neighbors,
+       float* distances,
+       rmm::mr::device_memory_resource* mr)
+{
+  return search_with_filtering(handle,
+                               params,
+                               idx,
+                               queries,
+                               n_queries,
+                               k,
+                               neighbors,
+                               distances,
+                               mr,
+                               raft::neighbors::filtering::none_ivf_sample_filter{});
 }
 
 }  // namespace raft::neighbors::ivf_pq

From d6a27c532168fd9ead7c36a0957105d102abe6c6 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 7 Jul 2023 10:53:42 +0200
Subject: [PATCH 21/25] Remove ANN reference

---
 cpp/include/raft/core/resource/device_memory_resource.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp
index 10e399439a..70b098db45 100644
--- a/cpp/include/raft/core/resource/device_memory_resource.hpp
+++ b/cpp/include/raft/core/resource/device_memory_resource.hpp
@@ -126,8 +126,8 @@ class workspace_resource_factory : public resource_factory {
       // which is the cuda_memory_resource.
       // The reason for this is that some raft algorithms rely on the workspace allocator to be
       // fast; e.g. some buffers are allocated and released in a loop in performance-critical paths
-      // (batching), such as ANN-search routines. We don't want many allocations to happen there
-      // unless the user insists on it.
+      // (e.g. in batching). We don't want many allocations to happen there unless the user insists
+      // on it.
       RAFT_LOG_DEBUG("The workspace uses the pool memory resource by default (limit: %zu)", limit);
       return default_pool_resource(limit);
     } else {

From 453042397d5b3e57b4941ae5ebe6e7ba3c2a2c42 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 7 Jul 2023 11:41:51 +0200
Subject: [PATCH 22/25] Use the plain workspace resource by default and print a
 warning if necessary

---
 .../detail/device_memory_resource.hpp         | 58 +++++++++++++++++++
 .../core/resource/device_memory_resource.hpp  | 23 +-------
 .../raft/neighbors/detail/ivf_pq_build.cuh    |  4 ++
 .../raft/neighbors/detail/ivf_pq_search.cuh   |  2 +
 4 files changed, 65 insertions(+), 22 deletions(-)
 create mode 100644 cpp/include/raft/core/resource/detail/device_memory_resource.hpp

diff --git a/cpp/include/raft/core/resource/detail/device_memory_resource.hpp b/cpp/include/raft/core/resource/detail/device_memory_resource.hpp
new file mode 100644
index 0000000000..9d3f13689d
--- /dev/null
+++ b/cpp/include/raft/core/resource/detail/device_memory_resource.hpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/core/logger.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
+
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <mutex>
+#include <set>
+#include <string>
+
+namespace raft::resource::detail {
+
+/**
+ * Warn a user of the calling algorithm if they use the default non-pooled memory allocator,
+ * as it may hurt the performance.
+ *
+ * This helper function is designed to produce the warning once for a given `user_name`.
+ *
+ * @param[in] res
+ * @param[in] user_name the name of the algorithm or any other identification.
+ *
+ */
+inline void warn_non_pool_workspace(resources const& res, std::string user_name)
+{
+  // Detect if the plain cuda memory resource is used for the workspace
+  if (rmm::mr::cuda_memory_resource{}.is_equal(*get_workspace_resource(res)->get_upstream())) {
+    static std::set<std::string> notified_names{};
+    static std::mutex mutex{};
+    std::lock_guard<std::mutex> guard(mutex);
+    auto [it, inserted] = notified_names.insert(std::move(user_name));
+    if (inserted) {
+      RAFT_LOG_WARN(
+        "[%s] the default cuda resource is used for the raft workspace allocations. This may lead "
+        "to a significant slowdown for this algorithm. Consider using the default pool resource "
+        "(`raft::resource::set_workspace_to_pool_resource`) or set your own resource explicitly "
+        "(`raft::resource::set_workspace_resource`).",
+        it->c_str());
+    }
+  }
+}
+
+}  // namespace raft::resource::detail
diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp
index 70b098db45..038e81e425 100644
--- a/cpp/include/raft/core/resource/device_memory_resource.hpp
+++ b/cpp/include/raft/core/resource/device_memory_resource.hpp
@@ -72,7 +72,7 @@ class workspace_resource_factory : public resource_factory {
     std::optional<std::size_t> alignment                = std::nullopt)
     : allocation_limit_(allocation_limit.value_or(default_allocation_limit())),
       alignment_(alignment),
-      mr_(mr ? mr : default_memory_resource(allocation_limit_))
+      mr_(mr ? mr : default_plain_resource())
   {
   }
 
@@ -118,27 +118,6 @@ class workspace_resource_factory : public resource_factory {
   std::optional<std::size_t> alignment_;
   std::shared_ptr<rmm::mr::device_memory_resource> mr_;
 
-  static inline auto default_memory_resource(std::size_t limit)
-    -> std::shared_ptr<rmm::mr::device_memory_resource>
-  {
-    if (rmm::mr::cuda_memory_resource{}.is_equal(*rmm::mr::get_current_device_resource())) {
-      // Use the memory pool if only we're sure the global memory resource is set to its default,
-      // which is the cuda_memory_resource.
-      // The reason for this is that some raft algorithms rely on the workspace allocator to be
-      // fast; e.g. some buffers are allocated and released in a loop in performance-critical paths
-      // (e.g. in batching). We don't want many allocations to happen there unless the user insists
-      // on it.
-      RAFT_LOG_DEBUG("The workspace uses the pool memory resource by default (limit: %zu)", limit);
-      return default_pool_resource(limit);
-    } else {
-      // If the user sets the global (rmm) memory resource to anything but the trivial
-      // cuda_memory_resource, we don't interfere that - they know better. In this case, the
-      // limiting resource adaptor is set on top of the global (per-device) resource.
-      RAFT_LOG_DEBUG("The workspace uses the global default memory resource (limit: %zu)", limit);
-      return default_plain_resource();
-    }
-  }
-
   static inline auto default_allocation_limit() -> std::size_t
   {
     std::size_t free_size{};
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
index a006e2f1d2..199cb74fbe 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -29,6 +29,7 @@
 #include <raft/core/logger.hpp>
 #include <raft/core/nvtx.hpp>
 #include <raft/core/operators.hpp>
+#include <raft/core/resource/detail/device_memory_resource.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/linalg/add.cuh>
@@ -1317,6 +1318,8 @@ void extend(raft::resources const& handle,
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
     "ivf_pq::extend(%zu, %u)", size_t(n_rows), index->dim());
+
+  resource::detail::warn_non_pool_workspace(handle, "raft::ivf_pq::extend");
   auto stream           = resource::get_cuda_stream(handle);
   const auto n_clusters = index->n_lists();
 
@@ -1515,6 +1518,7 @@ auto build(raft::resources const& handle,
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
     "ivf_pq::build(%zu, %u)", size_t(n_rows), dim);
+  resource::detail::warn_non_pool_workspace(handle, "raft::ivf_pq::build");
   static_assert(std::is_same_v<T, float> || std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
                 "Unsupported data type");
 
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
index c0020ee490..ab690240e0 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
@@ -31,6 +31,7 @@
 #include <raft/core/logger.hpp>
 #include <raft/core/nvtx.hpp>
 #include <raft/core/operators.hpp>
+#include <raft/core/resource/detail/device_memory_resource.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/linalg/gemm.cuh>
@@ -738,6 +739,7 @@ inline void search(raft::resources const& handle,
     params.n_probes,
     k,
     index.dim());
+  resource::detail::warn_non_pool_workspace(handle, "raft::ivf_pq::search");
 
   RAFT_EXPECTS(
     params.internal_distance_dtype == CUDA_R_16F || params.internal_distance_dtype == CUDA_R_32F,

From 775f71800a2b43dff8132f2b7624539f67cab571 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 18 Jul 2023 14:41:04 +0200
Subject: [PATCH 23/25] Add a note about no deleter

---
 cpp/include/raft/core/resource/device_memory_resource.hpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp
index 038e81e425..6cb317f9b7 100644
--- a/cpp/include/raft/core/resource/device_memory_resource.hpp
+++ b/cpp/include/raft/core/resource/device_memory_resource.hpp
@@ -106,7 +106,13 @@ class workspace_resource_factory : public resource_factory {
       upstream, min_size, max_size);
   }
 
-  /** Get the global memory resource wrapped into an unmanaged shared_ptr (with no deleter). */
+  /**
+   * Get the global memory resource wrapped into an unmanaged shared_ptr (with no deleter).
+   *
+   * Note: the lifetime of the underlying `rmm::mr::get_current_device_resource()` is managed
+   * somewhere else, since it's passed by a raw pointer. Hence, this shared_ptr wrapper is not
+   * allowed to delete the pointer on destruction.
+   */
   static inline auto default_plain_resource() -> std::shared_ptr<rmm::mr::device_memory_resource>
   {
     return std::shared_ptr<rmm::mr::device_memory_resource>{rmm::mr::get_current_device_resource(),

From d7fcde94a1ffcbdd721081b42a10662446bda4c9 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 18 Jul 2023 14:41:48 +0200
Subject: [PATCH 24/25] Use the workspace resource size to determine the batch
 sizes for ivf-pq

---
 cpp/include/raft/neighbors/detail/ivf_pq_search.cuh | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
index ab690240e0..5d6186edac 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
@@ -32,6 +32,7 @@
 #include <raft/core/nvtx.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/detail/device_memory_resource.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/linalg/gemm.cuh>
@@ -678,6 +679,7 @@ struct ivfpq_search {
  * A heuristic for bounding the number of queries per batch, to improve GPU utilization.
  * (based on the number of SMs and the work size).
  *
+ * @param res is used to query the workspace size
  * @param k top-k
  * @param n_probes number of selected clusters per query
  * @param n_queries number of queries hoped to be processed at once.
@@ -686,7 +688,8 @@ struct ivfpq_search {
  *
  * @return maximum recommended batch size.
  */
-inline auto get_max_batch_size(uint32_t k,
+inline auto get_max_batch_size(raft::resources const& res,
+                               uint32_t k,
                                uint32_t n_probes,
                                uint32_t n_queries,
                                uint32_t max_samples) -> uint32_t
@@ -705,11 +708,11 @@ inline auto get_max_batch_size(uint32_t k,
   auto ws_size = [k, n_probes, max_samples](uint32_t bs) -> uint64_t {
     return uint64_t(is_local_topk_feasible(k, n_probes, bs) ? k * n_probes : max_samples) * bs;
   };
-  constexpr uint64_t kMaxWsSize = 1024 * 1024 * 1024;
-  if (ws_size(max_batch_size) > kMaxWsSize) {
+  auto max_ws_size = resource::get_workspace_total_bytes(res);
+  if (ws_size(max_batch_size) > max_ws_size) {
     uint32_t smaller_batch_size = bound_by_power_of_two(max_batch_size);
     // gradually reduce the batch size until we fit into the max size limit.
-    while (smaller_batch_size > 1 && ws_size(smaller_batch_size) > kMaxWsSize) {
+    while (smaller_batch_size > 1 && ws_size(smaller_batch_size) > max_ws_size) {
       smaller_batch_size >>= 1;
     }
     return smaller_batch_size;
@@ -780,7 +783,7 @@ inline void search(raft::resources const& handle,
 
   // Maximum number of query vectors to search at the same time.
   const auto max_queries = std::min<uint32_t>(std::max<uint32_t>(n_queries, 1), 4096);
-  auto max_batch_size    = get_max_batch_size(k, n_probes, max_queries, max_samples);
+  auto max_batch_size    = get_max_batch_size(handle, k, n_probes, max_queries, max_samples);
 
   rmm::device_uvector<float> float_queries(max_queries * dim_ext, stream, mr);
   rmm::device_uvector<float> rot_queries(max_queries * index.rot_dim(), stream, mr);

From ae0f4693f3ea243adaf3f976e6123f074d82521f Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 25 Jul 2023 13:41:38 +0200
Subject: [PATCH 25/25] Use get_workspace_free_bytes and debug-log the usage of
 the default pool resource

---
 cpp/include/raft/core/resource/device_memory_resource.hpp | 6 ++++++
 cpp/include/raft/neighbors/detail/ivf_pq_search.cuh       | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp
index 6cb317f9b7..af684df747 100644
--- a/cpp/include/raft/core/resource/device_memory_resource.hpp
+++ b/cpp/include/raft/core/resource/device_memory_resource.hpp
@@ -102,6 +102,12 @@ class workspace_resource_factory : public resource_factory {
     //   2) The pool doesn't grab too much memory on top of the 'limit'.
     auto max_size = std::min<std::size_t>(limit + kOneGb / 2lu, limit * 3lu / 2lu);
     auto upstream = rmm::mr::get_current_device_resource();
+    RAFT_LOG_DEBUG(
+      "Setting the workspace pool resource; memory limit = %zu, initial pool size = %zu, max pool "
+      "size = %zu.",
+      limit,
+      min_size,
+      max_size);
     return std::make_shared<rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>>(
       upstream, min_size, max_size);
   }
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
index 5d6186edac..298083d1e5 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
@@ -708,7 +708,7 @@ inline auto get_max_batch_size(raft::resources const& res,
   auto ws_size = [k, n_probes, max_samples](uint32_t bs) -> uint64_t {
     return uint64_t(is_local_topk_feasible(k, n_probes, bs) ? k * n_probes : max_samples) * bs;
   };
-  auto max_ws_size = resource::get_workspace_total_bytes(res);
+  auto max_ws_size = resource::get_workspace_free_bytes(res);
   if (ws_size(max_batch_size) > max_ws_size) {
     uint32_t smaller_batch_size = bound_by_power_of_two(max_batch_size);
     // gradually reduce the batch size until we fit into the max size limit.