From a9397843081cc011b1f408ab978fcebac90431d9 Mon Sep 17 00:00:00 2001 From: achirkin Date: Mon, 20 Mar 2023 11:23:04 +0100 Subject: [PATCH 01/25] Wrap the workspace resource into a limiting_resource_adaptor --- cpp/include/raft/core/device_resources.hpp | 17 ++-- .../core/resource/device_memory_resource.hpp | 80 +++++++++++++++---- cpp/include/raft/core/resources.hpp | 7 +- 3 files changed, 80 insertions(+), 24 deletions(-) diff --git a/cpp/include/raft/core/device_resources.hpp b/cpp/include/raft/core/device_resources.hpp index df6b39a368..0275f08152 100644 --- a/cpp/include/raft/core/device_resources.hpp +++ b/cpp/include/raft/core/device_resources.hpp @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -60,12 +61,13 @@ namespace raft { class device_resources : public resources { public: device_resources(const device_resources& handle, - rmm::mr::device_memory_resource* workspace_resource) + rmm::mr::device_memory_resource* workspace_resource, + std::optional allocation_limit = std::nullopt) : resources{handle} { // replace the resource factory for the workspace_resources - resources::add_resource_factory( - std::make_shared(workspace_resource)); + resources::add_resource_factory(std::make_shared( + workspace_resource, allocation_limit, std::nullopt)); } device_resources(const device_resources& handle) : resources{handle} {} @@ -83,7 +85,8 @@ class device_resources : public resources { */ device_resources(rmm::cuda_stream_view stream_view = rmm::cuda_stream_per_thread, std::shared_ptr stream_pool = {nullptr}, - rmm::mr::device_memory_resource* workspace_resource = nullptr) + rmm::mr::device_memory_resource* workspace_resource = nullptr, + std::optional allocation_limit = std::nullopt) : resources{} { resources::add_resource_factory(std::make_shared()); @@ -91,8 +94,8 @@ class device_resources : public resources { std::make_shared(stream_view)); resources::add_resource_factory( std::make_shared(stream_pool)); - resources::add_resource_factory( - std::make_shared(workspace_resource)); + resources::add_resource_factory(std::make_shared( + workspace_resource, allocation_limit, std::nullopt)); } /** Destroys all held-up resources */ @@ -255,4 +258,4 @@ class stream_syncer { } // namespace raft -#endif \ No newline at end of file +#endif diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp index 35ae3d715f..061ba09235 100644 --- a/cpp/include/raft/core/resource/device_memory_resource.hpp +++ b/cpp/include/raft/core/resource/device_memory_resource.hpp @@ -17,21 +17,52 @@ #include #include +#include + #include +#include + +#include +#include namespace raft::resource { -class device_memory_resource : public resource { +class limited_memory_resource : public resource { public: - device_memory_resource(rmm::mr::device_memory_resource* mr_ = nullptr) : mr(mr_) + limited_memory_resource(rmm::mr::device_memory_resource* mr, + std::optional allocation_limit, + std::optional alignment) + : mr_(make_adaptor(mr, allocation_limit, alignment)) { - if (mr_ == nullptr) { mr = rmm::mr::get_current_device_resource(); } } - void* get_resource() override { return mr; } - ~device_memory_resource() override {} + auto get_resource() -> void* override { return &mr_; } + + ~limited_memory_resource() override = default; private: - rmm::mr::device_memory_resource* mr; + rmm::mr::limiting_resource_adaptor mr_; + + static inline auto get_device_mem() + { + std::size_t free_size{}; + std::size_t total_size{}; + RAFT_CUDA_TRY(cudaMemGetInfo(&free_size, &total_size)); + return total_size; + } + + static inline auto make_adaptor(rmm::mr::device_memory_resource* mr, + std::optional allocation_limit, + std::optional alignment) + -> rmm::mr::limiting_resource_adaptor + { + if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); } + auto limit = allocation_limit.value_or(get_device_mem()); + if (alignment.has_value()) { + return rmm::mr::limiting_resource_adaptor(mr, limit, alignment.value()); + } else { + return rmm::mr::limiting_resource_adaptor(mr, limit); + } + } }; /** @@ -40,12 +71,22 @@ class device_memory_resource : public resource { */ class workspace_resource_factory : public resource_factory { public: - workspace_resource_factory(rmm::mr::device_memory_resource* mr_ = nullptr) : mr(mr_) {} - resource_type get_resource_type() override { return resource_type::WORKSPACE_RESOURCE; } - resource* make_resource() override { return new device_memory_resource(mr); } + workspace_resource_factory(rmm::mr::device_memory_resource* mr, + std::optional allocation_limit, + std::optional alignment) + : mr_(mr), allocation_limit_(allocation_limit), alignment_(alignment) + { + } + auto get_resource_type() -> resource_type override { return resource_type::WORKSPACE_RESOURCE; } + auto make_resource() -> resource* override + { + return new limited_memory_resource(mr_, allocation_limit_, alignment_); + } private: - rmm::mr::device_memory_resource* mr; + rmm::mr::device_memory_resource* mr_; + std::optional allocation_limit_; + std::optional alignment_; }; /** @@ -54,12 +95,15 @@ class workspace_resource_factory : public resource_factory { * @param res raft resources object for managing resources * @return device memory resource object */ -inline rmm::mr::device_memory_resource* get_workspace_resource(resources const& res) +inline auto get_workspace_resource(resources const& res) + -> rmm::mr::limiting_resource_adaptor* { if (!res.has_resource_factory(resource_type::WORKSPACE_RESOURCE)) { - res.add_resource_factory(std::make_shared()); + res.add_resource_factory( + std::make_shared(nullptr, std::nullopt, std::nullopt)); } - return res.get_resource(resource_type::WORKSPACE_RESOURCE); + return res.get_resource>( + resource_type::WORKSPACE_RESOURCE); }; /** @@ -68,8 +112,12 @@ inline rmm::mr::device_memory_resource* get_workspace_resource(resources const& * @param res raft resources object for managing resources * @param mr a valid rmm device_memory_resource */ -inline void set_workspace_resource(resources const& res, rmm::mr::device_memory_resource* mr) +inline void set_workspace_resource(resources const& res, + rmm::mr::device_memory_resource* mr = nullptr, + std::optional allocation_limit = std::nullopt, + std::optional alignment = std::nullopt) { - res.add_resource_factory(std::make_shared(mr)); + res.add_resource_factory( + std::make_shared(mr, allocation_limit, alignment)); }; -} // namespace raft::resource \ No newline at end of file +} // namespace raft::resource diff --git a/cpp/include/raft/core/resources.hpp b/cpp/include/raft/core/resources.hpp index 64e281e934..e31918dc6d 100644 --- a/cpp/include/raft/core/resources.hpp +++ b/cpp/include/raft/core/resources.hpp @@ -94,6 +94,11 @@ class resources { RAFT_EXPECTS(rtype != resource::resource_type::LAST_KEY, "LAST_KEY is a placeholder and not a valid resource factory type."); factories_.at(rtype) = std::make_pair(rtype, factory); + // Clear the corresponding resource, so that on next `get_resource` the new factory is used + if (resources_.at(rtype).first != resource::resource_type::LAST_KEY) { + resources_.at(rtype) = std::make_pair(resource::resource_type::LAST_KEY, + std::make_shared()); + } } /** @@ -128,4 +133,4 @@ class resources { mutable std::vector factories_; mutable std::vector resources_; }; -} // namespace raft \ No newline at end of file +} // namespace raft From ac5762bad46d30dbab04f0bbcefc89fe02e90af6 Mon Sep 17 00:00:00 2001 From: achirkin Date: Mon, 20 Mar 2023 19:56:54 +0100 Subject: [PATCH 02/25] Set the pool memory resource by default and start the ivf-pq use case --- .../core/resource/device_memory_resource.hpp | 60 ++++++++++++++++--- .../raft/neighbors/detail/ivf_pq_search.cuh | 16 ++--- cpp/include/raft/neighbors/ivf_pq.cuh | 6 +- 3 files changed, 59 insertions(+), 23 deletions(-) diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp index 061ba09235..ce6dc0623b 100644 --- a/cpp/include/raft/core/resource/device_memory_resource.hpp +++ b/cpp/include/raft/core/resource/device_memory_resource.hpp @@ -21,6 +21,7 @@ #include #include +#include #include #include @@ -31,7 +32,16 @@ class limited_memory_resource : public resource { limited_memory_resource(rmm::mr::device_memory_resource* mr, std::optional allocation_limit, std::optional alignment) - : mr_(make_adaptor(mr, allocation_limit, alignment)) + : limited_memory_resource(mr, get_alloc_limit(allocation_limit), alignment) + { + } + + template + limited_memory_resource(rmm::mr::device_memory_resource* mr, + Deleter d, + std::optional allocation_limit, + std::optional alignment) + : limited_memory_resource(mr, d, get_alloc_limit(allocation_limit), alignment) { } @@ -40,27 +50,59 @@ class limited_memory_resource : public resource { ~limited_memory_resource() override = default; private: + std::shared_ptr upstream_; rmm::mr::limiting_resource_adaptor mr_; - static inline auto get_device_mem() + limited_memory_resource(rmm::mr::device_memory_resource* mr, + std::size_t allocation_limit, + std::optional alignment) + : upstream_{get_upstream(mr, allocation_limit)}, + mr_(make_adaptor(upstream_, allocation_limit, alignment)) + { + } + + template + limited_memory_resource(rmm::mr::device_memory_resource* mr, + Deleter d, + std::size_t allocation_limit, + std::optional alignment) + : upstream_{get_upstream(mr, allocation_limit), d}, + mr_{make_adaptor(upstream_, allocation_limit, alignment)} + { + } + + static inline auto get_upstream(rmm::mr::device_memory_resource* mr, std::size_t allocation_limit) + -> rmm::mr::device_memory_resource* + { + if (mr != nullptr) { return mr; } + // Create a pool memory resource by default + constexpr std::size_t kOneGb = 1024lu * 1024lu * 1024lu; + auto min_size = std::min(kOneGb, allocation_limit / 2); + auto max_size = allocation_limit * 3lu / 2lu; + return new rmm::mr::pool_memory_resource( + rmm::mr::get_current_device_resource(), min_size, max_size); + } + + static inline auto get_alloc_limit(std::optional limit) -> std::size_t { + if (limit.has_value()) { return limit.value(); } + // Allow a fraction of available memory by default. std::size_t free_size{}; std::size_t total_size{}; RAFT_CUDA_TRY(cudaMemGetInfo(&free_size, &total_size)); - return total_size; + return free_size / 2; } - static inline auto make_adaptor(rmm::mr::device_memory_resource* mr, - std::optional allocation_limit, + static inline auto make_adaptor(std::shared_ptr upstream, + std::size_t limit, std::optional alignment) -> rmm::mr::limiting_resource_adaptor { - if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); } - auto limit = allocation_limit.value_or(get_device_mem()); + auto p = upstream.get(); if (alignment.has_value()) { - return rmm::mr::limiting_resource_adaptor(mr, limit, alignment.value()); + return rmm::mr::limiting_resource_adaptor(p, limit, alignment.value()); } else { - return rmm::mr::limiting_resource_adaptor(mr, limit); + return rmm::mr::limiting_resource_adaptor(p, limit); } } }; diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh index 4b6e6f5e31..7323239e67 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh @@ -1262,10 +1262,10 @@ void ivfpq_search_worker(raft::device_resources const& handle, IdxT* neighbors, // [n_queries, topK] float* distances, // [n_queries, topK] float scaling_factor, - double preferred_shmem_carveout, - rmm::mr::device_memory_resource* mr) + double preferred_shmem_carveout) { auto stream = handle.get_stream(); + auto mr = handle.get_workspace_resource(); bool manage_local_topk = is_local_topk_feasible(topK, n_probes, n_queries); auto topk_len = manage_local_topk ? n_probes * topK : max_samples; @@ -1554,8 +1554,7 @@ inline void search(raft::device_resources const& handle, uint32_t n_queries, uint32_t k, IdxT* neighbors, - float* distances, - rmm::mr::device_memory_resource* mr = nullptr) + float* distances) { static_assert(std::is_same_v || std::is_same_v || std::is_same_v, "Unsupported element type."); @@ -1601,11 +1600,7 @@ inline void search(raft::device_resources const& handle, max_samples = ms; } - auto pool_guard = raft::get_pool_memory_resource(mr, n_queries * n_probes * k * 16); - if (pool_guard) { - RAFT_LOG_DEBUG("ivf_pq::search: using pool memory resource with initial size %zu bytes", - pool_guard->pool_size()); - } + auto mr = handle.get_workspace_resource(); // Maximum number of query vectors to search at the same time. const auto max_queries = std::min(std::max(n_queries, 1), 4096); @@ -1669,8 +1664,7 @@ inline void search(raft::device_resources const& handle, neighbors + uint64_t(k) * (offset_q + offset_b), distances + uint64_t(k) * (offset_q + offset_b), utils::config::kDivisor / utils::config::kDivisor, - params.preferred_shmem_carveout, - mr); + params.preferred_shmem_carveout); } } } diff --git a/cpp/include/raft/neighbors/ivf_pq.cuh b/cpp/include/raft/neighbors/ivf_pq.cuh index 4a12ca72a4..2a5c322016 100644 --- a/cpp/include/raft/neighbors/ivf_pq.cuh +++ b/cpp/include/raft/neighbors/ivf_pq.cuh @@ -182,8 +182,7 @@ void search(raft::device_resources const& handle, static_cast(queries.extent(0)), k, neighbors.data_handle(), - distances.data_handle(), - handle.get_workspace_resource()); + distances.data_handle()); } /** @} */ // end group ivf_pq @@ -349,7 +348,8 @@ void search(raft::device_resources const& handle, float* distances, rmm::mr::device_memory_resource* mr = nullptr) { - return detail::search(handle, params, idx, queries, n_queries, k, neighbors, distances, mr); + if (mr != nullptr) { raft::resource::set_workspace_resource(handle, mr); } + return detail::search(handle, params, idx, queries, n_queries, k, neighbors, distances); } } // namespace raft::neighbors::ivf_pq From c33d519ad1271e32a55c599a3b19176159727f82 Mon Sep 17 00:00:00 2001 From: achirkin Date: Tue, 21 Mar 2023 08:16:47 +0100 Subject: [PATCH 03/25] Refactor the resource to more rely on shared_ptr to manage lifetime --- cpp/include/raft/core/device_resources.hpp | 8 +- .../core/resource/device_memory_resource.hpp | 118 +++++++++--------- 2 files changed, 61 insertions(+), 65 deletions(-) diff --git a/cpp/include/raft/core/device_resources.hpp b/cpp/include/raft/core/device_resources.hpp index 0275f08152..6f82fc4f6c 100644 --- a/cpp/include/raft/core/device_resources.hpp +++ b/cpp/include/raft/core/device_resources.hpp @@ -66,8 +66,7 @@ class device_resources : public resources { : resources{handle} { // replace the resource factory for the workspace_resources - resources::add_resource_factory(std::make_shared( - workspace_resource, allocation_limit, std::nullopt)); + resource::set_workspace_resource(*this, workspace_resource, allocation_limit); } device_resources(const device_resources& handle) : resources{handle} {} @@ -82,6 +81,8 @@ class device_resources : public resources { * @param[in] stream_pool the stream pool used (which has default of nullptr if unspecified) * @param[in] workspace_resource an optional resource used by some functions for allocating * temporary workspaces. + * NB: the function takes the ownership of the resource; the semantics is the same + * as if the pointer was passed to std::shared_ptr constructor. */ device_resources(rmm::cuda_stream_view stream_view = rmm::cuda_stream_per_thread, std::shared_ptr stream_pool = {nullptr}, @@ -94,8 +95,7 @@ class device_resources : public resources { std::make_shared(stream_view)); resources::add_resource_factory( std::make_shared(stream_pool)); - resources::add_resource_factory(std::make_shared( - workspace_resource, allocation_limit, std::nullopt)); + resource::set_workspace_resource(*this, workspace_resource, allocation_limit); } /** Destroys all held-up resources */ diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp index ce6dc0623b..a0685ebc07 100644 --- a/cpp/include/raft/core/resource/device_memory_resource.hpp +++ b/cpp/include/raft/core/resource/device_memory_resource.hpp @@ -29,19 +29,10 @@ namespace raft::resource { class limited_memory_resource : public resource { public: - limited_memory_resource(rmm::mr::device_memory_resource* mr, - std::optional allocation_limit, - std::optional alignment) - : limited_memory_resource(mr, get_alloc_limit(allocation_limit), alignment) - { - } - - template - limited_memory_resource(rmm::mr::device_memory_resource* mr, - Deleter d, - std::optional allocation_limit, + limited_memory_resource(std::shared_ptr mr, + std::size_t allocation_limit, std::optional alignment) - : limited_memory_resource(mr, d, get_alloc_limit(allocation_limit), alignment) + : upstream_(mr), mr_(make_adaptor(mr, allocation_limit, alignment)) { } @@ -53,46 +44,6 @@ class limited_memory_resource : public resource { std::shared_ptr upstream_; rmm::mr::limiting_resource_adaptor mr_; - limited_memory_resource(rmm::mr::device_memory_resource* mr, - std::size_t allocation_limit, - std::optional alignment) - : upstream_{get_upstream(mr, allocation_limit)}, - mr_(make_adaptor(upstream_, allocation_limit, alignment)) - { - } - - template - limited_memory_resource(rmm::mr::device_memory_resource* mr, - Deleter d, - std::size_t allocation_limit, - std::optional alignment) - : upstream_{get_upstream(mr, allocation_limit), d}, - mr_{make_adaptor(upstream_, allocation_limit, alignment)} - { - } - - static inline auto get_upstream(rmm::mr::device_memory_resource* mr, std::size_t allocation_limit) - -> rmm::mr::device_memory_resource* - { - if (mr != nullptr) { return mr; } - // Create a pool memory resource by default - constexpr std::size_t kOneGb = 1024lu * 1024lu * 1024lu; - auto min_size = std::min(kOneGb, allocation_limit / 2); - auto max_size = allocation_limit * 3lu / 2lu; - return new rmm::mr::pool_memory_resource( - rmm::mr::get_current_device_resource(), min_size, max_size); - } - - static inline auto get_alloc_limit(std::optional limit) -> std::size_t - { - if (limit.has_value()) { return limit.value(); } - // Allow a fraction of available memory by default. - std::size_t free_size{}; - std::size_t total_size{}; - RAFT_CUDA_TRY(cudaMemGetInfo(&free_size, &total_size)); - return free_size / 2; - } - static inline auto make_adaptor(std::shared_ptr upstream, std::size_t limit, std::optional alignment) @@ -113,12 +64,16 @@ class limited_memory_resource : public resource { */ class workspace_resource_factory : public resource_factory { public: - workspace_resource_factory(rmm::mr::device_memory_resource* mr, - std::optional allocation_limit, - std::optional alignment) - : mr_(mr), allocation_limit_(allocation_limit), alignment_(alignment) + explicit workspace_resource_factory( + std::shared_ptr mr = {nullptr}, + std::optional allocation_limit = std::nullopt, + std::optional alignment = std::nullopt) + : allocation_limit_(allocation_limit.value_or(default_allocation_limit())), + alignment_(alignment), + mr_(mr ? mr : default_memory_resource(allocation_limit_)) { } + auto get_resource_type() -> resource_type override { return resource_type::WORKSPACE_RESOURCE; } auto make_resource() -> resource* override { @@ -126,9 +81,30 @@ class workspace_resource_factory : public resource_factory { } private: - rmm::mr::device_memory_resource* mr_; - std::optional allocation_limit_; + std::size_t allocation_limit_; std::optional alignment_; + std::shared_ptr mr_; + + // Create a pool memory resource by default + static inline auto default_memory_resource(std::size_t limit) + -> std::shared_ptr + { + constexpr std::size_t kOneGb = 1024lu * 1024lu * 1024lu; + auto min_size = std::min(kOneGb, limit / 2); + auto max_size = limit * 3lu / 2lu; + auto upstream = rmm::mr::get_current_device_resource(); + return std::make_shared>( + upstream, min_size, max_size); + } + + // Allow a fraction of available memory by default. + static inline auto default_allocation_limit() -> std::size_t + { + std::size_t free_size{}; + std::size_t total_size{}; + RAFT_CUDA_TRY(cudaMemGetInfo(&free_size, &total_size)); + return free_size / 2; + } }; /** @@ -141,8 +117,7 @@ inline auto get_workspace_resource(resources const& res) -> rmm::mr::limiting_resource_adaptor* { if (!res.has_resource_factory(resource_type::WORKSPACE_RESOURCE)) { - res.add_resource_factory( - std::make_shared(nullptr, std::nullopt, std::nullopt)); + res.add_resource_factory(std::make_shared()); } return res.get_resource>( resource_type::WORKSPACE_RESOURCE); @@ -155,11 +130,32 @@ inline auto get_workspace_resource(resources const& res) * @param mr a valid rmm device_memory_resource */ inline void set_workspace_resource(resources const& res, - rmm::mr::device_memory_resource* mr = nullptr, + std::shared_ptr mr = {nullptr}, std::optional allocation_limit = std::nullopt, std::optional alignment = std::nullopt) { res.add_resource_factory( std::make_shared(mr, allocation_limit, alignment)); }; + +inline void set_workspace_resource(resources const& res, + rmm::mr::device_memory_resource* mr, + std::optional allocation_limit = std::nullopt, + std::optional alignment = std::nullopt) +{ + set_workspace_resource( + res, std::shared_ptr{mr}, allocation_limit, alignment); +}; + +template +inline void set_workspace_resource(resources const& res, + rmm::mr::device_memory_resource* mr, + Deleter d, + std::optional allocation_limit = std::nullopt, + std::optional alignment = std::nullopt) +{ + set_workspace_resource( + res, std::shared_ptr{mr, d}, allocation_limit, alignment); +}; + } // namespace raft::resource From de5dd848ae6603f1c8f1e330f3b1dc178d4a4fa9 Mon Sep 17 00:00:00 2001 From: achirkin Date: Tue, 21 Mar 2023 09:43:35 +0100 Subject: [PATCH 04/25] Preserve the semantics of not transfering the ownership of raw pointers to keep the change non-breaking --- cpp/include/raft/core/device_resources.hpp | 4 +- .../core/resource/device_memory_resource.hpp | 42 ++++++++++++------- .../raft/neighbors/detail/ivf_pq_build.cuh | 10 +---- cpp/include/raft/neighbors/ivf_pq.cuh | 5 ++- 4 files changed, 35 insertions(+), 26 deletions(-) diff --git a/cpp/include/raft/core/device_resources.hpp b/cpp/include/raft/core/device_resources.hpp index 6f82fc4f6c..47370f5371 100644 --- a/cpp/include/raft/core/device_resources.hpp +++ b/cpp/include/raft/core/device_resources.hpp @@ -81,8 +81,8 @@ class device_resources : public resources { * @param[in] stream_pool the stream pool used (which has default of nullptr if unspecified) * @param[in] workspace_resource an optional resource used by some functions for allocating * temporary workspaces. - * NB: the function takes the ownership of the resource; the semantics is the same - * as if the pointer was passed to std::shared_ptr constructor. + * @param[in] allocation_limit the total amount of memory in bytes available to the temporary + * workspace resources. */ device_resources(rmm::cuda_stream_view stream_view = rmm::cuda_stream_per_thread, std::shared_ptr stream_pool = {nullptr}, diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp index a0685ebc07..d1c5b1726a 100644 --- a/cpp/include/raft/core/resource/device_memory_resource.hpp +++ b/cpp/include/raft/core/resource/device_memory_resource.hpp @@ -15,6 +15,7 @@ */ #pragma once +#include #include #include #include @@ -110,6 +111,7 @@ class workspace_resource_factory : public resource_factory { /** * Load a temp workspace resource from a resources instance (and populate it on the res * if needed). + * * @param res raft resources object for managing resources * @return device memory resource object */ @@ -124,10 +126,14 @@ inline auto get_workspace_resource(resources const& res) }; /** - * Set a temp workspace resource on a resources instance. + * Set a temporary workspace resource on a resources instance. * * @param res raft resources object for managing resources - * @param mr a valid rmm device_memory_resource + * @param mr an optional RMM device_memory_resource + * @param allocation_limit + * the total amount of memory in bytes available to the temporary workspace resources. + * @param alignment optional alignment requirements passed to RMM allocations + * */ inline void set_workspace_resource(resources const& res, std::shared_ptr mr = {nullptr}, @@ -138,24 +144,30 @@ inline void set_workspace_resource(resources const& res, std::make_shared(mr, allocation_limit, alignment)); }; +/** + * Set a temporary workspace resource on a resources instance. + * + * @param res raft resources object for managing resources + * @param mr an optional RMM device_memory_resource; + * note, the ownership of the object is not transferred with this raw pointer interface. + * @param allocation_limit + * the total amount of memory in bytes available to the temporary workspace resources. + * @param alignment optional alignment requirements passed to RMM allocations + * + */ inline void set_workspace_resource(resources const& res, rmm::mr::device_memory_resource* mr, std::optional allocation_limit = std::nullopt, std::optional alignment = std::nullopt) { - set_workspace_resource( - res, std::shared_ptr{mr}, allocation_limit, alignment); -}; - -template -inline void set_workspace_resource(resources const& res, - rmm::mr::device_memory_resource* mr, - Deleter d, - std::optional allocation_limit = std::nullopt, - std::optional alignment = std::nullopt) -{ - set_workspace_resource( - res, std::shared_ptr{mr, d}, allocation_limit, alignment); + // NB: to preserve the semantics of passing memory resource without transferring the ownership, + // we create a shared pointer with a dummy deleter (void_op). + set_workspace_resource(res, + mr != nullptr + ? std::shared_ptr{mr, void_op{}} + : std::shared_ptr{nullptr}, + allocation_limit, + alignment); }; } // namespace raft::resource diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh index 1a563d213e..d424d59dd1 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh @@ -442,9 +442,6 @@ void train_per_subset(raft::device_resources const& handle, index.pq_len(), stream); - // clone the handle and attached the device memory resource to it - const device_resources new_handle(handle, device_memory); - // train PQ codebook for this subspace auto sub_trainset_view = raft::make_device_matrix_view(sub_trainset.data(), n_rows, index.pq_len()); @@ -458,7 +455,7 @@ void train_per_subset(raft::device_resources const& handle, raft::cluster::kmeans_balanced_params kmeans_params; kmeans_params.n_iters = kmeans_n_iters; kmeans_params.metric = raft::distance::DistanceType::L2Expanded; - raft::cluster::kmeans_balanced::helpers::build_clusters(new_handle, + raft::cluster::kmeans_balanced::helpers::build_clusters(handle, kmeans_params, sub_trainset_view, centers_tmp_view, @@ -523,9 +520,6 @@ void train_per_cluster(raft::device_resources const& handle, indices + cluster_offsets[l], device_memory); - // clone the handle and attached the device memory resource to it - const device_resources new_handle(handle, device_memory); - // limit the cluster size to bound the training time. // [sic] we interpret the data as pq_len-dimensional size_t big_enough = 256ul * std::max(index.pq_book_size(), index.pq_dim()); @@ -546,7 +540,7 @@ void train_per_cluster(raft::device_resources const& handle, raft::cluster::kmeans_balanced_params kmeans_params; kmeans_params.n_iters = kmeans_n_iters; kmeans_params.metric = raft::distance::DistanceType::L2Expanded; - raft::cluster::kmeans_balanced::helpers::build_clusters(new_handle, + raft::cluster::kmeans_balanced::helpers::build_clusters(handle, kmeans_params, rot_vectors_view, centers_tmp_view, diff --git a/cpp/include/raft/neighbors/ivf_pq.cuh b/cpp/include/raft/neighbors/ivf_pq.cuh index 2a5c322016..6e51f65cf3 100644 --- a/cpp/include/raft/neighbors/ivf_pq.cuh +++ b/cpp/include/raft/neighbors/ivf_pq.cuh @@ -23,10 +23,13 @@ #include #include +#include #include #include +#include + namespace raft::neighbors::ivf_pq { /** @@ -348,7 +351,7 @@ void search(raft::device_resources const& handle, float* distances, rmm::mr::device_memory_resource* mr = nullptr) { - if (mr != nullptr) { raft::resource::set_workspace_resource(handle, mr); } + if (mr != nullptr) { resource::set_workspace_resource(handle, mr); } return detail::search(handle, params, idx, queries, n_queries, k, neighbors, distances); } From 79c954e228030ff2aa58ce3b9c4be3c8778b931c Mon Sep 17 00:00:00 2001 From: achirkin Date: Tue, 9 May 2023 08:29:16 +0200 Subject: [PATCH 05/25] Fix a missing merge change --- cpp/include/raft/neighbors/ivf_pq-inl.cuh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/include/raft/neighbors/ivf_pq-inl.cuh b/cpp/include/raft/neighbors/ivf_pq-inl.cuh index aecce5e289..8e10e99088 100644 --- a/cpp/include/raft/neighbors/ivf_pq-inl.cuh +++ b/cpp/include/raft/neighbors/ivf_pq-inl.cuh @@ -182,8 +182,7 @@ void search(raft::device_resources const& handle, static_cast(queries.extent(0)), k, neighbors.data_handle(), - distances.data_handle(), - handle.get_workspace_resource()); + distances.data_handle()); } /** @} */ // end group ivf_pq From 6cf11037b4c4f64f9057cd7cc6357907b8631b33 Mon Sep 17 00:00:00 2001 From: achirkin Date: Tue, 9 May 2023 09:02:35 +0200 Subject: [PATCH 06/25] Make the resource change not permanent --- cpp/include/raft/neighbors/ivf_pq-inl.cuh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cpp/include/raft/neighbors/ivf_pq-inl.cuh b/cpp/include/raft/neighbors/ivf_pq-inl.cuh index 8e10e99088..be8f7edaf1 100644 --- a/cpp/include/raft/neighbors/ivf_pq-inl.cuh +++ b/cpp/include/raft/neighbors/ivf_pq-inl.cuh @@ -348,8 +348,12 @@ void search(raft::device_resources const& handle, float* distances, rmm::mr::device_memory_resource* mr = nullptr) { - if (mr != nullptr) { resource::set_workspace_resource(handle, mr); } - return detail::search(handle, params, idx, queries, n_queries, k, neighbors, distances); + if (mr != nullptr) { + const device_resources res_local(handle, mr); + return detail::search(res_local, params, idx, queries, n_queries, k, neighbors, distances); + } else { + return detail::search(handle, params, idx, queries, n_queries, k, neighbors, distances); + } } } // namespace raft::neighbors::ivf_pq From 370b9edd1036ebc95509d644ac335f1711100579 Mon Sep 17 00:00:00 2001 From: achirkin Date: Tue, 9 May 2023 11:09:18 +0200 Subject: [PATCH 07/25] Don't force use the temp local workspace for all raft allocations --- .../raft/core/device_container_policy.hpp | 41 ++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/cpp/include/raft/core/device_container_policy.hpp b/cpp/include/raft/core/device_container_policy.hpp index eef981e56f..1fb2080fdb 100644 --- a/cpp/include/raft/core/device_container_policy.hpp +++ b/cpp/include/raft/core/device_container_policy.hpp @@ -164,7 +164,7 @@ class device_uvector_policy { public: auto create(raft::resources const& res, size_t n) -> container_type { - return container_type(n, resource::get_cuda_stream(res), resource::get_workspace_resource(res)); + return container_type(n, resource::get_cuda_stream(res)); } device_uvector_policy() = default; @@ -183,4 +183,43 @@ class device_uvector_policy { [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; } }; +/** + * @brief A container policy for device mdarray allocated in the temporary workspace. + */ +template +class device_uvector_workspace_policy { + public: + using element_type = ElementType; + using container_type = device_uvector; + // FIXME(jiamingy): allocator type is not supported by rmm::device_uvector + using pointer = typename container_type::pointer; + using const_pointer = typename container_type::const_pointer; + using reference = device_reference; + using const_reference = device_reference; + + using accessor_policy = std::experimental::default_accessor; + using const_accessor_policy = std::experimental::default_accessor; + + public: + auto create(raft::resources const& res, size_t n) -> container_type + { + return container_type(n, resource::get_cuda_stream(res), resource::get_workspace_resource(res)); + } + + device_uvector_workspace_policy() = default; + + [[nodiscard]] constexpr auto access(container_type& c, size_t n) const noexcept -> reference + { + return c[n]; + } + [[nodiscard]] constexpr auto access(container_type const& c, size_t n) const noexcept + -> const_reference + { + return c[n]; + } + + [[nodiscard]] auto make_accessor_policy() noexcept { return accessor_policy{}; } + [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; } +}; + } // namespace raft From 06cf4ff6aa655548f4f3b82fa99bbc7dd042c5c7 Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 28 Jun 2023 13:06:07 +0200 Subject: [PATCH 08/25] Don't use device_resources --- cpp/include/raft/neighbors/ivf_pq-inl.cuh | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/cpp/include/raft/neighbors/ivf_pq-inl.cuh b/cpp/include/raft/neighbors/ivf_pq-inl.cuh index 7fc0944e7b..8be13f675a 100644 --- a/cpp/include/raft/neighbors/ivf_pq-inl.cuh +++ b/cpp/include/raft/neighbors/ivf_pq-inl.cuh @@ -16,18 +16,18 @@ #pragma once -#include #include #include #include #include #include -#include // TODO: remove when possible +#include #include -#include -#include +#include + +#include // shared_ptr namespace raft::neighbors::ivf_pq { @@ -459,8 +459,11 @@ void search(raft::resources const& handle, rmm::mr::device_memory_resource* mr = nullptr) { if (mr != nullptr) { - // TODO: replace in with a method that would allow to clone raft::resources - const device_resources res_local(handle, mr); + // Shallow copy of the resource with the automatic lifespan: + // change the workspace resource temporarily + raft::resources res_local(handle); + resource::set_workspace_resource( + res_local, std::shared_ptr{mr, void_op{}}); return detail::search(res_local, params, idx, queries, n_queries, k, neighbors, distances); } else { return detail::search(handle, params, idx, queries, n_queries, k, neighbors, distances); From f27ba869fe581d9b0bb1a06007baf2bd6545c43d Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 28 Jun 2023 14:23:30 +0200 Subject: [PATCH 09/25] Using more of workspace memory resource --- .../raft/neighbors/detail/ivf_pq_build.cuh | 44 ++++++------------- 1 file changed, 13 insertions(+), 31 deletions(-) diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh index 9ada69e0f1..a006e2f1d2 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh @@ -346,10 +346,10 @@ void train_per_subset(raft::resources const& handle, const float* trainset, // [n_rows, dim] const uint32_t* labels, // [n_rows] uint32_t kmeans_n_iters, - rmm::mr::device_memory_resource* managed_memory, - rmm::mr::device_memory_resource* device_memory) + rmm::mr::device_memory_resource* managed_memory) { - auto stream = resource::get_cuda_stream(handle); + auto stream = resource::get_cuda_stream(handle); + auto device_memory = resource::get_workspace_resource(handle); rmm::device_uvector pq_centers_tmp(index.pq_centers().size(), stream, device_memory); rmm::device_uvector sub_trainset(n_rows * size_t(index.pq_len()), stream, device_memory); @@ -392,11 +392,6 @@ void train_per_subset(raft::resources const& handle, index.pq_len(), stream); - // clone the handle and attached the device memory resource to it - // TODO(achirkin): check if we should remove the device memory in the arguments - const resources new_handle(handle); - resource::set_workspace_resource(new_handle, device_memory); - // train PQ codebook for this subspace auto sub_trainset_view = raft::make_device_matrix_view(sub_trainset.data(), n_rows, index.pq_len()); @@ -428,10 +423,10 @@ void train_per_cluster(raft::resources const& handle, const float* trainset, // [n_rows, dim] const uint32_t* labels, // [n_rows] uint32_t kmeans_n_iters, - rmm::mr::device_memory_resource* managed_memory, - rmm::mr::device_memory_resource* device_memory) + rmm::mr::device_memory_resource* managed_memory) { - auto stream = resource::get_cuda_stream(handle); + auto stream = resource::get_cuda_stream(handle); + auto device_memory = resource::get_workspace_resource(handle); rmm::device_uvector pq_centers_tmp(index.pq_centers().size(), stream, device_memory); rmm::device_uvector cluster_sizes(index.n_lists(), stream, managed_memory); @@ -475,11 +470,6 @@ void train_per_cluster(raft::resources const& handle, indices + cluster_offsets[l], device_memory); - // clone the handle and attached the device memory resource to it - // TODO(achirkin): check if we should remove the device memory in the arguments - const resources new_handle(handle); - resource::set_workspace_resource(new_handle, device_memory); - // limit the cluster size to bound the training time. // [sic] we interpret the data as pq_len-dimensional size_t big_enough = 256ul * std::max(index.pq_book_size(), index.pq_dim()); @@ -1545,24 +1535,18 @@ auto build(raft::resources const& handle, size_t(n_rows) / std::max(params.kmeans_trainset_fraction * n_rows, index.n_lists())); size_t n_rows_train = n_rows / trainset_ratio; - rmm::mr::device_memory_resource* device_memory = nullptr; - auto pool_guard = raft::get_pool_memory_resource(device_memory, 1024 * 1024); - if (pool_guard) { RAFT_LOG_DEBUG("ivf_pq::build: using pool memory resource"); } - + auto* device_memory = resource::get_workspace_resource(handle); rmm::mr::managed_memory_resource managed_memory_upstream; rmm::mr::pool_memory_resource managed_memory( &managed_memory_upstream, 1024 * 1024); // If the trainset is small enough to comfortably fit into device memory, put it there. // Otherwise, use the managed memory. + constexpr size_t kTolerableRatio = 4; rmm::mr::device_memory_resource* big_memory_resource = &managed_memory; - { - size_t free_mem, total_mem; - constexpr size_t kTolerableRatio = 4; - RAFT_CUDA_TRY(cudaMemGetInfo(&free_mem, &total_mem)); - if (sizeof(float) * n_rows_train * index.dim() * kTolerableRatio < free_mem) { - big_memory_resource = device_memory; - } + if (sizeof(float) * n_rows_train * index.dim() * kTolerableRatio < + resource::get_workspace_free_bytes(handle)) { + big_memory_resource = device_memory; } // Besides just sampling, we transform the input dataset into floats to make it easier @@ -1711,8 +1695,7 @@ auto build(raft::resources const& handle, trainset.data(), labels.data(), params.kmeans_n_iters, - &managed_memory, - device_memory); + &managed_memory); break; case codebook_gen::PER_CLUSTER: train_per_cluster(handle, @@ -1721,8 +1704,7 @@ auto build(raft::resources const& handle, trainset.data(), labels.data(), params.kmeans_n_iters, - &managed_memory, - device_memory); + &managed_memory); break; default: RAFT_FAIL("Unreachable code"); } From d435855dd76930db05d1270979ba8dd714d12d7a Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 28 Jun 2023 14:24:35 +0200 Subject: [PATCH 10/25] Let device_uvector_policy keep the memory resource when needed --- .../raft/core/device_container_policy.hpp | 51 +++++-------------- cpp/include/raft/core/device_mdarray.hpp | 2 +- .../neighbors/detail/ivf_flat_serialize.cuh | 2 +- .../neighbors/detail/ivf_pq_serialize.cuh | 2 +- 4 files changed, 15 insertions(+), 42 deletions(-) diff --git a/cpp/include/raft/core/device_container_policy.hpp b/cpp/include/raft/core/device_container_policy.hpp index 1fb2080fdb..011de307db 100644 --- a/cpp/include/raft/core/device_container_policy.hpp +++ b/cpp/include/raft/core/device_container_policy.hpp @@ -164,49 +164,19 @@ class device_uvector_policy { public: auto create(raft::resources const& res, size_t n) -> container_type { - return container_type(n, resource::get_cuda_stream(res)); + if (mr_ == nullptr) { + // NB: not using the workspace resource by default! + // The workspace resource is for short-lived temporary allocations. + return container_type(n, resource::get_cuda_stream(res)); + } else { + return container_type(n, resource::get_cuda_stream(res), mr_); + } } - device_uvector_policy() = default; - - [[nodiscard]] constexpr auto access(container_type& c, size_t n) const noexcept -> reference + constexpr device_uvector_policy() = default; + constexpr explicit device_uvector_policy(rmm::mr::device_memory_resource* mr) noexcept : mr_(mr) { - return c[n]; } - [[nodiscard]] constexpr auto access(container_type const& c, size_t n) const noexcept - -> const_reference - { - return c[n]; - } - - [[nodiscard]] auto make_accessor_policy() noexcept { return accessor_policy{}; } - [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; } -}; - -/** - * @brief A container policy for device mdarray allocated in the temporary workspace. - */ -template -class device_uvector_workspace_policy { - public: - using element_type = ElementType; - using container_type = device_uvector; - // FIXME(jiamingy): allocator type is not supported by rmm::device_uvector - using pointer = typename container_type::pointer; - using const_pointer = typename container_type::const_pointer; - using reference = device_reference; - using const_reference = device_reference; - - using accessor_policy = std::experimental::default_accessor; - using const_accessor_policy = std::experimental::default_accessor; - - public: - auto create(raft::resources const& res, size_t n) -> container_type - { - return container_type(n, resource::get_cuda_stream(res), resource::get_workspace_resource(res)); - } - - device_uvector_workspace_policy() = default; [[nodiscard]] constexpr auto access(container_type& c, size_t n) const noexcept -> reference { @@ -220,6 +190,9 @@ class device_uvector_workspace_policy { [[nodiscard]] auto make_accessor_policy() noexcept { return accessor_policy{}; } [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; } + + private: + rmm::mr::device_memory_resource* mr_{nullptr}; }; } // namespace raft diff --git a/cpp/include/raft/core/device_mdarray.hpp b/cpp/include/raft/core/device_mdarray.hpp index 68273db15c..fe543c97dd 100644 --- a/cpp/include/raft/core/device_mdarray.hpp +++ b/cpp/include/raft/core/device_mdarray.hpp @@ -112,7 +112,7 @@ auto make_device_mdarray(raft::resources const& handle, using mdarray_t = device_mdarray; typename mdarray_t::mapping_type layout{exts}; - typename mdarray_t::container_policy_type policy{}; + typename mdarray_t::container_policy_type policy{mr}; return mdarray_t{handle, layout, policy}; } diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_serialize.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_serialize.cuh index b00d308827..61a6046273 100644 --- a/cpp/include/raft/neighbors/detail/ivf_flat_serialize.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_flat_serialize.cuh @@ -45,7 +45,7 @@ struct check_index_layout { "paste in the new size and consider updating the serialization logic"); }; -template struct check_index_layout), 296>; +template struct check_index_layout), 328>; /** * Save the index to file. diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_serialize.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_serialize.cuh index ff5bd8ef89..f01035cad3 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_serialize.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_serialize.cuh @@ -48,7 +48,7 @@ struct check_index_layout { }; // TODO: Recompute this and come back to it. -template struct check_index_layout), 448>; +template struct check_index_layout), 480>; /** * Write the index to an output stream From 1b62e3af8a18ea8fefff2cb64870c144415f1273 Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 28 Jun 2023 14:24:57 +0200 Subject: [PATCH 11/25] Make helper to query workspace size --- .../core/resource/device_memory_resource.hpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp index 07b3dd0be0..8cd93fe46b 100644 --- a/cpp/include/raft/core/resource/device_memory_resource.hpp +++ b/cpp/include/raft/core/resource/device_memory_resource.hpp @@ -126,6 +126,25 @@ inline auto get_workspace_resource(resources const& res) resource_type::WORKSPACE_RESOURCE); }; +/** Get the total size of the workspace resource. */ +inline auto get_workspace_total_bytes(resources const& res) -> size_t +{ + return get_workspace_resource(res)->get_allocation_limit(); +}; + +/** Get the already allocated size of the workspace resource. */ +inline auto get_workspace_used_bytes(resources const& res) -> size_t +{ + return get_workspace_resource(res)->get_allocated_bytes(); +}; + +/** Get the available size of the workspace resource. */ +inline auto get_workspace_free_bytes(resources const& res) -> size_t +{ + const auto* p = get_workspace_resource(res); + return p->get_allocation_limit() - p->get_allocated_bytes(); +}; + /** * Set a temporary workspace resource on a resources instance. * From 5fed631645d8d512f52591cdcb11fc4056fed935 Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 28 Jun 2023 15:06:49 +0200 Subject: [PATCH 12/25] Tiny unrelated test fix: copy data in a stream. --- cpp/test/util/device_atomics.cu | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/cpp/test/util/device_atomics.cu b/cpp/test/util/device_atomics.cu index 5e8a67c8f6..f305c7c875 100644 --- a/cpp/test/util/device_atomics.cu +++ b/cpp/test/util/device_atomics.cu @@ -53,14 +53,11 @@ TEST(Raft, AtomicIncWarp) out_device.data()); // Copy data to host - RAFT_CUDA_TRY(cudaMemcpy(out_host.data(), - (const void*)out_device.data(), - num_elts * sizeof(int), - cudaMemcpyDeviceToHost)); + copy(out_host.data(), out_device.data(), num_elts, s); // Check that count is correct and that each thread index is contained in the // array exactly once. - ASSERT_EQ(num_elts, counter.value(s)); + ASSERT_EQ(num_elts, counter.value(s)); // NB: accessing the counter synchronizes `s` std::sort(out_host.begin(), out_host.end()); for (int i = 0; i < num_elts; ++i) { ASSERT_EQ(i, out_host[i]); From a2e749dff03223d6482e2f24367c4276b955f822 Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 28 Jun 2023 16:45:44 +0200 Subject: [PATCH 13/25] Update the API to always use shared pointers to the resources --- cpp/include/raft/core/device_resources.hpp | 10 +-- cpp/include/raft/core/handle.hpp | 9 +-- .../core/resource/device_memory_resource.hpp | 26 ------- cpp/test/core/handle.cpp | 71 ++++++++++++------- 4 files changed, 58 insertions(+), 58 deletions(-) diff --git a/cpp/include/raft/core/device_resources.hpp b/cpp/include/raft/core/device_resources.hpp index 6c7813890a..cf06920a8c 100644 --- a/cpp/include/raft/core/device_resources.hpp +++ b/cpp/include/raft/core/device_resources.hpp @@ -61,7 +61,7 @@ namespace raft { class device_resources : public resources { public: device_resources(const device_resources& handle, - rmm::mr::device_memory_resource* workspace_resource, + std::shared_ptr workspace_resource, std::optional allocation_limit = std::nullopt) : resources{handle} { @@ -86,8 +86,8 @@ class device_resources : public resources { */ device_resources(rmm::cuda_stream_view stream_view = rmm::cuda_stream_per_thread, std::shared_ptr stream_pool = {nullptr}, - rmm::mr::device_memory_resource* workspace_resource = nullptr, - std::optional allocation_limit = std::nullopt) + std::shared_ptr workspace_resource = {nullptr}, + std::optional allocation_limit = std::nullopt) : resources{} { resources::add_resource_factory(std::make_shared()); @@ -95,7 +95,9 @@ class device_resources : public resources { std::make_shared(stream_view)); resources::add_resource_factory( std::make_shared(stream_pool)); - resource::set_workspace_resource(*this, workspace_resource, allocation_limit); + if (workspace_resource) { + resource::set_workspace_resource(*this, workspace_resource, allocation_limit); + } } /** Destroys all held-up resources */ diff --git a/cpp/include/raft/core/handle.hpp b/cpp/include/raft/core/handle.hpp index 2a6b5657e2..124ab8c315 100644 --- a/cpp/include/raft/core/handle.hpp +++ b/cpp/include/raft/core/handle.hpp @@ -32,7 +32,8 @@ namespace raft { */ class handle_t : public raft::device_resources { public: - handle_t(const handle_t& handle, rmm::mr::device_memory_resource* workspace_resource) + handle_t(const handle_t& handle, + std::shared_ptr workspace_resource) : device_resources(handle, workspace_resource) { } @@ -51,9 +52,9 @@ class handle_t : public raft::device_resources { * @param[in] workspace_resource an optional resource used by some functions for allocating * temporary workspaces. */ - handle_t(rmm::cuda_stream_view stream_view = rmm::cuda_stream_per_thread, - std::shared_ptr stream_pool = {nullptr}, - rmm::mr::device_memory_resource* workspace_resource = nullptr) + handle_t(rmm::cuda_stream_view stream_view = rmm::cuda_stream_per_thread, + std::shared_ptr stream_pool = {nullptr}, + std::shared_ptr workspace_resource = {nullptr}) : device_resources{stream_view, stream_pool, workspace_resource} { } diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp index 8cd93fe46b..33e8944dcc 100644 --- a/cpp/include/raft/core/resource/device_memory_resource.hpp +++ b/cpp/include/raft/core/resource/device_memory_resource.hpp @@ -164,30 +164,4 @@ inline void set_workspace_resource(resources const& res, std::make_shared(mr, allocation_limit, alignment)); }; -/** - * Set a temporary workspace resource on a resources instance. - * - * @param res raft resources object for managing resources - * @param mr an optional RMM device_memory_resource; - * note, the ownership of the object is not transferred with this raw pointer interface. - * @param allocation_limit - * the total amount of memory in bytes available to the temporary workspace resources. - * @param alignment optional alignment requirements passed to RMM allocations - * - */ -inline void set_workspace_resource(resources const& res, - rmm::mr::device_memory_resource* mr, - std::optional allocation_limit = std::nullopt, - std::optional alignment = std::nullopt) -{ - // NB: to preserve the semantics of passing memory resource without transferring the ownership, - // we create a shared pointer with a dummy deleter (void_op). - set_workspace_resource(res, - mr != nullptr - ? std::shared_ptr{mr, void_op{}} - : std::shared_ptr{nullptr}, - allocation_limit, - alignment); -}; - } // namespace raft::resource diff --git a/cpp/test/core/handle.cpp b/cpp/test/core/handle.cpp index 5436396b86..c0efe9236f 100644 --- a/cpp/test/core/handle.cpp +++ b/cpp/test/core/handle.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -274,39 +275,61 @@ TEST(Raft, WorkspaceResource) { raft::handle_t handle; - ASSERT_TRUE(dynamic_cast*>( - resource::get_workspace_resource(handle)) == nullptr); - ASSERT_EQ(rmm::mr::get_current_device_resource(), resource::get_workspace_resource(handle)); + // The returned resource is always a limiting adaptor + auto* orig_mr = resource::get_workspace_resource(handle)->get_upstream(); - auto pool_mr = new rmm::mr::pool_memory_resource(rmm::mr::get_current_device_resource()); - std::shared_ptr pool = {nullptr}; - raft::handle_t handle2(rmm::cuda_stream_per_thread, pool, pool_mr); + // Let's create a pooled resource + auto pool_mr = std::shared_ptr{ + new rmm::mr::pool_memory_resource(rmm::mr::get_current_device_resource())}; - ASSERT_TRUE(dynamic_cast*>( - resource::get_workspace_resource(handle2)) != nullptr); - // ASSERT_EQ(pool_mr, resource::get_workspace_resource(handle2)); // TODO: limiting resource! + // A tiny workspace of 1MB + size_t max_size = 1024 * 1024; - delete pool_mr; -} - -TEST(Raft, WorkspaceResourceCopy) -{ - auto stream_pool = std::make_shared(10); + // Replace the resource + resource::set_workspace_resource(handle, pool_mr, max_size); + auto new_mr = resource::get_workspace_resource(handle); - handle_t handle(rmm::cuda_stream_per_thread, stream_pool); + // By this point, the orig_mr likely points to a non-existent resource; don't dereference! + ASSERT_NE(orig_mr, new_mr); + ASSERT_EQ(pool_mr.get(), new_mr->get_upstream()); + // We can safely reset pool_mr, because the shared_ptr to the pool memory stays in the resource + pool_mr.reset(); - auto pool_mr = new rmm::mr::pool_memory_resource(rmm::mr::get_current_device_resource()); + auto stream = resource::get_cuda_stream(handle); + rmm::device_buffer buf(max_size / 2, stream, new_mr); - handle_t copied_handle(handle, pool_mr); + // Note, the underlying pool allocator likey uses more space than reported here + ASSERT_EQ(max_size, resource::get_workspace_total_bytes(handle)); + ASSERT_EQ(buf.size(), resource::get_workspace_used_bytes(handle)); + ASSERT_EQ(max_size - buf.size(), resource::get_workspace_free_bytes(handle)); - assert_handles_equal(handle, copied_handle); + // this should throw, becaise we partially used the space. + ASSERT_THROW((rmm::device_buffer{max_size, stream, new_mr}), rmm::bad_alloc); +} - // Assert the workspace_resources are what we expect - ASSERT_TRUE(dynamic_cast*>( - resource::get_workspace_resource(handle)) == nullptr); +TEST(Raft, WorkspaceResourceCopy) +{ + raft::handle_t res; + auto orig_mr = resource::get_workspace_resource(res); + auto orig_size = resource::get_workspace_total_bytes(res); - ASSERT_TRUE(dynamic_cast*>( - resource::get_workspace_resource(copied_handle)) != nullptr); + { + // create a new handle in the inner scope and update the workspace resource for it. + raft::resources tmp_res(res); + resource::set_workspace_resource( + tmp_res, + std::shared_ptr{ + new rmm::mr::pool_memory_resource(rmm::mr::get_current_device_resource())}, + orig_size * 2); + + ASSERT_EQ(orig_mr, resource::get_workspace_resource(res)); + ASSERT_EQ(orig_size, resource::get_workspace_total_bytes(res)); + + ASSERT_NE(orig_mr, resource::get_workspace_resource(tmp_res)); + ASSERT_NE(orig_size, resource::get_workspace_total_bytes(tmp_res)); + } + ASSERT_EQ(orig_mr, resource::get_workspace_resource(res)); + ASSERT_EQ(orig_size, resource::get_workspace_total_bytes(res)); } TEST(Raft, HandleCopy) From 7736d768158636345bde43fb26ff0ed5d79d9806 Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 28 Jun 2023 17:03:30 +0200 Subject: [PATCH 14/25] Fix a typo --- cpp/test/core/handle.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/test/core/handle.cpp b/cpp/test/core/handle.cpp index c0efe9236f..a1ad4385a7 100644 --- a/cpp/test/core/handle.cpp +++ b/cpp/test/core/handle.cpp @@ -298,7 +298,7 @@ TEST(Raft, WorkspaceResource) auto stream = resource::get_cuda_stream(handle); rmm::device_buffer buf(max_size / 2, stream, new_mr); - // Note, the underlying pool allocator likey uses more space than reported here + // Note, the underlying pool allocator likely uses more space than reported here ASSERT_EQ(max_size, resource::get_workspace_total_bytes(handle)); ASSERT_EQ(buf.size(), resource::get_workspace_used_bytes(handle)); ASSERT_EQ(max_size - buf.size(), resource::get_workspace_free_bytes(handle)); From be63f73ead447d7fef65aedd8c234949d20bb9d8 Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 29 Jun 2023 08:33:54 +0200 Subject: [PATCH 15/25] Rename limited->limiting resource for consistency --- .../raft/core/resource/device_memory_resource.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp index 33e8944dcc..af2761882a 100644 --- a/cpp/include/raft/core/resource/device_memory_resource.hpp +++ b/cpp/include/raft/core/resource/device_memory_resource.hpp @@ -29,18 +29,18 @@ #include namespace raft::resource { -class limited_memory_resource : public resource { +class limiting_memory_resource : public resource { public: - limited_memory_resource(std::shared_ptr mr, - std::size_t allocation_limit, - std::optional alignment) + limiting_memory_resource(std::shared_ptr mr, + std::size_t allocation_limit, + std::optional alignment) : upstream_(mr), mr_(make_adaptor(mr, allocation_limit, alignment)) { } auto get_resource() -> void* override { return &mr_; } - ~limited_memory_resource() override = default; + ~limiting_memory_resource() override = default; private: std::shared_ptr upstream_; @@ -79,7 +79,7 @@ class workspace_resource_factory : public resource_factory { auto get_resource_type() -> resource_type override { return resource_type::WORKSPACE_RESOURCE; } auto make_resource() -> resource* override { - return new limited_memory_resource(mr_, allocation_limit_, alignment_); + return new limiting_memory_resource(mr_, allocation_limit_, alignment_); } private: From c70728a04f459e528333ee6cd237fd30844e82e5 Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 29 Jun 2023 12:49:43 +0200 Subject: [PATCH 16/25] Add comments --- .../core/resource/device_memory_resource.hpp | 98 +++++++++++++++++-- 1 file changed, 89 insertions(+), 9 deletions(-) diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp index af2761882a..ec097c0333 100644 --- a/cpp/include/raft/core/resource/device_memory_resource.hpp +++ b/cpp/include/raft/core/resource/device_memory_resource.hpp @@ -82,30 +82,72 @@ class workspace_resource_factory : public resource_factory { return new limiting_memory_resource(mr_, allocation_limit_, alignment_); } + /** Construct a sensible default pool memory resource. */ + static inline auto default_pool_resource(std::size_t limit) + -> std::shared_ptr + { + // Set the default granularity to 1 GiB + constexpr std::size_t kOneGb = 1024lu * 1024lu * 1024lu; + // The initial size of the pool. The choice of this value only affects the performance a little + // bit. Heuristics: + // 1) the pool shouldn't be too big from the beginning independently of the limit; + // 2) otherwise, set it to half the max size to avoid too many resize calls. + auto min_size = std::min(kOneGb, limit / 2lu); + // The pool is going to be place behind the limiting resource adaptor. This means the user won't + // be able to allocate more than 'limit' bytes of memory anyway. At the same time, the pool + // itself may consume a little bit more memory than the 'limit' due to memory fragmentation. + // Therefore, we look for a compromise, such that: + // 1) 'limit' is accurate - the user should be more likely to run into the limiting resource + // resource adaptor bad_alloc error than into the pool bad_alloc error. + // 2) The pool doesn't grab too much memory on top of the 'limit'. + auto max_size = std::min(limit + kOneGb / 2lu, limit * 3lu / 2lu); + auto upstream = rmm::mr::get_current_device_resource(); + return std::make_shared>( + upstream, min_size, max_size); + } + + /** Get the global memory resource wrapped into an unmanaged shared_ptr (with no deleter). */ + static inline auto default_plain_resource() -> std::shared_ptr + { + return std::shared_ptr{rmm::mr::get_current_device_resource(), + void_op{}}; + } + private: std::size_t allocation_limit_; std::optional alignment_; std::shared_ptr mr_; - // Create a pool memory resource by default static inline auto default_memory_resource(std::size_t limit) -> std::shared_ptr { - constexpr std::size_t kOneGb = 1024lu * 1024lu * 1024lu; - auto min_size = std::min(kOneGb, limit / 2); - auto max_size = limit * 3lu / 2lu; - auto upstream = rmm::mr::get_current_device_resource(); - return std::make_shared>( - upstream, min_size, max_size); + if (rmm::mr::cuda_memory_resource{}.is_equal(*rmm::mr::get_current_device_resource())) { + // Use the memory pool if only we're sure the global memory resource is set to its default, + // which is the cuda_memory_resource. + // The reason for this is that some raft algorithms rely on the workspace allocator to be + // fast; e.g. some buffers are allocated and released in a loop in performance-critical paths + // (batching), such as ANN-search routines. We don't want many allocations to happen there + // unless the user insists on it. + RAFT_LOG_DEBUG("The workspace uses the pool memory resource by default (limit: %zu)", limit); + return default_pool_resource(limit); + } else { + // If the user sets the global (rmm) memory resource to anything but the trivial + // cuda_memory_resource, we don't interfere that - they know better. In this case, the + // limiting resource adaptor is set on top the global (per-device) resource. + RAFT_LOG_DEBUG("The workspace uses the global default memory resource (limit: %zu)", limit); + return default_plain_resource(); + } } - // Allow a fraction of available memory by default. static inline auto default_allocation_limit() -> std::size_t { std::size_t free_size{}; std::size_t total_size{}; RAFT_CUDA_TRY(cudaMemGetInfo(&free_size, &total_size)); - return free_size / 2; + // Note, the workspace does not claim all this memory from the start, so it's still usable by + // the main resource as well. + // This limit is merely an order for algorithm internals to plan the batching accordingly. + return total_size / 2; } }; @@ -164,4 +206,42 @@ inline void set_workspace_resource(resources const& res, std::make_shared(mr, allocation_limit, alignment)); }; +/** + * Set the temporary workspace resource to a pool on top of the global memory resource + * (`rmm::mr::get_current_device_resource()`. + * + * @param res raft resources object for managing resources + * @param allocation_limit + * the total amount of memory in bytes available to the temporary workspace resources; + * if not provided, a last used or default limit is used. + * + */ +inline void set_workspace_to_pool_resource( + resources const& res, std::optional allocation_limit = std::nullopt) +{ + if (!allocation_limit.has_value()) { allocation_limit = get_workspace_total_bytes(res); } + res.add_resource_factory(std::make_shared( + workspace_resource_factory::default_pool_resource(*allocation_limit), + allocation_limit, + std::nullopt)); +}; + +/** + * Set the temporary workspace resource the same as the global memory resource + * (`rmm::mr::get_current_device_resource()`. + * + * Note, the workspace resource is always limited; the limit here defines how much of the global + * memory resource can be consumed by the workspace allocations. + * + * @param res raft resources object for managing resources + * @param allocation_limit + * the total amount of memory in bytes available to the temporary workspace resources. + */ +inline void set_workspace_to_global_resource( + resources const& res, std::optional allocation_limit = std::nullopt) +{ + res.add_resource_factory(std::make_shared( + workspace_resource_factory::default_plain_resource(), allocation_limit, std::nullopt)); +}; + } // namespace raft::resource From be047d4eb03895005edbf44abb33e0a1c1295b33 Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 29 Jun 2023 13:11:33 +0200 Subject: [PATCH 17/25] Remove repeated word in the comment --- cpp/include/raft/core/resource/device_memory_resource.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp index ec097c0333..701cf43f5b 100644 --- a/cpp/include/raft/core/resource/device_memory_resource.hpp +++ b/cpp/include/raft/core/resource/device_memory_resource.hpp @@ -97,7 +97,7 @@ class workspace_resource_factory : public resource_factory { // be able to allocate more than 'limit' bytes of memory anyway. At the same time, the pool // itself may consume a little bit more memory than the 'limit' due to memory fragmentation. // Therefore, we look for a compromise, such that: - // 1) 'limit' is accurate - the user should be more likely to run into the limiting resource + // 1) 'limit' is accurate - the user should be more likely to run into the limiting // resource adaptor bad_alloc error than into the pool bad_alloc error. // 2) The pool doesn't grab too much memory on top of the 'limit'. auto max_size = std::min(limit + kOneGb / 2lu, limit * 3lu / 2lu); From 3e151d4c0735b2173d55d931e24fd9e5158c1474 Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 29 Jun 2023 13:13:37 +0200 Subject: [PATCH 18/25] Fix a missing word in the comment --- cpp/include/raft/core/resource/device_memory_resource.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp index 701cf43f5b..10e399439a 100644 --- a/cpp/include/raft/core/resource/device_memory_resource.hpp +++ b/cpp/include/raft/core/resource/device_memory_resource.hpp @@ -133,7 +133,7 @@ class workspace_resource_factory : public resource_factory { } else { // If the user sets the global (rmm) memory resource to anything but the trivial // cuda_memory_resource, we don't interfere that - they know better. In this case, the - // limiting resource adaptor is set on top the global (per-device) resource. + // limiting resource adaptor is set on top of the global (per-device) resource. RAFT_LOG_DEBUG("The workspace uses the global default memory resource (limit: %zu)", limit); return default_plain_resource(); } From 4cf64557d09e05ec287d2957025e5a1dee15a94a Mon Sep 17 00:00:00 2001 From: achirkin Date: Fri, 7 Jul 2023 10:12:30 +0200 Subject: [PATCH 19/25] Add a deprecation comment to the mr argument --- cpp/include/raft/neighbors/ivf_pq-inl.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/raft/neighbors/ivf_pq-inl.cuh b/cpp/include/raft/neighbors/ivf_pq-inl.cuh index 8be13f675a..b875d81f15 100644 --- a/cpp/include/raft/neighbors/ivf_pq-inl.cuh +++ b/cpp/include/raft/neighbors/ivf_pq-inl.cuh @@ -444,8 +444,8 @@ void search_with_filtering(raft::resources const& handle, * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset * [n_queries, k] * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k] - * @param[in] mr an optional memory resource to use across the searches (you can provide a large - * enough memory pool here to avoid memory allocations within search). + * @param[in] mr (deprecated) an optional memory resource to use across the searches (you can + * provide a large enough memory pool here to avoid memory allocations within search). */ template void search(raft::resources const& handle, From 127907ca32f6eedc645167b6e712415262808dd2 Mon Sep 17 00:00:00 2001 From: achirkin Date: Fri, 7 Jul 2023 10:42:40 +0200 Subject: [PATCH 20/25] Add function deprecations --- cpp/include/raft/neighbors/ivf_pq-ext.cuh | 45 ++++++++- cpp/include/raft/neighbors/ivf_pq-inl.cuh | 106 ++++++++++++++++------ 2 files changed, 118 insertions(+), 33 deletions(-) diff --git a/cpp/include/raft/neighbors/ivf_pq-ext.cuh b/cpp/include/raft/neighbors/ivf_pq-ext.cuh index 1595f55d8c..fcfe837e2d 100644 --- a/cpp/include/raft/neighbors/ivf_pq-ext.cuh +++ b/cpp/include/raft/neighbors/ivf_pq-ext.cuh @@ -92,8 +92,7 @@ void search_with_filtering(raft::resources const& handle, uint32_t k, IdxT* neighbors, float* distances, - rmm::mr::device_memory_resource* mr = nullptr, - IvfSampleFilterT sample_filter = IvfSampleFilterT()) RAFT_EXPLICIT; + IvfSampleFilterT sample_filter = IvfSampleFilterT{}) RAFT_EXPLICIT; template void search(raft::resources const& handle, @@ -103,8 +102,34 @@ void search(raft::resources const& handle, uint32_t n_queries, uint32_t k, IdxT* neighbors, - float* distances, - rmm::mr::device_memory_resource* mr = nullptr) RAFT_EXPLICIT; + float* distances) RAFT_EXPLICIT; + +template +[[deprecated( + "Drop the `mr` argument and use `raft::resource::set_workspace_resource` instead")]] void +search_with_filtering(raft::resources const& handle, + const raft::neighbors::ivf_pq::search_params& params, + const index& idx, + const T* queries, + uint32_t n_queries, + uint32_t k, + IdxT* neighbors, + float* distances, + rmm::mr::device_memory_resource* mr, + IvfSampleFilterT sample_filter = IvfSampleFilterT{}) RAFT_EXPLICIT; + +template +[[deprecated( + "Drop the `mr` argument and use `raft::resource::set_workspace_resource` instead")]] void +search(raft::resources const& handle, + const raft::neighbors::ivf_pq::search_params& params, + const index& idx, + const T* queries, + uint32_t n_queries, + uint32_t k, + IdxT* neighbors, + float* distances, + rmm::mr::device_memory_resource* mr) RAFT_EXPLICIT; } // namespace raft::neighbors::ivf_pq @@ -182,7 +207,17 @@ instantiate_raft_neighbors_ivf_pq_extend(uint8_t, int64_t); uint32_t k, \ IdxT* neighbors, \ float* distances, \ - rmm::mr::device_memory_resource* mr) + rmm::mr::device_memory_resource* mr); \ + \ + extern template void raft::neighbors::ivf_pq::search( \ + raft::resources const& handle, \ + const raft::neighbors::ivf_pq::search_params& params, \ + const raft::neighbors::ivf_pq::index& idx, \ + const T* queries, \ + uint32_t n_queries, \ + uint32_t k, \ + IdxT* neighbors, \ + float* distances) instantiate_raft_neighbors_ivf_pq_search(float, int64_t); instantiate_raft_neighbors_ivf_pq_search(int8_t, int64_t); diff --git a/cpp/include/raft/neighbors/ivf_pq-inl.cuh b/cpp/include/raft/neighbors/ivf_pq-inl.cuh index b875d81f15..ccf8717486 100644 --- a/cpp/include/raft/neighbors/ivf_pq-inl.cuh +++ b/cpp/include/raft/neighbors/ivf_pq-inl.cuh @@ -166,7 +166,7 @@ void search_with_filtering(raft::resources const& handle, raft::device_matrix_view queries, raft::device_matrix_view neighbors, raft::device_matrix_view distances, - IvfSampleFilterT sample_filter = IvfSampleFilterT()) + IvfSampleFilterT sample_filter = IvfSampleFilterT{}) { RAFT_EXPECTS( queries.extent(0) == neighbors.extent(0) && queries.extent(0) == distances.extent(0), @@ -229,7 +229,7 @@ void search(raft::resources const& handle, queries, neighbors, distances, - raft::neighbors::filtering::none_ivf_sample_filter()); + raft::neighbors::filtering::none_ivf_sample_filter{}); } /** @} */ // end group ivf_pq @@ -353,20 +353,17 @@ void extend(raft::resources const& handle, * eliminate entirely allocations happening within `search`: * @code{.cpp} * ... - * // Create a pooling memory resource with a pre-defined initial size. - * rmm::mr::pool_memory_resource mr( - * rmm::mr::get_current_device_resource(), 1024 * 1024); * // use default search parameters * ivf_pq::search_params search_params; * filtering::none_ivf_sample_filter filter; * // Use the same allocator across multiple searches to reduce the number of * // cuda memory allocations * ivf_pq::search_with_filtering( - * handle, search_params, index, queries1, N1, K, out_inds1, out_dists1, &mr, filter); + * handle, search_params, index, queries1, N1, K, out_inds1, out_dists1, filter); * ivf_pq::search_with_filtering( - * handle, search_params, index, queries2, N2, K, out_inds2, out_dists2, &mr, filter); + * handle, search_params, index, queries2, N2, K, out_inds2, out_dists2, filter); * ivf_pq::search_with_filtering( - * handle, search_params, index, queries3, N3, K, out_inds3, out_dists3, &mr, filter); + * handle, search_params, index, queries3, N3, K, out_inds3, out_dists3, nfilter); * ... * @endcode * The exact size of the temporary buffer depends on multiple factors and is an implementation @@ -385,8 +382,6 @@ void extend(raft::resources const& handle, * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset * [n_queries, k] * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k] - * @param[in] mr an optional memory resource to use across the searches (you can provide a large - * enough memory pool here to avoid memory allocations within search). * @param[in] sample_filter a filter the greenlights samples for a given query */ template @@ -398,11 +393,41 @@ void search_with_filtering(raft::resources const& handle, uint32_t k, IdxT* neighbors, float* distances, - rmm::mr::device_memory_resource* mr = nullptr, - IvfSampleFilterT sample_filter = IvfSampleFilterT()) + IvfSampleFilterT sample_filter = IvfSampleFilterT{}) { - detail::search( - handle, params, idx, queries, n_queries, k, neighbors, distances, mr, sample_filter); + detail::search(handle, params, idx, queries, n_queries, k, neighbors, distances, sample_filter); +} + +/** + * This function is deprecated and will be removed in a future. + * Please drop the `mr` argument and use `raft::resource::set_workspace_resource` instead. + */ +template +[[deprecated( + "Drop the `mr` argument and use `raft::resource::set_workspace_resource` instead")]] void +search_with_filtering(raft::resources const& handle, + const search_params& params, + const index& idx, + const T* queries, + uint32_t n_queries, + uint32_t k, + IdxT* neighbors, + float* distances, + rmm::mr::device_memory_resource* mr, + IvfSampleFilterT sample_filter = IvfSampleFilterT{}) +{ + if (mr != nullptr) { + // Shallow copy of the resource with the automatic lifespan: + // change the workspace resource temporarily + raft::resources res_local(handle); + resource::set_workspace_resource( + res_local, std::shared_ptr{mr, void_op{}}); + return search_with_filtering( + res_local, params, idx, queries, n_queries, k, neighbors, distances, sample_filter); + } else { + return search_with_filtering( + handle, params, idx, queries, n_queries, k, neighbors, distances, sample_filter); + } } /** @@ -444,8 +469,6 @@ void search_with_filtering(raft::resources const& handle, * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset * [n_queries, k] * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k] - * @param[in] mr (deprecated) an optional memory resource to use across the searches (you can - * provide a large enough memory pool here to avoid memory allocations within search). */ template void search(raft::resources const& handle, @@ -455,19 +478,46 @@ void search(raft::resources const& handle, uint32_t n_queries, uint32_t k, IdxT* neighbors, - float* distances, - rmm::mr::device_memory_resource* mr = nullptr) + float* distances) { - if (mr != nullptr) { - // Shallow copy of the resource with the automatic lifespan: - // change the workspace resource temporarily - raft::resources res_local(handle); - resource::set_workspace_resource( - res_local, std::shared_ptr{mr, void_op{}}); - return detail::search(res_local, params, idx, queries, n_queries, k, neighbors, distances); - } else { - return detail::search(handle, params, idx, queries, n_queries, k, neighbors, distances); - } + return search_with_filtering(handle, + params, + idx, + queries, + n_queries, + k, + neighbors, + distances, + raft::neighbors::filtering::none_ivf_sample_filter{}); +} + +/** + * This function is deprecated and will be removed in a future. + * Please drop the `mr` argument and use `raft::resource::set_workspace_resource` instead. + */ +template +[[deprecated( + "Drop the `mr` argument and use `raft::resource::set_workspace_resource` instead")]] void +search(raft::resources const& handle, + const search_params& params, + const index& idx, + const T* queries, + uint32_t n_queries, + uint32_t k, + IdxT* neighbors, + float* distances, + rmm::mr::device_memory_resource* mr) +{ + return search_with_filtering(handle, + params, + idx, + queries, + n_queries, + k, + neighbors, + distances, + mr, + raft::neighbors::filtering::none_ivf_sample_filter{}); } } // namespace raft::neighbors::ivf_pq From d6a27c532168fd9ead7c36a0957105d102abe6c6 Mon Sep 17 00:00:00 2001 From: achirkin Date: Fri, 7 Jul 2023 10:53:42 +0200 Subject: [PATCH 21/25] Remove ANN reference --- cpp/include/raft/core/resource/device_memory_resource.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp index 10e399439a..70b098db45 100644 --- a/cpp/include/raft/core/resource/device_memory_resource.hpp +++ b/cpp/include/raft/core/resource/device_memory_resource.hpp @@ -126,8 +126,8 @@ class workspace_resource_factory : public resource_factory { // which is the cuda_memory_resource. // The reason for this is that some raft algorithms rely on the workspace allocator to be // fast; e.g. some buffers are allocated and released in a loop in performance-critical paths - // (batching), such as ANN-search routines. We don't want many allocations to happen there - // unless the user insists on it. + // (e.g. in batching). We don't want many allocations to happen there unless the user insists + // on it. RAFT_LOG_DEBUG("The workspace uses the pool memory resource by default (limit: %zu)", limit); return default_pool_resource(limit); } else { From 453042397d5b3e57b4941ae5ebe6e7ba3c2a2c42 Mon Sep 17 00:00:00 2001 From: achirkin Date: Fri, 7 Jul 2023 11:41:51 +0200 Subject: [PATCH 22/25] Use the plain workspace resource by default and print a warning if necessary --- .../detail/device_memory_resource.hpp | 58 +++++++++++++++++++ .../core/resource/device_memory_resource.hpp | 23 +------- .../raft/neighbors/detail/ivf_pq_build.cuh | 4 ++ .../raft/neighbors/detail/ivf_pq_search.cuh | 2 + 4 files changed, 65 insertions(+), 22 deletions(-) create mode 100644 cpp/include/raft/core/resource/detail/device_memory_resource.hpp diff --git a/cpp/include/raft/core/resource/detail/device_memory_resource.hpp b/cpp/include/raft/core/resource/detail/device_memory_resource.hpp new file mode 100644 index 0000000000..9d3f13689d --- /dev/null +++ b/cpp/include/raft/core/resource/detail/device_memory_resource.hpp @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +#include + +#include +#include +#include + +namespace raft::resource::detail { + +/** + * Warn a user of the calling algorithm if they use the default non-pooled memory allocator, + * as it may hurt the performance. + * + * This helper function is designed to produce the warning once for a given `user_name`. + * + * @param[in] res + * @param[in] user_name the name of the algorithm or any other identification. + * + */ +inline void warn_non_pool_workspace(resources const& res, std::string user_name) +{ + // Detect if the plain cuda memory resource is used for the workspace + if (rmm::mr::cuda_memory_resource{}.is_equal(*get_workspace_resource(res)->get_upstream())) { + static std::set notified_names{}; + static std::mutex mutex{}; + std::lock_guard guard(mutex); + auto [it, inserted] = notified_names.insert(std::move(user_name)); + if (inserted) { + RAFT_LOG_WARN( + "[%s] the default cuda resource is used for the raft workspace allocations. This may lead " + "to a significant slowdown for this algorithm. Consider using the default pool resource " + "(`raft::resource::set_workspace_to_pool_resource`) or set your own resource explicitly " + "(`raft::resource::set_workspace_resource`).", + it->c_str()); + } + } +} + +} // namespace raft::resource::detail diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp index 70b098db45..038e81e425 100644 --- a/cpp/include/raft/core/resource/device_memory_resource.hpp +++ b/cpp/include/raft/core/resource/device_memory_resource.hpp @@ -72,7 +72,7 @@ class workspace_resource_factory : public resource_factory { std::optional alignment = std::nullopt) : allocation_limit_(allocation_limit.value_or(default_allocation_limit())), alignment_(alignment), - mr_(mr ? mr : default_memory_resource(allocation_limit_)) + mr_(mr ? mr : default_plain_resource()) { } @@ -118,27 +118,6 @@ class workspace_resource_factory : public resource_factory { std::optional alignment_; std::shared_ptr mr_; - static inline auto default_memory_resource(std::size_t limit) - -> std::shared_ptr - { - if (rmm::mr::cuda_memory_resource{}.is_equal(*rmm::mr::get_current_device_resource())) { - // Use the memory pool if only we're sure the global memory resource is set to its default, - // which is the cuda_memory_resource. - // The reason for this is that some raft algorithms rely on the workspace allocator to be - // fast; e.g. some buffers are allocated and released in a loop in performance-critical paths - // (e.g. in batching). We don't want many allocations to happen there unless the user insists - // on it. - RAFT_LOG_DEBUG("The workspace uses the pool memory resource by default (limit: %zu)", limit); - return default_pool_resource(limit); - } else { - // If the user sets the global (rmm) memory resource to anything but the trivial - // cuda_memory_resource, we don't interfere that - they know better. In this case, the - // limiting resource adaptor is set on top of the global (per-device) resource. - RAFT_LOG_DEBUG("The workspace uses the global default memory resource (limit: %zu)", limit); - return default_plain_resource(); - } - } - static inline auto default_allocation_limit() -> std::size_t { std::size_t free_size{}; diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh index a006e2f1d2..199cb74fbe 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -1317,6 +1318,8 @@ void extend(raft::resources const& handle, { common::nvtx::range fun_scope( "ivf_pq::extend(%zu, %u)", size_t(n_rows), index->dim()); + + resource::detail::warn_non_pool_workspace(handle, "raft::ivf_pq::extend"); auto stream = resource::get_cuda_stream(handle); const auto n_clusters = index->n_lists(); @@ -1515,6 +1518,7 @@ auto build(raft::resources const& handle, { common::nvtx::range fun_scope( "ivf_pq::build(%zu, %u)", size_t(n_rows), dim); + resource::detail::warn_non_pool_workspace(handle, "raft::ivf_pq::build"); static_assert(std::is_same_v || std::is_same_v || std::is_same_v, "Unsupported data type"); diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh index c0020ee490..ab690240e0 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -738,6 +739,7 @@ inline void search(raft::resources const& handle, params.n_probes, k, index.dim()); + resource::detail::warn_non_pool_workspace(handle, "raft::ivf_pq::search"); RAFT_EXPECTS( params.internal_distance_dtype == CUDA_R_16F || params.internal_distance_dtype == CUDA_R_32F, From 775f71800a2b43dff8132f2b7624539f67cab571 Mon Sep 17 00:00:00 2001 From: achirkin Date: Tue, 18 Jul 2023 14:41:04 +0200 Subject: [PATCH 23/25] Add a note about no deleter --- cpp/include/raft/core/resource/device_memory_resource.hpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp index 038e81e425..6cb317f9b7 100644 --- a/cpp/include/raft/core/resource/device_memory_resource.hpp +++ b/cpp/include/raft/core/resource/device_memory_resource.hpp @@ -106,7 +106,13 @@ class workspace_resource_factory : public resource_factory { upstream, min_size, max_size); } - /** Get the global memory resource wrapped into an unmanaged shared_ptr (with no deleter). */ + /** + * Get the global memory resource wrapped into an unmanaged shared_ptr (with no deleter). + * + * Note: the lifetime of the underlying `rmm::mr::get_current_device_resource()` is managed + * somewhere else, since it's passed by a raw pointer. Hence, this shared_ptr wrapper is not + * allowed to delete the pointer on destruction. + */ static inline auto default_plain_resource() -> std::shared_ptr { return std::shared_ptr{rmm::mr::get_current_device_resource(), From d7fcde94a1ffcbdd721081b42a10662446bda4c9 Mon Sep 17 00:00:00 2001 From: achirkin Date: Tue, 18 Jul 2023 14:41:48 +0200 Subject: [PATCH 24/25] Use the workspace resource size to determine the batch sizes for ivf-pq --- cpp/include/raft/neighbors/detail/ivf_pq_search.cuh | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh index ab690240e0..5d6186edac 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -678,6 +679,7 @@ struct ivfpq_search { * A heuristic for bounding the number of queries per batch, to improve GPU utilization. * (based on the number of SMs and the work size). * + * @param res is used to query the workspace size * @param k top-k * @param n_probes number of selected clusters per query * @param n_queries number of queries hoped to be processed at once. @@ -686,7 +688,8 @@ struct ivfpq_search { * * @return maximum recommended batch size. */ -inline auto get_max_batch_size(uint32_t k, +inline auto get_max_batch_size(raft::resources const& res, + uint32_t k, uint32_t n_probes, uint32_t n_queries, uint32_t max_samples) -> uint32_t @@ -705,11 +708,11 @@ inline auto get_max_batch_size(uint32_t k, auto ws_size = [k, n_probes, max_samples](uint32_t bs) -> uint64_t { return uint64_t(is_local_topk_feasible(k, n_probes, bs) ? k * n_probes : max_samples) * bs; }; - constexpr uint64_t kMaxWsSize = 1024 * 1024 * 1024; - if (ws_size(max_batch_size) > kMaxWsSize) { + auto max_ws_size = resource::get_workspace_total_bytes(res); + if (ws_size(max_batch_size) > max_ws_size) { uint32_t smaller_batch_size = bound_by_power_of_two(max_batch_size); // gradually reduce the batch size until we fit into the max size limit. - while (smaller_batch_size > 1 && ws_size(smaller_batch_size) > kMaxWsSize) { + while (smaller_batch_size > 1 && ws_size(smaller_batch_size) > max_ws_size) { smaller_batch_size >>= 1; } return smaller_batch_size; @@ -780,7 +783,7 @@ inline void search(raft::resources const& handle, // Maximum number of query vectors to search at the same time. const auto max_queries = std::min(std::max(n_queries, 1), 4096); - auto max_batch_size = get_max_batch_size(k, n_probes, max_queries, max_samples); + auto max_batch_size = get_max_batch_size(handle, k, n_probes, max_queries, max_samples); rmm::device_uvector float_queries(max_queries * dim_ext, stream, mr); rmm::device_uvector rot_queries(max_queries * index.rot_dim(), stream, mr); From ae0f4693f3ea243adaf3f976e6123f074d82521f Mon Sep 17 00:00:00 2001 From: achirkin Date: Tue, 25 Jul 2023 13:41:38 +0200 Subject: [PATCH 25/25] Use get_workspace_free_bytes and debug-log the usage of the default pool resource --- cpp/include/raft/core/resource/device_memory_resource.hpp | 6 ++++++ cpp/include/raft/neighbors/detail/ivf_pq_search.cuh | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp index 6cb317f9b7..af684df747 100644 --- a/cpp/include/raft/core/resource/device_memory_resource.hpp +++ b/cpp/include/raft/core/resource/device_memory_resource.hpp @@ -102,6 +102,12 @@ class workspace_resource_factory : public resource_factory { // 2) The pool doesn't grab too much memory on top of the 'limit'. auto max_size = std::min(limit + kOneGb / 2lu, limit * 3lu / 2lu); auto upstream = rmm::mr::get_current_device_resource(); + RAFT_LOG_DEBUG( + "Setting the workspace pool resource; memory limit = %zu, initial pool size = %zu, max pool " + "size = %zu.", + limit, + min_size, + max_size); return std::make_shared>( upstream, min_size, max_size); } diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh index 5d6186edac..298083d1e5 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh @@ -708,7 +708,7 @@ inline auto get_max_batch_size(raft::resources const& res, auto ws_size = [k, n_probes, max_samples](uint32_t bs) -> uint64_t { return uint64_t(is_local_topk_feasible(k, n_probes, bs) ? k * n_probes : max_samples) * bs; }; - auto max_ws_size = resource::get_workspace_total_bytes(res); + auto max_ws_size = resource::get_workspace_free_bytes(res); if (ws_size(max_batch_size) > max_ws_size) { uint32_t smaller_batch_size = bound_by_power_of_two(max_batch_size); // gradually reduce the batch size until we fit into the max size limit.