diff --git a/cpp/include/raft/core/device_container_policy.hpp b/cpp/include/raft/core/device_container_policy.hpp index eef981e56f..011de307db 100644 --- a/cpp/include/raft/core/device_container_policy.hpp +++ b/cpp/include/raft/core/device_container_policy.hpp @@ -164,10 +164,19 @@ class device_uvector_policy { public: auto create(raft::resources const& res, size_t n) -> container_type { - return container_type(n, resource::get_cuda_stream(res), resource::get_workspace_resource(res)); + if (mr_ == nullptr) { + // NB: not using the workspace resource by default! + // The workspace resource is for short-lived temporary allocations. + return container_type(n, resource::get_cuda_stream(res)); + } else { + return container_type(n, resource::get_cuda_stream(res), mr_); + } } - device_uvector_policy() = default; + constexpr device_uvector_policy() = default; + constexpr explicit device_uvector_policy(rmm::mr::device_memory_resource* mr) noexcept : mr_(mr) + { + } [[nodiscard]] constexpr auto access(container_type& c, size_t n) const noexcept -> reference { @@ -181,6 +190,9 @@ class device_uvector_policy { [[nodiscard]] auto make_accessor_policy() noexcept { return accessor_policy{}; } [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; } + + private: + rmm::mr::device_memory_resource* mr_{nullptr}; }; } // namespace raft diff --git a/cpp/include/raft/core/device_mdarray.hpp b/cpp/include/raft/core/device_mdarray.hpp index 68273db15c..fe543c97dd 100644 --- a/cpp/include/raft/core/device_mdarray.hpp +++ b/cpp/include/raft/core/device_mdarray.hpp @@ -112,7 +112,7 @@ auto make_device_mdarray(raft::resources const& handle, using mdarray_t = device_mdarray; typename mdarray_t::mapping_type layout{exts}; - typename mdarray_t::container_policy_type policy{}; + typename mdarray_t::container_policy_type policy{mr}; return mdarray_t{handle, layout, policy}; } diff --git a/cpp/include/raft/core/device_resources.hpp b/cpp/include/raft/core/device_resources.hpp index c620a688b9..cf06920a8c 100644 --- a/cpp/include/raft/core/device_resources.hpp +++ b/cpp/include/raft/core/device_resources.hpp @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -60,12 +61,12 @@ namespace raft { class device_resources : public resources { public: device_resources(const device_resources& handle, - rmm::mr::device_memory_resource* workspace_resource) + std::shared_ptr workspace_resource, + std::optional allocation_limit = std::nullopt) : resources{handle} { // replace the resource factory for the workspace_resources - resources::add_resource_factory( - std::make_shared(workspace_resource)); + resource::set_workspace_resource(*this, workspace_resource, allocation_limit); } device_resources(const device_resources& handle) : resources{handle} {} @@ -80,10 +81,13 @@ class device_resources : public resources { * @param[in] stream_pool the stream pool used (which has default of nullptr if unspecified) * @param[in] workspace_resource an optional resource used by some functions for allocating * temporary workspaces. + * @param[in] allocation_limit the total amount of memory in bytes available to the temporary + * workspace resources. */ device_resources(rmm::cuda_stream_view stream_view = rmm::cuda_stream_per_thread, std::shared_ptr stream_pool = {nullptr}, - rmm::mr::device_memory_resource* workspace_resource = nullptr) + std::shared_ptr workspace_resource = {nullptr}, + std::optional allocation_limit = std::nullopt) : resources{} { resources::add_resource_factory(std::make_shared()); @@ -91,8 +95,9 @@ class device_resources : public resources { std::make_shared(stream_view)); resources::add_resource_factory( std::make_shared(stream_pool)); - resources::add_resource_factory( - std::make_shared(workspace_resource)); + if (workspace_resource) { + resource::set_workspace_resource(*this, workspace_resource, allocation_limit); + } } /** Destroys all held-up resources */ @@ -255,4 +260,4 @@ class stream_syncer { } // namespace raft -#endif \ No newline at end of file +#endif diff --git a/cpp/include/raft/core/handle.hpp b/cpp/include/raft/core/handle.hpp index 2a6b5657e2..124ab8c315 100644 --- a/cpp/include/raft/core/handle.hpp +++ b/cpp/include/raft/core/handle.hpp @@ -32,7 +32,8 @@ namespace raft { */ class handle_t : public raft::device_resources { public: - handle_t(const handle_t& handle, rmm::mr::device_memory_resource* workspace_resource) + handle_t(const handle_t& handle, + std::shared_ptr workspace_resource) : device_resources(handle, workspace_resource) { } @@ -51,9 +52,9 @@ class handle_t : public raft::device_resources { * @param[in] workspace_resource an optional resource used by some functions for allocating * temporary workspaces. */ - handle_t(rmm::cuda_stream_view stream_view = rmm::cuda_stream_per_thread, - std::shared_ptr stream_pool = {nullptr}, - rmm::mr::device_memory_resource* workspace_resource = nullptr) + handle_t(rmm::cuda_stream_view stream_view = rmm::cuda_stream_per_thread, + std::shared_ptr stream_pool = {nullptr}, + std::shared_ptr workspace_resource = {nullptr}) : device_resources{stream_view, stream_pool, workspace_resource} { } diff --git a/cpp/include/raft/core/resource/detail/device_memory_resource.hpp b/cpp/include/raft/core/resource/detail/device_memory_resource.hpp new file mode 100644 index 0000000000..9d3f13689d --- /dev/null +++ b/cpp/include/raft/core/resource/detail/device_memory_resource.hpp @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +#include + +#include +#include +#include + +namespace raft::resource::detail { + +/** + * Warn a user of the calling algorithm if they use the default non-pooled memory allocator, + * as it may hurt the performance. + * + * This helper function is designed to produce the warning once for a given `user_name`. + * + * @param[in] res + * @param[in] user_name the name of the algorithm or any other identification. + * + */ +inline void warn_non_pool_workspace(resources const& res, std::string user_name) +{ + // Detect if the plain cuda memory resource is used for the workspace + if (rmm::mr::cuda_memory_resource{}.is_equal(*get_workspace_resource(res)->get_upstream())) { + static std::set notified_names{}; + static std::mutex mutex{}; + std::lock_guard guard(mutex); + auto [it, inserted] = notified_names.insert(std::move(user_name)); + if (inserted) { + RAFT_LOG_WARN( + "[%s] the default cuda resource is used for the raft workspace allocations. This may lead " + "to a significant slowdown for this algorithm. Consider using the default pool resource " + "(`raft::resource::set_workspace_to_pool_resource`) or set your own resource explicitly " + "(`raft::resource::set_workspace_resource`).", + it->c_str()); + } + } +} + +} // namespace raft::resource::detail diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp index ebc41e0f8e..af684df747 100644 --- a/cpp/include/raft/core/resource/device_memory_resource.hpp +++ b/cpp/include/raft/core/resource/device_memory_resource.hpp @@ -15,24 +15,49 @@ */ #pragma once +#include #include #include +#include + #include +#include #include +#include + +#include +#include namespace raft::resource { -class device_memory_resource : public resource { +class limiting_memory_resource : public resource { public: - device_memory_resource(rmm::mr::device_memory_resource* mr_ = nullptr) : mr(mr_) + limiting_memory_resource(std::shared_ptr mr, + std::size_t allocation_limit, + std::optional alignment) + : upstream_(mr), mr_(make_adaptor(mr, allocation_limit, alignment)) { - if (mr_ == nullptr) { mr = rmm::mr::get_current_device_resource(); } } - void* get_resource() override { return mr; } - ~device_memory_resource() override {} + auto get_resource() -> void* override { return &mr_; } + + ~limiting_memory_resource() override = default; private: - rmm::mr::device_memory_resource* mr; + std::shared_ptr upstream_; + rmm::mr::limiting_resource_adaptor mr_; + + static inline auto make_adaptor(std::shared_ptr upstream, + std::size_t limit, + std::optional alignment) + -> rmm::mr::limiting_resource_adaptor + { + auto p = upstream.get(); + if (alignment.has_value()) { + return rmm::mr::limiting_resource_adaptor(p, limit, alignment.value()); + } else { + return rmm::mr::limiting_resource_adaptor(p, limit); + } + } }; /** @@ -41,36 +66,173 @@ class device_memory_resource : public resource { */ class workspace_resource_factory : public resource_factory { public: - workspace_resource_factory(rmm::mr::device_memory_resource* mr_ = nullptr) : mr(mr_) {} - resource_type get_resource_type() override { return resource_type::WORKSPACE_RESOURCE; } - resource* make_resource() override { return new device_memory_resource(mr); } + explicit workspace_resource_factory( + std::shared_ptr mr = {nullptr}, + std::optional allocation_limit = std::nullopt, + std::optional alignment = std::nullopt) + : allocation_limit_(allocation_limit.value_or(default_allocation_limit())), + alignment_(alignment), + mr_(mr ? mr : default_plain_resource()) + { + } + + auto get_resource_type() -> resource_type override { return resource_type::WORKSPACE_RESOURCE; } + auto make_resource() -> resource* override + { + return new limiting_memory_resource(mr_, allocation_limit_, alignment_); + } + + /** Construct a sensible default pool memory resource. */ + static inline auto default_pool_resource(std::size_t limit) + -> std::shared_ptr + { + // Set the default granularity to 1 GiB + constexpr std::size_t kOneGb = 1024lu * 1024lu * 1024lu; + // The initial size of the pool. The choice of this value only affects the performance a little + // bit. Heuristics: + // 1) the pool shouldn't be too big from the beginning independently of the limit; + // 2) otherwise, set it to half the max size to avoid too many resize calls. + auto min_size = std::min(kOneGb, limit / 2lu); + // The pool is going to be place behind the limiting resource adaptor. This means the user won't + // be able to allocate more than 'limit' bytes of memory anyway. At the same time, the pool + // itself may consume a little bit more memory than the 'limit' due to memory fragmentation. + // Therefore, we look for a compromise, such that: + // 1) 'limit' is accurate - the user should be more likely to run into the limiting + // resource adaptor bad_alloc error than into the pool bad_alloc error. + // 2) The pool doesn't grab too much memory on top of the 'limit'. + auto max_size = std::min(limit + kOneGb / 2lu, limit * 3lu / 2lu); + auto upstream = rmm::mr::get_current_device_resource(); + RAFT_LOG_DEBUG( + "Setting the workspace pool resource; memory limit = %zu, initial pool size = %zu, max pool " + "size = %zu.", + limit, + min_size, + max_size); + return std::make_shared>( + upstream, min_size, max_size); + } + + /** + * Get the global memory resource wrapped into an unmanaged shared_ptr (with no deleter). + * + * Note: the lifetime of the underlying `rmm::mr::get_current_device_resource()` is managed + * somewhere else, since it's passed by a raw pointer. Hence, this shared_ptr wrapper is not + * allowed to delete the pointer on destruction. + */ + static inline auto default_plain_resource() -> std::shared_ptr + { + return std::shared_ptr{rmm::mr::get_current_device_resource(), + void_op{}}; + } private: - rmm::mr::device_memory_resource* mr; + std::size_t allocation_limit_; + std::optional alignment_; + std::shared_ptr mr_; + + static inline auto default_allocation_limit() -> std::size_t + { + std::size_t free_size{}; + std::size_t total_size{}; + RAFT_CUDA_TRY(cudaMemGetInfo(&free_size, &total_size)); + // Note, the workspace does not claim all this memory from the start, so it's still usable by + // the main resource as well. + // This limit is merely an order for algorithm internals to plan the batching accordingly. + return total_size / 2; + } }; /** * Load a temp workspace resource from a resources instance (and populate it on the res * if needed). + * * @param res raft resources object for managing resources * @return device memory resource object */ -inline rmm::mr::device_memory_resource* get_workspace_resource(resources const& res) +inline auto get_workspace_resource(resources const& res) + -> rmm::mr::limiting_resource_adaptor* { if (!res.has_resource_factory(resource_type::WORKSPACE_RESOURCE)) { res.add_resource_factory(std::make_shared()); } - return res.get_resource(resource_type::WORKSPACE_RESOURCE); + return res.get_resource>( + resource_type::WORKSPACE_RESOURCE); +}; + +/** Get the total size of the workspace resource. */ +inline auto get_workspace_total_bytes(resources const& res) -> size_t +{ + return get_workspace_resource(res)->get_allocation_limit(); +}; + +/** Get the already allocated size of the workspace resource. */ +inline auto get_workspace_used_bytes(resources const& res) -> size_t +{ + return get_workspace_resource(res)->get_allocated_bytes(); +}; + +/** Get the available size of the workspace resource. */ +inline auto get_workspace_free_bytes(resources const& res) -> size_t +{ + const auto* p = get_workspace_resource(res); + return p->get_allocation_limit() - p->get_allocated_bytes(); +}; + +/** + * Set a temporary workspace resource on a resources instance. + * + * @param res raft resources object for managing resources + * @param mr an optional RMM device_memory_resource + * @param allocation_limit + * the total amount of memory in bytes available to the temporary workspace resources. + * @param alignment optional alignment requirements passed to RMM allocations + * + */ +inline void set_workspace_resource(resources const& res, + std::shared_ptr mr = {nullptr}, + std::optional allocation_limit = std::nullopt, + std::optional alignment = std::nullopt) +{ + res.add_resource_factory( + std::make_shared(mr, allocation_limit, alignment)); +}; + +/** + * Set the temporary workspace resource to a pool on top of the global memory resource + * (`rmm::mr::get_current_device_resource()`. + * + * @param res raft resources object for managing resources + * @param allocation_limit + * the total amount of memory in bytes available to the temporary workspace resources; + * if not provided, a last used or default limit is used. + * + */ +inline void set_workspace_to_pool_resource( + resources const& res, std::optional allocation_limit = std::nullopt) +{ + if (!allocation_limit.has_value()) { allocation_limit = get_workspace_total_bytes(res); } + res.add_resource_factory(std::make_shared( + workspace_resource_factory::default_pool_resource(*allocation_limit), + allocation_limit, + std::nullopt)); }; /** - * Set a temp workspace resource on a resources instance. + * Set the temporary workspace resource the same as the global memory resource + * (`rmm::mr::get_current_device_resource()`. + * + * Note, the workspace resource is always limited; the limit here defines how much of the global + * memory resource can be consumed by the workspace allocations. * * @param res raft resources object for managing resources - * @param mr a valid rmm device_memory_resource + * @param allocation_limit + * the total amount of memory in bytes available to the temporary workspace resources. */ -inline void set_workspace_resource(resources const& res, rmm::mr::device_memory_resource* mr) +inline void set_workspace_to_global_resource( + resources const& res, std::optional allocation_limit = std::nullopt) { - res.add_resource_factory(std::make_shared(mr)); + res.add_resource_factory(std::make_shared( + workspace_resource_factory::default_plain_resource(), allocation_limit, std::nullopt)); }; + } // namespace raft::resource diff --git a/cpp/include/raft/core/resources.hpp b/cpp/include/raft/core/resources.hpp index e0f51b61b4..d5bd176d50 100644 --- a/cpp/include/raft/core/resources.hpp +++ b/cpp/include/raft/core/resources.hpp @@ -95,6 +95,11 @@ class resources { RAFT_EXPECTS(rtype != resource::resource_type::LAST_KEY, "LAST_KEY is a placeholder and not a valid resource factory type."); factories_.at(rtype) = std::make_pair(rtype, factory); + // Clear the corresponding resource, so that on next `get_resource` the new factory is used + if (resources_.at(rtype).first != resource::resource_type::LAST_KEY) { + resources_.at(rtype) = std::make_pair(resource::resource_type::LAST_KEY, + std::make_shared()); + } } /** diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_serialize.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_serialize.cuh index b00d308827..61a6046273 100644 --- a/cpp/include/raft/neighbors/detail/ivf_flat_serialize.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_flat_serialize.cuh @@ -45,7 +45,7 @@ struct check_index_layout { "paste in the new size and consider updating the serialization logic"); }; -template struct check_index_layout), 296>; +template struct check_index_layout), 328>; /** * Save the index to file. diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh index 4a54d33a02..199cb74fbe 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -346,10 +347,10 @@ void train_per_subset(raft::resources const& handle, const float* trainset, // [n_rows, dim] const uint32_t* labels, // [n_rows] uint32_t kmeans_n_iters, - rmm::mr::device_memory_resource* managed_memory, - rmm::mr::device_memory_resource* device_memory) + rmm::mr::device_memory_resource* managed_memory) { - auto stream = resource::get_cuda_stream(handle); + auto stream = resource::get_cuda_stream(handle); + auto device_memory = resource::get_workspace_resource(handle); rmm::device_uvector pq_centers_tmp(index.pq_centers().size(), stream, device_memory); rmm::device_uvector sub_trainset(n_rows * size_t(index.pq_len()), stream, device_memory); @@ -392,10 +393,6 @@ void train_per_subset(raft::resources const& handle, index.pq_len(), stream); - // clone the handle and attached the device memory resource to it - const resources new_handle(handle); - resource::set_workspace_resource(new_handle, device_memory); - // train PQ codebook for this subspace auto sub_trainset_view = raft::make_device_matrix_view(sub_trainset.data(), n_rows, index.pq_len()); @@ -409,7 +406,7 @@ void train_per_subset(raft::resources const& handle, raft::cluster::kmeans_balanced_params kmeans_params; kmeans_params.n_iters = kmeans_n_iters; kmeans_params.metric = raft::distance::DistanceType::L2Expanded; - raft::cluster::kmeans_balanced::helpers::build_clusters(new_handle, + raft::cluster::kmeans_balanced::helpers::build_clusters(handle, kmeans_params, sub_trainset_view, centers_tmp_view, @@ -427,10 +424,10 @@ void train_per_cluster(raft::resources const& handle, const float* trainset, // [n_rows, dim] const uint32_t* labels, // [n_rows] uint32_t kmeans_n_iters, - rmm::mr::device_memory_resource* managed_memory, - rmm::mr::device_memory_resource* device_memory) + rmm::mr::device_memory_resource* managed_memory) { - auto stream = resource::get_cuda_stream(handle); + auto stream = resource::get_cuda_stream(handle); + auto device_memory = resource::get_workspace_resource(handle); rmm::device_uvector pq_centers_tmp(index.pq_centers().size(), stream, device_memory); rmm::device_uvector cluster_sizes(index.n_lists(), stream, managed_memory); @@ -474,10 +471,6 @@ void train_per_cluster(raft::resources const& handle, indices + cluster_offsets[l], device_memory); - // clone the handle and attached the device memory resource to it - const resources new_handle(handle); - resource::set_workspace_resource(new_handle, device_memory); - // limit the cluster size to bound the training time. // [sic] we interpret the data as pq_len-dimensional size_t big_enough = 256ul * std::max(index.pq_book_size(), index.pq_dim()); @@ -498,7 +491,7 @@ void train_per_cluster(raft::resources const& handle, raft::cluster::kmeans_balanced_params kmeans_params; kmeans_params.n_iters = kmeans_n_iters; kmeans_params.metric = raft::distance::DistanceType::L2Expanded; - raft::cluster::kmeans_balanced::helpers::build_clusters(new_handle, + raft::cluster::kmeans_balanced::helpers::build_clusters(handle, kmeans_params, rot_vectors_view, centers_tmp_view, @@ -1325,6 +1318,8 @@ void extend(raft::resources const& handle, { common::nvtx::range fun_scope( "ivf_pq::extend(%zu, %u)", size_t(n_rows), index->dim()); + + resource::detail::warn_non_pool_workspace(handle, "raft::ivf_pq::extend"); auto stream = resource::get_cuda_stream(handle); const auto n_clusters = index->n_lists(); @@ -1523,6 +1518,7 @@ auto build(raft::resources const& handle, { common::nvtx::range fun_scope( "ivf_pq::build(%zu, %u)", size_t(n_rows), dim); + resource::detail::warn_non_pool_workspace(handle, "raft::ivf_pq::build"); static_assert(std::is_same_v || std::is_same_v || std::is_same_v, "Unsupported data type"); @@ -1543,24 +1539,18 @@ auto build(raft::resources const& handle, size_t(n_rows) / std::max(params.kmeans_trainset_fraction * n_rows, index.n_lists())); size_t n_rows_train = n_rows / trainset_ratio; - rmm::mr::device_memory_resource* device_memory = nullptr; - auto pool_guard = raft::get_pool_memory_resource(device_memory, 1024 * 1024); - if (pool_guard) { RAFT_LOG_DEBUG("ivf_pq::build: using pool memory resource"); } - + auto* device_memory = resource::get_workspace_resource(handle); rmm::mr::managed_memory_resource managed_memory_upstream; rmm::mr::pool_memory_resource managed_memory( &managed_memory_upstream, 1024 * 1024); // If the trainset is small enough to comfortably fit into device memory, put it there. // Otherwise, use the managed memory. + constexpr size_t kTolerableRatio = 4; rmm::mr::device_memory_resource* big_memory_resource = &managed_memory; - { - size_t free_mem, total_mem; - constexpr size_t kTolerableRatio = 4; - RAFT_CUDA_TRY(cudaMemGetInfo(&free_mem, &total_mem)); - if (sizeof(float) * n_rows_train * index.dim() * kTolerableRatio < free_mem) { - big_memory_resource = device_memory; - } + if (sizeof(float) * n_rows_train * index.dim() * kTolerableRatio < + resource::get_workspace_free_bytes(handle)) { + big_memory_resource = device_memory; } // Besides just sampling, we transform the input dataset into floats to make it easier @@ -1709,8 +1699,7 @@ auto build(raft::resources const& handle, trainset.data(), labels.data(), params.kmeans_n_iters, - &managed_memory, - device_memory); + &managed_memory); break; case codebook_gen::PER_CLUSTER: train_per_cluster(handle, @@ -1719,8 +1708,7 @@ auto build(raft::resources const& handle, trainset.data(), labels.data(), params.kmeans_n_iters, - &managed_memory, - device_memory); + &managed_memory); break; default: RAFT_FAIL("Unreachable code"); } diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh index 82b1ac1542..298083d1e5 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh @@ -31,6 +31,8 @@ #include #include #include +#include +#include #include #include #include @@ -429,10 +431,10 @@ void ivfpq_search_worker(raft::resources const& handle, float* distances, // [n_queries, topK] float scaling_factor, double preferred_shmem_carveout, - IvfSampleFilterT sample_filter, - rmm::mr::device_memory_resource* mr) + IvfSampleFilterT sample_filter) { auto stream = resource::get_cuda_stream(handle); + auto mr = resource::get_workspace_resource(handle); bool manage_local_topk = is_local_topk_feasible(topK, n_probes, n_queries); auto topk_len = manage_local_topk ? n_probes * topK : max_samples; @@ -677,6 +679,7 @@ struct ivfpq_search { * A heuristic for bounding the number of queries per batch, to improve GPU utilization. * (based on the number of SMs and the work size). * + * @param res is used to query the workspace size * @param k top-k * @param n_probes number of selected clusters per query * @param n_queries number of queries hoped to be processed at once. @@ -685,7 +688,8 @@ struct ivfpq_search { * * @return maximum recommended batch size. */ -inline auto get_max_batch_size(uint32_t k, +inline auto get_max_batch_size(raft::resources const& res, + uint32_t k, uint32_t n_probes, uint32_t n_queries, uint32_t max_samples) -> uint32_t @@ -704,11 +708,11 @@ inline auto get_max_batch_size(uint32_t k, auto ws_size = [k, n_probes, max_samples](uint32_t bs) -> uint64_t { return uint64_t(is_local_topk_feasible(k, n_probes, bs) ? k * n_probes : max_samples) * bs; }; - constexpr uint64_t kMaxWsSize = 1024 * 1024 * 1024; - if (ws_size(max_batch_size) > kMaxWsSize) { + auto max_ws_size = resource::get_workspace_free_bytes(res); + if (ws_size(max_batch_size) > max_ws_size) { uint32_t smaller_batch_size = bound_by_power_of_two(max_batch_size); // gradually reduce the batch size until we fit into the max size limit. - while (smaller_batch_size > 1 && ws_size(smaller_batch_size) > kMaxWsSize) { + while (smaller_batch_size > 1 && ws_size(smaller_batch_size) > max_ws_size) { smaller_batch_size >>= 1; } return smaller_batch_size; @@ -728,8 +732,7 @@ inline void search(raft::resources const& handle, uint32_t k, IdxT* neighbors, float* distances, - rmm::mr::device_memory_resource* mr = nullptr, - IvfSampleFilterT sample_filter = IvfSampleFilterT()) + IvfSampleFilterT sample_filter = IvfSampleFilterT()) { static_assert(std::is_same_v || std::is_same_v || std::is_same_v, "Unsupported element type."); @@ -739,6 +742,7 @@ inline void search(raft::resources const& handle, params.n_probes, k, index.dim()); + resource::detail::warn_non_pool_workspace(handle, "raft::ivf_pq::search"); RAFT_EXPECTS( params.internal_distance_dtype == CUDA_R_16F || params.internal_distance_dtype == CUDA_R_32F, @@ -775,15 +779,11 @@ inline void search(raft::resources const& handle, max_samples = ms; } - auto pool_guard = raft::get_pool_memory_resource(mr, n_queries * n_probes * k * 16); - if (pool_guard) { - RAFT_LOG_DEBUG("ivf_pq::search: using pool memory resource with initial size %zu bytes", - n_queries * n_probes * k * 16ull); - } + auto mr = resource::get_workspace_resource(handle); // Maximum number of query vectors to search at the same time. const auto max_queries = std::min(std::max(n_queries, 1), 4096); - auto max_batch_size = get_max_batch_size(k, n_probes, max_queries, max_samples); + auto max_batch_size = get_max_batch_size(handle, k, n_probes, max_queries, max_samples); rmm::device_uvector float_queries(max_queries * dim_ext, stream, mr); rmm::device_uvector rot_queries(max_queries * index.rot_dim(), stream, mr); @@ -845,8 +845,7 @@ inline void search(raft::resources const& handle, distances + uint64_t(k) * (offset_q + offset_b), utils::config::kDivisor / utils::config::kDivisor, params.preferred_shmem_carveout, - sample_filter, - mr); + sample_filter); } } } diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_serialize.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_serialize.cuh index ff5bd8ef89..f01035cad3 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_serialize.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_serialize.cuh @@ -48,7 +48,7 @@ struct check_index_layout { }; // TODO: Recompute this and come back to it. -template struct check_index_layout), 448>; +template struct check_index_layout), 480>; /** * Write the index to an output stream diff --git a/cpp/include/raft/neighbors/ivf_pq-ext.cuh b/cpp/include/raft/neighbors/ivf_pq-ext.cuh index 1595f55d8c..fcfe837e2d 100644 --- a/cpp/include/raft/neighbors/ivf_pq-ext.cuh +++ b/cpp/include/raft/neighbors/ivf_pq-ext.cuh @@ -92,8 +92,7 @@ void search_with_filtering(raft::resources const& handle, uint32_t k, IdxT* neighbors, float* distances, - rmm::mr::device_memory_resource* mr = nullptr, - IvfSampleFilterT sample_filter = IvfSampleFilterT()) RAFT_EXPLICIT; + IvfSampleFilterT sample_filter = IvfSampleFilterT{}) RAFT_EXPLICIT; template void search(raft::resources const& handle, @@ -103,8 +102,34 @@ void search(raft::resources const& handle, uint32_t n_queries, uint32_t k, IdxT* neighbors, - float* distances, - rmm::mr::device_memory_resource* mr = nullptr) RAFT_EXPLICIT; + float* distances) RAFT_EXPLICIT; + +template +[[deprecated( + "Drop the `mr` argument and use `raft::resource::set_workspace_resource` instead")]] void +search_with_filtering(raft::resources const& handle, + const raft::neighbors::ivf_pq::search_params& params, + const index& idx, + const T* queries, + uint32_t n_queries, + uint32_t k, + IdxT* neighbors, + float* distances, + rmm::mr::device_memory_resource* mr, + IvfSampleFilterT sample_filter = IvfSampleFilterT{}) RAFT_EXPLICIT; + +template +[[deprecated( + "Drop the `mr` argument and use `raft::resource::set_workspace_resource` instead")]] void +search(raft::resources const& handle, + const raft::neighbors::ivf_pq::search_params& params, + const index& idx, + const T* queries, + uint32_t n_queries, + uint32_t k, + IdxT* neighbors, + float* distances, + rmm::mr::device_memory_resource* mr) RAFT_EXPLICIT; } // namespace raft::neighbors::ivf_pq @@ -182,7 +207,17 @@ instantiate_raft_neighbors_ivf_pq_extend(uint8_t, int64_t); uint32_t k, \ IdxT* neighbors, \ float* distances, \ - rmm::mr::device_memory_resource* mr) + rmm::mr::device_memory_resource* mr); \ + \ + extern template void raft::neighbors::ivf_pq::search( \ + raft::resources const& handle, \ + const raft::neighbors::ivf_pq::search_params& params, \ + const raft::neighbors::ivf_pq::index& idx, \ + const T* queries, \ + uint32_t n_queries, \ + uint32_t k, \ + IdxT* neighbors, \ + float* distances) instantiate_raft_neighbors_ivf_pq_search(float, int64_t); instantiate_raft_neighbors_ivf_pq_search(int8_t, int64_t); diff --git a/cpp/include/raft/neighbors/ivf_pq-inl.cuh b/cpp/include/raft/neighbors/ivf_pq-inl.cuh index ad9d95f790..ccf8717486 100644 --- a/cpp/include/raft/neighbors/ivf_pq-inl.cuh +++ b/cpp/include/raft/neighbors/ivf_pq-inl.cuh @@ -16,17 +16,18 @@ #pragma once -#include #include #include #include #include #include +#include #include -#include -#include +#include + +#include // shared_ptr namespace raft::neighbors::ivf_pq { @@ -165,7 +166,7 @@ void search_with_filtering(raft::resources const& handle, raft::device_matrix_view queries, raft::device_matrix_view neighbors, raft::device_matrix_view distances, - IvfSampleFilterT sample_filter = IvfSampleFilterT()) + IvfSampleFilterT sample_filter = IvfSampleFilterT{}) { RAFT_EXPECTS( queries.extent(0) == neighbors.extent(0) && queries.extent(0) == distances.extent(0), @@ -186,7 +187,6 @@ void search_with_filtering(raft::resources const& handle, k, neighbors.data_handle(), distances.data_handle(), - resource::get_workspace_resource(handle), sample_filter); } @@ -229,7 +229,7 @@ void search(raft::resources const& handle, queries, neighbors, distances, - raft::neighbors::filtering::none_ivf_sample_filter()); + raft::neighbors::filtering::none_ivf_sample_filter{}); } /** @} */ // end group ivf_pq @@ -353,20 +353,17 @@ void extend(raft::resources const& handle, * eliminate entirely allocations happening within `search`: * @code{.cpp} * ... - * // Create a pooling memory resource with a pre-defined initial size. - * rmm::mr::pool_memory_resource mr( - * rmm::mr::get_current_device_resource(), 1024 * 1024); * // use default search parameters * ivf_pq::search_params search_params; * filtering::none_ivf_sample_filter filter; * // Use the same allocator across multiple searches to reduce the number of * // cuda memory allocations * ivf_pq::search_with_filtering( - * handle, search_params, index, queries1, N1, K, out_inds1, out_dists1, &mr, filter); + * handle, search_params, index, queries1, N1, K, out_inds1, out_dists1, filter); * ivf_pq::search_with_filtering( - * handle, search_params, index, queries2, N2, K, out_inds2, out_dists2, &mr, filter); + * handle, search_params, index, queries2, N2, K, out_inds2, out_dists2, filter); * ivf_pq::search_with_filtering( - * handle, search_params, index, queries3, N3, K, out_inds3, out_dists3, &mr, filter); + * handle, search_params, index, queries3, N3, K, out_inds3, out_dists3, nfilter); * ... * @endcode * The exact size of the temporary buffer depends on multiple factors and is an implementation @@ -385,8 +382,6 @@ void extend(raft::resources const& handle, * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset * [n_queries, k] * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k] - * @param[in] mr an optional memory resource to use across the searches (you can provide a large - * enough memory pool here to avoid memory allocations within search). * @param[in] sample_filter a filter the greenlights samples for a given query */ template @@ -398,11 +393,41 @@ void search_with_filtering(raft::resources const& handle, uint32_t k, IdxT* neighbors, float* distances, - rmm::mr::device_memory_resource* mr = nullptr, - IvfSampleFilterT sample_filter = IvfSampleFilterT()) + IvfSampleFilterT sample_filter = IvfSampleFilterT{}) { - detail::search( - handle, params, idx, queries, n_queries, k, neighbors, distances, mr, sample_filter); + detail::search(handle, params, idx, queries, n_queries, k, neighbors, distances, sample_filter); +} + +/** + * This function is deprecated and will be removed in a future. + * Please drop the `mr` argument and use `raft::resource::set_workspace_resource` instead. + */ +template +[[deprecated( + "Drop the `mr` argument and use `raft::resource::set_workspace_resource` instead")]] void +search_with_filtering(raft::resources const& handle, + const search_params& params, + const index& idx, + const T* queries, + uint32_t n_queries, + uint32_t k, + IdxT* neighbors, + float* distances, + rmm::mr::device_memory_resource* mr, + IvfSampleFilterT sample_filter = IvfSampleFilterT{}) +{ + if (mr != nullptr) { + // Shallow copy of the resource with the automatic lifespan: + // change the workspace resource temporarily + raft::resources res_local(handle); + resource::set_workspace_resource( + res_local, std::shared_ptr{mr, void_op{}}); + return search_with_filtering( + res_local, params, idx, queries, n_queries, k, neighbors, distances, sample_filter); + } else { + return search_with_filtering( + handle, params, idx, queries, n_queries, k, neighbors, distances, sample_filter); + } } /** @@ -444,8 +469,6 @@ void search_with_filtering(raft::resources const& handle, * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset * [n_queries, k] * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k] - * @param[in] mr an optional memory resource to use across the searches (you can provide a large - * enough memory pool here to avoid memory allocations within search). */ template void search(raft::resources const& handle, @@ -455,10 +478,46 @@ void search(raft::resources const& handle, uint32_t n_queries, uint32_t k, IdxT* neighbors, - float* distances, - rmm::mr::device_memory_resource* mr = nullptr) + float* distances) +{ + return search_with_filtering(handle, + params, + idx, + queries, + n_queries, + k, + neighbors, + distances, + raft::neighbors::filtering::none_ivf_sample_filter{}); +} + +/** + * This function is deprecated and will be removed in a future. + * Please drop the `mr` argument and use `raft::resource::set_workspace_resource` instead. + */ +template +[[deprecated( + "Drop the `mr` argument and use `raft::resource::set_workspace_resource` instead")]] void +search(raft::resources const& handle, + const search_params& params, + const index& idx, + const T* queries, + uint32_t n_queries, + uint32_t k, + IdxT* neighbors, + float* distances, + rmm::mr::device_memory_resource* mr) { - detail::search(handle, params, idx, queries, n_queries, k, neighbors, distances, mr); + return search_with_filtering(handle, + params, + idx, + queries, + n_queries, + k, + neighbors, + distances, + mr, + raft::neighbors::filtering::none_ivf_sample_filter{}); } } // namespace raft::neighbors::ivf_pq diff --git a/cpp/test/core/handle.cpp b/cpp/test/core/handle.cpp index 8c5e023df3..a1ad4385a7 100644 --- a/cpp/test/core/handle.cpp +++ b/cpp/test/core/handle.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -274,39 +275,61 @@ TEST(Raft, WorkspaceResource) { raft::handle_t handle; - ASSERT_TRUE(dynamic_cast*>( - resource::get_workspace_resource(handle)) == nullptr); - ASSERT_EQ(rmm::mr::get_current_device_resource(), resource::get_workspace_resource(handle)); + // The returned resource is always a limiting adaptor + auto* orig_mr = resource::get_workspace_resource(handle)->get_upstream(); - auto pool_mr = new rmm::mr::pool_memory_resource(rmm::mr::get_current_device_resource()); - std::shared_ptr pool = {nullptr}; - raft::handle_t handle2(rmm::cuda_stream_per_thread, pool, pool_mr); + // Let's create a pooled resource + auto pool_mr = std::shared_ptr{ + new rmm::mr::pool_memory_resource(rmm::mr::get_current_device_resource())}; - ASSERT_TRUE(dynamic_cast*>( - resource::get_workspace_resource(handle2)) != nullptr); - ASSERT_EQ(pool_mr, resource::get_workspace_resource(handle2)); + // A tiny workspace of 1MB + size_t max_size = 1024 * 1024; - delete pool_mr; -} - -TEST(Raft, WorkspaceResourceCopy) -{ - auto stream_pool = std::make_shared(10); + // Replace the resource + resource::set_workspace_resource(handle, pool_mr, max_size); + auto new_mr = resource::get_workspace_resource(handle); - handle_t handle(rmm::cuda_stream_per_thread, stream_pool); + // By this point, the orig_mr likely points to a non-existent resource; don't dereference! + ASSERT_NE(orig_mr, new_mr); + ASSERT_EQ(pool_mr.get(), new_mr->get_upstream()); + // We can safely reset pool_mr, because the shared_ptr to the pool memory stays in the resource + pool_mr.reset(); - auto pool_mr = new rmm::mr::pool_memory_resource(rmm::mr::get_current_device_resource()); + auto stream = resource::get_cuda_stream(handle); + rmm::device_buffer buf(max_size / 2, stream, new_mr); - handle_t copied_handle(handle, pool_mr); + // Note, the underlying pool allocator likely uses more space than reported here + ASSERT_EQ(max_size, resource::get_workspace_total_bytes(handle)); + ASSERT_EQ(buf.size(), resource::get_workspace_used_bytes(handle)); + ASSERT_EQ(max_size - buf.size(), resource::get_workspace_free_bytes(handle)); - assert_handles_equal(handle, copied_handle); + // this should throw, becaise we partially used the space. + ASSERT_THROW((rmm::device_buffer{max_size, stream, new_mr}), rmm::bad_alloc); +} - // Assert the workspace_resources are what we expect - ASSERT_TRUE(dynamic_cast*>( - resource::get_workspace_resource(handle)) == nullptr); +TEST(Raft, WorkspaceResourceCopy) +{ + raft::handle_t res; + auto orig_mr = resource::get_workspace_resource(res); + auto orig_size = resource::get_workspace_total_bytes(res); - ASSERT_TRUE(dynamic_cast*>( - resource::get_workspace_resource(copied_handle)) != nullptr); + { + // create a new handle in the inner scope and update the workspace resource for it. + raft::resources tmp_res(res); + resource::set_workspace_resource( + tmp_res, + std::shared_ptr{ + new rmm::mr::pool_memory_resource(rmm::mr::get_current_device_resource())}, + orig_size * 2); + + ASSERT_EQ(orig_mr, resource::get_workspace_resource(res)); + ASSERT_EQ(orig_size, resource::get_workspace_total_bytes(res)); + + ASSERT_NE(orig_mr, resource::get_workspace_resource(tmp_res)); + ASSERT_NE(orig_size, resource::get_workspace_total_bytes(tmp_res)); + } + ASSERT_EQ(orig_mr, resource::get_workspace_resource(res)); + ASSERT_EQ(orig_size, resource::get_workspace_total_bytes(res)); } TEST(Raft, HandleCopy) diff --git a/cpp/test/util/device_atomics.cu b/cpp/test/util/device_atomics.cu index 355cb0d4dd..56f798b617 100644 --- a/cpp/test/util/device_atomics.cu +++ b/cpp/test/util/device_atomics.cu @@ -60,7 +60,7 @@ TEST(Raft, AtomicIncWarp) // Check that count is correct and that each thread index is contained in the // array exactly once. - ASSERT_EQ(num_elts, counter.value(s)); + ASSERT_EQ(num_elts, counter.value(s)); // NB: accessing the counter synchronizes `s` std::sort(out_host.begin(), out_host.end()); for (int i = 0; i < num_elts; ++i) { ASSERT_EQ(i, out_host[i]);