From 73f78c6b418311ac4b6f075c99c11635517fc2ed Mon Sep 17 00:00:00 2001 From: viclafargue Date: Wed, 24 Nov 2021 11:09:41 +0100 Subject: [PATCH 1/6] Remove RAFT memory management --- cpp/include/raft/cudart_utils.h | 32 ---- cpp/include/raft/mr/allocator.hpp | 70 -------- cpp/include/raft/mr/buffer_base.hpp | 210 ----------------------- cpp/include/raft/mr/device/allocator.hpp | 54 ------ cpp/include/raft/mr/device/buffer.hpp | 68 -------- cpp/include/raft/mr/host/allocator.hpp | 58 ------- cpp/include/raft/mr/host/buffer.hpp | 84 --------- cpp/include/raft/spatial/knn/knn.hpp | 2 - python/raft/common/handle.pxd | 5 - 9 files changed, 583 deletions(-) delete mode 100644 cpp/include/raft/mr/allocator.hpp delete mode 100644 cpp/include/raft/mr/buffer_base.hpp delete mode 100644 cpp/include/raft/mr/device/allocator.hpp delete mode 100644 cpp/include/raft/mr/device/buffer.hpp delete mode 100644 cpp/include/raft/mr/host/allocator.hpp delete mode 100644 cpp/include/raft/mr/host/buffer.hpp diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h index 486103dedb..85b1e7cb94 100644 --- a/cpp/include/raft/cudart_utils.h +++ b/cpp/include/raft/cudart_utils.h @@ -277,38 +277,6 @@ void print_device_vector(const char* variable_name, const T* devMem, } /** @} */ -static std::mutex mutex_; -static std::unordered_map allocations; - -template -void allocate(Type*& ptr, size_t len, rmm::cuda_stream_view stream, - bool setZero = false) { - size_t size = len * sizeof(Type); - ptr = (Type*)rmm::mr::get_current_device_resource()->allocate(size, stream); - if (setZero) CUDA_CHECK(cudaMemsetAsync((void*)ptr, 0, size, stream)); - - std::lock_guard _(mutex_); - allocations[ptr] = size; -} - -template -void deallocate(Type*& ptr, rmm::cuda_stream_view stream) { - std::lock_guard _(mutex_); - size_t size = allocations[ptr]; - allocations.erase(ptr); - rmm::mr::get_current_device_resource()->deallocate((void*)ptr, size, stream); -} - -inline void deallocate_all(rmm::cuda_stream_view stream) { - std::lock_guard _(mutex_); - for (auto& alloc : allocations) { - void* ptr = alloc.first; - size_t size = alloc.second; - rmm::mr::get_current_device_resource()->deallocate(ptr, size, stream); - } - allocations.clear(); -} - /** helper method to get max usable shared mem per block parameter */ inline int getSharedMemPerBlock() { int devId; diff --git a/cpp/include/raft/mr/allocator.hpp b/cpp/include/raft/mr/allocator.hpp deleted file mode 100644 index 08a4987c91..0000000000 --- a/cpp/include/raft/mr/allocator.hpp +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include - -namespace raft { -namespace mr { - -/** - * @brief Interface for an asynchronous device/host allocator. - * - * An implementation of this interface can make the following assumptions: - * - It does not need to be but it can allow async allocate and deallocate. - * - * @note This interface does NOT support RAII. Thus, if you need RAII-enabled - * interface, better to use `device_buffer` or `host_buffer`. - */ -class base_allocator { - public: - /** - * @brief Asynchronously allocates a memory region. - * - * An implementation of this need to return a allocation of n bytes properly - * align bytes on the configured device. The allocation can optionally be - * asynchronous in the sense that it is only save to use after all work - * submitted to the passed in stream prior to the call to allocate has - * completed. If the allocation is used before, e.g. in another stream the - * behaviour may be undefined. - * @todo: Add alignment requirments. - * - * @param[in] n number of bytes to allocate - * @param[in] stream stream to issue the possible asynchronous allocation in - */ - virtual void* allocate(std::size_t n, cudaStream_t stream) = 0; - - /** - * @brief Asynchronously deallocates device memory - * - * An implementation of this need to ensure that the allocation that the - * passed in pointer points to remains usable until all work sheduled in - * stream prior to the call to deallocate has completed. - * - * @param[inout] p pointer to the buffer to deallocte - * @param[in] n size of the buffer to deallocte in bytes - * @param[in] stream stream in which the allocation might be still in use - */ - virtual void deallocate(void* p, std::size_t n, cudaStream_t stream) = 0; - - virtual ~base_allocator() = default; -}; // class base_allocator - -}; // namespace mr -}; // namespace raft diff --git a/cpp/include/raft/mr/buffer_base.hpp b/cpp/include/raft/mr/buffer_base.hpp deleted file mode 100644 index 4a2362bf97..0000000000 --- a/cpp/include/raft/mr/buffer_base.hpp +++ /dev/null @@ -1,210 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include - -#include -#include -#include - -namespace raft { -namespace mr { - -/** - * @brief Base for all RAII-based owning of temporary memory allocations. This - * class should ideally not be used by users directly, but instead via - * the child classes `device_buffer` and `host_buffer`. - * - * @tparam T data type - * @tparam AllocatorT The underly allocator object - */ -template -class buffer_base { - public: - using size_type = std::size_t; - using value_type = T; - using iterator = value_type*; - using const_iterator = const value_type*; - using reference = T&; - using const_reference = const T&; - - buffer_base() = delete; - - buffer_base(const buffer_base& other) = delete; - - buffer_base& operator=(const buffer_base& other) = delete; - - /** - * @brief Main ctor - * - * @param[in] allocator asynchronous allocator used for managing buffer life - * @param[in] stream cuda stream where this allocation operations are async - * @param[in] n size of the buffer (in number of elements) - */ - buffer_base(std::shared_ptr allocator, cudaStream_t stream, - size_type n = 0) - : data_(nullptr), - size_(n), - capacity_(n), - stream_(stream), - allocator_(std::move(allocator)) { - if (capacity_ > 0) { - data_ = static_cast( - allocator_->allocate(capacity_ * sizeof(value_type), stream_)); - CUDA_CHECK(cudaStreamSynchronize(stream_)); - } - } - - ~buffer_base() { release(); } - - value_type* data() { return data_; } - - const value_type* data() const { return data_; } - - size_type size() const { return size_; } - - void clear() { size_ = 0; } - - iterator begin() { return data_; } - - const_iterator begin() const { return data_; } - - iterator end() { return data_ + size_; } - - const_iterator end() const { return data_ + size_; } - - /** - * @brief Reserve new memory size for this buffer. - * - * It re-allocates a fresh buffer if the new requested capacity is more than - * the current one, copies the old buffer contents to this new buffer and - * removes the old one. - * - * @param[in] new_capacity new capacity (in number of elements) - * @{ - */ - void reserve(size_type new_capacity) { - if (new_capacity > capacity_) { - auto* new_data = static_cast( - allocator_->allocate(new_capacity * sizeof(value_type), stream_)); - if (size_ > 0) { - raft::copy(new_data, data_, size_, stream_); - } - // Only deallocate if we have allocated a pointer - if (nullptr != data_) { - allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_); - } - data_ = new_data; - capacity_ = new_capacity; - } - } - - void reserve(size_type new_capacity, cudaStream_t stream) { - set_stream(stream); - reserve(new_capacity); - } - /** @} */ - - /** - * @brief Resize the underlying buffer (uses `reserve` method internally) - * - * @param[in] new_size new buffer size - * @{ - */ - void resize(const size_type new_size) { - reserve(new_size); - size_ = new_size; - } - - void resize(const size_type new_size, cudaStream_t stream) { - set_stream(stream); - resize(new_size); - } - /** @} */ - - /** - * @brief Deletes the underlying buffer - * - * If this method is not explicitly called, it will be during the destructor - * @{ - */ - void release() { - if (nullptr != data_) { - allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_); - } - data_ = nullptr; - capacity_ = 0; - size_ = 0; - } - - void release(cudaStream_t stream) { - set_stream(stream); - release(); - } - /** @} */ - - /** - * @brief returns the underlying allocator used - * - * @return the allocator pointer - */ - std::shared_ptr get_allocator() const { return allocator_; } - - /** - * @brief returns the underlying stream used - * - * @return the cuda stream - */ - cudaStream_t get_stream() const { return stream_; } - - protected: - value_type* data_; - - private: - size_type size_; - size_type capacity_; - cudaStream_t stream_; - std::shared_ptr allocator_; - - /** - * @brief Sets a new cuda stream where the future operations will be queued - * - * This method makes sure that the inter-stream dependencies are met and taken - * care of, before setting the input stream as a new stream for this buffer. - * Ideally, the same cuda stream passed during constructor is expected to be - * used throughout this buffer's lifetime, for performance. - * - * @param[in] stream new cuda stream to be set. If it is the same as the - * current one, then this method will be a no-op. - */ - void set_stream(cudaStream_t stream) { - if (stream_ != stream) { - cudaEvent_t event; - CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); - CUDA_CHECK(cudaEventRecord(event, stream_)); - CUDA_CHECK(cudaStreamWaitEvent(stream, event, 0)); - stream_ = stream; - CUDA_CHECK(cudaEventDestroy(event)); - } - } -}; // class buffer_base - -}; // namespace mr -}; // namespace raft diff --git a/cpp/include/raft/mr/device/allocator.hpp b/cpp/include/raft/mr/device/allocator.hpp deleted file mode 100644 index 3d1ce38c31..0000000000 --- a/cpp/include/raft/mr/device/allocator.hpp +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include - -#include - -namespace raft { -namespace mr { -namespace device { - -/** - * @brief An explicit interface for an asynchronous device allocator. - * - * This is mostly done in order to reduce work needed in cuML codebase. - * An implementation of this interface can make the following assumptions, - * further to the ones listed in `Allocator`: - * - Allocations may be always on the device that was specified on construction. - */ -class allocator : public base_allocator {}; - -/** Default device allocator based on the one provided by RMM */ -class default_allocator : public allocator { - public: - void* allocate(std::size_t n, cudaStream_t stream) override { - void* ptr = rmm::mr::get_current_device_resource()->allocate(n, stream); - return ptr; - } - - void deallocate(void* p, std::size_t n, cudaStream_t stream) override { - rmm::mr::get_current_device_resource()->deallocate(p, n, stream); - } -}; // class default_allocator - -}; // namespace device -}; // namespace mr -}; // namespace raft diff --git a/cpp/include/raft/mr/device/buffer.hpp b/cpp/include/raft/mr/device/buffer.hpp deleted file mode 100644 index 39b5674ce4..0000000000 --- a/cpp/include/raft/mr/device/buffer.hpp +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include "allocator.hpp" - -namespace raft { -namespace mr { -namespace device { - -/** - * @brief RAII object owning a contiguous typed device buffer. The passed in - * allocator supports asynchronous allocation and deallocation so this - * can also be used for temporary memory - * - * @code{.cpp} - * template - * void foo(..., cudaStream_t stream) { - * ... - * raft::mr::device::buffer temp(stream, 0); - * ... - * temp.resize(n); - * kernelA<<>>(...,temp.data(),...); - * kernelB<<>>(...,temp.data(),...); - * temp.release(); - * ... - * } - * @endcode - */ -template -class buffer : public buffer_base { - public: - using size_type = typename buffer_base::size_type; - using value_type = typename buffer_base::value_type; - using iterator = typename buffer_base::iterator; - using const_iterator = typename buffer_base::const_iterator; - using reference = typename buffer_base::reference; - using const_reference = typename buffer_base::const_reference; - - buffer() = delete; - - buffer(const buffer& other) = delete; - - buffer& operator=(const buffer& other) = delete; - - buffer(std::shared_ptr alloc, cudaStream_t stream, size_type n = 0) - : buffer_base(alloc, stream, n) {} -}; // class buffer - -}; // namespace device -}; // namespace mr -}; // namespace raft diff --git a/cpp/include/raft/mr/host/allocator.hpp b/cpp/include/raft/mr/host/allocator.hpp deleted file mode 100644 index e5b3da24eb..0000000000 --- a/cpp/include/raft/mr/host/allocator.hpp +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include - -#include - -#include - -namespace raft { -namespace mr { -namespace host { - -/** - * @brief An explicit interface for an asynchronous host allocations. - * - * This is mostly done in order to reduce work needed in cuML codebase. - * An implementation of this interface can make the following assumptions, - * further to the ones listed in `Allocator`: - * - Allocations don't need to be zero copy accessible form a device. - */ -class allocator : public base_allocator {}; - -/** Default cudaMallocHost/cudaFreeHost based host allocator */ -class default_allocator : public allocator { - public: - void* allocate(std::size_t n, cudaStream_t stream) override { - void* ptr = nullptr; - CUDA_CHECK(cudaMallocHost(&ptr, n)); - return ptr; - } - - void deallocate(void* p, std::size_t n, cudaStream_t stream) override { - //Must call _NO_THROW here since this is called frequently from object - //destructors which are "nothrow" by default - CUDA_CHECK_NO_THROW(cudaFreeHost(p)); - } -}; // class default_allocator - -}; // namespace host -}; // namespace mr -}; // namespace raft diff --git a/cpp/include/raft/mr/host/buffer.hpp b/cpp/include/raft/mr/host/buffer.hpp deleted file mode 100644 index 3c505bf2ed..0000000000 --- a/cpp/include/raft/mr/host/buffer.hpp +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include "allocator.hpp" - -namespace raft { -namespace mr { -namespace host { - -/** - * @brief RAII object owning a contigous typed host buffer (aka pinned memory). - * The passed in allocator supports asynchronus allocation and - * deallocation so this can also be used for temporary memory - * - * @code{.cpp} - * template - * void foo(const T* in_d , T* out_d, ..., cudaStream_t stream) { - * ... - * raft::mr::host::buffer temp(stream, 0); - * ... - * temp.resize(n); - * raft::copy(temp.data(), in_d, temp.size()); - * ... - * raft::copy(out_d, temp.data(), temp.size()); - * temp.release(stream); - * ... - * } - * @endcode - */ -template -class buffer : public buffer_base { - public: - using size_type = typename buffer_base::size_type; - using value_type = typename buffer_base::value_type; - using iterator = typename buffer_base::iterator; - using const_iterator = typename buffer_base::const_iterator; - using reference = typename buffer_base::reference; - using const_reference = typename buffer_base::const_reference; - - buffer() = delete; - - buffer(const buffer& other) = delete; - - buffer& operator=(const buffer& other) = delete; - - buffer(std::shared_ptr alloc, const device::buffer& other) - : buffer_base(alloc, other.get_stream(), other.size()) { - if (other.size() > 0) { - raft::copy(data_, other.data(), other.size(), other.get_stream()); - } - } - - buffer(std::shared_ptr alloc, cudaStream_t stream, size_type n = 0) - : buffer_base(alloc, stream, n) {} - - reference operator[](size_type pos) { return data_[pos]; } - - const_reference operator[](size_type pos) const { return data_[pos]; } - - private: - using buffer_base::data_; -}; - -}; // namespace host -}; // namespace mr -}; // namespace raft diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp index 6472eaa80b..4716b51c98 100644 --- a/cpp/include/raft/spatial/knn/knn.hpp +++ b/cpp/include/raft/spatial/knn/knn.hpp @@ -25,8 +25,6 @@ namespace raft { namespace spatial { namespace knn { -using deviceAllocator = raft::mr::device::allocator; - /** * Performs a k-select across row partitioned index/distance * matrices formatted like the following: diff --git a/python/raft/common/handle.pxd b/python/raft/common/handle.pxd index 884d81bed1..11ebb30fd6 100644 --- a/python/raft/common/handle.pxd +++ b/python/raft/common/handle.pxd @@ -24,11 +24,6 @@ from libcpp.memory cimport shared_ptr from .cuda cimport _Stream -cdef extern from "raft/mr/device/allocator.hpp" \ - namespace "raft::mr::device" nogil: - cdef cppclass allocator: - pass - cdef extern from "raft/handle.hpp" namespace "raft" nogil: cdef cppclass handle_t: handle_t() except + From 1c0f3c3cb5d44173e13faae3e9cdc265a83c1df0 Mon Sep 17 00:00:00 2001 From: viclafargue Date: Wed, 8 Dec 2021 15:20:07 +0100 Subject: [PATCH 2/6] Update fusedL2KNN test --- cpp/test/spatial/fused_l2_knn.cu | 89 +++++++++++++++++--------------- 1 file changed, 47 insertions(+), 42 deletions(-) diff --git a/cpp/test/spatial/fused_l2_knn.cu b/cpp/test/spatial/fused_l2_knn.cu index e48a3c6657..441e72d64b 100644 --- a/cpp/test/spatial/fused_l2_knn.cu +++ b/cpp/test/spatial/fused_l2_knn.cu @@ -99,65 +99,69 @@ testing::AssertionResult devArrMatchKnnPair(const T* expected_idx, template class FusedL2KNNTest : public ::testing::TestWithParam { + public: + FusedL2KNNTest() + : stream_(handle_.get_stream()), + params_(::testing::TestWithParam::GetParam()), + database(params_.num_db_vecs * params_.dim, stream_), + search_queries(params_.num_queries * params_.dim, stream_), + raft_indices_(params_.num_queries * params_.k, stream_), + raft_distances_(params_.num_queries * params_.k, stream_), + faiss_indices_(params_.num_queries * params_.k, stream_), + faiss_distances_(params_.num_queries * params_.k, stream_) + { + CUDA_CHECK(cudaMemsetAsync(database.data(), 0, database.size() * sizeof(T), stream_)); + CUDA_CHECK( + cudaMemsetAsync(search_queries.data(), 0, search_queries.size() * sizeof(T), stream_)); + CUDA_CHECK(cudaMemsetAsync(raft_indices_.data(), 0, raft_indices_.size() * sizeof(T), stream_)); + CUDA_CHECK(cudaMemsetAsync( + raft_distances_.data(), 0, raft_distances_.size() * sizeof(int64_t), stream_)); + CUDA_CHECK( + cudaMemsetAsync(faiss_indices_.data(), 0, faiss_indices_.size() * sizeof(int64_t), stream_)); + CUDA_CHECK( + cudaMemsetAsync(faiss_distances_.data(), 0, faiss_distances_.size() * sizeof(T), stream_)); + } + protected: void testBruteForce() { - cudaStream_t stream = handle_.get_stream(); - launchFaissBfknn(); detail::fusedL2Knn(dim, - raft_indices_, - raft_distances_, - database, - search_queries, + raft_indices_.data(), + raft_distances_.data(), + database.data(), + search_queries.data(), num_db_vecs, num_queries, k_, true, true, - stream, + stream_, metric); // verify. - devArrMatchKnnPair(faiss_indices_, - raft_indices_, - faiss_distances_, - raft_distances_, + devArrMatchKnnPair(faiss_indices_.data(), + raft_indices_.data(), + faiss_distances_.data(), + raft_distances_.data(), num_queries, k_, float(0.001), - stream); + stream_); } void SetUp() override { - params_ = ::testing::TestWithParam::GetParam(); num_queries = params_.num_queries; num_db_vecs = params_.num_db_vecs; dim = params_.dim; k_ = params_.k; metric = params_.metric_; - cudaStream_t stream = handle_.get_stream(); - - raft::allocate(database, num_db_vecs * dim, stream, true); - raft::allocate(search_queries, num_queries * dim, stream, true); - unsigned long long int seed = 1234ULL; raft::random::Rng r(seed); - r.uniform(database, num_db_vecs * dim, T(-1.0), T(1.0), stream); - r.uniform(search_queries, num_queries * dim, T(-1.0), T(1.0), stream); - - raft::allocate(raft_indices_, num_queries * k_, stream, true); - raft::allocate(raft_distances_, num_queries * k_, stream, true); - raft::allocate(faiss_indices_, num_queries * k_, stream, true); - raft::allocate(faiss_distances_, num_queries * k_, stream, true); - } - - void TearDown() override - { - cudaStream_t stream = handle_.get_stream(); - raft::deallocate_all(stream); + r.uniform(database.data(), num_db_vecs * dim, T(-1.0), T(1.0), stream_); + r.uniform(search_queries.data(), num_queries * dim, T(-1.0), T(1.0), stream_); } void launchFaissBfknn() @@ -169,37 +173,38 @@ class FusedL2KNNTest : public ::testing::TestWithParam { gpu_res.noTempMemory(); int device; CUDA_CHECK(cudaGetDevice(&device)); - gpu_res.setDefaultStream(device, handle_.get_stream()); + gpu_res.setDefaultStream(device, stream_); faiss::gpu::GpuDistanceParams args; args.metric = m; args.metricArg = 0; args.k = k_; args.dims = dim; - args.vectors = database; + args.vectors = database.data(); args.vectorsRowMajor = true; args.numVectors = num_db_vecs; - args.queries = search_queries; + args.queries = search_queries.data(); args.queriesRowMajor = true; args.numQueries = num_queries; - args.outDistances = faiss_distances_; - args.outIndices = faiss_indices_; + args.outDistances = faiss_distances_.data(); + args.outIndices = faiss_indices_.data(); bfKnn(&gpu_res, args); } private: raft::handle_t handle_; + cudaStream_t stream_ = 0; FusedL2KNNInputs params_; int num_queries; int num_db_vecs; int dim; - T* database; - T* search_queries; - int64_t* raft_indices_; - T* raft_distances_; - int64_t* faiss_indices_; - T* faiss_distances_; + rmm::device_uvector database; + rmm::device_uvector search_queries; + rmm::device_uvector raft_indices_; + rmm::device_uvector raft_distances_; + rmm::device_uvector faiss_indices_; + rmm::device_uvector faiss_distances_; int k_; raft::distance::DistanceType metric; }; From 3d9fc94d8b107e4f6e84bfe16eb916601bd9ddc5 Mon Sep 17 00:00:00 2001 From: viclafargue Date: Tue, 25 Jan 2022 18:29:05 +0100 Subject: [PATCH 3/6] Completing RAFT memory management removal --- cpp/include/raft/comms/helper.hpp | 1 - cpp/include/raft/lap/lap_kernels.cuh | 1 - cpp/include/raft/mr/buffer_base.hpp | 211 ------------------ cpp/include/raft/mr/device/allocator.hpp | 57 ----- cpp/include/raft/mr/device/buffer.hpp | 70 ------ cpp/include/raft/mr/host/allocator.hpp | 61 ----- cpp/include/raft/mr/host/buffer.hpp | 85 ------- .../raft/sparse/distance/detail/coo_spmv.cuh | 1 - .../raft/sparse/distance/detail/utils.cuh | 3 - cpp/include/raft/sparse/distance/distance.hpp | 1 - .../sparse/hierarchy/detail/agglomerative.cuh | 1 - .../hierarchy/detail/connectivities.cuh | 1 - .../raft/sparse/hierarchy/detail/mst.cuh | 1 - cpp/include/raft/sparse/op/detail/reduce.cuh | 1 - .../selection/detail/connect_components.cuh | 1 - .../raft/sparse/selection/detail/knn.cuh | 1 - cpp/include/raft/spatial/knn/ann.hpp | 2 - cpp/include/raft/spatial/knn/knn.hpp | 2 - cpp/test/CMakeLists.txt | 2 - cpp/test/mr/device/buffer.cpp | 92 -------- cpp/test/mr/host/buffer.cpp | 71 ------ python/raft/common/handle.pxd | 2 - python/raft/common/handle.pyx | 7 +- 23 files changed, 5 insertions(+), 670 deletions(-) delete mode 100644 cpp/include/raft/mr/buffer_base.hpp delete mode 100644 cpp/include/raft/mr/device/allocator.hpp delete mode 100644 cpp/include/raft/mr/device/buffer.hpp delete mode 100644 cpp/include/raft/mr/host/allocator.hpp delete mode 100644 cpp/include/raft/mr/host/buffer.hpp delete mode 100644 cpp/test/mr/device/buffer.cpp delete mode 100644 cpp/test/mr/host/buffer.cpp diff --git a/cpp/include/raft/comms/helper.hpp b/cpp/include/raft/comms/helper.hpp index 09a767bea7..d83e8f4d4f 100644 --- a/cpp/include/raft/comms/helper.hpp +++ b/cpp/include/raft/comms/helper.hpp @@ -18,7 +18,6 @@ #include #include -#include #include #include diff --git a/cpp/include/raft/lap/lap_kernels.cuh b/cpp/include/raft/lap/lap_kernels.cuh index 328cbf3e74..b61d0bd269 100644 --- a/cpp/include/raft/lap/lap_kernels.cuh +++ b/cpp/include/raft/lap/lap_kernels.cuh @@ -28,7 +28,6 @@ #include #include -#include #include diff --git a/cpp/include/raft/mr/buffer_base.hpp b/cpp/include/raft/mr/buffer_base.hpp deleted file mode 100644 index 6998c1f186..0000000000 --- a/cpp/include/raft/mr/buffer_base.hpp +++ /dev/null @@ -1,211 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include - -#include -#include -#include - -namespace raft { -namespace mr { - -/** - * @brief Base for all RAII-based owning of temporary memory allocations. This - * class should ideally not be used by users directly, but instead via - * the child classes `device_buffer` and `host_buffer`. - * - * @tparam T data type - * @tparam AllocatorT The underly allocator object - */ -template -class buffer_base { - public: - using size_type = std::size_t; - using value_type = T; - using iterator = value_type*; - using const_iterator = const value_type*; - using reference = T&; - using const_reference = const T&; - - buffer_base() = delete; - - buffer_base(const buffer_base& other) = delete; - - buffer_base& operator=(const buffer_base& other) = delete; - - /** - * @brief Main ctor - * - * @param[in] allocator asynchronous allocator used for managing buffer life - * @param[in] stream cuda stream where this allocation operations are async - * @param[in] n size of the buffer (in number of elements) - */ - buffer_base(std::shared_ptr allocator, cudaStream_t stream, size_type n = 0) - : data_(nullptr), size_(n), capacity_(n), stream_(stream), allocator_(std::move(allocator)) - { - if (capacity_ > 0) { - data_ = - static_cast(allocator_->allocate(capacity_ * sizeof(value_type), stream_)); - RAFT_CUDA_TRY(cudaStreamSynchronize(stream_)); - } - } - - ~buffer_base() { release(); } - - value_type* data() { return data_; } - - const value_type* data() const { return data_; } - - size_type size() const { return size_; } - - void clear() { size_ = 0; } - - iterator begin() { return data_; } - - const_iterator begin() const { return data_; } - - iterator end() { return data_ + size_; } - - const_iterator end() const { return data_ + size_; } - - /** - * @brief Reserve new memory size for this buffer. - * - * It re-allocates a fresh buffer if the new requested capacity is more than - * the current one, copies the old buffer contents to this new buffer and - * removes the old one. - * - * @param[in] new_capacity new capacity (in number of elements) - * @{ - */ - void reserve(size_type new_capacity) - { - if (new_capacity > capacity_) { - auto* new_data = - static_cast(allocator_->allocate(new_capacity * sizeof(value_type), stream_)); - if (size_ > 0) { raft::copy(new_data, data_, size_, stream_); } - // Only deallocate if we have allocated a pointer - if (nullptr != data_) { - allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_); - } - data_ = new_data; - capacity_ = new_capacity; - } - } - - void reserve(size_type new_capacity, cudaStream_t stream) - { - set_stream(stream); - reserve(new_capacity); - } - /** @} */ - - /** - * @brief Resize the underlying buffer (uses `reserve` method internally) - * - * @param[in] new_size new buffer size - * @{ - */ - void resize(const size_type new_size) - { - reserve(new_size); - size_ = new_size; - } - - void resize(const size_type new_size, cudaStream_t stream) - { - set_stream(stream); - resize(new_size); - } - /** @} */ - - /** - * @brief Deletes the underlying buffer - * - * If this method is not explicitly called, it will be during the destructor - * @{ - */ - void release() - { - if (nullptr != data_) { - allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_); - } - data_ = nullptr; - capacity_ = 0; - size_ = 0; - } - - void release(cudaStream_t stream) - { - set_stream(stream); - release(); - } - /** @} */ - - /** - * @brief returns the underlying allocator used - * - * @return the allocator pointer - */ - std::shared_ptr get_allocator() const { return allocator_; } - - /** - * @brief returns the underlying stream used - * - * @return the cuda stream - */ - cudaStream_t get_stream() const { return stream_; } - - protected: - value_type* data_; - - private: - size_type size_; - size_type capacity_; - cudaStream_t stream_; - std::shared_ptr allocator_; - - /** - * @brief Sets a new cuda stream where the future operations will be queued - * - * This method makes sure that the inter-stream dependencies are met and taken - * care of, before setting the input stream as a new stream for this buffer. - * Ideally, the same cuda stream passed during constructor is expected to be - * used throughout this buffer's lifetime, for performance. - * - * @param[in] stream new cuda stream to be set. If it is the same as the - * current one, then this method will be a no-op. - */ - void set_stream(cudaStream_t stream) - { - if (stream_ != stream) { - cudaEvent_t event; - RAFT_CUDA_TRY(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); - RAFT_CUDA_TRY(cudaEventRecord(event, stream_)); - RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, event, 0)); - stream_ = stream; - RAFT_CUDA_TRY(cudaEventDestroy(event)); - } - } -}; // class buffer_base - -}; // namespace mr -}; // namespace raft diff --git a/cpp/include/raft/mr/device/allocator.hpp b/cpp/include/raft/mr/device/allocator.hpp deleted file mode 100644 index 8d306a199f..0000000000 --- a/cpp/include/raft/mr/device/allocator.hpp +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include - -#include - -namespace raft { -namespace mr { -namespace device { - -/** - * @brief An explicit interface for an asynchronous device allocator. - * - * This is mostly done in order to reduce work needed in cuML codebase. - * An implementation of this interface can make the following assumptions, - * further to the ones listed in `Allocator`: - * - Allocations may be always on the device that was specified on construction. - */ -class allocator : public base_allocator { -}; - -/** Default device allocator based on the one provided by RMM */ -class default_allocator : public allocator { - public: - void* allocate(std::size_t n, cudaStream_t stream) override - { - void* ptr = rmm::mr::get_current_device_resource()->allocate(n, stream); - return ptr; - } - - void deallocate(void* p, std::size_t n, cudaStream_t stream) override - { - rmm::mr::get_current_device_resource()->deallocate(p, n, stream); - } -}; // class default_allocator - -}; // namespace device -}; // namespace mr -}; // namespace raft diff --git a/cpp/include/raft/mr/device/buffer.hpp b/cpp/include/raft/mr/device/buffer.hpp deleted file mode 100644 index 9b5ff11c50..0000000000 --- a/cpp/include/raft/mr/device/buffer.hpp +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "allocator.hpp" -#include -#include - -namespace raft { -namespace mr { -namespace device { - -/** - * @brief RAII object owning a contiguous typed device buffer. The passed in - * allocator supports asynchronous allocation and deallocation so this - * can also be used for temporary memory - * - * @code{.cpp} - * template - * void foo(..., cudaStream_t stream) { - * ... - * raft::mr::device::buffer temp(stream, 0); - * ... - * temp.resize(n); - * kernelA<<>>(...,temp.data(),...); - * kernelB<<>>(...,temp.data(),...); - * temp.release(); - * ... - * } - * @endcode - */ -template -class buffer : public buffer_base { - public: - using size_type = typename buffer_base::size_type; - using value_type = typename buffer_base::value_type; - using iterator = typename buffer_base::iterator; - using const_iterator = typename buffer_base::const_iterator; - using reference = typename buffer_base::reference; - using const_reference = typename buffer_base::const_reference; - - buffer() = delete; - - buffer(const buffer& other) = delete; - - buffer& operator=(const buffer& other) = delete; - - buffer(std::shared_ptr alloc, cudaStream_t stream, size_type n = 0) - : buffer_base(alloc, stream, n) - { - } -}; // class buffer - -}; // namespace device -}; // namespace mr -}; // namespace raft diff --git a/cpp/include/raft/mr/host/allocator.hpp b/cpp/include/raft/mr/host/allocator.hpp deleted file mode 100644 index 71b5465451..0000000000 --- a/cpp/include/raft/mr/host/allocator.hpp +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include - -#include - -#include - -namespace raft { -namespace mr { -namespace host { - -/** - * @brief An explicit interface for an asynchronous host allocations. - * - * This is mostly done in order to reduce work needed in cuML codebase. - * An implementation of this interface can make the following assumptions, - * further to the ones listed in `Allocator`: - * - Allocations don't need to be zero copy accessible form a device. - */ -class allocator : public base_allocator { -}; - -/** Default cudaMallocHost/cudaFreeHost based host allocator */ -class default_allocator : public allocator { - public: - void* allocate(std::size_t n, cudaStream_t stream) override - { - void* ptr = nullptr; - RAFT_CUDA_TRY(cudaMallocHost(&ptr, n)); - return ptr; - } - - void deallocate(void* p, std::size_t n, cudaStream_t stream) override - { - // Must call _NO_THROW here since this is called frequently from object - // destructors which are "nothrow" by default - RAFT_CUDA_TRY_NO_THROW(cudaFreeHost(p)); - } -}; // class default_allocator - -}; // namespace host -}; // namespace mr -}; // namespace raft diff --git a/cpp/include/raft/mr/host/buffer.hpp b/cpp/include/raft/mr/host/buffer.hpp deleted file mode 100644 index 204b384719..0000000000 --- a/cpp/include/raft/mr/host/buffer.hpp +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "allocator.hpp" -#include -#include -#include - -namespace raft { -namespace mr { -namespace host { - -/** - * @brief RAII object owning a contigous typed host buffer (aka pinned memory). - * The passed in allocator supports asynchronus allocation and - * deallocation so this can also be used for temporary memory - * - * @code{.cpp} - * template - * void foo(const T* in_d , T* out_d, ..., cudaStream_t stream) { - * ... - * raft::mr::host::buffer temp(stream, 0); - * ... - * temp.resize(n); - * raft::copy(temp.data(), in_d, temp.size()); - * ... - * raft::copy(out_d, temp.data(), temp.size()); - * temp.release(stream); - * ... - * } - * @endcode - */ -template -class buffer : public buffer_base { - public: - using size_type = typename buffer_base::size_type; - using value_type = typename buffer_base::value_type; - using iterator = typename buffer_base::iterator; - using const_iterator = typename buffer_base::const_iterator; - using reference = typename buffer_base::reference; - using const_reference = typename buffer_base::const_reference; - - buffer() = delete; - - buffer(const buffer& other) = delete; - - buffer& operator=(const buffer& other) = delete; - - buffer(std::shared_ptr alloc, const device::buffer& other) - : buffer_base(alloc, other.get_stream(), other.size()) - { - if (other.size() > 0) { raft::copy(data_, other.data(), other.size(), other.get_stream()); } - } - - buffer(std::shared_ptr alloc, cudaStream_t stream, size_type n = 0) - : buffer_base(alloc, stream, n) - { - } - - reference operator[](size_type pos) { return data_[pos]; } - - const_reference operator[](size_type pos) const { return data_[pos]; } - - private: - using buffer_base::data_; -}; - -}; // namespace host -}; // namespace mr -}; // namespace raft diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh index c23a2b1537..cb9e67dd6b 100644 --- a/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh +++ b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh @@ -21,7 +21,6 @@ #include #include -#include #include #include "../../csr.hpp" diff --git a/cpp/include/raft/sparse/distance/detail/utils.cuh b/cpp/include/raft/sparse/distance/detail/utils.cuh index 8c01b33c1e..6af7867f63 100644 --- a/cpp/include/raft/sparse/distance/detail/utils.cuh +++ b/cpp/include/raft/sparse/distance/detail/utils.cuh @@ -16,9 +16,6 @@ #pragma once -#include -#include - #include namespace raft { diff --git a/cpp/include/raft/sparse/distance/distance.hpp b/cpp/include/raft/sparse/distance/distance.hpp index 2f121dce33..aee53ac37a 100644 --- a/cpp/include/raft/sparse/distance/distance.hpp +++ b/cpp/include/raft/sparse/distance/distance.hpp @@ -21,7 +21,6 @@ #include #include -#include #include #include diff --git a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh index 4e78494e6b..593702b4a7 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh @@ -19,7 +19,6 @@ #include #include #include -#include #include diff --git a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh index 5d4640f4a6..548ca00c93 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh @@ -24,7 +24,6 @@ #include #include -#include #include #include #include diff --git a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh index 7173c76c08..10e9d04c0d 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh @@ -19,7 +19,6 @@ #include #include -#include #include #include #include diff --git a/cpp/include/raft/sparse/op/detail/reduce.cuh b/cpp/include/raft/sparse/op/detail/reduce.cuh index 074a139ba9..715bbe6deb 100644 --- a/cpp/include/raft/sparse/op/detail/reduce.cuh +++ b/cpp/include/raft/sparse/op/detail/reduce.cuh @@ -20,7 +20,6 @@ #include #include -#include #include #include diff --git a/cpp/include/raft/sparse/selection/detail/connect_components.cuh b/cpp/include/raft/sparse/selection/detail/connect_components.cuh index 817b9782f2..a7364dec4c 100644 --- a/cpp/include/raft/sparse/selection/detail/connect_components.cuh +++ b/cpp/include/raft/sparse/selection/detail/connect_components.cuh @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include diff --git a/cpp/include/raft/sparse/selection/detail/knn.cuh b/cpp/include/raft/sparse/selection/detail/knn.cuh index de0a15c029..b8dc25053b 100644 --- a/cpp/include/raft/sparse/selection/detail/knn.cuh +++ b/cpp/include/raft/sparse/selection/detail/knn.cuh @@ -23,7 +23,6 @@ #include #include #include -#include #include #include diff --git a/cpp/include/raft/spatial/knn/ann.hpp b/cpp/include/raft/spatial/knn/ann.hpp index e8cc85256d..a63af32603 100644 --- a/cpp/include/raft/spatial/knn/ann.hpp +++ b/cpp/include/raft/spatial/knn/ann.hpp @@ -22,8 +22,6 @@ #include #include -#include - namespace raft { namespace spatial { namespace knn { diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp index ecee562795..59df75ba36 100644 --- a/cpp/include/raft/spatial/knn/knn.hpp +++ b/cpp/include/raft/spatial/knn/knn.hpp @@ -19,8 +19,6 @@ #include "detail/knn_brute_force_faiss.cuh" #include "detail/selection_faiss.cuh" -#include - namespace raft { namespace spatial { namespace knn { diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index b37c671525..dac555c1d3 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -63,8 +63,6 @@ add_executable(test_raft test/linalg/unary_op.cu test/matrix/math.cu test/matrix/matrix.cu - test/mr/device/buffer.cpp - test/mr/host/buffer.cpp test/mst.cu test/random/rng.cu test/random/rng_int.cu diff --git a/cpp/test/mr/device/buffer.cpp b/cpp/test/mr/device/buffer.cpp deleted file mode 100644 index 4861a4ca1f..0000000000 --- a/cpp/test/mr/device/buffer.cpp +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include - -namespace raft { -namespace mr { -namespace device { - -TEST(Raft, DeviceBufferAlloc) -{ - cudaStream_t stream; - RAFT_CUDA_TRY(cudaStreamCreate(&stream)); - // no allocation at construction - rmm::device_uvector buff(0, stream); - ASSERT_EQ(0, buff.size()); - // explicit allocation after construction - buff.resize(20, stream); - ASSERT_EQ(20, buff.size()); - // resizing to a smaller buffer size - buff.resize(10, stream); - ASSERT_EQ(10, buff.size()); - // explicit deallocation - buff.release(); - ASSERT_EQ(0, buff.size()); - // use these methods without the explicit stream parameter - buff.resize(20, stream); - ASSERT_EQ(20, buff.size()); - buff.resize(10, stream); - ASSERT_EQ(10, buff.size()); - buff.release(); - ASSERT_EQ(0, buff.size()); - RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); - RAFT_CUDA_TRY(cudaStreamDestroy(stream)); -} - -TEST(Raft, DeviceBufferZeroResize) -{ - // Create a limiting_resource_adaptor to track allocations - auto curr_mr = - dynamic_cast(rmm::mr::get_current_device_resource()); - auto limit_mr = - std::make_shared>(curr_mr, - 1000); - - rmm::mr::set_current_device_resource(limit_mr.get()); - - cudaStream_t stream; - RAFT_CUDA_TRY(cudaStreamCreate(&stream)); - // no allocation at construction - rmm::device_uvector buff(10, stream); - ASSERT_EQ(10, buff.size()); - // explicit allocation after construction - buff.resize(0, stream); - ASSERT_EQ(0, buff.size()); - // resizing to a smaller buffer size - buff.resize(20, stream); - ASSERT_EQ(20, buff.size()); - // explicit deallocation - buff.release(); - ASSERT_EQ(0, buff.size()); - - // Now check that there is no memory left. (Used to not be true) - ASSERT_EQ(0, limit_mr->get_allocated_bytes()); - - rmm::mr::set_current_device_resource(curr_mr); - - RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); - RAFT_CUDA_TRY(cudaStreamDestroy(stream)); -} - -} // namespace device -} // namespace mr -} // namespace raft diff --git a/cpp/test/mr/host/buffer.cpp b/cpp/test/mr/host/buffer.cpp deleted file mode 100644 index d645ffa0e0..0000000000 --- a/cpp/test/mr/host/buffer.cpp +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include - -namespace raft { -namespace mr { -namespace host { - -TEST(Raft, HostBuffer) -{ - auto alloc = std::make_shared(); - cudaStream_t stream; - RAFT_CUDA_TRY(cudaStreamCreate(&stream)); - // no allocation at construction - buffer buff(alloc, stream); - ASSERT_EQ(0, buff.size()); - // explicit allocation after construction - buff.resize(20, stream); - ASSERT_EQ(20, buff.size()); - // resizing to a smaller buffer size - buff.resize(10, stream); - ASSERT_EQ(10, buff.size()); - // explicit deallocation - buff.release(stream); - ASSERT_EQ(0, buff.size()); - // use these methods without the explicit stream parameter - buff.resize(20); - ASSERT_EQ(20, buff.size()); - buff.resize(10); - ASSERT_EQ(10, buff.size()); - buff.release(); - ASSERT_EQ(0, buff.size()); - RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); - RAFT_CUDA_TRY(cudaStreamDestroy(stream)); -} - -TEST(Raft, DeviceToHostBuffer) -{ - auto d_alloc = std::make_shared(); - auto h_alloc = std::make_shared(); - cudaStream_t stream; - RAFT_CUDA_TRY(cudaStreamCreate(&stream)); - device::buffer d_buff(d_alloc, stream, 32); - RAFT_CUDA_TRY(cudaMemsetAsync(d_buff.data(), 0, sizeof(char) * d_buff.size(), stream)); - buffer h_buff(h_alloc, d_buff); - ASSERT_EQ(d_buff.size(), h_buff.size()); - RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); - RAFT_CUDA_TRY(cudaStreamDestroy(stream)); -} - -} // namespace host -} // namespace mr -} // namespace raft diff --git a/python/raft/common/handle.pxd b/python/raft/common/handle.pxd index 077b6d62b6..dc38ac47ec 100644 --- a/python/raft/common/handle.pxd +++ b/python/raft/common/handle.pxd @@ -33,8 +33,6 @@ cdef extern from "raft/handle.hpp" namespace "raft" nogil: handle_t(cuda_stream_view stream_view) except + handle_t(cuda_stream_view stream_view, shared_ptr[cuda_stream_pool] stream_pool) except + - void set_device_allocator(shared_ptr[allocator] a) except + - shared_ptr[allocator] get_device_allocator() except + cuda_stream_view get_stream() except + void sync_stream() except + diff --git a/python/raft/common/handle.pyx b/python/raft/common/handle.pyx index 1accf9e679..b4a681931a 100644 --- a/python/raft/common/handle.pyx +++ b/python/raft/common/handle.pyx @@ -20,13 +20,16 @@ # cython: language_level = 3 # import raft -from libcpp.memory cimport shared_ptr from rmm._lib.cuda_stream_view cimport cuda_stream_per_thread from rmm._lib.cuda_stream_view cimport cuda_stream_view from .cuda cimport _Stream, _Error, cudaStreamSynchronize from .cuda import CudaRuntimeError +cdef extern from * nogil: + ctypedef void* cudaStream_t "cudaStream_t" + + cdef class Handle: """ Handle is a lightweight python wrapper around the corresponding C++ class @@ -64,7 +67,7 @@ cdef class Handle: self.stream_pool)) else: # this constructor constructs a handle on user stream - c_stream = cuda_stream_view(<_Stream> stream.getStream()) + c_stream = cuda_stream_view(stream.getStream()) self.c_obj.reset(new handle_t(c_stream, self.stream_pool)) From 41d1326b3c65da6e8d42eeb9803a56c85476e0f7 Mon Sep 17 00:00:00 2001 From: viclafargue Date: Mon, 14 Feb 2022 16:49:25 +0100 Subject: [PATCH 4/6] Use of RAFT_CUDA_TRY --- cpp/include/raft/random/detail/rng_impl.cuh | 2 +- cpp/test/linalg/rsvd.cu | 10 +++++----- cpp/test/span.cu | 8 ++++---- cpp/test/spatial/fused_l2_knn.cu | 15 ++++++++------- 4 files changed, 18 insertions(+), 17 deletions(-) diff --git a/cpp/include/raft/random/detail/rng_impl.cuh b/cpp/include/raft/random/detail/rng_impl.cuh index 2406456404..b77e84284a 100644 --- a/cpp/include/raft/random/detail/rng_impl.cuh +++ b/cpp/include/raft/random/detail/rng_impl.cuh @@ -846,7 +846,7 @@ class RngImpl { rmm::device_uvector workspace(0, stream); sortPairs(workspace, expWts.data(), sortedWts.data(), inIdxPtr, outIdxPtr, (int)len, stream); if (outIdx != nullptr) { - CUDA_CHECK(cudaMemcpyAsync( + RAFT_CUDA_TRY(cudaMemcpyAsync( outIdx, outIdxPtr, sizeof(IdxT) * sampledLen, cudaMemcpyDeviceToDevice, stream)); } scatter(out, in, outIdxPtr, sampledLen, stream); diff --git a/cpp/test/linalg/rsvd.cu b/cpp/test/linalg/rsvd.cu index da38464bf7..8aef3e77a4 100644 --- a/cpp/test/linalg/rsvd.cu +++ b/cpp/test/linalg/rsvd.cu @@ -100,11 +100,11 @@ class RsvdTest : public ::testing::TestWithParam> { int len_redundant = m * n_redundant; r.normal(A.data(), len_informative, mu, sigma, stream); - CUDA_CHECK(cudaMemcpyAsync(A.data() + len_informative, - A.data(), - len_redundant * sizeof(T), - cudaMemcpyDeviceToDevice, - stream)); + RAFT_CUDA_TRY(cudaMemcpyAsync(A.data() + len_informative, + A.data(), + len_redundant * sizeof(T), + cudaMemcpyDeviceToDevice, + stream)); } std::vector A_backup_cpu(m * n); // Backup A matrix as svdJacobi will destroy the content of A diff --git a/cpp/test/span.cu b/cpp/test/span.cu index e121cea108..abecfc677c 100644 --- a/cpp/test/span.cu +++ b/cpp/test/span.cu @@ -30,16 +30,16 @@ struct TestStatus { public: TestStatus() { - CUDA_CHECK(cudaMalloc(&status_, sizeof(int))); + RAFT_CUDA_TRY(cudaMalloc(&status_, sizeof(int))); int h_status = 1; - CUDA_CHECK(cudaMemcpy(status_, &h_status, sizeof(int), cudaMemcpyHostToDevice)); + RAFT_CUDA_TRY(cudaMemcpy(status_, &h_status, sizeof(int), cudaMemcpyHostToDevice)); } - ~TestStatus() noexcept(false) { CUDA_CHECK(cudaFree(status_)); } + ~TestStatus() noexcept(false) { RAFT_CUDA_TRY(cudaFree(status_)); } int Get() { int h_status; - CUDA_CHECK(cudaMemcpy(&h_status, status_, sizeof(int), cudaMemcpyDeviceToHost)); + RAFT_CUDA_TRY(cudaMemcpy(&h_status, status_, sizeof(int), cudaMemcpyDeviceToHost)); return h_status; } diff --git a/cpp/test/spatial/fused_l2_knn.cu b/cpp/test/spatial/fused_l2_knn.cu index 57d7126fba..db84c9d47b 100644 --- a/cpp/test/spatial/fused_l2_knn.cu +++ b/cpp/test/spatial/fused_l2_knn.cu @@ -110,15 +110,16 @@ class FusedL2KNNTest : public ::testing::TestWithParam { faiss_indices_(params_.num_queries * params_.k, stream_), faiss_distances_(params_.num_queries * params_.k, stream_) { - CUDA_CHECK(cudaMemsetAsync(database.data(), 0, database.size() * sizeof(T), stream_)); - CUDA_CHECK( + RAFT_CUDA_TRY(cudaMemsetAsync(database.data(), 0, database.size() * sizeof(T), stream_)); + RAFT_CUDA_TRY( cudaMemsetAsync(search_queries.data(), 0, search_queries.size() * sizeof(T), stream_)); - CUDA_CHECK(cudaMemsetAsync(raft_indices_.data(), 0, raft_indices_.size() * sizeof(T), stream_)); - CUDA_CHECK(cudaMemsetAsync( + RAFT_CUDA_TRY( + cudaMemsetAsync(raft_indices_.data(), 0, raft_indices_.size() * sizeof(T), stream_)); + RAFT_CUDA_TRY(cudaMemsetAsync( raft_distances_.data(), 0, raft_distances_.size() * sizeof(int64_t), stream_)); - CUDA_CHECK( + RAFT_CUDA_TRY( cudaMemsetAsync(faiss_indices_.data(), 0, faiss_indices_.size() * sizeof(int64_t), stream_)); - CUDA_CHECK( + RAFT_CUDA_TRY( cudaMemsetAsync(faiss_distances_.data(), 0, faiss_distances_.size() * sizeof(T), stream_)); } @@ -172,7 +173,7 @@ class FusedL2KNNTest : public ::testing::TestWithParam { gpu_res.noTempMemory(); int device; - CUDA_CHECK(cudaGetDevice(&device)); + RAFT_CUDA_TRY(cudaGetDevice(&device)); gpu_res.setDefaultStream(device, stream_); faiss::gpu::GpuDistanceParams args; From 8ec45817ba14fa1eb1240b5c41ce8f37f3d8f9f8 Mon Sep 17 00:00:00 2001 From: viclafargue Date: Thu, 24 Feb 2022 16:48:13 +0100 Subject: [PATCH 5/6] Update copyright header --- cpp/include/raft/cudart_utils.h | 2 +- cpp/include/raft/sparse/distance/detail/utils.cuh | 2 +- cpp/test/CMakeLists.txt | 2 +- cpp/test/spatial/fused_l2_knn.cu | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h index 3a063be77e..1940fcea51 100644 --- a/cpp/include/raft/cudart_utils.h +++ b/cpp/include/raft/cudart_utils.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/raft/sparse/distance/detail/utils.cuh b/cpp/include/raft/sparse/distance/detail/utils.cuh index 6af7867f63..ed2b414c70 100644 --- a/cpp/include/raft/sparse/distance/detail/utils.cuh +++ b/cpp/include/raft/sparse/distance/detail/utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 8b7c39669b..8b52ea6158 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -1,5 +1,5 @@ #============================================================================= -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/cpp/test/spatial/fused_l2_knn.cu b/cpp/test/spatial/fused_l2_knn.cu index db84c9d47b..1b6c8390ed 100644 --- a/cpp/test/spatial/fused_l2_knn.cu +++ b/cpp/test/spatial/fused_l2_knn.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 0243630b13a375b06298aef620f01a72d1812b34 Mon Sep 17 00:00:00 2001 From: viclafargue Date: Tue, 15 Mar 2022 19:35:48 +0100 Subject: [PATCH 6/6] Resolve remaining issues --- .../raft/random/detail/make_regression.cuh | 1 - .../raft/stats/detail/homogeneity_score.cuh | 1 - cpp/test/spatial/fused_l2_knn.cu | 6 +-- cpp/test/stats/information_criterion.cu | 53 ++++++++++--------- 4 files changed, 30 insertions(+), 31 deletions(-) diff --git a/cpp/include/raft/random/detail/make_regression.cuh b/cpp/include/raft/random/detail/make_regression.cuh index 8bab85e485..42c1319889 100644 --- a/cpp/include/raft/random/detail/make_regression.cuh +++ b/cpp/include/raft/random/detail/make_regression.cuh @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include diff --git a/cpp/include/raft/stats/detail/homogeneity_score.cuh b/cpp/include/raft/stats/detail/homogeneity_score.cuh index 4c78553258..e781b58875 100644 --- a/cpp/include/raft/stats/detail/homogeneity_score.cuh +++ b/cpp/include/raft/stats/detail/homogeneity_score.cuh @@ -22,7 +22,6 @@ #pragma once -#include #include #include diff --git a/cpp/test/spatial/fused_l2_knn.cu b/cpp/test/spatial/fused_l2_knn.cu index 3be4b47256..2ec4e86d1f 100644 --- a/cpp/test/spatial/fused_l2_knn.cu +++ b/cpp/test/spatial/fused_l2_knn.cu @@ -114,9 +114,9 @@ class FusedL2KNNTest : public ::testing::TestWithParam { RAFT_CUDA_TRY( cudaMemsetAsync(search_queries.data(), 0, search_queries.size() * sizeof(T), stream_)); RAFT_CUDA_TRY( - cudaMemsetAsync(raft_indices_.data(), 0, raft_indices_.size() * sizeof(T), stream_)); - RAFT_CUDA_TRY(cudaMemsetAsync( - raft_distances_.data(), 0, raft_distances_.size() * sizeof(int64_t), stream_)); + cudaMemsetAsync(raft_indices_.data(), 0, raft_indices_.size() * sizeof(int64_t), stream_)); + RAFT_CUDA_TRY( + cudaMemsetAsync(raft_distances_.data(), 0, raft_distances_.size() * sizeof(T), stream_)); RAFT_CUDA_TRY( cudaMemsetAsync(faiss_indices_.data(), 0, faiss_indices_.size() * sizeof(int64_t), stream_)); RAFT_CUDA_TRY( diff --git a/cpp/test/stats/information_criterion.cu b/cpp/test/stats/information_criterion.cu index 802e3fee23..d61f8591a5 100644 --- a/cpp/test/stats/information_criterion.cu +++ b/cpp/test/stats/information_criterion.cu @@ -19,7 +19,8 @@ #include #include -#include +#include +#include #include @@ -59,21 +60,23 @@ struct BatchedICInputs { template class BatchedICTest : public ::testing::TestWithParam> { + public: + BatchedICTest() + : params(::testing::TestWithParam>::GetParam()), + stream(handle.get_stream()), + res_d(sizeof(T) * params.batch_size, stream) + { + } + protected: void SetUp() override { using std::vector; - params = ::testing::TestWithParam>::GetParam(); - - // Create stream and allocator - RAFT_CUDA_TRY(cudaStreamCreate(&stream)); - allocator = std::make_shared(); // Create arrays std::vector loglike_h = std::vector(params.batch_size); res_h.resize(params.batch_size); - T* loglike_d = (T*)allocator->allocate(sizeof(T) * params.batch_size, stream); - res_d = (T*)allocator->allocate(sizeof(T) * params.batch_size, stream); + rmm::device_uvector loglike_d(sizeof(T) * params.batch_size, stream); // Generate random data std::random_device rd; @@ -83,11 +86,11 @@ class BatchedICTest : public ::testing::TestWithParam> { loglike_h[i] = std::log(udis(gen)); // Copy the data to the device - raft::update_device(loglike_d, loglike_h.data(), params.batch_size, stream); + raft::update_device(loglike_d.data(), loglike_h.data(), params.batch_size, stream); // Compute the tested results - information_criterion_batched(res_d, - loglike_d, + information_criterion_batched(res_d.data(), + loglike_d.data(), params.ic_type, params.n_params, params.batch_size, @@ -103,22 +106,14 @@ class BatchedICTest : public ::testing::TestWithParam> { params.n_samples); RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); - - allocator->deallocate(loglike_d, sizeof(T) * params.batch_size, stream); - } - - void TearDown() override - { - allocator->deallocate(res_d, sizeof(T) * params.batch_size, stream); - RAFT_CUDA_TRY(cudaStreamDestroy(stream)); } protected: - std::shared_ptr allocator; + raft::handle_t handle; + cudaStream_t stream = 0; BatchedICInputs params; - T* res_d; + rmm::device_uvector res_d; std::vector res_h; - cudaStream_t stream = 0; }; // Test parameters (op, n_batches, m, n, p, q, tolerance) @@ -133,13 +128,19 @@ using BatchedICTestD = BatchedICTest; using BatchedICTestF = BatchedICTest; TEST_P(BatchedICTestD, Result) { - ASSERT_TRUE(devArrMatchHost( - res_h.data(), res_d, params.batch_size, raft::CompareApprox(params.tolerance), stream)); + ASSERT_TRUE(devArrMatchHost(res_h.data(), + res_d.data(), + params.batch_size, + raft::CompareApprox(params.tolerance), + stream)); } TEST_P(BatchedICTestF, Result) { - ASSERT_TRUE(devArrMatchHost( - res_h.data(), res_d, params.batch_size, raft::CompareApprox(params.tolerance), stream)); + ASSERT_TRUE(devArrMatchHost(res_h.data(), + res_d.data(), + params.batch_size, + raft::CompareApprox(params.tolerance), + stream)); } INSTANTIATE_TEST_CASE_P(BatchedICTests, BatchedICTestD, ::testing::ValuesIn(inputsd));