From 1c4e1e600b8b5da4265290f031e7785e824b706e Mon Sep 17 00:00:00 2001 From: Victor Lafargue Date: Fri, 27 Aug 2021 17:26:55 +0200 Subject: [PATCH] Miscellaneous tech debts/cleanups (#286) Miscellaneous updates to solve tech debts in RAFT : - [x] Removal of handle host and device allocators - [x] Addition of a `get_thrust_policy` method to the handle - [x] Usage of `get_thrust_policy` where handle is available - [x] Removal of `rmm::device_vector` - [x] Use of RMM device allocator in the `raft::allocate` function - [x] Creation of an allocation + deallocation helper system - [x] Usage of `rmm::exec_policy` instead of `thrust::cuda::par.on` when no handle is available Authors: - Victor Lafargue (https://github.com/viclafargue) Approvers: - Dante Gama Dessavre (https://github.com/dantegd) URL: https://github.com/rapidsai/raft/pull/286 --- cpp/cmake/thirdparty/get_rmm.cmake | 2 +- cpp/include/raft/common/cub_wrappers.cuh | 4 +- cpp/include/raft/comms/helper.hpp | 10 +- cpp/include/raft/comms/std_comms.hpp | 40 ++---- cpp/include/raft/comms/test.hpp | 41 ++---- cpp/include/raft/cudart_utils.h | 46 +++++-- cpp/include/raft/distance/distance.cuh | 7 +- cpp/include/raft/handle.hpp | 30 +--- cpp/include/raft/label/classlabels.cuh | 57 ++++---- cpp/include/raft/lap/lap.cuh | 87 ++++++------ cpp/include/raft/lap/lap_functions.cuh | 47 +++---- .../raft/linalg/cholesky_r1_update.cuh | 8 +- cpp/include/raft/linalg/eig.cuh | 28 ++-- cpp/include/raft/linalg/init.h | 3 +- cpp/include/raft/linalg/qr.cuh | 19 ++- cpp/include/raft/linalg/svd.cuh | 28 ++-- cpp/include/raft/linalg/transpose.h | 4 +- cpp/include/raft/matrix/math.cuh | 10 +- cpp/include/raft/matrix/matrix.cuh | 14 +- cpp/include/raft/random/rng.cuh | 17 +-- cpp/include/raft/sparse/convert/csr.cuh | 24 ++-- cpp/include/raft/sparse/coo.cuh | 38 +++-- cpp/include/raft/sparse/csr.cuh | 19 +-- .../raft/sparse/distance/bin_distance.cuh | 11 +- cpp/include/raft/sparse/distance/coo_spmv.cuh | 3 - .../coo_spmv_strategies/base_strategy.cuh | 2 - .../coo_mask_row_iterators.cuh | 1 - .../coo_spmv_strategies/hash_strategy.cuh | 2 +- cpp/include/raft/sparse/distance/distance.cuh | 1 - .../raft/sparse/distance/ip_distance.cuh | 3 +- .../raft/sparse/distance/l2_distance.cuh | 21 +-- cpp/include/raft/sparse/distance/utils.cuh | 2 - .../sparse/hierarchy/detail/agglomerative.cuh | 13 +- .../hierarchy/detail/connectivities.cuh | 13 +- .../raft/sparse/hierarchy/detail/mst.cuh | 16 +-- .../raft/sparse/hierarchy/single_linkage.hpp | 1 - cpp/include/raft/sparse/linalg/add.cuh | 10 +- cpp/include/raft/sparse/linalg/spectral.cuh | 23 ++-- cpp/include/raft/sparse/linalg/symmetrize.cuh | 33 ++--- cpp/include/raft/sparse/linalg/transpose.h | 12 +- .../raft/sparse/mst/detail/mst_solver_inl.cuh | 122 ++++++++-------- cpp/include/raft/sparse/mst/detail/utils.cuh | 4 +- cpp/include/raft/sparse/mst/mst_solver.cuh | 24 ++-- cpp/include/raft/sparse/op/filter.cuh | 36 ++--- cpp/include/raft/sparse/op/reduce.cuh | 10 +- cpp/include/raft/sparse/op/sort.h | 21 +-- .../sparse/selection/connect_components.cuh | 36 ++--- cpp/include/raft/sparse/selection/knn.cuh | 2 - .../raft/sparse/selection/knn_graph.cuh | 1 - cpp/include/raft/spatial/knn/ann.hpp | 3 - .../knn/detail/ann_quantized_faiss.cuh | 8 +- .../knn/detail/knn_brute_force_faiss.cuh | 19 ++- .../raft/spatial/knn/detail/processing.hpp | 33 ++--- cpp/include/raft/spatial/knn/knn.hpp | 6 +- cpp/include/raft/spectral/cluster_solvers.hpp | 9 +- cpp/include/raft/spectral/kmeans.hpp | 130 ++++++++---------- cpp/include/raft/spectral/matrix_wrappers.hpp | 83 ++++------- .../raft/spectral/modularity_maximization.hpp | 29 ++-- cpp/include/raft/spectral/partition.hpp | 29 ++-- cpp/include/raft/spectral/spectral_util.hpp | 23 ++-- cpp/test/cluster_solvers.cu | 9 +- cpp/test/distance/dist_adj.cu | 22 ++- cpp/test/distance/distance_base.cuh | 24 ++-- cpp/test/distance/fused_l2_nn.cu | 32 ++--- cpp/test/eigen_solvers.cu | 9 +- cpp/test/label/label.cu | 34 ++--- cpp/test/label/merge_labels.cu | 6 +- cpp/test/lap/lap.cu | 17 ++- cpp/test/linalg/add.cu | 14 +- cpp/test/linalg/binary_op.cu | 16 +-- cpp/test/linalg/cholesky_r1.cu | 27 ++-- cpp/test/linalg/coalesced_reduction.cu | 15 +- cpp/test/linalg/divide.cu | 15 +- cpp/test/linalg/eig.cu | 34 ++--- cpp/test/linalg/eig_sel.cu | 19 +-- cpp/test/linalg/eltwise.cu | 14 +- cpp/test/linalg/map.cu | 25 ++-- cpp/test/linalg/map_then_reduce.cu | 26 ++-- cpp/test/linalg/matrix_vector_op.cu | 21 ++- cpp/test/linalg/multiply.cu | 15 +- cpp/test/linalg/norm.cu | 18 ++- cpp/test/linalg/reduce.cu | 10 +- cpp/test/linalg/reduce.cuh | 15 +- cpp/test/linalg/strided_reduction.cu | 10 +- cpp/test/linalg/subtract.cu | 18 ++- cpp/test/linalg/svd.cu | 28 ++-- cpp/test/linalg/transpose.cu | 12 +- cpp/test/linalg/unary_op.cu | 11 +- cpp/test/matrix/math.cu | 51 +++---- cpp/test/matrix/matrix.cu | 34 +++-- cpp/test/mr/device/buffer.cpp | 18 ++- cpp/test/mst.cu | 24 +++- cpp/test/random/rng.cu | 41 +++--- cpp/test/random/rng_int.cu | 12 +- cpp/test/random/sample_without_replacement.cu | 14 +- cpp/test/sparse/add.cu | 43 ++---- cpp/test/sparse/connect_components.cu | 11 +- cpp/test/sparse/convert_coo.cu | 10 +- cpp/test/sparse/convert_csr.cu | 26 ++-- cpp/test/sparse/csr_row_slice.cu | 34 ++--- cpp/test/sparse/csr_to_dense.cu | 22 +-- cpp/test/sparse/csr_transpose.cu | 36 ++--- cpp/test/sparse/degree.cu | 36 ++--- cpp/test/sparse/dist_coo_spmv.cu | 29 ++-- cpp/test/sparse/distance.cu | 21 +-- cpp/test/sparse/filter.cu | 13 +- cpp/test/sparse/knn.cu | 27 ++-- cpp/test/sparse/knn_graph.cu | 16 +-- cpp/test/sparse/linkage.cu | 39 ++---- cpp/test/sparse/norm.cu | 15 +- cpp/test/sparse/reduce.cu | 3 +- cpp/test/sparse/row_op.cu | 12 +- cpp/test/sparse/selection.cu | 24 +--- cpp/test/sparse/sort.cu | 13 +- cpp/test/sparse/symmetrize.cu | 33 ++--- cpp/test/spatial/haversine.cu | 33 ++--- cpp/test/spatial/knn.cu | 34 +++-- cpp/test/spectral_matrix.cu | 19 ++- cpp/test/stats/mean.cu | 10 +- cpp/test/stats/mean_center.cu | 18 ++- cpp/test/stats/stddev.cu | 18 ++- cpp/test/stats/sum.cu | 12 +- python/raft/common/handle.pxd | 2 - 123 files changed, 1114 insertions(+), 1583 deletions(-) diff --git a/cpp/cmake/thirdparty/get_rmm.cmake b/cpp/cmake/thirdparty/get_rmm.cmake index 85ebc6238e..e990ab1367 100644 --- a/cpp/cmake/thirdparty/get_rmm.cmake +++ b/cpp/cmake/thirdparty/get_rmm.cmake @@ -44,4 +44,4 @@ endfunction() set(RAFT_MIN_VERSION_rmm "${RAFT_VERSION_MAJOR}.${RAFT_VERSION_MINOR}.00") -find_and_configure_rmm(${RAFT_MIN_VERSION_rmm}) +find_and_configure_rmm(${RAFT_MIN_VERSION_rmm}) \ No newline at end of file diff --git a/cpp/include/raft/common/cub_wrappers.cuh b/cpp/include/raft/common/cub_wrappers.cuh index 8d5b29f700..8e3519fea5 100644 --- a/cpp/include/raft/common/cub_wrappers.cuh +++ b/cpp/include/raft/common/cub_wrappers.cuh @@ -17,7 +17,7 @@ #pragma once #include -#include +#include namespace raft { @@ -34,7 +34,7 @@ namespace raft { * @param stream cuda stream */ template -void sortPairs(raft::mr::device::buffer &workspace, const KeyT *inKeys, +void sortPairs(rmm::device_uvector &workspace, const KeyT *inKeys, KeyT *outKeys, const ValueT *inVals, ValueT *outVals, int len, cudaStream_t stream) { size_t worksize; diff --git a/cpp/include/raft/comms/helper.hpp b/cpp/include/raft/comms/helper.hpp index 7b24e31bbe..e01490d728 100644 --- a/cpp/include/raft/comms/helper.hpp +++ b/cpp/include/raft/comms/helper.hpp @@ -38,11 +38,10 @@ namespace comms { */ void build_comms_nccl_only(handle_t *handle, ncclComm_t nccl_comm, int num_ranks, int rank) { - auto d_alloc = handle->get_device_allocator(); cudaStream_t stream = handle->get_stream(); auto communicator = std::make_shared(std::unique_ptr( - new raft::comms::std_comms(nccl_comm, num_ranks, rank, d_alloc, stream))); + new raft::comms::std_comms(nccl_comm, num_ranks, rank, stream))); handle->set_comms(communicator); } @@ -80,12 +79,11 @@ void build_comms_nccl_ucx(handle_t *handle, ncclComm_t nccl_comm, } } - auto d_alloc = handle->get_device_allocator(); cudaStream_t stream = handle->get_stream(); - auto communicator = std::make_shared(std::unique_ptr( - new raft::comms::std_comms(nccl_comm, (ucp_worker_h)ucp_worker, eps_sp, - num_ranks, rank, d_alloc, stream))); + auto communicator = std::make_shared( + std::unique_ptr(new raft::comms::std_comms( + nccl_comm, (ucp_worker_h)ucp_worker, eps_sp, num_ranks, rank, stream))); handle->set_comms(communicator); } diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index 765e8741bb..ff75931fb9 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -20,7 +20,7 @@ #include #include -#include +#include #include @@ -64,17 +64,16 @@ class std_comms : public comms_iface { */ std_comms(ncclComm_t nccl_comm, ucp_worker_h ucp_worker, std::shared_ptr eps, int num_ranks, int rank, - const std::shared_ptr device_allocator, cudaStream_t stream, bool subcomms_ucp = true) : nccl_comm_(nccl_comm), stream_(stream), + status_(2, stream), num_ranks_(num_ranks), rank_(rank), subcomms_ucp_(subcomms_ucp), ucp_worker_(ucp_worker), ucp_eps_(eps), - next_request_id_(0), - device_allocator_(device_allocator) { + next_request_id_(0) { initialize(); }; @@ -85,27 +84,19 @@ class std_comms : public comms_iface { * @param rank rank of the current worker */ std_comms(const ncclComm_t nccl_comm, int num_ranks, int rank, - const std::shared_ptr device_allocator, cudaStream_t stream) : nccl_comm_(nccl_comm), stream_(stream), + status_(2, stream), num_ranks_(num_ranks), rank_(rank), - subcomms_ucp_(false), - device_allocator_(device_allocator) { + subcomms_ucp_(false) { initialize(); }; - virtual ~std_comms() { - device_allocator_->deallocate(sendbuff_, sizeof(int), stream_); - device_allocator_->deallocate(recvbuff_, sizeof(int), stream_); - } - void initialize() { - sendbuff_ = reinterpret_cast( - device_allocator_->allocate(sizeof(int), stream_)); - recvbuff_ = reinterpret_cast( - device_allocator_->allocate(sizeof(int), stream_)); + sendbuff_ = status_.data(); + recvbuff_ = status_.data() + 1; } int get_size() const { return num_ranks_; } @@ -113,8 +104,8 @@ class std_comms : public comms_iface { int get_rank() const { return rank_; } std::unique_ptr comm_split(int color, int key) const { - mr::device::buffer d_colors(device_allocator_, stream_, get_size()); - mr::device::buffer d_keys(device_allocator_, stream_, get_size()); + rmm::device_uvector d_colors(get_size(), stream_); + rmm::device_uvector d_keys(get_size(), stream_); update_device(d_colors.data() + get_rank(), &color, 1, stream_); update_device(d_keys.data() + get_rank(), &key, 1, stream_); @@ -167,12 +158,12 @@ class std_comms : public comms_iface { if (ucp_worker_ != nullptr && subcomms_ucp_) { auto eps_sp = std::make_shared(new_ucx_ptrs.data()); - return std::unique_ptr(new std_comms( - nccl_comm, (ucp_worker_h)ucp_worker_, eps_sp, subcomm_ranks.size(), key, - device_allocator_, stream_, subcomms_ucp_)); + return std::unique_ptr( + new std_comms(nccl_comm, (ucp_worker_h)ucp_worker_, eps_sp, + subcomm_ranks.size(), key, stream_, subcomms_ucp_)); } else { - return std::unique_ptr(new std_comms( - nccl_comm, subcomm_ranks.size(), key, device_allocator_, stream_)); + return std::unique_ptr( + new std_comms(nccl_comm, subcomm_ranks.size(), key, stream_)); } } @@ -465,6 +456,7 @@ class std_comms : public comms_iface { cudaStream_t stream_; int *sendbuff_, *recvbuff_; + rmm::device_uvector status_; int num_ranks_; int rank_; @@ -478,8 +470,6 @@ class std_comms : public comms_iface { mutable std::unordered_map requests_in_flight_; mutable std::unordered_set free_requests_; - - std::shared_ptr device_allocator_; }; } // end namespace comms } // end namespace raft diff --git a/cpp/include/raft/comms/test.hpp b/cpp/include/raft/comms/test.hpp index 4e95c4eef0..17db8e88af 100644 --- a/cpp/include/raft/comms/test.hpp +++ b/cpp/include/raft/comms/test.hpp @@ -18,7 +18,6 @@ #include #include -#include #include #include @@ -44,8 +43,7 @@ bool test_collective_allreduce(const handle_t &handle, int root) { cudaStream_t stream = handle.get_stream(); - raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); - temp_d.resize(1, stream); + rmm::device_scalar temp_d(stream); CUDA_CHECK( cudaMemcpyAsync(temp_d.data(), &send, 1, cudaMemcpyHostToDevice, stream)); @@ -76,8 +74,7 @@ bool test_collective_broadcast(const handle_t &handle, int root) { cudaStream_t stream = handle.get_stream(); - raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); - temp_d.resize(1, stream); + rmm::device_scalar temp_d(stream); if (communicator.get_rank() == root) CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), @@ -104,8 +101,7 @@ bool test_collective_reduce(const handle_t &handle, int root) { cudaStream_t stream = handle.get_stream(); - raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); - temp_d.resize(1, stream); + rmm::device_scalar temp_d(stream); CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); @@ -134,11 +130,8 @@ bool test_collective_allgather(const handle_t &handle, int root) { cudaStream_t stream = handle.get_stream(); - raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); - temp_d.resize(1, stream); - - raft::mr::device::buffer recv_d(handle.get_device_allocator(), stream, - communicator.get_size()); + rmm::device_scalar temp_d(stream); + rmm::device_uvector recv_d(communicator.get_size(), stream); CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); @@ -169,12 +162,9 @@ bool test_collective_gather(const handle_t &handle, int root) { cudaStream_t stream = handle.get_stream(); - raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); - temp_d.resize(1, stream); - - raft::mr::device::buffer recv_d( - handle.get_device_allocator(), stream, - communicator.get_rank() == root ? communicator.get_size() : 0); + rmm::device_scalar temp_d(stream); + rmm::device_uvector recv_d( + communicator.get_rank() == root ? communicator.get_size() : 0, stream); CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); @@ -211,12 +201,9 @@ bool test_collective_gatherv(const handle_t &handle, int root) { cudaStream_t stream = handle.get_stream(); - raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); - temp_d.resize(sends.size(), stream); - - raft::mr::device::buffer recv_d( - handle.get_device_allocator(), stream, - communicator.get_rank() == root ? displacements.back() : 0); + rmm::device_uvector temp_d(sends.size(), stream); + rmm::device_uvector recv_d( + communicator.get_rank() == root ? displacements.back() : 0, stream); CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(), sends.size() * sizeof(int), cudaMemcpyHostToDevice, @@ -256,10 +243,8 @@ bool test_collective_reducescatter(const handle_t &handle, int root) { cudaStream_t stream = handle.get_stream(); - raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream, - sends.size()); - raft::mr::device::buffer recv_d(handle.get_device_allocator(), stream, - 1); + rmm::device_uvector temp_d(sends.size(), stream); + rmm::device_scalar recv_d(stream); CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(), sends.size() * sizeof(int), cudaMemcpyHostToDevice, diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h index 86c60addf2..85ca310530 100644 --- a/cpp/include/raft/cudart_utils.h +++ b/cpp/include/raft/cudart_utils.h @@ -17,6 +17,8 @@ #pragma once #include +#include +#include #include @@ -25,6 +27,8 @@ #include #include #include +#include +#include ///@todo: enable once logging has been enabled in raft //#include "logger.hpp" @@ -200,7 +204,8 @@ class grid_1d_block_t { * @param stream cuda stream */ template -void copy(Type* dst, const Type* src, size_t len, cudaStream_t stream) { +void copy(Type* dst, const Type* src, size_t len, + rmm::cuda_stream_view stream) { CUDA_CHECK( cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream)); } @@ -214,20 +219,20 @@ void copy(Type* dst, const Type* src, size_t len, cudaStream_t stream) { /** performs a host to device copy */ template void update_device(Type* d_ptr, const Type* h_ptr, size_t len, - cudaStream_t stream) { + rmm::cuda_stream_view stream) { copy(d_ptr, h_ptr, len, stream); } /** performs a device to host copy */ template void update_host(Type* h_ptr, const Type* d_ptr, size_t len, - cudaStream_t stream) { + rmm::cuda_stream_view stream) { copy(h_ptr, d_ptr, len, stream); } template void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, - cudaStream_t stream) { + rmm::cuda_stream_view stream) { CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream)); } @@ -259,11 +264,36 @@ void print_device_vector(const char* variable_name, const T* devMem, } /** @} */ -/** cuda malloc */ +static std::mutex mutex_; +static std::unordered_map allocations; + +template +void allocate(Type*& ptr, size_t len, rmm::cuda_stream_view stream, + bool setZero = false) { + size_t size = len * sizeof(Type); + ptr = (Type*)rmm::mr::get_current_device_resource()->allocate(size, stream); + if (setZero) CUDA_CHECK(cudaMemsetAsync((void*)ptr, 0, size, stream)); + + std::lock_guard _(mutex_); + allocations[ptr] = size; +} + template -void allocate(Type*& ptr, size_t len, bool setZero = false) { - CUDA_CHECK(cudaMalloc((void**)&ptr, sizeof(Type) * len)); - if (setZero) CUDA_CHECK(cudaMemset(ptr, 0, sizeof(Type) * len)); +void deallocate(Type*& ptr, rmm::cuda_stream_view stream) { + std::lock_guard _(mutex_); + size_t size = allocations[ptr]; + allocations.erase(ptr); + rmm::mr::get_current_device_resource()->deallocate((void*)ptr, size, stream); +} + +inline void deallocate_all(rmm::cuda_stream_view stream) { + std::lock_guard _(mutex_); + for (auto& alloc : allocations) { + void* ptr = alloc.first; + size_t size = alloc.second; + rmm::mr::get_current_device_resource()->deallocate(ptr, size, stream); + } + allocations.clear(); } /** helper method to get max usable shared mem per block parameter */ diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/distance.cuh index 02d8fb6d03..65b4f3b830 100644 --- a/cpp/include/raft/distance/distance.cuh +++ b/cpp/include/raft/distance/distance.cuh @@ -31,7 +31,7 @@ #include #include #include -#include +#include namespace raft { namespace distance { @@ -376,7 +376,7 @@ void distance(const InType *x, const InType *y, OutType *dist, Index_ m, template void pairwise_distance_impl(const Type *x, const Type *y, Type *dist, Index_ m, Index_ n, Index_ k, - raft::mr::device::buffer &workspace, + rmm::device_uvector &workspace, cudaStream_t stream, bool isRowMajor, Type metric_arg = 2.0f) { auto worksize = @@ -389,8 +389,7 @@ void pairwise_distance_impl(const Type *x, const Type *y, Type *dist, Index_ m, template void pairwise_distance(const Type *x, const Type *y, Type *dist, Index_ m, - Index_ n, Index_ k, - raft::mr::device::buffer &workspace, + Index_ n, Index_ k, rmm::device_uvector &workspace, raft::distance::DistanceType metric, cudaStream_t stream, bool isRowMajor = true, Type metric_arg = 2.0f) { switch (metric) { diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp index dbe7e83189..c925669530 100644 --- a/cpp/include/raft/handle.hpp +++ b/cpp/include/raft/handle.hpp @@ -36,9 +36,8 @@ #include #include #include -#include -#include #include +#include #include "cudart_utils.h" namespace raft { @@ -63,10 +62,9 @@ class handle_t { CUDA_CHECK(cudaGetDevice(&cur_dev)); return cur_dev; }()), - streams_(n_streams), - device_allocator_(std::make_shared()), - host_allocator_(std::make_shared()) { + streams_(n_streams) { create_resources(); + thrust_policy_ = std::make_unique(user_stream_); } /** @@ -86,10 +84,9 @@ class handle_t { "ERROR: the main handle must have at least one worker stream\n"); prop_ = other.get_device_properties(); device_prop_initialized_ = true; - device_allocator_ = other.get_device_allocator(); - host_allocator_ = other.get_host_allocator(); create_resources(); set_stream(other.get_internal_stream(stream_id)); + thrust_policy_ = std::make_unique(user_stream_); } /** Destroys all held-up resources */ @@ -103,20 +100,6 @@ class handle_t { return rmm::cuda_stream_view(user_stream_); } - void set_device_allocator(std::shared_ptr allocator) { - device_allocator_ = allocator; - } - std::shared_ptr get_device_allocator() const { - return device_allocator_; - } - - void set_host_allocator(std::shared_ptr allocator) { - host_allocator_ = allocator; - } - std::shared_ptr get_host_allocator() const { - return host_allocator_; - } - cublasHandle_t get_cublas_handle() const { std::lock_guard _(mutex_); if (!cublas_initialized_) { @@ -153,6 +136,8 @@ class handle_t { return cusparse_handle_; } + rmm::exec_policy& get_thrust_policy() const { return *thrust_policy_; } + // legacy compatibility for cuML cudaStream_t get_internal_stream(int sid) const { return streams_.get_stream(sid).value(); @@ -236,8 +221,7 @@ class handle_t { mutable bool cusolver_sp_initialized_{false}; mutable cusparseHandle_t cusparse_handle_; mutable bool cusparse_initialized_{false}; - std::shared_ptr device_allocator_; - std::shared_ptr host_allocator_; + std::unique_ptr thrust_policy_{nullptr}; cudaStream_t user_stream_{nullptr}; cudaEvent_t event_; mutable cudaDeviceProp prop_; diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/classlabels.cuh index 0da7da2eb6..b2302836bc 100644 --- a/cpp/include/raft/label/classlabels.cuh +++ b/cpp/include/raft/label/classlabels.cuh @@ -21,8 +21,8 @@ #include #include #include -#include -#include +#include +#include namespace raft { namespace label { @@ -36,41 +36,39 @@ namespace label { * \tparam value_t numeric type of the arrays with class labels * \param [in] y device array of labels, size [n] * \param [in] n number of labels - * \param [out] y_unique device array of unique labels, unallocated on entry, + * \param [out] unique device array of unique labels, unallocated on entry, * on exit it has size [n_unique] * \param [out] n_unique number of unique labels * \param [in] stream cuda stream - * \param [in] allocator device allocator */ template -void getUniquelabels(value_t *y, size_t n, value_t **y_unique, int *n_unique, - cudaStream_t stream, - std::shared_ptr allocator) { - raft::mr::device::buffer y2(allocator, stream, n); - raft::mr::device::buffer y3(allocator, stream, n); - raft::mr::device::buffer d_num_selected(allocator, stream, 1); +int getUniquelabels(rmm::device_uvector &unique, value_t *y, size_t n, + cudaStream_t stream) { + rmm::device_scalar d_num_selected(stream); + rmm::device_uvector workspace(n, stream); size_t bytes = 0; size_t bytes2 = 0; // Query how much temporary storage we will need for cub operations // and allocate it - cub::DeviceRadixSort::SortKeys(NULL, bytes, y, y2.data(), n); - cub::DeviceSelect::Unique(NULL, bytes2, y2.data(), y3.data(), + cub::DeviceRadixSort::SortKeys(NULL, bytes, y, workspace.data(), n); + cub::DeviceSelect::Unique(NULL, bytes2, workspace.data(), workspace.data(), d_num_selected.data(), n); bytes = max(bytes, bytes2); - raft::mr::device::buffer cub_storage(allocator, stream, bytes); + rmm::device_uvector cub_storage(bytes, stream); // Select Unique classes - cub::DeviceRadixSort::SortKeys(cub_storage.data(), bytes, y, y2.data(), n); - cub::DeviceSelect::Unique(cub_storage.data(), bytes, y2.data(), y3.data(), - d_num_selected.data(), n); - raft::update_host(n_unique, d_num_selected.data(), 1, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + cub::DeviceRadixSort::SortKeys(cub_storage.data(), bytes, y, workspace.data(), + n); + cub::DeviceSelect::Unique(cub_storage.data(), bytes, workspace.data(), + workspace.data(), d_num_selected.data(), n); + int n_unique = d_num_selected.value(stream); // Copy unique classes to output - *y_unique = - (value_t *)allocator->allocate(*n_unique * sizeof(value_t), stream); - raft::copy(*y_unique, y3.data(), *n_unique, stream); + unique.resize(n_unique, stream); + raft::copy(unique.data(), workspace.data(), n_unique, stream); + + return n_unique; } /** @@ -147,22 +145,17 @@ __global__ void map_label_kernel(Type *map_ids, size_t N_labels, Type *in, */ template void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream, - Lambda filter_op, - std::shared_ptr allocator, - bool zero_based = false) { + Lambda filter_op, bool zero_based = false) { static const size_t TPB_X = 256; dim3 blocks(raft::ceildiv(N, TPB_X)); dim3 threads(TPB_X); - Type *map_ids; - int num_clusters; - getUniquelabels(in, N, &map_ids, &num_clusters, stream, allocator); + rmm::device_uvector map_ids(0, stream); + int num_clusters = getUniquelabels(map_ids, in, N, stream); map_label_kernel<<>>( - map_ids, num_clusters, in, out, N, filter_op, zero_based); - - allocator->deallocate(map_ids, num_clusters * sizeof(Type), stream); + map_ids.data(), num_clusters, in, out, N, filter_op, zero_based); } /** @@ -184,11 +177,9 @@ void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream, */ template void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream, - std::shared_ptr allocator, bool zero_based = false) { make_monotonic( - out, in, N, stream, [] __device__(Type val) { return false; }, allocator, - zero_based); + out, in, N, stream, [] __device__(Type val) { return false; }, zero_based); } }; // namespace label }; // end namespace raft diff --git a/cpp/include/raft/lap/lap.cuh b/cpp/include/raft/lap/lap.cuh index 6bc1c08029..f64afb3549 100644 --- a/cpp/include/raft/lap/lap.cuh +++ b/cpp/include/raft/lap/lap.cuh @@ -25,6 +25,7 @@ #pragma once #include +#include #include "d_structs.h" #include "lap_functions.cuh" @@ -44,19 +45,19 @@ class LinearAssignmentProblem { VertexData d_row_data_dev, d_col_data_dev; raft::handle_t const &handle_; - raft::mr::device::buffer row_covers_v; - raft::mr::device::buffer col_covers_v; - raft::mr::device::buffer row_duals_v; - raft::mr::device::buffer col_duals_v; - raft::mr::device::buffer col_slacks_v; - raft::mr::device::buffer row_is_visited_v; - raft::mr::device::buffer col_is_visited_v; - raft::mr::device::buffer row_parents_v; - raft::mr::device::buffer col_parents_v; - raft::mr::device::buffer row_children_v; - raft::mr::device::buffer col_children_v; - raft::mr::device::buffer obj_val_primal_v; - raft::mr::device::buffer obj_val_dual_v; + rmm::device_uvector row_covers_v; + rmm::device_uvector col_covers_v; + rmm::device_uvector row_duals_v; + rmm::device_uvector col_duals_v; + rmm::device_uvector col_slacks_v; + rmm::device_uvector row_is_visited_v; + rmm::device_uvector col_is_visited_v; + rmm::device_uvector row_parents_v; + rmm::device_uvector col_parents_v; + rmm::device_uvector row_children_v; + rmm::device_uvector col_children_v; + rmm::device_uvector obj_val_primal_v; + rmm::device_uvector obj_val_dual_v; public: LinearAssignmentProblem(raft::handle_t const &handle, vertex_t size, @@ -66,19 +67,19 @@ class LinearAssignmentProblem { batchsize_(batchsize), epsilon_(epsilon), d_costs_(nullptr), - row_covers_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - col_covers_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - row_duals_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - col_duals_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - col_slacks_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - row_is_visited_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - col_is_visited_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - row_parents_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - col_parents_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - row_children_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - col_children_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - obj_val_primal_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - obj_val_dual_v(handle_.get_device_allocator(), handle_.get_stream(), 0) {} + row_covers_v(0, handle_.get_stream()), + col_covers_v(0, handle_.get_stream()), + row_duals_v(0, handle_.get_stream()), + col_duals_v(0, handle_.get_stream()), + col_slacks_v(0, handle_.get_stream()), + row_is_visited_v(0, handle_.get_stream()), + col_is_visited_v(0, handle_.get_stream()), + row_parents_v(0, handle_.get_stream()), + col_parents_v(0, handle_.get_stream()), + row_children_v(0, handle_.get_stream()), + col_children_v(0, handle_.get_stream()), + obj_val_primal_v(0, handle_.get_stream()), + obj_val_dual_v(0, handle_.get_stream()) {} // Executes Hungarian algorithm on the input cost matrix. void solve(weight_t const *d_cost_matrix, vertex_t *d_row_assignment, @@ -152,19 +153,20 @@ class LinearAssignmentProblem { private: // Helper function for initializing global variables and arrays on a single host. void initializeDevice() { - row_covers_v.resize(batchsize_ * size_); - col_covers_v.resize(batchsize_ * size_); - row_duals_v.resize(batchsize_ * size_); - col_duals_v.resize(batchsize_ * size_); - col_slacks_v.resize(batchsize_ * size_); - row_is_visited_v.resize(batchsize_ * size_); - col_is_visited_v.resize(batchsize_ * size_); - row_parents_v.resize(batchsize_ * size_); - col_parents_v.resize(batchsize_ * size_); - row_children_v.resize(batchsize_ * size_); - col_children_v.resize(batchsize_ * size_); - obj_val_primal_v.resize(batchsize_); - obj_val_dual_v.resize(batchsize_); + cudaStream_t stream = handle_.get_stream(); + row_covers_v.resize(batchsize_ * size_, stream); + col_covers_v.resize(batchsize_ * size_, stream); + row_duals_v.resize(batchsize_ * size_, stream); + col_duals_v.resize(batchsize_ * size_, stream); + col_slacks_v.resize(batchsize_ * size_, stream); + row_is_visited_v.resize(batchsize_ * size_, stream); + col_is_visited_v.resize(batchsize_ * size_, stream); + row_parents_v.resize(batchsize_ * size_, stream); + col_parents_v.resize(batchsize_ * size_, stream); + row_children_v.resize(batchsize_ * size_, stream); + col_children_v.resize(batchsize_ * size_, stream); + obj_val_primal_v.resize(batchsize_, stream); + obj_val_dual_v.resize(batchsize_, stream); d_vertices_dev.row_covers = row_covers_v.data(); d_vertices_dev.col_covers = col_covers_v.data(); @@ -231,17 +233,16 @@ class LinearAssignmentProblem { int hungarianStep3() { int next; - raft::mr::device::buffer flag_v(handle_.get_device_allocator(), - handle_.get_stream(), 1); + rmm::device_scalar flag_v(handle_.get_stream()); bool h_flag = false; - raft::update_device(flag_v.data(), &h_flag, 1, handle_.get_stream()); + flag_v.set_value_async(h_flag, handle_.get_stream()); detail::executeZeroCover(handle_, d_costs_, d_vertices_dev, d_row_data_dev, d_col_data_dev, flag_v.data(), batchsize_, size_, epsilon_); - raft::update_host(&h_flag, flag_v.data(), 1, handle_.get_stream()); + h_flag = flag_v.value(handle_.get_stream()); next = h_flag ? 4 : 5; diff --git a/cpp/include/raft/lap/lap_functions.cuh b/cpp/include/raft/lap/lap_functions.cuh index 7640f3f816..830940f0ec 100644 --- a/cpp/include/raft/lap/lap_functions.cuh +++ b/cpp/include/raft/lap/lap_functions.cuh @@ -29,7 +29,8 @@ #include #include #include -#include +#include +#include #include #include @@ -125,10 +126,8 @@ inline void computeInitialAssignments(raft::handle_t const &handle, std::size_t size = SP * N; - raft::mr::device::buffer row_lock_v(handle.get_device_allocator(), - handle.get_stream(), size); - raft::mr::device::buffer col_lock_v(handle.get_device_allocator(), - handle.get_stream(), size); + rmm::device_uvector row_lock_v(size, handle.get_stream()); + rmm::device_uvector col_lock_v(size, handle.get_stream()); thrust::fill_n(thrust::device, d_vertices.row_assignments, size, -1); thrust::fill_n(thrust::device, d_vertices.col_assignments, size, -1); @@ -214,25 +213,21 @@ inline vertex_t zeroCoverIteration(raft::handle_t const &handle, weight_t epsilon) { vertex_t M; - raft::mr::device::buffer csr_ptrs_v(handle.get_device_allocator(), - handle.get_stream(), 0); - raft::mr::device::buffer csr_neighbors_v( - handle.get_device_allocator(), handle.get_stream(), 0); + rmm::device_uvector csr_ptrs_v(0, handle.get_stream()); + rmm::device_uvector csr_neighbors_v(0, handle.get_stream()); { dim3 blocks_per_grid; dim3 threads_per_block; int total_blocks = 0; - raft::mr::device::buffer predicates_v(handle.get_device_allocator(), - handle.get_stream(), SP * N); - raft::mr::device::buffer addresses_v( - handle.get_device_allocator(), handle.get_stream(), SP * N); + rmm::device_uvector predicates_v(SP * N, handle.get_stream()); + rmm::device_uvector addresses_v(SP * N, handle.get_stream()); thrust::fill_n(thrust::device, predicates_v.data(), SP * N, false); thrust::fill_n(thrust::device, addresses_v.data(), SP * N, vertex_t{0}); - csr_ptrs_v.resize(SP + 1); + csr_ptrs_v.resize(SP + 1, handle.get_stream()); thrust::fill_n(thrust::device, csr_ptrs_v.data(), (SP + 1), vertex_t{-1}); @@ -251,7 +246,7 @@ inline vertex_t zeroCoverIteration(raft::handle_t const &handle, addresses_v.end(), addresses_v.begin()); if (M > 0) { - csr_neighbors_v.resize(M); + csr_neighbors_v.resize(M, handle.get_stream()); kernel_rowScatterCSR<<>>( @@ -300,10 +295,8 @@ inline void reversePass(raft::handle_t const &handle, raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, size); - raft::mr::device::buffer predicates_v(handle.get_device_allocator(), - handle.get_stream(), size); - raft::mr::device::buffer addresses_v(handle.get_device_allocator(), - handle.get_stream(), size); + rmm::device_uvector predicates_v(size, handle.get_stream()); + rmm::device_uvector addresses_v(size, handle.get_stream()); thrust::fill_n(thrust::device, predicates_v.data(), size, false); thrust::fill_n(thrust::device, addresses_v.data(), size, vertex_t{0}); @@ -329,8 +322,7 @@ inline void reversePass(raft::handle_t const &handle, raft::lap::detail::calculateLinearDims( blocks_per_grid_1, threads_per_block_1, total_blocks_1, csr_size); - raft::mr::device::buffer elements_v( - handle.get_device_allocator(), handle.get_stream(), csr_size); + rmm::device_uvector elements_v(csr_size, handle.get_stream()); kernel_augmentScatter<<>>( @@ -358,10 +350,8 @@ inline void augmentationPass(raft::handle_t const &handle, raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP * N); - raft::mr::device::buffer predicates_v(handle.get_device_allocator(), - handle.get_stream(), SP * N); - raft::mr::device::buffer addresses_v(handle.get_device_allocator(), - handle.get_stream(), SP * N); + rmm::device_uvector predicates_v(SP * N, handle.get_stream()); + rmm::device_uvector addresses_v(SP * N, handle.get_stream()); thrust::fill_n(thrust::device, predicates_v.data(), SP * N, false); thrust::fill_n(thrust::device, addresses_v.data(), SP * N, vertex_t{0}); @@ -388,8 +378,8 @@ inline void augmentationPass(raft::handle_t const &handle, raft::lap::detail::calculateLinearDims( blocks_per_grid_1, threads_per_block_1, total_blocks_1, row_ids_csr_size); - raft::mr::device::buffer elements_v( - handle.get_device_allocator(), handle.get_stream(), row_ids_csr_size); + rmm::device_uvector elements_v(row_ids_csr_size, + handle.get_stream()); kernel_augmentScatter<<>>( @@ -418,8 +408,7 @@ inline void dualUpdate(raft::handle_t const &handle, dim3 threads_per_block; int total_blocks; - raft::mr::device::buffer sp_min_v(handle.get_device_allocator(), - handle.get_stream(), 1); + rmm::device_scalar sp_min_v(handle.get_stream()); raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP); diff --git a/cpp/include/raft/linalg/cholesky_r1_update.cuh b/cpp/include/raft/linalg/cholesky_r1_update.cuh index b5a93c4953..d6d064c20e 100644 --- a/cpp/include/raft/linalg/cholesky_r1_update.cuh +++ b/cpp/include/raft/linalg/cholesky_r1_update.cuh @@ -63,11 +63,11 @@ namespace linalg { * @code{.cpp} * // Initialize arrays * int ld_L = n_rows; - * device_buffer L(allocator, stream, ld_L * n_rows); + * rmm::device_uvector L(ld_L * n_rows, stream); * MLCommon::LinAlg::choleskyRank1Update(handle, L, n_rows, ld_L, nullptr, * &n_bytes, CUBLAS_FILL_MODE_LOWER, * stream); - * device_buffer workspace(allocator, stream, n_bytes); + * rmm::device_uvector workspace(n_bytes, stream); * * for (n=1; n<=n_rows; rank++) { * // Calculate a new row/column of matrix A into A_new @@ -87,11 +87,11 @@ namespace linalg { * @code{.cpp} * // Initialize arrays * int ld_U = n_rows; - * device_buffer U(allocator, stream, ld_U * n_rows); + * rmm::device_uvector U(ld_U * n_rows, stream); * MLCommon::LinAlg::choleskyRank1Update(handle, L, n_rows, ld_U, nullptr, * &n_bytes, CUBLAS_FILL_MODE_UPPER, * stream); - * device_buffer workspace(allocator, stream, n_bytes); + * rmm::device_uvector workspace(stream, n_bytes, stream); * * for (n=1; n<=n_rows; n++) { * // Calculate a new row/column of matrix A into array A_new diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh index 6172618380..5b2df3bcb3 100644 --- a/cpp/include/raft/linalg/eig.cuh +++ b/cpp/include/raft/linalg/eig.cuh @@ -22,7 +22,8 @@ #include #include #include -#include +#include +#include namespace raft { namespace linalg { @@ -44,7 +45,6 @@ template void eigDC(const raft::handle_t &handle, const math_t *in, int n_rows, int n_cols, math_t *eig_vectors, math_t *eig_vals, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); int lwork; @@ -52,8 +52,8 @@ void eigDC(const raft::handle_t &handle, const math_t *in, int n_rows, CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, eig_vals, &lwork)); - raft::mr::device::buffer d_work(allocator, stream, lwork); - raft::mr::device::buffer d_dev_info(allocator, stream, 1); + rmm::device_uvector d_work(lwork, stream); + rmm::device_scalar d_dev_info(stream); raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream); @@ -63,9 +63,7 @@ void eigDC(const raft::handle_t &handle, const math_t *in, int n_rows, d_dev_info.data(), stream)); CUDA_CHECK(cudaGetLastError()); - int dev_info; - raft::update_host(&dev_info, d_dev_info.data(), 1, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + int dev_info = d_dev_info.value(stream); ASSERT(dev_info == 0, "eig.cuh: eigensolver couldn't converge to a solution. " "This usually occurs when some of the features do not vary enough."); @@ -93,7 +91,6 @@ template void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, int n_eig_vals, math_t *eig_vectors, math_t *eig_vals, EigVecMemUsage memUsage, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); int lwork; @@ -104,9 +101,9 @@ void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, math_t(0.0), math_t(0.0), n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, &lwork)); - raft::mr::device::buffer d_work(allocator, stream, lwork); - raft::mr::device::buffer d_dev_info(allocator, stream, 1); - raft::mr::device::buffer d_eig_vectors(allocator, stream, 0); + rmm::device_uvector d_work(lwork, stream); + rmm::device_scalar d_dev_info(stream); + rmm::device_uvector d_eig_vectors(0, stream); if (memUsage == OVERWRITE_INPUT) { CUSOLVER_CHECK(cusolverDnsyevdx( @@ -127,9 +124,7 @@ void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, CUDA_CHECK(cudaGetLastError()); - int dev_info; - raft::update_host(&dev_info, d_dev_info.data(), 1, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + int dev_info = d_dev_info.value(stream); ASSERT(dev_info == 0, "eig.cuh: eigensolver couldn't converge to a solution. " "This usually occurs when some of the features do not vary enough."); @@ -163,7 +158,6 @@ template void eigJacobi(const raft::handle_t &handle, const math_t *in, int n_rows, int n_cols, math_t *eig_vectors, math_t *eig_vals, cudaStream_t stream, math_t tol = 1.e-7, int sweeps = 15) { - auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); syevjInfo_t syevj_params = nullptr; @@ -176,8 +170,8 @@ void eigJacobi(const raft::handle_t &handle, const math_t *in, int n_rows, cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors, n_cols, eig_vals, &lwork, syevj_params)); - raft::mr::device::buffer d_work(allocator, stream, lwork); - raft::mr::device::buffer dev_info(allocator, stream, 1); + rmm::device_uvector d_work(lwork, stream); + rmm::device_scalar dev_info(stream); raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream); diff --git a/cpp/include/raft/linalg/init.h b/cpp/include/raft/linalg/init.h index cb2e8ed1ab..9944685a1f 100644 --- a/cpp/include/raft/linalg/init.h +++ b/cpp/include/raft/linalg/init.h @@ -19,6 +19,7 @@ #include #include #include +#include namespace raft { namespace linalg { @@ -40,7 +41,7 @@ void range(T *out, int start, int end, cudaStream_t stream) { thrust::counting_iterator first(start); thrust::counting_iterator last = first + (end - start); thrust::device_ptr ptr(out); - thrust::copy(thrust::cuda::par.on(stream), first, last, ptr); + thrust::copy(rmm::exec_policy(stream), first, last, ptr); } /** diff --git a/cpp/include/raft/linalg/qr.cuh b/cpp/include/raft/linalg/qr.cuh index cafa8d54f1..cc912d7d86 100644 --- a/cpp/include/raft/linalg/qr.cuh +++ b/cpp/include/raft/linalg/qr.cuh @@ -19,7 +19,8 @@ #include #include #include -#include +#include +#include namespace raft { namespace linalg { @@ -42,7 +43,6 @@ namespace linalg { template void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q, int n_rows, int n_cols, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); int m = n_rows, n = n_cols; @@ -50,14 +50,14 @@ void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q, CUDA_CHECK(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream)); - raft::mr::device::buffer tau(allocator, stream, k); + rmm::device_uvector tau(k, stream); CUDA_CHECK(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * k, stream)); - raft::mr::device::buffer devInfo(allocator, stream, 1); + rmm::device_scalar devInfo(stream); int Lwork; CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, m, n, Q, m, &Lwork)); - raft::mr::device::buffer workspace(allocator, stream, Lwork); + rmm::device_uvector workspace(Lwork, stream); CUSOLVER_CHECK(cusolverDngeqrf(cusolverH, m, n, Q, m, tau.data(), workspace.data(), Lwork, devInfo.data(), stream)); @@ -86,12 +86,11 @@ void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q, template void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R, int n_rows, int n_cols, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); int m = n_rows, n = n_cols; - raft::mr::device::buffer R_full(allocator, stream, m * n); - raft::mr::device::buffer tau(allocator, stream, min(m, n)); + rmm::device_uvector R_full(m * n, stream); + rmm::device_uvector tau(min(m, n), stream); CUDA_CHECK( cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream)); int R_full_nrows = m, R_full_ncols = n; @@ -99,12 +98,12 @@ void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R, cudaMemcpyDeviceToDevice, stream)); int Lwork; - raft::mr::device::buffer devInfo(allocator, stream, 1); + rmm::device_scalar devInfo(stream); CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows, &Lwork)); - raft::mr::device::buffer workspace(allocator, stream, Lwork); + rmm::device_uvector workspace(Lwork, stream); CUSOLVER_CHECK(cusolverDngeqrf( cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows, tau.data(), workspace.data(), Lwork, devInfo.data(), stream)); diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/svd.cuh index 7357a68a4c..8b40a80903 100644 --- a/cpp/include/raft/linalg/svd.cuh +++ b/cpp/include/raft/linalg/svd.cuh @@ -23,7 +23,8 @@ #include #include #include -#include +#include +#include #include "eig.cuh" #include "gemm.cuh" #include "transpose.h" @@ -54,8 +55,6 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *sing_vals, T *left_sing_vecs, T *right_sing_vecs, bool trans_right, bool gen_left_vec, bool gen_right_vec, cudaStream_t stream) { - std::shared_ptr allocator = - handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); cublasHandle_t cublasH = handle.get_cublas_handle(); @@ -71,13 +70,13 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols, const int m = n_rows; const int n = n_cols; - raft::mr::device::buffer devInfo(allocator, stream, 1); + rmm::device_scalar devInfo(stream); T *d_rwork = nullptr; int lwork = 0; CUSOLVER_CHECK( cusolverDngesvd_bufferSize(cusolverH, n_rows, n_cols, &lwork)); - raft::mr::device::buffer d_work(allocator, stream, lwork); + rmm::device_uvector d_work(lwork, stream); char jobu = 'S'; char jobvt = 'A'; @@ -112,12 +111,11 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols, template void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S, T *U, T *V, bool gen_left_vec, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); cublasHandle_t cublasH = handle.get_cublas_handle(); int len = n_cols * n_cols; - raft::mr::device::buffer in_cross_mult(allocator, stream, len); + rmm::device_uvector in_cross_mult(len, stream); T alpha = T(1); T beta = T(0); @@ -162,7 +160,6 @@ void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, math_t *sing_vals, math_t *left_sing_vecs, math_t *right_sing_vecs, bool gen_left_vec, bool gen_right_vec, math_t tol, int max_sweeps, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); gesvdjInfo_t gesvdj_params = NULL; @@ -174,7 +171,7 @@ void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, int m = n_rows; int n = n_cols; - raft::mr::device::buffer devInfo(allocator, stream, 1); + rmm::device_scalar devInfo(stream); int lwork = 0; int econ = 1; @@ -183,7 +180,7 @@ void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals, left_sing_vecs, m, right_sing_vecs, n, &lwork, gesvdj_params)); - raft::mr::device::buffer d_work(allocator, stream, lwork); + rmm::device_uvector d_work(lwork, stream); CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj( cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals, @@ -210,10 +207,8 @@ template void svdReconstruction(const raft::handle_t &handle, math_t *U, math_t *S, math_t *V, math_t *out, int n_rows, int n_cols, int k, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); - const math_t alpha = 1.0, beta = 0.0; - raft::mr::device::buffer SVT(allocator, stream, k * n_cols); + rmm::device_uvector SVT(k * n_cols, stream); raft::linalg::gemm(handle, S, k, k, V, SVT.data(), k, n_cols, CUBLAS_OP_N, CUBLAS_OP_T, alpha, beta, stream); @@ -239,14 +234,13 @@ template bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U, math_t *S_vec, math_t *V, int n_rows, int n_cols, int k, math_t tol, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); cublasHandle_t cublasH = handle.get_cublas_handle(); int m = n_rows, n = n_cols; // form product matrix - raft::mr::device::buffer P_d(allocator, stream, m * n); - raft::mr::device::buffer S_mat(allocator, stream, k * k); + rmm::device_uvector P_d(m * n, stream); + rmm::device_uvector S_mat(k * k, stream); CUDA_CHECK(cudaMemsetAsync(P_d.data(), 0, sizeof(math_t) * m * n, stream)); CUDA_CHECK(cudaMemsetAsync(S_mat.data(), 0, sizeof(math_t) * k * k, stream)); @@ -262,7 +256,7 @@ bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U, // calculate percent error const math_t alpha = 1.0, beta = -1.0; - raft::mr::device::buffer A_minus_P(allocator, stream, m * n); + rmm::device_uvector A_minus_P(m * n, stream); CUDA_CHECK( cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream)); diff --git a/cpp/include/raft/linalg/transpose.h b/cpp/include/raft/linalg/transpose.h index d90f6271fa..db1cabd694 100644 --- a/cpp/include/raft/linalg/transpose.h +++ b/cpp/include/raft/linalg/transpose.h @@ -17,8 +17,8 @@ #pragma once #include -#include #include +#include namespace raft { namespace linalg { @@ -60,7 +60,7 @@ void transpose(math_t *inout, int n, cudaStream_t stream) { auto d_inout = inout; auto counting = thrust::make_counting_iterator(0); - thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size, + thrust::for_each(rmm::exec_policy(stream), counting, counting + size, [=] __device__(int idx) { int s_row = idx % m; int s_col = idx / m; diff --git a/cpp/include/raft/matrix/math.cuh b/cpp/include/raft/matrix/math.cuh index 0a72117140..41ca85dce0 100644 --- a/cpp/include/raft/matrix/math.cuh +++ b/cpp/include/raft/matrix/math.cuh @@ -21,8 +21,8 @@ #include #include #include -#include -#include +#include +#include namespace raft { namespace matrix { @@ -285,7 +285,6 @@ void setValue(math_t *out, const math_t *in, math_t scalar, int len, * @param src: input matrix * @param dest: output matrix. The result is stored in the dest matrix * @param len: number elements of input matrix - * @param allocator device allocator * @param stream cuda stream */ template @@ -294,10 +293,7 @@ void ratio(const raft::handle_t &handle, math_t *src, math_t *dest, IdxType len, auto d_src = src; auto d_dest = dest; - std::shared_ptr allocator = - handle.get_device_allocator(); - - raft::mr::device::buffer d_sum(allocator, stream, 1); + rmm::device_scalar d_sum(stream); auto *d_sum_ptr = d_sum.data(); auto no_op = [] __device__(math_t in) { return in; }; raft::linalg::mapThenSumReduce(d_sum_ptr, len, no_op, stream, src); diff --git a/cpp/include/raft/matrix/matrix.cuh b/cpp/include/raft/matrix/matrix.cuh index 5f5755e24e..688b92da09 100644 --- a/cpp/include/raft/matrix/matrix.cuh +++ b/cpp/include/raft/matrix/matrix.cuh @@ -20,13 +20,13 @@ #include #include #include -#include #include #include #include #include #include #include +#include namespace raft { namespace matrix { @@ -64,7 +64,7 @@ void copyRows(const m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, idx_t size = n_rows_indices * n_cols; auto counting = thrust::make_counting_iterator(0); - thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size, + thrust::for_each(rmm::exec_policy(stream), counting, counting + size, [=] __device__(idx_t idx) { idx_t row = idx % n_rows_indices; idx_t col = idx / n_rows_indices; @@ -108,7 +108,7 @@ void truncZeroOrigin(m_t *in, idx_t in_n_rows, m_t *out, idx_t out_n_rows, auto d_q_trunc = out; auto counting = thrust::make_counting_iterator(0); - thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size, + thrust::for_each(rmm::exec_policy(stream), counting, counting + size, [=] __device__(idx_t idx) { idx_t row = idx % m; idx_t col = idx / m; @@ -133,8 +133,8 @@ void colReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) { auto d_q_reversed = inout; auto counting = thrust::make_counting_iterator(0); - thrust::for_each(thrust::cuda::par.on(stream), counting, - counting + (size / 2), [=] __device__(idx_t idx) { + thrust::for_each(rmm::exec_policy(stream), counting, counting + (size / 2), + [=] __device__(idx_t idx) { idx_t dest_row = idx % m; idx_t dest_col = idx / m; idx_t src_row = dest_row; @@ -161,8 +161,8 @@ void rowReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) { auto d_q_reversed = inout; auto counting = thrust::make_counting_iterator(0); - thrust::for_each(thrust::cuda::par.on(stream), counting, - counting + (size / 2), [=] __device__(idx_t idx) { + thrust::for_each(rmm::exec_policy(stream), counting, counting + (size / 2), + [=] __device__(idx_t idx) { idx_t dest_row = idx % m; idx_t dest_col = idx / m; idx_t src_row = (m - dest_row) - 1; diff --git a/cpp/include/raft/random/rng.cuh b/cpp/include/raft/random/rng.cuh index 56710ea81f..3d2e44e49b 100644 --- a/cpp/include/raft/random/rng.cuh +++ b/cpp/include/raft/random/rng.cuh @@ -24,9 +24,8 @@ #include #include #include -#include -#include #include +#include #include #include "rng_impl.cuh" @@ -498,7 +497,6 @@ class Rng { * sampling is desired * @param sampledLen output sampled array length * @param len input array length - * @param allocator device allocator for allocating any workspace required * @param stream cuda stream */ template @@ -509,13 +507,10 @@ class Rng { ASSERT(sampledLen <= len, "sampleWithoutReplacement: 'sampledLen' cant be more than 'len'."); - std::shared_ptr allocator = - handle.get_device_allocator(); - - raft::mr::device::buffer expWts(allocator, stream, len); - raft::mr::device::buffer sortedWts(allocator, stream, len); - raft::mr::device::buffer inIdx(allocator, stream, len); - raft::mr::device::buffer outIdxBuff(allocator, stream, len); + rmm::device_uvector expWts(len, stream); + rmm::device_uvector sortedWts(len, stream); + rmm::device_uvector inIdx(len, stream); + rmm::device_uvector outIdxBuff(len, stream); auto *inIdxPtr = inIdx.data(); // generate modified weights custom_distribution( @@ -533,7 +528,7 @@ class Rng { ///@todo: use a more efficient partitioning scheme instead of full sort // sort the array and pick the top sampledLen items IdxT *outIdxPtr = outIdxBuff.data(); - raft::mr::device::buffer workspace(allocator, stream); + rmm::device_uvector workspace(0, stream); sortPairs(workspace, expWts.data(), sortedWts.data(), inIdxPtr, outIdxPtr, (int)len, stream); if (outIdx != nullptr) { diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh index a034bdbda8..79b18ebd0a 100644 --- a/cpp/include/raft/sparse/convert/csr.cuh +++ b/cpp/include/raft/sparse/convert/csr.cuh @@ -22,8 +22,7 @@ #include #include #include -#include -#include +#include #include #include @@ -49,16 +48,15 @@ void coo_to_csr(const raft::handle_t &handle, const int *srcRows, int *dst_offsets, int *dstCols, value_t *dstVals) { auto stream = handle.get_stream(); auto cusparseHandle = handle.get_cusparse_handle(); - auto d_alloc = handle.get_device_allocator(); - raft::mr::device::buffer dstRows(d_alloc, stream, nnz); + rmm::device_uvector dstRows(nnz, stream); CUDA_CHECK(cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream)); CUDA_CHECK(cudaMemcpyAsync(dstCols, srcCols, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream)); auto buffSize = raft::sparse::cusparsecoosort_bufferSizeExt( cusparseHandle, m, m, nnz, srcRows, srcCols, stream); - raft::mr::device::buffer pBuffer(d_alloc, stream, buffSize); - raft::mr::device::buffer P(d_alloc, stream, nnz); + rmm::device_uvector pBuffer(buffSize, stream); + rmm::device_uvector P(nnz, stream); CUSPARSE_CHECK( cusparseCreateIdentityPermutation(cusparseHandle, nnz, P.data())); raft::sparse::cusparsecoosortByRow(cusparseHandle, m, m, nnz, dstRows.data(), @@ -147,14 +145,12 @@ void csr_adj_graph(const Index_ *row_ind, Index_ total_rows, Index_ nnz, * @param nnz: size of COO rows array * @param row_ind: output row indices array * @param m: number of rows in dense matrix - * @param d_alloc device allocator for temporary buffers * @param stream: cuda stream to use */ template void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m, - std::shared_ptr d_alloc, cudaStream_t stream) { - raft::mr::device::buffer row_counts(d_alloc, stream, m); + rmm::device_uvector row_counts(m, stream); CUDA_CHECK(cudaMemsetAsync(row_counts.data(), 0, m * sizeof(T), stream)); @@ -164,7 +160,7 @@ void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m, thrust::device_ptr row_counts_d = thrust::device_pointer_cast(row_counts.data()); thrust::device_ptr c_ind_d = thrust::device_pointer_cast(row_ind); - exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m, + exclusive_scan(rmm::exec_policy(stream), row_counts_d, row_counts_d + m, c_ind_d); } @@ -173,15 +169,11 @@ void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m, * * @param coo: Input COO matrix * @param row_ind: output row indices array - * @param d_alloc device allocator for temporary buffers * @param stream: cuda stream to use */ template -void sorted_coo_to_csr(COO *coo, int *row_ind, - std::shared_ptr d_alloc, - cudaStream_t stream) { - sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, d_alloc, - stream); +void sorted_coo_to_csr(COO *coo, int *row_ind, cudaStream_t stream) { + sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, stream); } }; // end NAMESPACE convert diff --git a/cpp/include/raft/sparse/coo.cuh b/cpp/include/raft/sparse/coo.cuh index 73120fea8c..fa21614f8f 100644 --- a/cpp/include/raft/sparse/coo.cuh +++ b/cpp/include/raft/sparse/coo.cuh @@ -17,13 +17,11 @@ #include #include #include -#include -#include +#include #include #include -#include #include #include @@ -58,9 +56,9 @@ namespace sparse { template class COO { protected: - raft::mr::device::buffer rows_arr; - raft::mr::device::buffer cols_arr; - raft::mr::device::buffer vals_arr; + rmm::device_uvector rows_arr; + rmm::device_uvector cols_arr; + rmm::device_uvector vals_arr; public: Index_Type nnz; @@ -68,13 +66,12 @@ class COO { Index_Type n_cols; /** - * @param d_alloc: the device allocator to use for the underlying buffers * @param stream: CUDA stream to use */ - COO(std::shared_ptr d_alloc, cudaStream_t stream) - : rows_arr(d_alloc, stream, 0), - cols_arr(d_alloc, stream, 0), - vals_arr(d_alloc, stream, 0), + COO(cudaStream_t stream) + : rows_arr(0, stream), + cols_arr(0, stream), + vals_arr(0, stream), nnz(0), n_rows(0), n_cols(0) {} @@ -87,10 +84,9 @@ class COO { * @param n_rows: number of rows in the dense matrix * @param n_cols: number of cols in the dense matrix */ - COO(raft::mr::device::buffer &rows, - raft::mr::device::buffer &cols, - raft::mr::device::buffer &vals, Index_Type nnz, Index_Type n_rows = 0, - Index_Type n_cols = 0) + COO(rmm::device_uvector &rows, + rmm::device_uvector &cols, rmm::device_uvector &vals, + Index_Type nnz, Index_Type n_rows = 0, Index_Type n_cols = 0) : rows_arr(rows), cols_arr(cols), vals_arr(vals), @@ -99,19 +95,17 @@ class COO { n_cols(n_cols) {} /** - * @param d_alloc: the device allocator use * @param stream: CUDA stream to use * @param nnz: size of the rows/cols/vals arrays * @param n_rows: number of rows in the dense matrix * @param n_cols: number of cols in the dense matrix * @param init: initialize arrays with zeros */ - COO(std::shared_ptr d_alloc, cudaStream_t stream, - Index_Type nnz, Index_Type n_rows = 0, Index_Type n_cols = 0, - bool init = true) - : rows_arr(d_alloc, stream, nnz), - cols_arr(d_alloc, stream, nnz), - vals_arr(d_alloc, stream, nnz), + COO(cudaStream_t stream, Index_Type nnz, Index_Type n_rows = 0, + Index_Type n_cols = 0, bool init = true) + : rows_arr(nnz, stream), + cols_arr(nnz, stream), + vals_arr(nnz, stream), nnz(nnz), n_rows(n_rows), n_cols(n_cols) { diff --git a/cpp/include/raft/sparse/csr.cuh b/cpp/include/raft/sparse/csr.cuh index bc4a68d296..041aedf41c 100644 --- a/cpp/include/raft/sparse/csr.cuh +++ b/cpp/include/raft/sparse/csr.cuh @@ -20,8 +20,8 @@ #include #include #include -#include -#include +#include +#include #include #include @@ -208,7 +208,6 @@ void weak_cc_batched(Index_ *labels, const Index_ *row_ind, * @param row_ind_ptr the row index pointer of the CSR array * @param nnz the size of row_ind_ptr array * @param N number of vertices - * @param d_alloc: deviceAllocator to use for temp memory * @param stream the cuda stream to use * @param filter_op an optional filtering function to determine which points * should get considered for labeling. It gets global indexes (not batch-wide!) @@ -216,11 +215,8 @@ void weak_cc_batched(Index_ *labels, const Index_ *row_ind, template bool> void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr, - Index_ nnz, Index_ N, - std::shared_ptr d_alloc, - cudaStream_t stream, Lambda filter_op) { - raft::mr::device::buffer m(d_alloc, stream, 1); - + Index_ nnz, Index_ N, cudaStream_t stream, Lambda filter_op) { + rmm::device_scalar m(stream); WeakCCState state(m.data()); weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, 0, N, stream, filter_op); @@ -245,15 +241,12 @@ void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr, * @param row_ind_ptr the row index pointer of the CSR array * @param nnz the size of row_ind_ptr array * @param N number of vertices - * @param d_alloc: deviceAllocator to use for temp memory * @param stream the cuda stream to use */ template void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr, - Index_ nnz, Index_ N, - std::shared_ptr d_alloc, - cudaStream_t stream) { - raft::mr::device::buffer m(d_alloc, stream, 1); + Index_ nnz, Index_ N, cudaStream_t stream) { + rmm::device_scalar m(stream); WeakCCState state(m.data()); weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, 0, N, stream, [](Index_) { return true; }); diff --git a/cpp/include/raft/sparse/distance/bin_distance.cuh b/cpp/include/raft/sparse/distance/bin_distance.cuh index f3109556b7..6885c250c0 100644 --- a/cpp/include/raft/sparse/distance/bin_distance.cuh +++ b/cpp/include/raft/sparse/distance/bin_distance.cuh @@ -21,13 +21,11 @@ #include #include #include -#include - -#include - #include #include +#include #include +#include #include @@ -84,7 +82,6 @@ void compute_bin_distance(value_t *out, const value_idx *Q_coo_rows, const value_t *Q_data, value_idx Q_nnz, const value_idx *R_coo_rows, const value_t *R_data, value_idx R_nnz, value_idx m, value_idx n, - std::shared_ptr alloc, cudaStream_t stream, expansion_f expansion_func) { rmm::device_uvector Q_norms(m, stream); rmm::device_uvector R_norms(n, stream); @@ -130,7 +127,7 @@ class jaccard_expanded_distances_t : public distances_t { compute_bin_distance( out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, - config_->handle.get_device_allocator(), config_->handle.get_stream(), + config_->handle.get_stream(), [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { value_t q_r_union = q_norm + r_norm; value_t denom = q_r_union - dot; @@ -179,7 +176,7 @@ class dice_expanded_distances_t : public distances_t { compute_bin_distance( out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, - config_->handle.get_device_allocator(), config_->handle.get_stream(), + config_->handle.get_stream(), [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { value_t q_r_union = q_norm + r_norm; value_t dice = (2 * dot) / q_r_union; diff --git a/cpp/include/raft/sparse/distance/coo_spmv.cuh b/cpp/include/raft/sparse/distance/coo_spmv.cuh index 3a78f9ada0..24be171900 100644 --- a/cpp/include/raft/sparse/distance/coo_spmv.cuh +++ b/cpp/include/raft/sparse/distance/coo_spmv.cuh @@ -22,7 +22,6 @@ #include #include #include -#include #include #include "../csr.cuh" @@ -35,8 +34,6 @@ #include -#include - namespace raft { namespace sparse { namespace distance { diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh index 5ace978a23..3b57225350 100644 --- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh +++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh @@ -22,8 +22,6 @@ #include "coo_mask_row_iterators.cuh" #include -#include -#include namespace raft { namespace sparse { diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh index 44c3833f96..74eb37bc2b 100644 --- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh +++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh @@ -20,7 +20,6 @@ #include "../utils.cuh" #include -#include namespace raft { namespace sparse { diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh index 1295d24103..a95c6ff85b 100644 --- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh +++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh @@ -55,7 +55,7 @@ class hash_strategy : public coo_spmv_strategy { rmm::device_uvector &mask_indptr, std::tuple &n_rows_divided, cudaStream_t stream) { - auto policy = rmm::exec_policy(stream); + auto policy = this->config.handle.get_thrust_policy(); auto less = thrust::copy_if( policy, thrust::make_counting_iterator(value_idx(0)), diff --git a/cpp/include/raft/sparse/distance/distance.cuh b/cpp/include/raft/sparse/distance/distance.cuh index a1974b3666..03df396b2e 100644 --- a/cpp/include/raft/sparse/distance/distance.cuh +++ b/cpp/include/raft/sparse/distance/distance.cuh @@ -22,7 +22,6 @@ #include #include #include -#include #include #include diff --git a/cpp/include/raft/sparse/distance/ip_distance.cuh b/cpp/include/raft/sparse/distance/ip_distance.cuh index 882ccba027..b1e2756671 100644 --- a/cpp/include/raft/sparse/distance/ip_distance.cuh +++ b/cpp/include/raft/sparse/distance/ip_distance.cuh @@ -22,8 +22,6 @@ #include #include -#include - #include #include #include @@ -31,6 +29,7 @@ #include #include #include +#include #include diff --git a/cpp/include/raft/sparse/distance/l2_distance.cuh b/cpp/include/raft/sparse/distance/l2_distance.cuh index 8886d4c9df..6ccfd4adcb 100644 --- a/cpp/include/raft/sparse/distance/l2_distance.cuh +++ b/cpp/include/raft/sparse/distance/l2_distance.cuh @@ -21,17 +21,13 @@ #include #include #include +#include +#include #include #include -#include - -#include - -#include #include - -#include #include +#include #include @@ -127,9 +123,8 @@ template alloc, - cudaStream_t stream, expansion_f expansion_func) { + value_idx R_nnz, value_idx m, value_idx n, cudaStream_t stream, + expansion_f expansion_func) { rmm::device_uvector Q_sq_norms(m, stream); rmm::device_uvector R_sq_norms(n, stream); CUDA_CHECK( @@ -161,7 +156,6 @@ void compute_corr(value_t *out, const value_idx *Q_coo_rows, const value_t *Q_data, value_idx Q_nnz, const value_idx *R_coo_rows, const value_t *R_data, value_idx R_nnz, value_idx m, value_idx n, value_idx n_cols, - std::shared_ptr alloc, cudaStream_t stream) { // sum_sq for std dev rmm::device_uvector Q_sq_norms(m, stream); @@ -221,7 +215,7 @@ class l2_expanded_distances_t : public distances_t { compute_l2( out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, - config_->handle.get_device_allocator(), config_->handle.get_stream(), + config_->handle.get_stream(), [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { return -2 * dot + q_norm + r_norm; }); @@ -283,7 +277,6 @@ class correlation_expanded_distances_t : public distances_t { compute_corr(out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, config_->b_ncols, - config_->handle.get_device_allocator(), config_->handle.get_stream()); } @@ -322,7 +315,7 @@ class cosine_expanded_distances_t : public distances_t { compute_l2( out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, - config_->handle.get_device_allocator(), config_->handle.get_stream(), + config_->handle.get_stream(), [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { value_t norms = sqrt(q_norm) * sqrt(r_norm); // deal with potential for 0 in denominator by forcing 0/1 instead diff --git a/cpp/include/raft/sparse/distance/utils.cuh b/cpp/include/raft/sparse/distance/utils.cuh index 6b6d77a2d5..3bee1bc87d 100644 --- a/cpp/include/raft/sparse/distance/utils.cuh +++ b/cpp/include/raft/sparse/distance/utils.cuh @@ -21,8 +21,6 @@ #include -#include - namespace raft { namespace sparse { namespace distance { diff --git a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh index 3cffa1c28a..4ef2ac43e2 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh @@ -22,7 +22,6 @@ #include #include -#include #include #include @@ -100,9 +99,8 @@ class UnionFind { template void build_dendrogram_host(const handle_t &handle, const value_idx *rows, const value_idx *cols, const value_t *data, - std::size_t nnz, value_idx *children, - value_t *out_delta, value_idx *out_size) { - auto d_alloc = handle.get_device_allocator(); + size_t nnz, value_idx *children, value_t *out_delta, + value_idx *out_size) { auto stream = handle.get_stream(); value_idx n_edges = nnz; @@ -225,11 +223,10 @@ struct init_label_roots { */ template void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels, - const value_idx *children, - std::size_t n_clusters, std::size_t n_leaves) { - auto d_alloc = handle.get_device_allocator(); + const value_idx *children, size_t n_clusters, + size_t n_leaves) { auto stream = handle.get_stream(); - auto thrust_policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); + auto thrust_policy = handle.get_thrust_policy(); // Handle special case where n_clusters == 1 if (n_clusters == 1) { diff --git a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh index 7cf959dda6..31e4a0f263 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh @@ -22,7 +22,6 @@ #include #include -#include #include #include @@ -60,12 +59,11 @@ struct distance_graph_impl &indptr, rmm::device_uvector &indices, rmm::device_uvector &data, int c) { - auto d_alloc = handle.get_device_allocator(); auto stream = handle.get_stream(); - auto exec_policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); + auto thrust_policy = handle.get_thrust_policy(); // Need to symmetrize knn into undirected graph - raft::sparse::COO knn_graph_coo(d_alloc, stream); + raft::sparse::COO knn_graph_coo(stream); raft::sparse::selection::knn_graph(handle, X, m, n, metric, knn_graph_coo, c); @@ -78,7 +76,7 @@ struct distance_graph_impl &tup) { bool self_loop = thrust::get<0>(tup) == thrust::get<1>(tup); @@ -86,9 +84,8 @@ struct distance_graph_impl(tup)); }); - raft::sparse::convert::sorted_coo_to_csr(knn_graph_coo.rows(), - knn_graph_coo.nnz, indptr.data(), - m + 1, d_alloc, stream); + raft::sparse::convert::sorted_coo_to_csr( + knn_graph_coo.rows(), knn_graph_coo.nnz, indptr.data(), m + 1, stream); // TODO: Wouldn't need to copy here if we could compute knn // graph directly on the device uvectors diff --git a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh index 765a5ad77f..6ef6f9879b 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh @@ -25,12 +25,9 @@ #include #include -#include - #include #include #include -#include namespace raft { namespace hierarchy { @@ -80,18 +77,16 @@ void connect_knn_graph(const raft::handle_t &handle, const value_t *X, red_op reduction_op, raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded) { - auto d_alloc = handle.get_device_allocator(); auto stream = handle.get_stream(); - raft::sparse::COO connected_edges(d_alloc, stream); + raft::sparse::COO connected_edges(stream); raft::linkage::connect_components( handle, connected_edges, X, color, m, n, reduction_op); rmm::device_uvector indptr2(m + 1, stream); - raft::sparse::convert::sorted_coo_to_csr(connected_edges.rows(), - connected_edges.nnz, indptr2.data(), - m + 1, d_alloc, stream); + raft::sparse::convert::sorted_coo_to_csr( + connected_edges.rows(), connected_edges.nnz, indptr2.data(), m + 1, stream); // On the second call, we hand the MST the original colors // and the new set of edges and let it restart the optimization process @@ -136,7 +131,6 @@ void build_sorted_mst(const raft::handle_t &handle, const value_t *X, raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded, int max_iter = 10) { - auto d_alloc = handle.get_device_allocator(); auto stream = handle.get_stream(); // We want to have MST initialize colors on first call. @@ -145,7 +139,7 @@ void build_sorted_mst(const raft::handle_t &handle, const value_t *X, true); int iters = 1; - int n_components = linkage::get_n_components(color, m, d_alloc, stream); + int n_components = linkage::get_n_components(color, m, stream); while (n_components > 1 && iters < max_iter) { connect_knn_graph(handle, X, mst_coo, m, n, color, @@ -153,7 +147,7 @@ void build_sorted_mst(const raft::handle_t &handle, const value_t *X, iters++; - n_components = linkage::get_n_components(color, m, d_alloc, stream); + n_components = linkage::get_n_components(color, m, stream); } /** diff --git a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp index 01a033945c..06fffb8aed 100644 --- a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp +++ b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp @@ -58,7 +58,6 @@ void single_linkage(const raft::handle_t &handle, const value_t *X, size_t m, "n_clusters must be less than or equal to the number of data points"); auto stream = handle.get_stream(); - auto d_alloc = handle.get_device_allocator(); rmm::device_uvector indptr(EMPTY, stream); rmm::device_uvector indices(EMPTY, stream); diff --git a/cpp/include/raft/sparse/linalg/add.cuh b/cpp/include/raft/sparse/linalg/add.cuh index 47b1ba6e41..7ed627b9e2 100644 --- a/cpp/include/raft/sparse/linalg/add.cuh +++ b/cpp/include/raft/sparse/linalg/add.cuh @@ -21,8 +21,8 @@ #include #include #include -#include -#include +#include +#include #include #include @@ -156,19 +156,17 @@ __global__ void csr_add_kernel(const int *a_ind, const int *a_indptr, * @param nnz2: size of right hand index_ptr and val arrays * @param m: size of output array (number of rows in final matrix) * @param out_ind: output row_ind array - * @param d_alloc: device allocator to use for temp memory * @param stream: cuda stream to use */ template size_t csr_add_calc_inds(const int *a_ind, const int *a_indptr, const T *a_val, int nnz1, const int *b_ind, const int *b_indptr, const T *b_val, int nnz2, int m, int *out_ind, - std::shared_ptr d_alloc, cudaStream_t stream) { dim3 grid(raft::ceildiv(m, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); - raft::mr::device::buffer row_counts(d_alloc, stream, m + 1); + rmm::device_uvector row_counts(m + 1, stream); CUDA_CHECK( cudaMemsetAsync(row_counts.data(), 0, (m + 1) * sizeof(int), stream)); @@ -184,7 +182,7 @@ size_t csr_add_calc_inds(const int *a_ind, const int *a_indptr, const T *a_val, thrust::device_ptr row_counts_d = thrust::device_pointer_cast(row_counts.data()); thrust::device_ptr c_ind_d = thrust::device_pointer_cast(out_ind); - exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m, + exclusive_scan(rmm::exec_policy(stream), row_counts_d, row_counts_d + m, c_ind_d); return cnnz; diff --git a/cpp/include/raft/sparse/linalg/spectral.cuh b/cpp/include/raft/sparse/linalg/spectral.cuh index 15302f3b74..ce0c4bbe6f 100644 --- a/cpp/include/raft/sparse/linalg/spectral.cuh +++ b/cpp/include/raft/sparse/linalg/spectral.cuh @@ -18,9 +18,8 @@ #include #include -#include -#include #include +#include #include #include @@ -35,16 +34,15 @@ void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals, int nnz, int n, int n_components, T *out, unsigned long long seed = 1234567) { auto stream = handle.get_stream(); - auto d_alloc = handle.get_device_allocator(); - raft::mr::device::buffer src_offsets(d_alloc, stream, n + 1); - raft::mr::device::buffer dst_cols(d_alloc, stream, nnz); - raft::mr::device::buffer dst_vals(d_alloc, stream, nnz); + rmm::device_uvector src_offsets(n + 1, stream); + rmm::device_uvector dst_cols(nnz, stream); + rmm::device_uvector dst_vals(nnz, stream); convert::coo_to_csr(handle, rows, cols, vals, nnz, n, src_offsets.data(), dst_cols.data(), dst_vals.data()); - raft::mr::device::buffer eigVals(d_alloc, stream, n_components + 1); - raft::mr::device::buffer eigVecs(d_alloc, stream, n * (n_components + 1)); - raft::mr::device::buffer labels(d_alloc, stream, n); + rmm::device_uvector eigVals(n_components + 1, stream); + rmm::device_uvector eigVecs(n * (n_components + 1), stream); + rmm::device_uvector labels(n, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -65,8 +63,6 @@ void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals, index_type maxiter = 4000; //default reset value (when set to 0); value_type tol = 0.01; index_type restart_iter = 15 + neigvs; //what cugraph is using - auto t_exe_p = thrust::cuda::par.on(stream); - using thrust_exe_policy_t = decltype(t_exe_p); raft::eigen_solver_config_t cfg{neigvs, maxiter, restart_iter, tol}; @@ -85,15 +81,14 @@ void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals, using value_type_t = value_type; std::pair solve( - handle_t const &handle, thrust_exe_policy_t t_exe_policy, - size_type_t n_obs_vecs, size_type_t dim, + handle_t const &handle, size_type_t n_obs_vecs, size_type_t dim, value_type_t const *__restrict__ obs, index_type_t *__restrict__ codes) const { return std::make_pair(0, 0); } }; - raft::spectral::partition(handle, t_exe_p, r_csr_m, eig_solver, + raft::spectral::partition(handle, r_csr_m, eig_solver, no_op_cluster_solver_t{}, labels.data(), eigVals.data(), eigVecs.data()); diff --git a/cpp/include/raft/sparse/linalg/symmetrize.cuh b/cpp/include/raft/sparse/linalg/symmetrize.cuh index 5c2c78f0c3..a6e1027288 100644 --- a/cpp/include/raft/sparse/linalg/symmetrize.cuh +++ b/cpp/include/raft/sparse/linalg/symmetrize.cuh @@ -21,8 +21,8 @@ #include #include #include -#include -#include +#include +#include #include #include @@ -31,8 +31,6 @@ #include #include -#include -#include #include #include @@ -122,22 +120,20 @@ __global__ void coo_symmetrize_kernel(int *row_ind, int *rows, int *cols, * @param in: Input COO matrix * @param out: Output symmetrized COO matrix * @param reduction_op: a custom reduction function - * @param d_alloc device allocator for temporary buffers * @param stream: cuda stream to use */ template void coo_symmetrize(COO *in, COO *out, Lambda reduction_op, // two-argument reducer - std::shared_ptr d_alloc, cudaStream_t stream) { dim3 grid(raft::ceildiv(in->n_rows, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); ASSERT(!out->validate_mem(), "Expecting unallocated COO for output"); - raft::mr::device::buffer in_row_ind(d_alloc, stream, in->n_rows); + rmm::device_uvector in_row_ind(in->n_rows, stream); - convert::sorted_coo_to_csr(in, in_row_ind.data(), d_alloc, stream); + convert::sorted_coo_to_csr(in, in_row_ind.data(), stream); out->allocate(in->nnz * 2, in->n_rows, in->n_cols, true, stream); @@ -250,14 +246,14 @@ __global__ static void symmetric_sum(value_idx *restrict edges, * @param k: Number of n_neighbors * @param out: Output COO Matrix class * @param stream: Input cuda stream - * @param d_alloc device allocator for temporary buffers */ template -void from_knn_symmetrize_matrix( - const value_idx *restrict knn_indices, const value_t *restrict knn_dists, - const value_idx n, const int k, COO *out, - cudaStream_t stream, std::shared_ptr d_alloc) { +void from_knn_symmetrize_matrix(const value_idx *restrict knn_indices, + const value_t *restrict knn_dists, + const value_idx n, const int k, + COO *out, + cudaStream_t stream) { // (1) Find how much space needed in each row // We look through all datapoints and increment the count for each row. const dim3 threadsPerBlock(TPB_X, TPB_Y); @@ -265,11 +261,11 @@ void from_knn_symmetrize_matrix( raft::ceildiv(k, TPB_Y)); // Notice n+1 since we can reuse these arrays for transpose_edges, original_edges in step (4) - raft::mr::device::buffer row_sizes(d_alloc, stream, n); + rmm::device_uvector row_sizes(n, stream); CUDA_CHECK( cudaMemsetAsync(row_sizes.data(), 0, sizeof(value_idx) * n, stream)); - raft::mr::device::buffer row_sizes2(d_alloc, stream, n); + rmm::device_uvector row_sizes2(n, stream); CUDA_CHECK( cudaMemsetAsync(row_sizes2.data(), 0, sizeof(value_idx) * n, stream)); @@ -298,8 +294,8 @@ void from_knn_symmetrize_matrix( thrust::device_pointer_cast(row_sizes.data()); // Rolling cumulative sum - thrust::exclusive_scan(thrust::cuda::par.on(stream), __row_sizes, - __row_sizes + n, __edges); + thrust::exclusive_scan(rmm::exec_policy(stream), __row_sizes, __row_sizes + n, + __edges); // (5) Perform final data + data.T operation in tandem with memcpying symmetric_sum<<>>( @@ -314,7 +310,6 @@ template void symmetrize(const raft::handle_t &handle, const value_idx *rows, const value_idx *cols, const value_t *vals, size_t m, size_t n, size_t nnz, raft::sparse::COO &out) { - auto d_alloc = handle.get_device_allocator(); auto stream = handle.get_stream(); // copy rows to cols and cols to rows @@ -333,7 +328,7 @@ void symmetrize(const raft::handle_t &handle, const value_idx *rows, // sort COO raft::sparse::op::coo_sort((value_idx)m, (value_idx)n, (value_idx)nnz * 2, symm_rows.data(), symm_cols.data(), - symm_vals.data(), d_alloc, stream); + symm_vals.data(), stream); raft::sparse::op::max_duplicates(handle, out, symm_rows.data(), symm_cols.data(), symm_vals.data(), nnz * 2, diff --git a/cpp/include/raft/sparse/linalg/transpose.h b/cpp/include/raft/sparse/linalg/transpose.h index 6afe4ca8f6..7ad4b93ec0 100644 --- a/cpp/include/raft/sparse/linalg/transpose.h +++ b/cpp/include/raft/sparse/linalg/transpose.h @@ -21,8 +21,7 @@ #include #include #include -#include -#include +#include #include #include @@ -53,7 +52,6 @@ namespace linalg { * @param[in] csr_nrows : Number of rows in CSR * @param[in] csr_ncols : Number of columns in CSR * @param[in] nnz : Number of nonzeros of CSR - * @param[in] allocator : Allocator for intermediate memory * @param[in] stream : Cuda stream for ordering events */ template @@ -61,9 +59,7 @@ void csr_transpose(cusparseHandle_t handle, const value_idx *csr_indptr, const value_idx *csr_indices, const value_t *csr_data, value_idx *csc_indptr, value_idx *csc_indices, value_t *csc_data, value_idx csr_nrows, value_idx csr_ncols, - value_idx nnz, - std::shared_ptr allocator, - cudaStream_t stream) { + value_idx nnz, cudaStream_t stream) { size_t convert_csc_workspace_size = 0; CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc_bufferSize( @@ -72,8 +68,8 @@ void csr_transpose(cusparseHandle_t handle, const value_idx *csr_indptr, CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1, &convert_csc_workspace_size, stream)); - raft::mr::device::buffer convert_csc_workspace( - allocator, stream, convert_csc_workspace_size); + rmm::device_uvector convert_csc_workspace(convert_csc_workspace_size, + stream); CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc( handle, csr_nrows, csr_ncols, nnz, csr_data, csr_indptr, csr_indices, diff --git a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh index c5ba4fcb4f..33b980afcd 100644 --- a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh +++ b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh @@ -23,11 +23,10 @@ #include "utils.cuh" #include -#include -#include +#include +#include #include -#include #include #include #include @@ -35,11 +34,6 @@ #include #include -#include - -#include -#include - namespace raft { namespace mst { typedef std::chrono::high_resolution_clock Clock; @@ -65,20 +59,20 @@ MST_solver::MST_solver( offsets(offsets_), indices(indices_), weights(weights_), - altered_weights(e_), + altered_weights(e_, stream_), v(v_), e(e_), color_index(color_), - color(v_), - next_color(v_), - min_edge_color(v_), - new_mst_edge(v_), - mst_edge(e_, false), - temp_src(2 * v_), - temp_dst(2 * v_), - temp_weights(2 * v_), - mst_edge_count(1, 0), - prev_mst_edge_count(1, 0), + color(v_, stream_), + next_color(v_, stream_), + min_edge_color(v_, stream_), + new_mst_edge(v_, stream_), + mst_edge(e_, stream_), + temp_src(2 * v_, stream_), + temp_dst(2 * v_, stream_), + temp_weights(2 * v_, stream_), + mst_edge_count(1, stream_), + prev_mst_edge_count(1, stream_), stream(stream_), symmetrize_output(symmetrize_output_), initialize_colors(initialize_colors_), @@ -87,13 +81,18 @@ MST_solver::MST_solver( max_threads = handle_.get_device_properties().maxThreadsPerBlock; sm_count = handle_.get_device_properties().multiProcessorCount; + mst_edge_count.set_value_to_zero_async(stream); + prev_mst_edge_count.set_value_to_zero_async(stream); + CUDA_CHECK(cudaMemsetAsync(mst_edge.data(), 0, mst_edge.size() * sizeof(bool), + stream)); + //Initially, color holds the vertex id as color - auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); + auto policy = handle.get_thrust_policy(); if (initialize_colors_) { thrust::sequence(policy, color.begin(), color.end(), 0); thrust::sequence(policy, color_index, color_index + v, 0); } else { - raft::copy(color.data().get(), color_index, v, stream); + raft::copy(color.data(), color_index, v, stream); } thrust::sequence(policy, next_color.begin(), next_color.end(), 0); } @@ -160,12 +159,12 @@ MST_solver::solve() { timer3 += duration_us(stop - start); #endif - auto curr_mst_edge_count = mst_edge_count[0]; + auto curr_mst_edge_count = mst_edge_count.value(stream); RAFT_EXPECTS(curr_mst_edge_count <= max_mst_edges, "Number of edges found by MST is invalid. This may be due to " "loss in precision. Try increasing precision of weights."); - if (curr_mst_edge_count == prev_mst_edge_count[0]) { + if (curr_mst_edge_count == prev_mst_edge_count.value(stream)) { #ifdef MST_TIME std::cout << "Iterations: " << i << std::endl; std::cout << timer0 << "," << timer1 << "," << timer2 << "," << timer3 @@ -196,12 +195,11 @@ MST_solver::solve() { #endif // copy this iteration's results and store - prev_mst_edge_count = mst_edge_count; + prev_mst_edge_count.set_value_async(curr_mst_edge_count, stream); } // result packaging - thrust::host_vector host_mst_edge_count = mst_edge_count; - mst_result.n_edges = host_mst_edge_count[0]; + mst_result.n_edges = mst_edge_count.value(stream); mst_result.src.resize(mst_result.n_edges, stream); mst_result.dst.resize(mst_result.n_edges, stream); mst_result.weights.resize(mst_result.n_edges, stream); @@ -227,8 +225,8 @@ template alteration_t MST_solver::alteration_max() { - auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); - rmm::device_vector tmp(e); + auto policy = handle.get_thrust_policy(); + rmm::device_uvector tmp(e, stream); thrust::device_ptr weights_ptr(weights); thrust::copy(policy, weights_ptr, weights_ptr + e, tmp.begin()); //sort tmp weights @@ -242,7 +240,7 @@ MST_solver::alteration_max() { thrust::make_zip_iterator(thrust::make_tuple(tmp.begin(), tmp.begin() + 1)); auto end = thrust::make_zip_iterator(thrust::make_tuple(new_end - 1, new_end)); - auto init = tmp[1] - tmp[0]; + auto init = tmp.element(1, stream) - tmp.element(0, stream); auto max = thrust::transform_reduce(policy, begin, end, alteration_functor(), init, thrust::minimum()); @@ -261,7 +259,7 @@ void MST_solver::alteration() { alteration_t max = alteration_max(); // pool of rand values - rmm::device_vector rand_values(v); + rmm::device_uvector rand_values(v, stream); // Random number generator curandGenerator_t randGen; @@ -269,8 +267,7 @@ void MST_solver::alteration() { curandSetPseudoRandomGeneratorSeed(randGen, 1234567); // Initialize rand values - auto curand_status = - curand_generate_uniformX(randGen, rand_values.data().get(), v); + auto curand_status = curand_generate_uniformX(randGen, rand_values.data(), v); RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS, "MST: CURAND failed"); curand_status = curandDestroyGenerator(randGen); RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS, @@ -278,8 +275,8 @@ void MST_solver::alteration() { //Alterate the weights, make all undirected edge weight unique while keeping Wuv == Wvu detail::alteration_kernel<<>>( - v, e, offsets, indices, weights, max, rand_values.data().get(), - altered_weights.data().get()); + v, e, offsets, indices, weights, max, rand_values.data(), + altered_weights.data()); } // updates colors of vertices by propagating the lower color to the higher @@ -288,23 +285,24 @@ template ::label_prop( vertex_t* mst_src, vertex_t* mst_dst) { // update the colors of both ends its until there is no change in colors - thrust::host_vector curr_mst_edge_count = mst_edge_count; + edge_t curr_mst_edge_count = mst_edge_count.value(stream); auto min_pair_nthreads = std::min(v, (vertex_t)max_threads); auto min_pair_nblocks = std::min( (v + min_pair_nthreads - 1) / min_pair_nthreads, (vertex_t)max_blocks); - rmm::device_vector done(1, false); - - edge_t* new_mst_edge_ptr = new_mst_edge.data().get(); - vertex_t* color_ptr = color.data().get(); - vertex_t* next_color_ptr = next_color.data().get(); + edge_t* new_mst_edge_ptr = new_mst_edge.data(); + vertex_t* color_ptr = color.data(); + vertex_t* next_color_ptr = next_color.data(); - bool* done_ptr = done.data().get(); + rmm::device_scalar done(stream); + done.set_value_to_zero_async(stream); + bool* done_ptr = done.data(); + const bool true_val = true; auto i = 0; - while (!done[0]) { - done[0] = true; + while (!done.value(stream)) { + done.set_value_async(true_val, stream); detail::min_pair_colors<<>>( v, indices, new_mst_edge_ptr, color_ptr, color_index, next_color_ptr); @@ -327,7 +325,7 @@ template void MST_solver::min_edge_per_vertex() { - auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); + auto policy = handle.get_thrust_policy(); thrust::fill(policy, min_edge_color.begin(), min_edge_color.end(), std::numeric_limits::max()); thrust::fill(policy, new_mst_edge.begin(), new_mst_edge.end(), @@ -335,11 +333,11 @@ void MST_solver>>( offsets, indices, altered_weights_ptr, color_ptr, color_index, @@ -354,18 +352,18 @@ void MST_solver::max()); - vertex_t* color_ptr = color.data().get(); - edge_t* new_mst_edge_ptr = new_mst_edge.data().get(); - bool* mst_edge_ptr = mst_edge.data().get(); - alteration_t* min_edge_color_ptr = min_edge_color.data().get(); - alteration_t* altered_weights_ptr = altered_weights.data().get(); - vertex_t* temp_src_ptr = temp_src.data().get(); - vertex_t* temp_dst_ptr = temp_dst.data().get(); - weight_t* temp_weights_ptr = temp_weights.data().get(); + vertex_t* color_ptr = color.data(); + edge_t* new_mst_edge_ptr = new_mst_edge.data(); + bool* mst_edge_ptr = mst_edge.data(); + alteration_t* min_edge_color_ptr = min_edge_color.data(); + alteration_t* altered_weights_ptr = altered_weights.data(); + vertex_t* temp_src_ptr = temp_src.data(); + vertex_t* temp_dst_ptr = temp_dst.data(); + weight_t* temp_weights_ptr = temp_weights.data(); detail::min_edge_per_supervertex<<>>( color_ptr, color_index, new_mst_edge_ptr, mst_edge_ptr, indices, weights, @@ -390,8 +388,8 @@ void MST_solver::check_termination() { std::min((2 * v + nthreads - 1) / nthreads, (vertex_t)max_blocks); // count number of new mst edges - edge_t* mst_edge_count_ptr = mst_edge_count.data().get(); - vertex_t* temp_src_ptr = temp_src.data().get(); + edge_t* mst_edge_count_ptr = mst_edge_count.data(); + vertex_t* temp_src_ptr = temp_src.data(); detail::kernel_count_new_mst_edges<<>>( temp_src_ptr, mst_edge_count_ptr, 2 * v); @@ -411,9 +409,9 @@ template void MST_solver::append_src_dst_pair( vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights) { - auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); + auto policy = handle.get_thrust_policy(); - auto curr_mst_edge_count = prev_mst_edge_count[0]; + edge_t curr_mst_edge_count = prev_mst_edge_count.value(stream); // iterator to end of mst edges added to final output in previous iteration auto src_dst_zip_end = thrust::make_zip_iterator(thrust::make_tuple( diff --git a/cpp/include/raft/sparse/mst/detail/utils.cuh b/cpp/include/raft/sparse/mst/detail/utils.cuh index 8f755de459..4d5ca6ebe1 100644 --- a/cpp/include/raft/sparse/mst/detail/utils.cuh +++ b/cpp/include/raft/sparse/mst/detail/utils.cuh @@ -18,7 +18,7 @@ #pragma once #include -#include +#include #define MST_TIME namespace raft { @@ -32,7 +32,7 @@ __device__ idx_t get_1D_idx() { // somewhat smart vector print template -void printv(rmm::device_vector& vec, const std::string& name = "", +void printv(rmm::device_uvector& vec, const std::string& name = "", const size_t displ = 5) { #ifdef MST_TIME std::cout.precision(15); diff --git a/cpp/include/raft/sparse/mst/mst_solver.cuh b/cpp/include/raft/sparse/mst/mst_solver.cuh index 833882ea0d..44b34ee5c7 100644 --- a/cpp/include/raft/sparse/mst/mst_solver.cuh +++ b/cpp/include/raft/sparse/mst/mst_solver.cuh @@ -18,8 +18,8 @@ #pragma once #include +#include #include -#include namespace raft { @@ -68,24 +68,24 @@ class MST_solver { vertex_t sm_count; vertex_t* color_index; // represent each supervertex as a color - rmm::device_vector + rmm::device_uvector min_edge_color; // minimum incident edge weight per color - rmm::device_vector new_mst_edge; // new minimum edge per vertex - rmm::device_vector + rmm::device_uvector new_mst_edge; // new minimum edge per vertex + rmm::device_uvector altered_weights; // weights to be used for mst - rmm::device_vector + rmm::device_scalar mst_edge_count; // total number of edges added after every iteration - rmm::device_vector + rmm::device_scalar prev_mst_edge_count; // total number of edges up to the previous iteration - rmm::device_vector + rmm::device_uvector mst_edge; // mst output - true if the edge belongs in mst - rmm::device_vector next_color; // next iteration color - rmm::device_vector color; // index of color that vertex points to + rmm::device_uvector next_color; // next iteration color + rmm::device_uvector color; // index of color that vertex points to // new src-dst pairs found per iteration - rmm::device_vector temp_src; - rmm::device_vector temp_dst; - rmm::device_vector temp_weights; + rmm::device_uvector temp_src; + rmm::device_uvector temp_dst; + rmm::device_uvector temp_weights; void label_prop(vertex_t* mst_src, vertex_t* mst_dst); void min_edge_per_vertex(); diff --git a/cpp/include/raft/sparse/op/filter.cuh b/cpp/include/raft/sparse/op/filter.cuh index 562d506cfe..492058f85f 100644 --- a/cpp/include/raft/sparse/op/filter.cuh +++ b/cpp/include/raft/sparse/op/filter.cuh @@ -21,8 +21,8 @@ #include #include #include -#include -#include +#include +#include #include #include @@ -84,11 +84,9 @@ __global__ void coo_remove_scalar_kernel(const int *rows, const int *cols, template void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz, int *crows, int *ccols, T *cvals, int *cnnz, - int *cur_cnnz, T scalar, int n, - std::shared_ptr d_alloc, - cudaStream_t stream) { - raft::mr::device::buffer ex_scan(d_alloc, stream, n); - raft::mr::device::buffer cur_ex_scan(d_alloc, stream, n); + int *cur_cnnz, T scalar, int n, cudaStream_t stream) { + rmm::device_uvector ex_scan(n, stream); + rmm::device_uvector cur_ex_scan(n, stream); CUDA_CHECK(cudaMemsetAsync(ex_scan.data(), 0, n * sizeof(int), stream)); CUDA_CHECK(cudaMemsetAsync(cur_ex_scan.data(), 0, n * sizeof(int), stream)); @@ -96,14 +94,14 @@ void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz, thrust::device_ptr dev_cnnz = thrust::device_pointer_cast(cnnz); thrust::device_ptr dev_ex_scan = thrust::device_pointer_cast(ex_scan.data()); - thrust::exclusive_scan(thrust::cuda::par.on(stream), dev_cnnz, dev_cnnz + n, + thrust::exclusive_scan(rmm::exec_policy(stream), dev_cnnz, dev_cnnz + n, dev_ex_scan); CUDA_CHECK(cudaPeekAtLastError()); thrust::device_ptr dev_cur_cnnz = thrust::device_pointer_cast(cur_cnnz); thrust::device_ptr dev_cur_ex_scan = thrust::device_pointer_cast(cur_ex_scan.data()); - thrust::exclusive_scan(thrust::cuda::par.on(stream), dev_cur_cnnz, + thrust::exclusive_scan(rmm::exec_policy(stream), dev_cur_cnnz, dev_cur_cnnz + n, dev_cur_ex_scan); CUDA_CHECK(cudaPeekAtLastError()); @@ -122,15 +120,12 @@ void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz, * @param in: input COO matrix * @param out: output COO matrix * @param scalar: scalar to remove from arrays - * @param d_alloc device allocator for temporary buffers * @param stream: cuda stream to use */ template -void coo_remove_scalar(COO *in, COO *out, T scalar, - std::shared_ptr d_alloc, - cudaStream_t stream) { - raft::mr::device::buffer row_count_nz(d_alloc, stream, in->n_rows); - raft::mr::device::buffer row_count(d_alloc, stream, in->n_rows); +void coo_remove_scalar(COO *in, COO *out, T scalar, cudaStream_t stream) { + rmm::device_uvector row_count_nz(in->n_rows, stream); + rmm::device_uvector row_count(in->n_rows, stream); CUDA_CHECK( cudaMemsetAsync(row_count_nz.data(), 0, in->n_rows * sizeof(int), stream)); @@ -146,7 +141,7 @@ void coo_remove_scalar(COO *in, COO *out, T scalar, thrust::device_ptr d_row_count_nz = thrust::device_pointer_cast(row_count_nz.data()); - int out_nnz = thrust::reduce(thrust::cuda::par.on(stream), d_row_count_nz, + int out_nnz = thrust::reduce(rmm::exec_policy(stream), d_row_count_nz, d_row_count_nz + in->n_rows); out->allocate(out_nnz, in->n_rows, in->n_cols, false, stream); @@ -154,7 +149,7 @@ void coo_remove_scalar(COO *in, COO *out, T scalar, coo_remove_scalar(in->rows(), in->cols(), in->vals(), in->nnz, out->rows(), out->cols(), out->vals(), row_count_nz.data(), row_count.data(), scalar, - in->n_rows, d_alloc, stream); + in->n_rows, stream); CUDA_CHECK(cudaPeekAtLastError()); } @@ -163,14 +158,11 @@ void coo_remove_scalar(COO *in, COO *out, T scalar, * * @param in: input COO matrix * @param out: output COO matrix - * @param d_alloc device allocator for temporary buffers * @param stream: cuda stream to use */ template -void coo_remove_zeros(COO *in, COO *out, - std::shared_ptr d_alloc, - cudaStream_t stream) { - coo_remove_scalar(in, out, T(0.0), d_alloc, stream); +void coo_remove_zeros(COO *in, COO *out, cudaStream_t stream) { + coo_remove_scalar(in, out, T(0.0), stream); } }; // namespace op diff --git a/cpp/include/raft/sparse/op/reduce.cuh b/cpp/include/raft/sparse/op/reduce.cuh index 53c9f89074..09a35720fb 100644 --- a/cpp/include/raft/sparse/op/reduce.cuh +++ b/cpp/include/raft/sparse/op/reduce.cuh @@ -21,7 +21,6 @@ #include #include #include -#include #include #include @@ -32,7 +31,6 @@ #include #include #include -#include #include #include @@ -126,18 +124,16 @@ void max_duplicates(const raft::handle_t &handle, raft::sparse::COO &out, const value_idx *rows, const value_idx *cols, const value_t *vals, size_t nnz, size_t m, size_t n) { - auto d_alloc = handle.get_device_allocator(); auto stream = handle.get_stream(); - - auto exec_policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); + auto thrust_policy = handle.get_thrust_policy(); // compute diffs & take exclusive scan rmm::device_uvector diff(nnz + 1, stream); compute_duplicates_mask(diff.data(), rows, cols, nnz, stream); - thrust::exclusive_scan(thrust::cuda::par.on(stream), diff.data(), - diff.data() + diff.size(), diff.data()); + thrust::exclusive_scan(thrust_policy, diff.data(), diff.data() + diff.size(), + diff.data()); // compute final size value_idx size = 0; diff --git a/cpp/include/raft/sparse/op/sort.h b/cpp/include/raft/sparse/op/sort.h index 09d5b568be..c40801a0b1 100644 --- a/cpp/include/raft/sparse/op/sort.h +++ b/cpp/include/raft/sparse/op/sort.h @@ -20,9 +20,8 @@ #include #include #include -#include -#include #include +#include #include #include @@ -59,35 +58,28 @@ struct TupleComp { * @param rows rows array from coo matrix * @param cols cols array from coo matrix * @param vals vals array from coo matrix - * @param d_alloc device allocator for temporary buffers * @param stream: cuda stream to use */ template void coo_sort(int m, int n, int nnz, int *rows, int *cols, T *vals, - // TODO: Remove this - std::shared_ptr d_alloc, cudaStream_t stream) { auto coo_indices = thrust::make_zip_iterator(thrust::make_tuple(rows, cols)); // get all the colors in contiguous locations so we can map them to warps. - thrust::sort_by_key(thrust::cuda::par.on(stream), coo_indices, - coo_indices + nnz, vals, TupleComp()); + thrust::sort_by_key(rmm::exec_policy(stream), coo_indices, coo_indices + nnz, + vals, TupleComp()); } /** * @brief Sort the underlying COO arrays by row * @tparam T: the type name of the underlying value array * @param in: COO to sort by row - * @param d_alloc device allocator for temporary buffers * @param stream: the cuda stream to use */ template -void coo_sort(COO *const in, - // TODO: Remove this - std::shared_ptr d_alloc, - cudaStream_t stream) { +void coo_sort(COO *const in, cudaStream_t stream) { coo_sort(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), - in->vals(), d_alloc, stream); + in->vals(), stream); } /** @@ -107,8 +99,7 @@ void coo_sort_by_weight(value_idx *rows, value_idx *cols, value_t *data, auto first = thrust::make_zip_iterator(thrust::make_tuple(rows, cols)); - thrust::sort_by_key(thrust::cuda::par.on(stream), t_data, t_data + nnz, - first); + thrust::sort_by_key(rmm::exec_policy(stream), t_data, t_data + nnz, first); } }; // namespace op }; // end NAMESPACE sparse diff --git a/cpp/include/raft/sparse/selection/connect_components.cuh b/cpp/include/raft/sparse/selection/connect_components.cuh index 8aae90f1d8..46369ca964 100644 --- a/cpp/include/raft/sparse/selection/connect_components.cuh +++ b/cpp/include/raft/sparse/selection/connect_components.cuh @@ -159,14 +159,10 @@ struct CubKVPMinReduce { */ template value_idx get_n_components(value_idx *colors, size_t n_rows, - std::shared_ptr d_alloc, cudaStream_t stream) { - value_idx *map_ids; - int num_clusters; - raft::label::getUniquelabels(colors, n_rows, &map_ids, &num_clusters, stream, - d_alloc); - d_alloc->deallocate(map_ids, num_clusters * sizeof(value_idx), stream); - + rmm::device_uvector map_ids(0, stream); + int num_clusters = + raft::label::getUniquelabels(map_ids, colors, n_rows, stream); return num_clusters; } @@ -197,15 +193,13 @@ struct LookupColorOp { * @param[in] X original dense data * @param[in] n_rows number of rows in original dense data * @param[in] n_cols number of columns in original dense data - * @param[in] d_alloc device allocator to use * @param[in] stream cuda stream for which to order cuda operations */ template void perform_1nn(cub::KeyValuePair *kvp, value_idx *nn_colors, value_idx *colors, const value_t *X, - size_t n_rows, size_t n_cols, - std::shared_ptr d_alloc, - cudaStream_t stream, red_op reduction_op) { + size_t n_rows, size_t n_cols, cudaStream_t stream, + red_op reduction_op) { rmm::device_uvector workspace(n_rows, stream); rmm::device_uvector x_norm(n_rows, stream); @@ -218,7 +212,7 @@ void perform_1nn(cub::KeyValuePair *kvp, workspace.data(), reduction_op, reduction_op, true, true, stream); LookupColorOp extract_colors_op(colors); - thrust::transform(thrust::cuda::par.on(stream), kvp, kvp + n_rows, nn_colors, + thrust::transform(rmm::exec_policy(stream), kvp, kvp + n_rows, nn_colors, extract_colors_op); } @@ -239,15 +233,15 @@ void sort_by_color(value_idx *colors, value_idx *nn_colors, cub::KeyValuePair *kvp, value_idx *src_indices, size_t n_rows, cudaStream_t stream) { thrust::counting_iterator arg_sort_iter(0); - thrust::copy(thrust::cuda::par.on(stream), arg_sort_iter, - arg_sort_iter + n_rows, src_indices); + thrust::copy(rmm::exec_policy(stream), arg_sort_iter, arg_sort_iter + n_rows, + src_indices); auto keys = thrust::make_zip_iterator(thrust::make_tuple( colors, nn_colors, (raft::linkage::KeyValuePair *)kvp)); auto vals = thrust::make_zip_iterator(thrust::make_tuple(src_indices)); // get all the colors in contiguous locations so we can map them to warps. - thrust::sort_by_key(thrust::cuda::par.on(stream), keys, keys + n_rows, vals, + thrust::sort_by_key(rmm::exec_policy(stream), keys, keys + n_rows, vals, TupleComp()); } @@ -324,7 +318,6 @@ void connect_components(const raft::handle_t &handle, size_t n_rows, size_t n_cols, red_op reduction_op, raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded) { - auto d_alloc = handle.get_device_allocator(); auto stream = handle.get_stream(); RAFT_EXPECTS(metric == raft::distance::DistanceType::L2SqrtExpanded, @@ -336,10 +329,9 @@ void connect_components(const raft::handle_t &handle, // Normalize colors so they are drawn from a monotonically increasing set raft::label::make_monotonic(colors.data(), colors.data(), n_rows, stream, - d_alloc, true); + true); - value_idx n_components = - get_n_components(colors.data(), n_rows, d_alloc, stream); + value_idx n_components = get_n_components(colors.data(), n_rows, stream); /** * First compute 1-nn for all colors where the color of each data point @@ -351,7 +343,7 @@ void connect_components(const raft::handle_t &handle, rmm::device_uvector src_indices(n_rows, stream); perform_1nn(temp_inds_dists.data(), nn_colors.data(), colors.data(), X, - n_rows, n_cols, d_alloc, stream, reduction_op); + n_rows, n_cols, stream, reduction_op); /** * Sort data points by color (neighbors are not sorted) @@ -369,7 +361,7 @@ void connect_components(const raft::handle_t &handle, raft::sparse::op::compute_duplicates_mask(out_index.data(), colors.data(), nn_colors.data(), n_rows, stream); - thrust::exclusive_scan(thrust::cuda::par.on(stream), out_index.data(), + thrust::exclusive_scan(handle.get_thrust_policy(), out_index.data(), out_index.data() + out_index.size(), out_index.data()); // compute final size @@ -380,7 +372,7 @@ void connect_components(const raft::handle_t &handle, size++; - raft::sparse::COO min_edges(d_alloc, stream); + raft::sparse::COO min_edges(stream); min_edges.allocate(size, n_rows, n_rows, true, stream); min_components_by_color(min_edges, out_index.data(), src_indices.data(), diff --git a/cpp/include/raft/sparse/selection/knn.cuh b/cpp/include/raft/sparse/selection/knn.cuh index 71fbb8ab3d..3566939bc4 100644 --- a/cpp/include/raft/sparse/selection/knn.cuh +++ b/cpp/include/raft/sparse/selection/knn.cuh @@ -24,7 +24,6 @@ #include #include #include -#include #include #include @@ -415,7 +414,6 @@ class sparse_knn_t { * @param[out] output_dists dense matrix for output distances (size n_query_rows * k) * @param[in] k the number of neighbors to query * @param[in] cusparseHandle the initialized cusparseHandle instance to use - * @param[in] allocator device allocator instance to use * @param[in] handle.get_stream() CUDA handle.get_stream() to order operations with respect to * @param[in] batch_size_index maximum number of rows to use from index matrix per batch * @param[in] batch_size_query maximum number of rows to use from query matrix per batch diff --git a/cpp/include/raft/sparse/selection/knn_graph.cuh b/cpp/include/raft/sparse/selection/knn_graph.cuh index 1cf225087a..1cdd66f516 100644 --- a/cpp/include/raft/sparse/selection/knn_graph.cuh +++ b/cpp/include/raft/sparse/selection/knn_graph.cuh @@ -96,7 +96,6 @@ void knn_graph(const handle_t &handle, const value_t *X, size_t m, size_t n, raft::sparse::COO &out, int c = 15) { int k = build_k(m, c); - auto d_alloc = handle.get_device_allocator(); auto stream = handle.get_stream(); size_t nnz = m * k; diff --git a/cpp/include/raft/spatial/knn/ann.hpp b/cpp/include/raft/spatial/knn/ann.hpp index 77d7831b4a..2cdf9bf4f5 100644 --- a/cpp/include/raft/spatial/knn/ann.hpp +++ b/cpp/include/raft/spatial/knn/ann.hpp @@ -22,15 +22,12 @@ #include #include -#include #include namespace raft { namespace spatial { namespace knn { -using deviceAllocator = raft::mr::device::allocator; - /** * @brief Flat C++ API function to build an approximate nearest neighbors index * from an index array and a set of parameters. diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh index 6e4c99b646..77ad4afe96 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh @@ -39,14 +39,11 @@ #include #include -#include #include #include -#include #include -#include #include #include @@ -145,8 +142,7 @@ void approx_knn_build_index(raft::handle_t &handle, // perform preprocessing // k set to 0 (unused during preprocessing / revertion) std::unique_ptr> query_metric_processor = - create_processor(metric, n, D, 0, false, handle.get_stream(), - handle.get_device_allocator()); + create_processor(metric, n, D, 0, false, handle.get_stream()); query_metric_processor->preprocess(index_array); @@ -183,7 +179,7 @@ void approx_knn_search(raft::handle_t &handle, float *distances, // perform preprocessing std::unique_ptr> query_metric_processor = create_processor(index->metric, n, index->index->d, k, false, - handle.get_stream(), handle.get_device_allocator()); + handle.get_stream()); query_metric_processor->preprocess(query_array); index->index->search(n, query_array, k, distances, indices); diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh index 09494e9eb1..84c130b0e4 100644 --- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh @@ -19,6 +19,8 @@ #include #include +#include + #include #include #include @@ -27,7 +29,6 @@ #include #include -#include #include #include #include @@ -179,7 +180,6 @@ inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK, * @param[out] res_I pointer to device memory for returning k nearest indices * @param[out] res_D pointer to device memory for returning k nearest distances * @param[in] k number of neighbors to query - * @param[in] allocator the device memory allocator to use for temporary scratch memory * @param[in] userStream the main cuda stream to use * @param[in] internalStreams optional when n_params > 0, the index partitions can be * queried in parallel using these streams. Note that n_int_streams also @@ -198,7 +198,6 @@ template void brute_force_knn_impl(std::vector &input, std::vector &sizes, IntType D, float *search_items, IntType n, int64_t *res_I, float *res_D, IntType k, - std::shared_ptr allocator, cudaStream_t userStream, cudaStream_t *internalStreams = nullptr, int n_int_streams = 0, bool rowMajorIndex = true, @@ -228,28 +227,26 @@ void brute_force_knn_impl(std::vector &input, std::vector &sizes, // perform preprocessing std::unique_ptr> query_metric_processor = - create_processor(metric, n, D, k, rowMajorQuery, userStream, - allocator); + create_processor(metric, n, D, k, rowMajorQuery, userStream); query_metric_processor->preprocess(search_items); std::vector>> metric_processors( input.size()); for (size_t i = 0; i < input.size(); i++) { - metric_processors[i] = create_processor( - metric, sizes[i], D, k, rowMajorQuery, userStream, allocator); + metric_processors[i] = create_processor(metric, sizes[i], D, k, + rowMajorQuery, userStream); metric_processors[i]->preprocess(input[i]); } int device; CUDA_CHECK(cudaGetDevice(&device)); - raft::mr::device::buffer trans(allocator, userStream, - id_ranges->size()); + rmm::device_uvector trans(id_ranges->size(), userStream); raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(), userStream); - raft::mr::device::buffer all_D(allocator, userStream, 0); - raft::mr::device::buffer all_I(allocator, userStream, 0); + rmm::device_uvector all_D(0, userStream); + rmm::device_uvector all_I(0, userStream); float *out_D = res_D; int64_t *out_I = res_I; diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp index a645412c2f..876e91e877 100644 --- a/cpp/include/raft/spatial/knn/detail/processing.hpp +++ b/cpp/include/raft/spatial/knn/detail/processing.hpp @@ -19,16 +19,14 @@ #include #include #include -#include -#include #include #include +#include namespace raft { namespace spatial { namespace knn { -using deviceAllocator = raft::mr::device::allocator; /** * @brief A virtual class defining pre- and post-processing * for metrics. This class will temporarily modify its given @@ -56,16 +54,13 @@ class CosineMetricProcessor : public MetricProcessor { size_t n_rows_; size_t n_cols_; cudaStream_t stream_; - std::shared_ptr device_allocator_; - raft::mr::device::buffer colsums_; + rmm::device_uvector colsums_; public: CosineMetricProcessor(size_t n_rows, size_t n_cols, int k, bool row_major, - cudaStream_t stream, - std::shared_ptr allocator) - : device_allocator_(allocator), - stream_(stream), - colsums_(allocator, stream, n_rows), + cudaStream_t stream) + : stream_(stream), + colsums_(n_rows, stream), n_cols_(n_cols), n_rows_(n_rows), row_major_(row_major), @@ -104,11 +99,9 @@ class CorrelationMetricProcessor : public CosineMetricProcessor { public: CorrelationMetricProcessor(size_t n_rows, size_t n_cols, int k, - bool row_major, cudaStream_t stream, - std::shared_ptr allocator) - : CosineMetricProcessor(n_rows, n_cols, k, row_major, stream, - allocator), - means_(allocator, stream, n_rows) {} + bool row_major, cudaStream_t stream) + : CosineMetricProcessor(n_rows, n_cols, k, row_major, stream), + means_(n_rows, stream) {} void preprocess(math_t *data) { math_t normalizer_const = 1.0 / (math_t)cosine::n_cols_; @@ -143,7 +136,7 @@ class CorrelationMetricProcessor : public CosineMetricProcessor { ~CorrelationMetricProcessor() = default; - raft::mr::device::buffer means_; + rmm::device_uvector means_; }; template @@ -161,18 +154,18 @@ class DefaultMetricProcessor : public MetricProcessor { template inline std::unique_ptr> create_processor( distance::DistanceType metric, int n, int D, int k, bool rowMajorQuery, - cudaStream_t userStream, std::shared_ptr allocator) { + cudaStream_t userStream) { MetricProcessor *mp = nullptr; switch (metric) { case distance::DistanceType::CosineExpanded: - mp = new CosineMetricProcessor(n, D, k, rowMajorQuery, userStream, - allocator); + mp = + new CosineMetricProcessor(n, D, k, rowMajorQuery, userStream); break; case distance::DistanceType::CorrelationExpanded: mp = new CorrelationMetricProcessor(n, D, k, rowMajorQuery, - userStream, allocator); + userStream); break; default: mp = new DefaultMetricProcessor(); diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp index a3a1972c13..71c547c281 100644 --- a/cpp/include/raft/spatial/knn/knn.hpp +++ b/cpp/include/raft/spatial/knn/knn.hpp @@ -18,15 +18,12 @@ #include "detail/knn_brute_force_faiss.cuh" -#include #include namespace raft { namespace spatial { namespace knn { -using deviceAllocator = raft::mr::device::allocator; - template inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK, value_idx *outV, size_t n_samples, int n_parts, @@ -72,8 +69,7 @@ inline void brute_force_knn( std::vector int_streams = handle.get_internal_streams(); detail::brute_force_knn_impl(input, sizes, D, search_items, n, res_I, res_D, - k, handle.get_device_allocator(), - handle.get_stream(), int_streams.data(), + k, handle.get_stream(), int_streams.data(), handle.get_num_internal_streams(), rowMajorIndex, rowMajorQuery, translations, metric, metric_arg); } diff --git a/cpp/include/raft/spectral/cluster_solvers.hpp b/cpp/include/raft/spectral/cluster_solvers.hpp index 922ae7cfab..6f507331d9 100644 --- a/cpp/include/raft/spectral/cluster_solvers.hpp +++ b/cpp/include/raft/spectral/cluster_solvers.hpp @@ -42,19 +42,16 @@ struct kmeans_solver_t { size_type_t> const& config) : config_(config) {} - template std::pair solve( - handle_t const& handle, thrust_exe_policy_t t_exe_policy, - size_type_t n_obs_vecs, size_type_t dim, + handle_t const& handle, size_type_t n_obs_vecs, size_type_t dim, value_type_t const* __restrict__ obs, index_type_t* __restrict__ codes) const { RAFT_EXPECTS(obs != nullptr, "Null obs buffer."); RAFT_EXPECTS(codes != nullptr, "Null codes buffer."); value_type_t residual{}; index_type_t iters{}; - kmeans(handle, t_exe_policy, n_obs_vecs, dim, config_.n_clusters, - config_.tol, config_.maxIter, obs, codes, residual, iters, - config_.seed); + kmeans(handle, n_obs_vecs, dim, config_.n_clusters, config_.tol, + config_.maxIter, obs, codes, residual, iters, config_.seed); return std::make_pair(residual, iters); } diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index fb05bff3e2..b6f0105487 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -21,7 +21,6 @@ #include #include -#include #include #include #include @@ -325,7 +324,6 @@ static __global__ void divideCentroids( * Centroid is randomly chosen with k-means++ algorithm. * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. - * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. * @param n Number of observation vectors. * @param d Dimension of observation vectors. @@ -341,12 +339,9 @@ static __global__ void divideCentroids( * coordinates. * @return Zero if successful. Otherwise non-zero. */ -template -static int chooseNewCentroid(handle_t const& handle, - thrust_exe_pol_t thrust_exec_policy, - index_type_t n, index_type_t d, index_type_t k, - value_type_t rand, +template +static int chooseNewCentroid(handle_t const& handle, index_type_t n, + index_type_t d, index_type_t k, value_type_t rand, const value_type_t* __restrict__ obs, value_type_t* __restrict__ dists, value_type_t* __restrict__ centroid) { @@ -357,8 +352,9 @@ static int chooseNewCentroid(handle_t const& handle, // Observation vector that is chosen as new centroid index_type_t obsIndex; - auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); + auto thrust_exec_policy = handle.get_thrust_policy(); // Compute cumulative sum of distances thrust::inclusive_scan(thrust_exec_policy, thrust::device_pointer_cast(dists), @@ -417,10 +413,7 @@ static int chooseNewCentroid(handle_t const& handle, * Centroids are randomly chosen with k-means++ algorithm * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. - * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. - * @param thrust_exec_policy thrust execution policy - * (assumed to have same stream as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -439,14 +432,12 @@ static int chooseNewCentroid(handle_t const& handle, * distance between observation vectors and the closest centroid. * @return Zero if successful. Otherwise non-zero. */ -template +template static int initializeCentroids( - handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, index_type_t n, - index_type_t d, index_type_t k, const value_type_t* __restrict__ obs, - value_type_t* __restrict__ centroids, index_type_t* __restrict__ codes, - index_type_t* __restrict__ clusterSizes, value_type_t* __restrict__ dists, - unsigned long long seed) { + handle_t const& handle, index_type_t n, index_type_t d, index_type_t k, + const value_type_t* __restrict__ obs, value_type_t* __restrict__ centroids, + index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes, + value_type_t* __restrict__ dists, unsigned long long seed) { // ------------------------------------------------------- // Variable declarations // ------------------------------------------------------- @@ -458,8 +449,9 @@ static int initializeCentroids( thrust::default_random_engine rng(seed); thrust::uniform_real_distribution uniformDist(0, 1); - auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); + auto thrust_exec_policy = handle.get_thrust_policy(); constexpr index_type_t grid_lower_bound{65535}; @@ -486,8 +478,8 @@ static int initializeCentroids( thrust::fill(thrust_exec_policy, thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n), 1); CHECK_CUDA(stream); - if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, uniformDist(rng), - obs, dists, centroids)) + if (chooseNewCentroid(handle, n, d, k, uniformDist(rng), obs, dists, + centroids)) WARNING("error in k-means++ (could not pick centroid)"); // Compute distances from first centroid @@ -499,8 +491,8 @@ static int initializeCentroids( // Choose remaining centroids for (i = 1; i < k; ++i) { // Choose ith centroid - if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, uniformDist(rng), - obs, dists, centroids + IDX(0, i, d))) + if (chooseNewCentroid(handle, n, d, k, uniformDist(rng), obs, dists, + centroids + IDX(0, i, d))) WARNING("error in k-means++ (could not pick centroid)"); // Compute distances from ith centroid @@ -529,10 +521,7 @@ static int initializeCentroids( * Distance is measured with Euclidean norm. * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. - * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. - * @param thrust_exec_policy thrust execution policy - * (assumed to have same stream as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -553,16 +542,18 @@ static int initializeCentroids( * of squares of assignment. * @return Zero if successful. Otherwise non-zero. */ -template -static int assignCentroids( - handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, index_type_t n, - index_type_t d, index_type_t k, const value_type_t* __restrict__ obs, - const value_type_t* __restrict__ centroids, value_type_t* __restrict__ dists, - index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes, - value_type_t* residual_host) { - auto cublas_h = handle.get_cublas_handle(); +template +static int assignCentroids(handle_t const& handle, index_type_t n, + index_type_t d, index_type_t k, + const value_type_t* __restrict__ obs, + const value_type_t* __restrict__ centroids, + value_type_t* __restrict__ dists, + index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes, + value_type_t* residual_host) { auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); + auto thrust_exec_policy = handle.get_thrust_policy(); // Compute distance between centroids and observation vectors CUDA_TRY(cudaMemsetAsync(dists, 0, n * k * sizeof(value_type_t), stream)); @@ -606,10 +597,7 @@ static int assignCentroids( * All clusters are assumed to be non-empty. * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. - * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. - * @param thrust_exec_policy thrust execution policy - * (assumed to have same stream as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -628,10 +616,8 @@ static int assignCentroids( * Workspace. * @return Zero if successful. Otherwise non-zero. */ -template -static int updateCentroids(handle_t const& handle, - thrust_exe_pol_t thrust_exec_policy, index_type_t n, +template +static int updateCentroids(handle_t const& handle, index_type_t n, index_type_t d, index_type_t k, const value_type_t* __restrict__ obs, const index_type_t* __restrict__ codes, @@ -649,8 +635,9 @@ static int updateCentroids(handle_t const& handle, constexpr index_type_t grid_lower_bound{65535}; - auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); + auto thrust_exec_policy = handle.get_thrust_policy(); // Device memory thrust::device_ptr obs_copy(work); @@ -722,10 +709,7 @@ namespace raft { * k-means++ algorithm. * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. - * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. - * @param thrust_exec_policy thrust execution policy - * (assumed to have same stream as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -754,11 +738,10 @@ namespace raft { * @param seed random seed to be used. * @return error flag. */ -template -int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, - index_type_t n, index_type_t d, index_type_t k, value_type_t tol, - index_type_t maxiter, const value_type_t* __restrict__ obs, +template +int kmeans(handle_t const& handle, index_type_t n, index_type_t d, + index_type_t k, value_type_t tol, index_type_t maxiter, + const value_type_t* __restrict__ obs, index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes, value_type_t* __restrict__ centroids, @@ -785,16 +768,17 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, // Initialization // ------------------------------------------------------- - auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); + auto thrust_exec_policy = handle.get_thrust_policy(); // Trivial cases if (k == 1) { CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream)); CUDA_TRY(cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t), cudaMemcpyHostToDevice, stream)); - if (updateCentroids(handle, thrust_exec_policy, n, d, k, obs, codes, - clusterSizes, centroids, work, work_int)) + if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, + work, work_int)) WARNING("could not compute k-means centroids"); dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE}; @@ -840,21 +824,21 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, // ------------------------------------------------------- // Choose initial cluster centroids - if (initializeCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids, - codes, clusterSizes, work, seed)) + if (initializeCentroids(handle, n, d, k, obs, centroids, codes, clusterSizes, + work, seed)) WARNING("could not initialize k-means centroids"); // Apply k-means iteration until convergence for (iter = 0; iter < maxiter; ++iter) { // Update cluster centroids - if (updateCentroids(handle, thrust_exec_policy, n, d, k, obs, codes, - clusterSizes, centroids, work, work_int)) + if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, + work, work_int)) WARNING("could not update k-means centroids"); // Determine centroid closest to each observation residualPrev = *residual_host; - if (assignCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids, - work, codes, clusterSizes, residual_host)) + if (assignCentroids(handle, n, d, k, obs, centroids, work, codes, + clusterSizes, residual_host)) WARNING("could not assign observation vectors to k-means clusters"); // Reinitialize empty clusters with new centroids @@ -868,12 +852,11 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, // conditions, such as if obs is corrupt (as seen as a result of a // DataFrame column of NULL edge vals used to create the Graph) while (emptyCentroid < k) { - if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, - uniformDist(rng), obs, work, + if (chooseNewCentroid(handle, n, d, k, uniformDist(rng), obs, work, centroids + IDX(0, emptyCentroid, d))) WARNING("could not replace empty centroid"); - if (assignCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids, - work, codes, clusterSizes, residual_host)) + if (assignCentroids(handle, n, d, k, obs, centroids, work, codes, + clusterSizes, residual_host)) WARNING("could not assign observation vectors to k-means clusters"); emptyCentroid = (thrust::find(thrust_exec_policy, @@ -905,10 +888,7 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, * k-means++ algorithm. * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. - * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. - * @param thrust_exec_policy thrust execution policy - * (assumed to have same stream as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -926,11 +906,10 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, * @param seed random seed to be used. * @return error flag */ -template -int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, - index_type_t n, index_type_t d, index_type_t k, value_type_t tol, - index_type_t maxiter, const value_type_t* __restrict__ obs, +template +int kmeans(handle_t const& handle, index_type_t n, index_type_t d, + index_type_t k, value_type_t tol, index_type_t maxiter, + const value_type_t* __restrict__ obs, index_type_t* __restrict__ codes, value_type_t& residual, index_type_t& iters, unsigned long long seed = 123456) { using namespace matrix; @@ -950,9 +929,8 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, // Perform k-means return kmeans( - handle, thrust_exec_policy, n, d, k, tol, maxiter, obs, codes, - clusterSizes.raw(), centroids.raw(), work.raw(), work_int.raw(), &residual, - &iters, seed); + handle, n, d, k, tol, maxiter, obs, codes, clusterSizes.raw(), + centroids.raw(), work.raw(), work_int.raw(), &residual, &iters, seed); } } // namespace raft diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index c43154d17a..42fc621a1a 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -72,52 +73,30 @@ struct vector_view_t { : buffer_(buffer), size_(sz) {} vector_view_t(vector_view_t&& other) - : buffer_(other.buffer_), size_(other.size_) { - other.buffer_ = nullptr; - other.size_ = 0; - } + : buffer_(other.raw()), size_(other.size()) {} vector_view_t& operator=(vector_view_t&& other) { - buffer_ = other.buffer_; - size_ = other.size_; - - other.buffer_ = nullptr; - other.size_ = 0; + buffer_ = other.raw(); + size_ = other.size(); } }; -// allocatable vector, using raft handle allocator -// template class vector_t { - handle_t const& handle_; - value_type* buffer_; - size_type size_; - cudaStream_t stream_; - public: vector_t(handle_t const& raft_handle, size_type sz) - : handle_(raft_handle), - buffer_( - static_cast(raft_handle.get_device_allocator()->allocate( - sz * sizeof(value_type), raft_handle.get_stream()))), - size_(sz), - stream_(raft_handle.get_stream()) {} - - ~vector_t(void) { - handle_.get_device_allocator()->deallocate( - buffer_, size_ * sizeof(value_type), stream_); - } + : buffer_(sz, raft_handle.get_stream()), + thrust_policy(raft_handle.get_thrust_policy()) {} - size_type size(void) const { return size_; } + size_type size(void) const { return buffer_.size(); } - value_type* raw(void) { return buffer_; } + value_type* raw(void) { return buffer_.data(); } - value_type const* raw(void) const { return buffer_; } + value_type const* raw(void) const { return buffer_.data(); } - template - value_type nrm1(ThrustExecPolicy t_exe_pol) const { - return thrust::reduce(t_exe_pol, buffer_, buffer_ + size_, value_type{0}, + value_type nrm1() const { + return thrust::reduce(thrust_policy, buffer_.data(), + buffer_.data() + buffer_.size(), value_type{0}, [] __device__(auto left, auto right) { auto abs_left = left > 0 ? left : -left; auto abs_right = right > 0 ? right : -right; @@ -125,10 +104,15 @@ class vector_t { }); } - template - void fill(ThrustExecPolicy t_exe_pol, value_type value) { - thrust::fill_n(t_exe_pol, buffer_, size_, value); + void fill(value_type value) { + thrust::fill_n(thrust_policy, buffer_.data(), buffer_.size(), value); } + + private: + using thrust_exec_policy_t = thrust::detail::execute_with_allocator< + rmm::mr::thrust_allocator, thrust::cuda_cub::execute_on_stream_base>; + rmm::device_uvector buffer_; + const thrust_exec_policy_t thrust_policy; }; template @@ -280,31 +264,26 @@ struct sparse_matrix_t { template struct laplacian_matrix_t : sparse_matrix_t { - template - laplacian_matrix_t(handle_t const& raft_handle, - ThrustExePolicy thrust_exec_policy, - index_type const* row_offsets, + laplacian_matrix_t(handle_t const& raft_handle, index_type const* row_offsets, index_type const* col_indices, value_type const* values, index_type const nrows, index_type const nnz) : sparse_matrix_t(raft_handle, row_offsets, col_indices, values, nrows, nnz), diagonal_(raft_handle, nrows) { vector_t ones{raft_handle, nrows}; - ones.fill(thrust_exec_policy, 1.0); + ones.fill(1.0); sparse_matrix_t::mv(1, ones.raw(), 0, diagonal_.raw()); } - template laplacian_matrix_t(handle_t const& raft_handle, - ThrustExePolicy thrust_exec_policy, sparse_matrix_t const& csr_m) : sparse_matrix_t(raft_handle, csr_m.row_offsets_, csr_m.col_indices_, csr_m.values_, csr_m.nrows_, csr_m.nnz_), diagonal_(raft_handle, csr_m.nrows_) { vector_t ones{raft_handle, csr_m.nrows_}; - ones.fill(thrust_exec_policy, 1.0); + ones.fill(1.0); sparse_matrix_t::mv(1, ones.raw(), 0, diagonal_.raw()); } @@ -351,27 +330,19 @@ struct laplacian_matrix_t : sparse_matrix_t { template struct modularity_matrix_t : laplacian_matrix_t { - template modularity_matrix_t(handle_t const& raft_handle, - ThrustExePolicy thrust_exec_policy, index_type const* row_offsets, index_type const* col_indices, value_type const* values, index_type const nrows, index_type const nnz) : laplacian_matrix_t( - raft_handle, thrust_exec_policy, row_offsets, col_indices, values, - nrows, nnz) { - edge_sum_ = laplacian_matrix_t::diagonal_.nrm1( - thrust_exec_policy); + raft_handle, row_offsets, col_indices, values, nrows, nnz) { + edge_sum_ = laplacian_matrix_t::diagonal_.nrm1(); } - template modularity_matrix_t(handle_t const& raft_handle, - ThrustExePolicy thrust_exec_policy, sparse_matrix_t const& csr_m) - : laplacian_matrix_t(raft_handle, - thrust_exec_policy, csr_m) { - edge_sum_ = laplacian_matrix_t::diagonal_.nrm1( - thrust_exec_policy); + : laplacian_matrix_t(raft_handle, csr_m) { + edge_sum_ = laplacian_matrix_t::diagonal_.nrm1(); } // y = alpha*A*x + beta*y diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp index f8dfe5daa3..fededbfcb4 100644 --- a/cpp/include/raft/spectral/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -20,7 +20,6 @@ #include #include -#include #include #include #include @@ -79,19 +78,18 @@ using namespace linalg; * performed. * @return error flag. */ -template +template std::tuple modularity_maximization( - handle_t const &handle, ThrustExePolicy thrust_exec_policy, - sparse_matrix_t const &csr_m, + handle_t const &handle, sparse_matrix_t const &csr_m, EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); - auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); std::tuple stats; // # iters eigen solver, cluster solver residual, # iters cluster solver @@ -101,7 +99,7 @@ std::tuple modularity_maximization( // Compute eigenvectors of Modularity Matrix // Initialize Modularity Matrix - modularity_matrix_t B{handle, thrust_exec_policy, csr_m}; + modularity_matrix_t B{handle, csr_m}; auto eigen_config = eigen_solver.get_config(); auto nEigVecs = eigen_config.n_eigVecs; @@ -111,7 +109,7 @@ std::tuple modularity_maximization( eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs); // Whiten eigenvector matrix - transform_eigen_matrix(handle, thrust_exec_policy, n, nEigVecs, eigVecs); + transform_eigen_matrix(handle, n, nEigVecs, eigVecs); // notice that at this point the matrix has already been transposed, so we are scaling // columns @@ -119,8 +117,8 @@ std::tuple modularity_maximization( CHECK_CUDA(stream); // Find partition clustering - auto pair_cluster = cluster_solver.solve(handle, thrust_exec_policy, n, - nEigVecs, eigVecs, clusters); + auto pair_cluster = + cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters); std::get<1>(stats) = pair_cluster.first; std::get<2>(stats) = pair_cluster.second; @@ -138,9 +136,8 @@ std::tuple modularity_maximization( * @param clusters (Input, device memory, n entries) Cluster assignments. * @param modularity On exit, modularity */ -template +template void analyzeModularity(handle_t const &handle, - ThrustExePolicy thrust_exec_policy, sparse_matrix_t const &csr_m, vertex_t nClusters, vertex_t const *__restrict__ clusters, @@ -163,15 +160,15 @@ void analyzeModularity(handle_t const &handle, cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // Initialize Modularity - modularity_matrix_t B{handle, thrust_exec_policy, csr_m}; + modularity_matrix_t B{handle, csr_m}; // Initialize output modularity = 0; // Iterate through partitions for (i = 0; i < nClusters; ++i) { - if (!construct_indicator(handle, thrust_exec_policy, i, n, clustersize, - partModularity, clusters, part_i, Bx, B)) { + if (!construct_indicator(handle, i, n, clustersize, partModularity, + clusters, part_i, Bx, B)) { WARNING("empty partition"); continue; } @@ -180,7 +177,7 @@ void analyzeModularity(handle_t const &handle, modularity += partModularity; } - modularity = modularity / B.diagonal_.nrm1(thrust_exec_policy); + modularity = modularity / B.diagonal_.nrm1(); } } // namespace spectral diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index 841fca04d9..2df3812a4a 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -19,7 +19,6 @@ #include #include -#include #include #include #include @@ -62,19 +61,18 @@ using namespace linalg; * performed. * @return statistics: number of eigensolver iterations, . */ -template +template std::tuple partition( - handle_t const &handle, ThrustExePolicy thrust_exec_policy, - sparse_matrix_t const &csr_m, + handle_t const &handle, sparse_matrix_t const &csr_m, EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); - auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); std::tuple stats; //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, cluster solver residual, # iters cluster solver @@ -89,7 +87,7 @@ std::tuple partition( // Initialize Laplacian ///sparse_matrix_t A{handle, graph}; - laplacian_matrix_t L{handle, thrust_exec_policy, csr_m}; + laplacian_matrix_t L{handle, csr_m}; auto eigen_config = eigen_solver.get_config(); auto nEigVecs = eigen_config.n_eigVecs; @@ -99,11 +97,11 @@ std::tuple partition( eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs); // Whiten eigenvector matrix - transform_eigen_matrix(handle, thrust_exec_policy, n, nEigVecs, eigVecs); + transform_eigen_matrix(handle, n, nEigVecs, eigVecs); // Find partition clustering - auto pair_cluster = cluster_solver.solve(handle, thrust_exec_policy, n, - nEigVecs, eigVecs, clusters); + auto pair_cluster = + cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters); std::get<1>(stats) = pair_cluster.first; std::get<2>(stats) = pair_cluster.second; @@ -129,9 +127,8 @@ std::tuple partition( * @param cost On exit, partition cost function. * @return error flag. */ -template +template void analyzePartition(handle_t const &handle, - ThrustExePolicy thrust_exec_policy, sparse_matrix_t const &csr_m, vertex_t nClusters, const vertex_t *__restrict__ clusters, weight_t &edgeCut, weight_t &cost) { @@ -140,8 +137,8 @@ void analyzePartition(handle_t const &handle, vertex_t i; vertex_t n = csr_m.nrows_; - auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); weight_t partEdgesCut, clustersize; @@ -155,7 +152,7 @@ void analyzePartition(handle_t const &handle, // Initialize Laplacian ///sparse_matrix_t A{handle, graph}; - laplacian_matrix_t L{handle, thrust_exec_policy, csr_m}; + laplacian_matrix_t L{handle, csr_m}; // Initialize output cost = 0; @@ -164,8 +161,8 @@ void analyzePartition(handle_t const &handle, // Iterate through partitions for (i = 0; i < nClusters; ++i) { // Construct indicator vector for ith partition - if (!construct_indicator(handle, thrust_exec_policy, i, n, clustersize, - partEdgesCut, clusters, part_i, Lx, L)) { + if (!construct_indicator(handle, i, n, clustersize, partEdgesCut, clusters, + part_i, Lx, L)) { WARNING("empty partition"); continue; } diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp index 40dde30a74..c148350c0f 100644 --- a/cpp/include/raft/spectral/spectral_util.hpp +++ b/cpp/include/raft/spectral/spectral_util.hpp @@ -19,7 +19,6 @@ #include #include -#include #include #include #include @@ -108,13 +107,12 @@ cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs) { return cudaSuccess; } -template -void transform_eigen_matrix(handle_t const& handle, - ThrustExePolicy thrust_exec_policy, edge_t n, - vertex_t nEigVecs, weight_t* eigVecs) { - auto cublas_h = handle.get_cublas_handle(); +template +void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs, + weight_t* eigVecs) { auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); + auto thrust_exec_policy = handle.get_thrust_policy(); const weight_t zero{0.0}; const weight_t one{1.0}; @@ -187,16 +185,15 @@ struct equal_to_i_op { // Construct indicator vector for ith partition // -template -bool construct_indicator(handle_t const& handle, - ThrustExePolicy thrust_exec_policy, edge_t index, - edge_t n, weight_t& clustersize, weight_t& partStats, +template +bool construct_indicator(handle_t const& handle, edge_t index, edge_t n, + weight_t& clustersize, weight_t& partStats, vertex_t const* __restrict__ clusters, vector_t& part_i, vector_t& Bx, laplacian_matrix_t const& B) { - auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); + auto thrust_exec_policy = handle.get_thrust_policy(); thrust::for_each(thrust_exec_policy, thrust::make_zip_iterator(thrust::make_tuple( diff --git a/cpp/test/cluster_solvers.cu b/cpp/test/cluster_solvers.cu index 4ff6cdf5fa..d280b3e95c 100644 --- a/cpp/test/cluster_solvers.cu +++ b/cpp/test/cluster_solvers.cu @@ -49,8 +49,7 @@ TEST(Raft, ClusterSolvers) { kmeans_solver_t cluster_solver{cfg}; - EXPECT_ANY_THROW(cluster_solver.solve(h, thrust::cuda::par.on(stream), n, d, - eigvecs, codes)); + EXPECT_ANY_THROW(cluster_solver.solve(h, n, d, eigvecs, codes)); } TEST(Raft, ModularitySolvers) { @@ -89,14 +88,12 @@ TEST(Raft, ModularitySolvers) { auto stream = h.get_stream(); sparse_matrix_t sm{h, nullptr, nullptr, nullptr, 0, 0}; - auto t_exe_p = thrust::cuda::par.on(stream); EXPECT_ANY_THROW(spectral::modularity_maximization( - h, t_exe_p, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs)); + h, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs)); value_type modularity{0}; - EXPECT_ANY_THROW( - spectral::analyzeModularity(h, t_exe_p, sm, k, clusters, modularity)); + EXPECT_ANY_THROW(spectral::analyzeModularity(h, sm, k, clusters, modularity)); } } // namespace raft diff --git a/cpp/test/distance/dist_adj.cu b/cpp/test/distance/dist_adj.cu index e2ed2c01dc..8d5cd68f13 100644 --- a/cpp/test/distance/dist_adj.cu +++ b/cpp/test/distance/dist_adj.cu @@ -77,12 +77,11 @@ class DistanceAdjTest int n = params.n; int k = params.k; bool isRowMajor = params.isRowMajor; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(x, m * k); - raft::allocate(y, n * k); - raft::allocate(dist_ref, m * n); - raft::allocate(dist, m * n); + raft::allocate(x, m * k, stream); + raft::allocate(y, n * k, stream); + raft::allocate(dist_ref, m * n, stream); + raft::allocate(dist, m * n, stream); r.uniform(x, m * k, DataType(-1.0), DataType(1.0), stream); r.uniform(y, n * k, DataType(-1.0), DataType(1.0), stream); @@ -94,7 +93,7 @@ class DistanceAdjTest raft::distance::getWorkspaceSize(x, y, m, n, k); if (worksize != 0) { - raft::allocate(workspace, worksize); + raft::allocate(workspace, worksize, stream); } auto fin_op = [threshold] __device__(DataType d_val, int g_d_idx) { @@ -103,21 +102,16 @@ class DistanceAdjTest raft::distance::distance( x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor); - CUDA_CHECK(cudaStreamDestroy(stream)); - CUDA_CHECK(cudaFree(workspace)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } - void TearDown() override { - CUDA_CHECK(cudaFree(x)); - CUDA_CHECK(cudaFree(y)); - CUDA_CHECK(cudaFree(dist_ref)); - CUDA_CHECK(cudaFree(dist)); - } + void TearDown() override {} protected: DistanceAdjInputs params; DataType *x, *y; bool *dist_ref, *dist; + cudaStream_t stream; }; const std::vector> inputsf = { diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh index 9e3290593d..4798d102f3 100644 --- a/cpp/test/distance/distance_base.cuh +++ b/cpp/test/distance/distance_base.cuh @@ -392,13 +392,12 @@ class DistanceTest : public ::testing::TestWithParam> { int k = params.k; DataType metric_arg = params.metric_arg; bool isRowMajor = params.isRowMajor; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(x, m * k); - raft::allocate(y, n * k); - raft::allocate(dist_ref, m * n); - raft::allocate(dist, m * n); - raft::allocate(dist2, m * n); + raft::allocate(x, m * k, stream); + raft::allocate(y, n * k, stream); + raft::allocate(dist_ref, m * n, stream); + raft::allocate(dist, m * n, stream); + raft::allocate(dist2, m * n, stream); if (distanceType == raft::distance::DistanceType::HellingerExpanded || distanceType == raft::distance::DistanceType::JensenShannon || distanceType == raft::distance::DistanceType::KLDivergence) { @@ -416,7 +415,6 @@ class DistanceTest : public ::testing::TestWithParam> { r.uniform(x, m * k, DataType(-1.0), DataType(1.0), stream); r.uniform(y, n * k, DataType(-1.0), DataType(1.0), stream); } - naiveDistance(dist_ref, x, y, m, n, k, distanceType, isRowMajor, metric_arg); char *workspace = nullptr; @@ -424,28 +422,24 @@ class DistanceTest : public ::testing::TestWithParam> { raft::distance::getWorkspaceSize(x, y, m, n, k); if (worksize != 0) { - raft::allocate(workspace, worksize); + raft::allocate(workspace, worksize, stream); } DataType threshold = -10000.f; distanceLauncher(x, y, dist, dist2, m, n, k, params, threshold, workspace, worksize, stream, isRowMajor, metric_arg); - CUDA_CHECK(cudaStreamDestroy(stream)); - CUDA_CHECK(cudaFree(workspace)); } void TearDown() override { - CUDA_CHECK(cudaFree(x)); - CUDA_CHECK(cudaFree(y)); - CUDA_CHECK(cudaFree(dist_ref)); - CUDA_CHECK(cudaFree(dist)); - CUDA_CHECK(cudaFree(dist2)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: DistanceInputs params; DataType *x, *y, *dist_ref, *dist, *dist2; + cudaStream_t stream; }; } // end namespace distance diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu index 4573a070b6..cfea4ee2d9 100644 --- a/cpp/test/distance/fused_l2_nn.cu +++ b/cpp/test/distance/fused_l2_nn.cu @@ -107,13 +107,13 @@ class FusedL2NNTest : public ::testing::TestWithParam> { int n = params.n; int k = params.k; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(x, m * k); - raft::allocate(y, n * k); - raft::allocate(xn, m); - raft::allocate(yn, n); - raft::allocate(workspace, sizeof(int) * m); - raft::allocate(min, m); - raft::allocate(min_ref, m); + raft::allocate(x, m * k, stream); + raft::allocate(y, n * k, stream); + raft::allocate(xn, m, stream); + raft::allocate(yn, n, stream); + raft::allocate(workspace, sizeof(int) * m, stream); + raft::allocate(min, m, stream); + raft::allocate(min_ref, m, stream); r.uniform(x, m * k, DataT(-1.0), DataT(1.0), stream); r.uniform(y, n * k, DataT(-1.0), DataT(1.0), stream); generateGoldenResult(); @@ -122,15 +122,8 @@ class FusedL2NNTest : public ::testing::TestWithParam> { } void TearDown() override { - CUDA_CHECK(cudaStreamSynchronize(stream)); + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); - CUDA_CHECK(cudaFree(x)); - CUDA_CHECK(cudaFree(y)); - CUDA_CHECK(cudaFree(xn)); - CUDA_CHECK(cudaFree(yn)); - CUDA_CHECK(cudaFree(workspace)); - CUDA_CHECK(cudaFree(min_ref)); - CUDA_CHECK(cudaFree(min)); } protected: @@ -282,18 +275,17 @@ class FusedL2NNDetTest : public FusedL2NNTest { void SetUp() override { FusedL2NNTest::SetUp(); int m = this->params.m; - raft::allocate(min1, m); + CUDA_CHECK(cudaStreamCreate(&stream)); + raft::allocate(min1, m, stream); } - void TearDown() override { - FusedL2NNTest::TearDown(); - CUDA_CHECK(cudaFree(min1)); - } + void TearDown() override { FusedL2NNTest::TearDown(); } protected: cub::KeyValuePair *min1; static const int NumRepeats = 100; + cudaStream_t stream; void generateGoldenResult() override {} }; diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu index 328137f42d..15794ef568 100644 --- a/cpp/test/eigen_solvers.cu +++ b/cpp/test/eigen_solvers.cu @@ -100,18 +100,15 @@ TEST(Raft, SpectralSolvers) { seed}; kmeans_solver_t cluster_solver{clust_cfg}; - auto stream = h.get_stream(); - - auto t_exe_p = thrust::cuda::par.on(stream); sparse_matrix_t sm{h, nullptr, nullptr, nullptr, 0, 0}; - EXPECT_ANY_THROW(spectral::partition( - h, t_exe_p, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs)); + EXPECT_ANY_THROW(spectral::partition(h, sm, eig_solver, cluster_solver, + clusters, eigvals, eigvecs)); value_type edgeCut{0}; value_type cost{0}; EXPECT_ANY_THROW( - spectral::analyzePartition(h, t_exe_p, sm, k, clusters, edgeCut, cost)); + spectral::analyzePartition(h, sm, k, clusters, edgeCut, cost)); } } // namespace raft diff --git a/cpp/test/label/label.cu b/cpp/test/label/label.cu index dc2846fdba..b28c754a5a 100644 --- a/cpp/test/label/label.cu +++ b/cpp/test/label/label.cu @@ -20,7 +20,6 @@ #include #include -#include #include "../test_utils.h" #include @@ -44,9 +43,9 @@ TEST_F(MakeMonotonicTest, Result) { float *data, *actual, *expected; - raft::allocate(data, m, true); - raft::allocate(actual, m, true); - raft::allocate(expected, m, true); + raft::allocate(data, m, stream, true); + raft::allocate(actual, m, stream, true); + raft::allocate(expected, m, stream, true); float *data_h = new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 8.0, 7.0, 8.0, 8.0, 25.0, 80.0}; @@ -57,17 +56,14 @@ TEST_F(MakeMonotonicTest, Result) { raft::update_device(data, data_h, m, stream); raft::update_device(expected, expected_h, m, stream); - std::shared_ptr allocator( - new raft::mr::device::default_allocator); - make_monotonic(actual, data, m, stream, allocator); + make_monotonic(actual, data, m, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); ASSERT_TRUE(devArrMatch(actual, expected, m, raft::Compare(), stream)); + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(actual)); delete data_h; delete expected_h; @@ -76,39 +72,35 @@ TEST_F(MakeMonotonicTest, Result) { TEST(labelTest, Classlabels) { cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - std::shared_ptr allocator( - new raft::mr::device::default_allocator); int n_rows = 6; float *y_d; - raft::allocate(y_d, n_rows); + raft::allocate(y_d, n_rows, stream); float y_h[] = {2, -1, 1, 2, 1, 1}; raft::update_device(y_d, y_h, n_rows, stream); - int n_classes; - float *y_unique_d; - getUniquelabels(y_d, n_rows, &y_unique_d, &n_classes, stream, allocator); + rmm::device_uvector y_unique_d(0, stream); + int n_classes = getUniquelabels(y_unique_d, y_d, n_rows, stream); ASSERT_EQ(n_classes, 3); float y_unique_exp[] = {-1, 1, 2}; - EXPECT_TRUE(devArrMatchHost(y_unique_exp, y_unique_d, n_classes, + EXPECT_TRUE(devArrMatchHost(y_unique_exp, y_unique_d.data(), n_classes, raft::Compare(), stream)); float *y_relabeled_d; - raft::allocate(y_relabeled_d, n_rows); + raft::allocate(y_relabeled_d, n_rows, stream); - getOvrlabels(y_d, n_rows, y_unique_d, n_classes, y_relabeled_d, 2, stream); + getOvrlabels(y_d, n_rows, y_unique_d.data(), n_classes, y_relabeled_d, 2, + stream); float y_relabeled_exp[] = {1, -1, -1, 1, -1, -1}; EXPECT_TRUE(devArrMatchHost(y_relabeled_exp, y_relabeled_d, n_rows, raft::Compare(), stream)); + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); - CUDA_CHECK(cudaFree(y_d)); - CUDA_CHECK(cudaFree(y_unique_d)); - CUDA_CHECK(cudaFree(y_relabeled_d)); } }; // namespace label }; // namespace raft diff --git a/cpp/test/label/merge_labels.cu b/cpp/test/label/merge_labels.cu index a2f14a8dbc..28d8d59884 100644 --- a/cpp/test/label/merge_labels.cu +++ b/cpp/test/label/merge_labels.cu @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include "../test_utils.h" @@ -50,7 +50,7 @@ class MergeLabelsTest expected(params.N, stream), R(params.N, stream), mask(params.N, stream), - m(1, stream) {} + m(stream) {} void Run() { raft::update_device(labels_a.data(), params.labels_a.data(), params.N, @@ -76,7 +76,7 @@ class MergeLabelsTest raft::handle_t handle; cudaStream_t stream; rmm::device_uvector labels_a, labels_b, expected, R; - rmm::device_uvector mask, m; + rmm::device_scalar mask, m; }; using MergeLabelsTestI = MergeLabelsTest; diff --git a/cpp/test/lap/lap.cu b/cpp/test/lap/lap.cu index 04f473f836..08429e18f2 100644 --- a/cpp/test/lap/lap.cu +++ b/cpp/test/lap/lap.cu @@ -24,6 +24,8 @@ */ #include +#include + #include #include #include @@ -65,15 +67,12 @@ void hungarian_test(int problemsize, int costrange, int problemcount, for (int j = 0; j < problemcount; j++) { generateProblem(h_cost, batchsize, problemsize, costrange); - raft::mr::device::buffer elements_v( - handle.get_device_allocator(), handle.get_stream(), - batchsize * problemsize * problemsize); - raft::mr::device::buffer row_assignment_v( - handle.get_device_allocator(), handle.get_stream(), - batchsize * problemsize); - raft::mr::device::buffer col_assignment_v( - handle.get_device_allocator(), handle.get_stream(), - batchsize * problemsize); + rmm::device_uvector elements_v( + batchsize * problemsize * problemsize, handle.get_stream()); + rmm::device_uvector row_assignment_v(batchsize * problemsize, + handle.get_stream()); + rmm::device_uvector col_assignment_v(batchsize * problemsize, + handle.get_stream()); raft::update_device(elements_v.data(), h_cost, batchsize * problemsize * problemsize, diff --git a/cpp/test/linalg/add.cu b/cpp/test/linalg/add.cu index 2fc9d4e30f..301f069a33 100644 --- a/cpp/test/linalg/add.cu +++ b/cpp/test/linalg/add.cu @@ -32,10 +32,10 @@ class AddTest : public ::testing::TestWithParam> { raft::random::Rng r(params.seed); int len = params.len; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(in1, len); - raft::allocate(in2, len); - raft::allocate(out_ref, len); - raft::allocate(out, len); + raft::allocate(in1, len, stream); + raft::allocate(in2, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(out, len, stream); r.uniform(in1, len, InT(-1.0), InT(1.0), stream); r.uniform(in2, len, InT(-1.0), InT(1.0), stream); naiveAddElem(out_ref, in1, in2, len); @@ -43,11 +43,7 @@ class AddTest : public ::testing::TestWithParam> { } void TearDown() override { - CUDA_CHECK(cudaStreamSynchronize(stream)); - CUDA_CHECK(cudaFree(in1)); - CUDA_CHECK(cudaFree(in2)); - CUDA_CHECK(cudaFree(out_ref)); - CUDA_CHECK(cudaFree(out)); + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); } diff --git a/cpp/test/linalg/binary_op.cu b/cpp/test/linalg/binary_op.cu index 3ae4f86066..475d8e58ff 100644 --- a/cpp/test/linalg/binary_op.cu +++ b/cpp/test/linalg/binary_op.cu @@ -17,8 +17,8 @@ #include #include #include -#include #include +#include #include "../test_utils.h" #include "binary_op.cuh" @@ -48,10 +48,10 @@ class BinaryOpTest cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); IdxType len = params.len; - allocate(in1, len); - allocate(in2, len); - allocate(out_ref, len); - allocate(out, len); + raft::allocate(in1, len, stream); + raft::allocate(in2, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(out, len, stream); r.uniform(in1, len, InType(-1.0), InType(1.0), stream); r.uniform(in2, len, InType(-1.0), InType(1.0), stream); naiveAdd(out_ref, in1, in2, len); @@ -136,9 +136,9 @@ class BinaryOpAlignment : public ::testing::Test { // Test to trigger cudaErrorMisalignedAddress if veclen is incorrectly // chosen. int n = 1024; - mr::device::buffer x(handle.get_device_allocator(), stream, n); - mr::device::buffer y(handle.get_device_allocator(), stream, n); - mr::device::buffer z(handle.get_device_allocator(), stream, n); + rmm::device_uvector x(n, stream); + rmm::device_uvector y(n, stream); + rmm::device_uvector z(n, stream); CUDA_CHECK(cudaMemsetAsync(x.data(), 0, n * sizeof(math_t), stream)); CUDA_CHECK(cudaMemsetAsync(y.data(), 0, n * sizeof(math_t), stream)); raft::linalg::binaryOp( diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/test/linalg/cholesky_r1.cu index 00236d53fa..00db1715dc 100644 --- a/cpp/test/linalg/cholesky_r1.cu +++ b/cpp/test/linalg/cholesky_r1.cu @@ -19,8 +19,9 @@ #include #include #include -#include -#include +#include +#include + #include #include #include "../test_utils.h" @@ -31,12 +32,11 @@ template class CholeskyR1Test : public ::testing::Test { protected: CholeskyR1Test() - : allocator(handle.get_device_allocator()), - G(allocator, handle.get_stream(), n_rows * n_rows), - L(allocator, handle.get_stream(), n_rows * n_rows), - L_exp(allocator, handle.get_stream(), n_rows * n_rows), - devInfo(allocator, handle.get_stream(), 1), - workspace(allocator, handle.get_stream()) { + : G(n_rows * n_rows, handle.get_stream()), + L(n_rows * n_rows, handle.get_stream()), + L_exp(n_rows * n_rows, handle.get_stream()), + devInfo(handle.get_stream()), + workspace(0, handle.get_stream()) { CUDA_CHECK(cudaStreamCreate(&stream)); handle.set_stream(stream); raft::update_device(G.data(), G_host, n_rows * n_rows, stream); @@ -105,7 +105,6 @@ class CholeskyR1Test : public ::testing::Test { } raft::handle_t handle; - std::shared_ptr allocator; cusolverDnHandle_t solver_handle; cudaStream_t stream; @@ -120,11 +119,11 @@ class CholeskyR1Test : public ::testing::Test { math_t G2_host[4] = {3, 4, 2, 1}; - raft::mr::device::buffer devInfo; - raft::mr::device::buffer G; - raft::mr::device::buffer L_exp; - raft::mr::device::buffer L; - raft::mr::device::buffer workspace; + rmm::device_scalar devInfo; + rmm::device_uvector G; + rmm::device_uvector L_exp; + rmm::device_uvector L; + rmm::device_uvector workspace; }; typedef ::testing::Types FloatTypes; diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu index e45f5651b4..45dbd9dcc4 100644 --- a/cpp/test/linalg/coalesced_reduction.cu +++ b/cpp/test/linalg/coalesced_reduction.cu @@ -57,11 +57,10 @@ class coalescedReductionTest raft::random::Rng r(params.seed); int rows = params.rows, cols = params.cols; int len = rows * cols; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(data, len); - raft::allocate(dots_exp, rows); - raft::allocate(dots_act, rows); + raft::allocate(data, len, stream); + raft::allocate(dots_exp, rows, stream); + raft::allocate(dots_act, rows, stream); r.uniform(data, len, T(-1.0), T(1.0), stream); naiveCoalescedReduction(dots_exp, data, cols, rows, stream); @@ -70,18 +69,18 @@ class coalescedReductionTest // Add to result with inplace = true next coalescedReductionLaunch(dots_act, data, cols, rows, stream, true); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(dots_exp)); - CUDA_CHECK(cudaFree(dots_act)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: coalescedReductionInputs params; T *data, *dots_exp, *dots_act; + cudaStream_t stream; }; const std::vector> inputsf = { diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu index 2396558939..563f96c835 100644 --- a/cpp/test/linalg/divide.cu +++ b/cpp/test/linalg/divide.cu @@ -51,27 +51,26 @@ class DivideTest ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.len; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(in, len); - raft::allocate(out_ref, len); - raft::allocate(out, len); + raft::allocate(in, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(out, len, stream); r.uniform(in, len, T(-1.0), T(1.0), stream); naiveDivide(out_ref, in, params.scalar, len, stream); divideScalar(out, in, params.scalar, len, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(in)); - CUDA_CHECK(cudaFree(out_ref)); - CUDA_CHECK(cudaFree(out)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: UnaryOpInputs params; T *in, *out_ref, *out; + cudaStream_t stream; }; const std::vector> inputsf = { diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu index 159d288174..6e26757cf3 100644 --- a/cpp/test/linalg/eig.cu +++ b/cpp/test/linalg/eig.cu @@ -50,24 +50,24 @@ class EigTest : public ::testing::TestWithParam> { raft::random::Rng r(params.seed); int len = params.len; - raft::allocate(cov_matrix, len); + raft::allocate(cov_matrix, len, stream); T cov_matrix_h[] = {1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0}; ASSERT(len == 16, "This test only works with 4x4 matrices!"); raft::update_device(cov_matrix, cov_matrix_h, len, stream); - raft::allocate(eig_vectors, len); - raft::allocate(eig_vals, params.n_col); - raft::allocate(eig_vectors_jacobi, len); - raft::allocate(eig_vals_jacobi, params.n_col); + raft::allocate(eig_vectors, len, stream); + raft::allocate(eig_vals, params.n_col, stream); + raft::allocate(eig_vectors_jacobi, len, stream); + raft::allocate(eig_vals_jacobi, params.n_col, stream); T eig_vectors_ref_h[] = {0.2790, -0.6498, 0.6498, -0.2789, -0.5123, 0.4874, 0.4874, -0.5123, 0.6498, 0.2789, -0.2789, -0.6498, 0.4874, 0.5123, 0.5123, 0.4874}; T eig_vals_ref_h[] = {0.0614, 0.1024, 0.3096, 3.5266}; - raft::allocate(eig_vectors_ref, len); - raft::allocate(eig_vals_ref, params.n_col); + raft::allocate(eig_vectors_ref, len, stream); + raft::allocate(eig_vals_ref, params.n_col, stream); raft::update_device(eig_vectors_ref, eig_vectors_ref_h, len, stream); raft::update_device(eig_vals_ref, eig_vals_ref_h, params.n_col, stream); @@ -82,11 +82,11 @@ class EigTest : public ::testing::TestWithParam> { // test code for comparing two methods len = params.n * params.n; - raft::allocate(cov_matrix_large, len); - raft::allocate(eig_vectors_large, len); - raft::allocate(eig_vectors_jacobi_large, len); - raft::allocate(eig_vals_large, params.n); - raft::allocate(eig_vals_jacobi_large, params.n); + raft::allocate(cov_matrix_large, len, stream); + raft::allocate(eig_vectors_large, len, stream); + raft::allocate(eig_vectors_jacobi_large, len, stream); + raft::allocate(eig_vals_large, params.n, stream); + raft::allocate(eig_vals_jacobi_large, params.n, stream); r.uniform(cov_matrix_large, len, T(-1.0), T(1.0), stream); @@ -97,15 +97,7 @@ class EigTest : public ::testing::TestWithParam> { sweeps); } - void TearDown() override { - CUDA_CHECK(cudaFree(cov_matrix)); - CUDA_CHECK(cudaFree(eig_vectors)); - CUDA_CHECK(cudaFree(eig_vectors_jacobi)); - CUDA_CHECK(cudaFree(eig_vals)); - CUDA_CHECK(cudaFree(eig_vals_jacobi)); - CUDA_CHECK(cudaFree(eig_vectors_ref)); - CUDA_CHECK(cudaFree(eig_vals_ref)); - } + void TearDown() override { raft::deallocate_all(stream); } protected: EigInputs params; diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu index b3980f281d..bdd0a08ff6 100644 --- a/cpp/test/linalg/eig_sel.cu +++ b/cpp/test/linalg/eig_sel.cu @@ -51,36 +51,31 @@ class EigSelTest : public ::testing::TestWithParam> { params = ::testing::TestWithParam>::GetParam(); int len = params.len; - raft::allocate(cov_matrix, len); + raft::allocate(cov_matrix, len, stream); T cov_matrix_h[] = {1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0}; ASSERT(len == 16, "This test only works with 4x4 matrices!"); raft::update_device(cov_matrix, cov_matrix_h, len, stream); - raft::allocate(eig_vectors, 12); - raft::allocate(eig_vals, params.n_col); + raft::allocate(eig_vectors, 12, stream); + raft::allocate(eig_vals, params.n_col, stream); T eig_vectors_ref_h[] = {-0.5123, 0.4874, 0.4874, -0.5123, 0.6498, 0.2789, -0.2789, -0.6498, 0.4874, 0.5123, 0.5123, 0.4874}; T eig_vals_ref_h[] = {0.1024, 0.3096, 3.5266, 3.5266}; - raft::allocate(eig_vectors_ref, 12); - raft::allocate(eig_vals_ref, params.n_col); + raft::allocate(eig_vectors_ref, 12, stream); + raft::allocate(eig_vals_ref, params.n_col, stream); raft::update_device(eig_vectors_ref, eig_vectors_ref_h, 12, stream); raft::update_device(eig_vals_ref, eig_vals_ref_h, 4, stream); eigSelDC(handle, cov_matrix, params.n_row, params.n_col, 3, eig_vectors, eig_vals, EigVecMemUsage::OVERWRITE_INPUT, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); } - void TearDown() override { - CUDA_CHECK(cudaFree(cov_matrix)); - CUDA_CHECK(cudaFree(eig_vectors)); - CUDA_CHECK(cudaFree(eig_vals)); - CUDA_CHECK(cudaFree(eig_vectors_ref)); - CUDA_CHECK(cudaFree(eig_vals_ref)); - } + void TearDown() override { raft::deallocate_all(stream); } protected: EigSelInputs params; diff --git a/cpp/test/linalg/eltwise.cu b/cpp/test/linalg/eltwise.cu index 572951c557..e955f7a354 100644 --- a/cpp/test/linalg/eltwise.cu +++ b/cpp/test/linalg/eltwise.cu @@ -69,9 +69,9 @@ class ScalarMultiplyTest cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - allocate(in, len); - allocate(out_ref, len); - allocate(out, len); + raft::allocate(in, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(out, len, stream); r.uniform(in, len, T(-1.0), T(1.0), stream); naiveScale(out_ref, in, scalar, len, stream); scalarMultiply(out, in, scalar, len, stream); @@ -156,10 +156,10 @@ class EltwiseAddTest : public ::testing::TestWithParam> { cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); int len = params.len; - allocate(in1, len); - allocate(in2, len); - allocate(out_ref, len); - allocate(out, len); + raft::allocate(in1, len, stream); + raft::allocate(in2, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(out, len, stream); r.uniform(in1, len, T(-1.0), T(1.0), stream); r.uniform(in2, len, T(-1.0), T(1.0), stream); naiveAdd(out_ref, in1, in2, len, stream); diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu index 227bce6a48..5b13fb5362 100644 --- a/cpp/test/linalg/map.cu +++ b/cpp/test/linalg/map.cu @@ -18,7 +18,6 @@ #include #include #include -#include #include #include "../test_utils.h" @@ -48,11 +47,10 @@ void create_ref(OutType *out_ref, const InType *in1, const InType *in2, const InType *in3, InType scalar, IdxType len, cudaStream_t stream) { InType *tmp; - allocate(tmp, len); + raft::allocate(tmp, len, stream); eltwiseAdd(tmp, in1, in2, len, stream); eltwiseAdd(out_ref, tmp, in3, len, stream); scalarAdd(out_ref, out_ref, (OutType)scalar, len, stream); - CUDA_CHECK(cudaFree(tmp)); } template @@ -64,35 +62,32 @@ class MapTest ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); IdxType len = params.len; - allocate(in1, len); - allocate(in2, len); - allocate(in3, len); - allocate(out_ref, len); - allocate(out, len); + raft::allocate(in1, len, stream); + raft::allocate(in2, len, stream); + raft::allocate(in3, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(out, len, stream); r.uniform(in1, len, InType(-1.0), InType(1.0), stream); r.uniform(in2, len, InType(-1.0), InType(1.0), stream); r.uniform(in3, len, InType(-1.0), InType(1.0), stream); create_ref(out_ref, in1, in2, in3, params.scalar, len, stream); mapLaunch(out, in1, in2, in3, params.scalar, len, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(in1)); - CUDA_CHECK(cudaFree(in2)); - CUDA_CHECK(cudaFree(in3)); - CUDA_CHECK(cudaFree(out_ref)); - CUDA_CHECK(cudaFree(out)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: MapInputs params; InType *in1, *in2, *in3; OutType *out_ref, *out; + cudaStream_t stream; }; const std::vector> inputsf_i32 = { diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu index 6e146fa4bb..4a44e59504 100644 --- a/cpp/test/linalg/map_then_reduce.cu +++ b/cpp/test/linalg/map_then_reduce.cu @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include "../test_utils.h" namespace raft { @@ -74,26 +76,25 @@ class MapReduceTest : public ::testing::TestWithParam> { raft::random::Rng r(params.seed); auto len = params.len; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - allocate(in, len); - allocate(out_ref, len); - allocate(out, len); + raft::allocate(in, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(out, len, stream); r.uniform(in, len, InType(-1.0), InType(1.0), stream); mapReduceLaunch(out_ref, out, in, len, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(in)); - CUDA_CHECK(cudaFree(out_ref)); - CUDA_CHECK(cudaFree(out)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: MapReduceInputs params; InType *in; OutType *out_ref, *out; + cudaStream_t stream; }; const std::vector> inputsf = { @@ -131,9 +132,7 @@ class MapGenericReduceTest : public ::testing::Test { protected: MapGenericReduceTest() - : allocator(handle.get_device_allocator()), - input(allocator, handle.get_stream(), n), - output(allocator, handle.get_stream(), 1) { + : input(n, handle.get_stream()), output(handle.get_stream()) { CUDA_CHECK(cudaStreamCreate(&stream)); handle.set_stream(stream); initInput(input.data(), input.size(), stream); @@ -172,9 +171,8 @@ class MapGenericReduceTest : public ::testing::Test { int n = 1237; raft::handle_t handle; cudaStream_t stream; - std::shared_ptr allocator; - raft::mr::device::buffer input; - raft::mr::device::buffer output; + rmm::device_uvector input; + rmm::device_scalar output; }; using IoTypePair = diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu index aa46c78b0f..e017ee0918 100644 --- a/cpp/test/linalg/matrix_vector_op.cu +++ b/cpp/test/linalg/matrix_vector_op.cu @@ -66,14 +66,13 @@ class MatVecOpTest IdxType N = params.rows, D = params.cols; IdxType len = N * D; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - allocate(in, len); - allocate(out_ref, len); - allocate(out, len); + raft::allocate(in, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(out, len, stream); IdxType vecLen = params.bcastAlongRows ? D : N; - allocate(vec1, vecLen); - allocate(vec2, vecLen); + raft::allocate(vec1, vecLen, stream); + raft::allocate(vec2, vecLen, stream); r.uniform(in, len, (T)-1.0, (T)1.0, stream); r.uniform(vec1, vecLen, (T)-1.0, (T)1.0, stream); r.uniform(vec2, vecLen, (T)-1.0, (T)1.0, stream); @@ -86,20 +85,18 @@ class MatVecOpTest } matrixVectorOpLaunch(out, in, vec1, vec2, D, N, params.rowMajor, params.bcastAlongRows, params.useTwoVectors, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(vec1)); - CUDA_CHECK(cudaFree(vec2)); - CUDA_CHECK(cudaFree(out)); - CUDA_CHECK(cudaFree(out_ref)); - CUDA_CHECK(cudaFree(in)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: MatVecOpInputs params; T *in, *out, *out_ref, *vec1, *vec2; + cudaStream_t stream; }; const std::vector> inputsf_i32 = { diff --git a/cpp/test/linalg/multiply.cu b/cpp/test/linalg/multiply.cu index 1d3e753de3..d7bda7c27d 100644 --- a/cpp/test/linalg/multiply.cu +++ b/cpp/test/linalg/multiply.cu @@ -31,27 +31,26 @@ class MultiplyTest : public ::testing::TestWithParam> { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.len; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(in, len); - raft::allocate(out_ref, len); - raft::allocate(out, len); + raft::allocate(in, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(out, len, stream); r.uniform(in, len, T(-1.0), T(1.0), stream); naiveScale(out_ref, in, params.scalar, len, stream); multiplyScalar(out, in, params.scalar, len, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(in)); - CUDA_CHECK(cudaFree(out_ref)); - CUDA_CHECK(cudaFree(out)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: UnaryOpInputs params; T *in, *out_ref, *out; + cudaStream_t stream; }; const std::vector> inputsf = { diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu index acc25addd0..5563064982 100644 --- a/cpp/test/linalg/norm.cu +++ b/cpp/test/linalg/norm.cu @@ -78,9 +78,9 @@ class RowNormTest : public ::testing::TestWithParam> { int rows = params.rows, cols = params.cols, len = rows * cols; cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(data, len); - raft::allocate(dots_exp, rows); - raft::allocate(dots_act, rows); + raft::allocate(data, len, stream); + raft::allocate(dots_exp, rows, stream); + raft::allocate(dots_act, rows, stream); r.uniform(data, len, T(-1.0), T(1.0), stream); naiveRowNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt, stream); @@ -143,10 +143,10 @@ class ColNormTest : public ::testing::TestWithParam> { int rows = params.rows, cols = params.cols, len = rows * cols; cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(data, len); + raft::allocate(data, len, stream); r.uniform(data, len, T(-1.0), T(1.0), stream); - raft::allocate(dots_exp, cols); - raft::allocate(dots_act, cols); + raft::allocate(dots_exp, cols, stream); + raft::allocate(dots_act, cols, stream); naiveColNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt, stream); @@ -157,13 +157,11 @@ class ColNormTest : public ::testing::TestWithParam> { } else { colNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream); } - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(dots_exp)); - CUDA_CHECK(cudaFree(dots_act)); + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); } diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu index 9082397265..7ceeaf7f8e 100644 --- a/cpp/test/linalg/reduce.cu +++ b/cpp/test/linalg/reduce.cu @@ -63,9 +63,9 @@ class ReduceTest int rows = params.rows, cols = params.cols; int len = rows * cols; outlen = params.alongRows ? rows : cols; - raft::allocate(data, len); - raft::allocate(dots_exp, outlen); - raft::allocate(dots_act, outlen); + raft::allocate(data, len, stream); + raft::allocate(dots_exp, outlen, stream); + raft::allocate(dots_act, outlen, stream); r.uniform(data, len, InType(-1.0), InType(1.0), stream); naiveReduction(dots_exp, data, cols, rows, params.rowMajor, params.alongRows, stream); @@ -82,9 +82,7 @@ class ReduceTest } void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(dots_exp)); - CUDA_CHECK(cudaFree(dots_act)); + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); } diff --git a/cpp/test/linalg/reduce.cuh b/cpp/test/linalg/reduce.cuh index 30a9c2e271..7f8319636b 100644 --- a/cpp/test/linalg/reduce.cuh +++ b/cpp/test/linalg/reduce.cuh @@ -18,9 +18,9 @@ #include #include -#include #include #include +#include namespace raft { namespace linalg { @@ -54,17 +54,20 @@ void unaryAndGemv(OutType *dots, const InType *data, int D, int N, cudaStream_t stream) { //computes a MLCommon unary op on data (squares it), then computes Ax //(A input matrix and x column vector) to sum columns - thrust::device_vector sq(D * N); + rmm::device_uvector sq(D * N, stream); raft::linalg::unaryOp( thrust::raw_pointer_cast(sq.data()), data, D * N, [] __device__(InType v) { return static_cast(v * v); }, stream); cublasHandle_t handle; CUBLAS_CHECK(cublasCreate(&handle)); - thrust::device_vector ones(N, 1); //column vector [1...1] + rmm::device_uvector ones(N, stream); //column vector [1...1] + raft::linalg::unaryOp( + ones.data(), ones.data(), ones.size(), + [=] __device__(OutType input) { return 1; }, stream); OutType alpha = 1, beta = 0; - CUBLAS_CHECK(raft::linalg::cublasgemv( - handle, CUBLAS_OP_N, D, N, &alpha, thrust::raw_pointer_cast(sq.data()), D, - thrust::raw_pointer_cast(ones.data()), 1, &beta, dots, 1, stream)); + CUBLAS_CHECK(raft::linalg::cublasgemv(handle, CUBLAS_OP_N, D, N, &alpha, + sq.data(), D, ones.data(), 1, &beta, + dots, 1, stream)); CUDA_CHECK(cudaDeviceSynchronize()); CUBLAS_CHECK(cublasDestroy(handle)); } diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/test/linalg/strided_reduction.cu index b27fa2ac1a..55d8cc0e92 100644 --- a/cpp/test/linalg/strided_reduction.cu +++ b/cpp/test/linalg/strided_reduction.cu @@ -49,9 +49,9 @@ class stridedReductionTest int rows = params.rows, cols = params.cols; int len = rows * cols; - raft::allocate(data, len); - raft::allocate(dots_exp, cols); //expected dot products (from test) - raft::allocate(dots_act, cols); //actual dot products (from prim) + raft::allocate(data, len, stream); + raft::allocate(dots_exp, cols, stream); //expected dot products (from test) + raft::allocate(dots_act, cols, stream); //actual dot products (from prim) r.uniform(data, len, T(-1.0), T(1.0), stream); //initialize matrix to random @@ -60,9 +60,7 @@ class stridedReductionTest } void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(dots_exp)); - CUDA_CHECK(cudaFree(dots_act)); + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); } diff --git a/cpp/test/linalg/subtract.cu b/cpp/test/linalg/subtract.cu index ced3f65fdd..27dea8503f 100644 --- a/cpp/test/linalg/subtract.cu +++ b/cpp/test/linalg/subtract.cu @@ -79,12 +79,11 @@ class SubtractTest : public ::testing::TestWithParam> { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.len; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(in1, len); - raft::allocate(in2, len); - raft::allocate(out_ref, len); - raft::allocate(out, len); + raft::allocate(in1, len, stream); + raft::allocate(in2, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(out, len, stream); r.uniform(in1, len, T(-1.0), T(1.0), stream); r.uniform(in2, len, T(-1.0), T(1.0), stream); @@ -95,19 +94,18 @@ class SubtractTest : public ::testing::TestWithParam> { subtractScalar(out, out, T(1), len, stream); subtract(in1, in1, in2, len, stream); subtractScalar(in1, in1, T(1), len, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(in1)); - CUDA_CHECK(cudaFree(in2)); - CUDA_CHECK(cudaFree(out_ref)); - CUDA_CHECK(cudaFree(out)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: SubtractInputs params; T *in1, *in2, *out_ref, *out; + cudaStream_t stream; }; const std::vector> inputsf2 = { diff --git a/cpp/test/linalg/svd.cu b/cpp/test/linalg/svd.cu index fff321768f..72a27790de 100644 --- a/cpp/test/linalg/svd.cu +++ b/cpp/test/linalg/svd.cu @@ -48,8 +48,8 @@ class SvdTest : public ::testing::TestWithParam> { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.len; - cudaStream_t stream = handle.get_stream(); - raft::allocate(data, len); + stream = handle.get_stream(); + raft::allocate(data, len, stream); ASSERT(params.n_row == 3, "This test only supports nrows=3!"); ASSERT(params.len == 6, "This test only supports len=6!"); @@ -59,9 +59,9 @@ class SvdTest : public ::testing::TestWithParam> { int left_evl = params.n_row * params.n_col; int right_evl = params.n_col * params.n_col; - raft::allocate(left_eig_vectors_qr, left_evl); - raft::allocate(right_eig_vectors_trans_qr, right_evl); - raft::allocate(sing_vals_qr, params.n_col); + raft::allocate(left_eig_vectors_qr, left_evl, stream); + raft::allocate(right_eig_vectors_trans_qr, right_evl, stream); + raft::allocate(sing_vals_qr, params.n_col, stream); // allocate(left_eig_vectors_jacobi, left_evl); // allocate(right_eig_vectors_trans_jacobi, right_evl); @@ -74,9 +74,9 @@ class SvdTest : public ::testing::TestWithParam> { T sing_vals_ref_h[] = {7.065283, 1.040081}; - raft::allocate(left_eig_vectors_ref, left_evl); - raft::allocate(right_eig_vectors_ref, right_evl); - raft::allocate(sing_vals_ref, params.n_col); + raft::allocate(left_eig_vectors_ref, left_evl, stream); + raft::allocate(right_eig_vectors_ref, right_evl, stream); + raft::allocate(sing_vals_ref, params.n_col, stream); raft::update_device(left_eig_vectors_ref, left_eig_vectors_ref_h, left_evl, stream); @@ -87,22 +87,16 @@ class SvdTest : public ::testing::TestWithParam> { svdQR(handle, data, params.n_row, params.n_col, sing_vals_qr, left_eig_vectors_qr, right_eig_vectors_trans_qr, true, true, true, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); } - void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(left_eig_vectors_qr)); - CUDA_CHECK(cudaFree(right_eig_vectors_trans_qr)); - CUDA_CHECK(cudaFree(sing_vals_qr)); - CUDA_CHECK(cudaFree(left_eig_vectors_ref)); - CUDA_CHECK(cudaFree(right_eig_vectors_ref)); - CUDA_CHECK(cudaFree(sing_vals_ref)); - } + void TearDown() override { raft::deallocate_all(stream); } protected: SvdInputs params; T *data, *left_eig_vectors_qr, *right_eig_vectors_trans_qr, *sing_vals_qr, *left_eig_vectors_ref, *right_eig_vectors_ref, *sing_vals_ref; + cudaStream_t stream; }; const std::vector> inputsf2 = { diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu index f10b029962..c574f54a05 100644 --- a/cpp/test/linalg/transpose.cu +++ b/cpp/test/linalg/transpose.cu @@ -48,26 +48,22 @@ class TransposeTest : public ::testing::TestWithParam> { int len = params.len; - raft::allocate(data, len); + raft::allocate(data, len, stream); ASSERT(params.len == 9, "This test works only with len=9!"); T data_h[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0}; raft::update_device(data, data_h, len, stream); - raft::allocate(data_trans_ref, len); + raft::allocate(data_trans_ref, len, stream); T data_ref_h[] = {1.0, 4.0, 7.0, 2.0, 5.0, 8.0, 3.0, 6.0, 9.0}; raft::update_device(data_trans_ref, data_ref_h, len, stream); - raft::allocate(data_trans, len); + raft::allocate(data_trans, len, stream); transpose(handle, data, data_trans, params.n_row, params.n_col, stream); transpose(data, params.n_row, stream); } - void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(data_trans)); - CUDA_CHECK(cudaFree(data_trans_ref)); - } + void TearDown() override { raft::deallocate_all(stream); } protected: TranposeInputs params; diff --git a/cpp/test/linalg/unary_op.cu b/cpp/test/linalg/unary_op.cu index 666ab8619d..042e8b9cbf 100644 --- a/cpp/test/linalg/unary_op.cu +++ b/cpp/test/linalg/unary_op.cu @@ -53,18 +53,15 @@ class UnaryOpTest raft::random::Rng r(params.seed); CUDA_CHECK(cudaStreamCreate(&stream)); auto len = params.len; - allocate(in, len); - allocate(out_ref, len); - allocate(out, len); + raft::allocate(in, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(out, len, stream); r.uniform(in, len, InType(-1.0), InType(1.0), stream); } void TearDown() override { - CUDA_CHECK(cudaStreamSynchronize(stream)); + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); - CUDA_CHECK(cudaFree(in)); - CUDA_CHECK(cudaFree(out_ref)); - CUDA_CHECK(cudaFree(out)); } virtual void DoTest() { diff --git a/cpp/test/matrix/math.cu b/cpp/test/matrix/math.cu index 578139623a..63381dec07 100644 --- a/cpp/test/matrix/math.cu +++ b/cpp/test/matrix/math.cu @@ -115,22 +115,22 @@ class MathTest : public ::testing::TestWithParam> { random::Rng r(params.seed); int len = params.len; - allocate(in_power, len); - allocate(out_power_ref, len); - allocate(in_sqrt, len); - allocate(out_sqrt_ref, len); - allocate(in_sign_flip, len); - allocate(out_sign_flip_ref, len); - raft::handle_t handle; - cudaStream_t stream; + stream = handle.get_stream(); CUDA_CHECK(cudaStreamCreate(&stream)); - allocate(in_ratio, 4); + raft::allocate(in_power, len, stream); + raft::allocate(out_power_ref, len, stream); + raft::allocate(in_sqrt, len, stream); + raft::allocate(out_sqrt_ref, len, stream); + raft::allocate(in_sign_flip, len, stream); + raft::allocate(out_sign_flip_ref, len, stream); + + raft::allocate(in_ratio, 4, stream); T in_ratio_h[4] = {1.0, 2.0, 2.0, 3.0}; update_device(in_ratio, in_ratio_h, 4, stream); - allocate(out_ratio_ref, 4); + raft::allocate(out_ratio_ref, 4, stream); T out_ratio_ref_h[4] = {0.125, 0.25, 0.25, 0.375}; update_device(out_ratio_ref, out_ratio_ref_h, 4, stream); @@ -150,9 +150,9 @@ class MathTest : public ::testing::TestWithParam> { naiveSignFlip(in_sign_flip, out_sign_flip_ref, params.n_row, params.n_col); signFlip(in_sign_flip, params.n_row, params.n_col, stream); - allocate(in_recip, 4); - allocate(in_recip_ref, 4); - allocate(out_recip, 4); + raft::allocate(in_recip, 4, stream); + raft::allocate(in_recip_ref, 4, stream); + raft::allocate(out_recip, 4, stream); // default threshold is 1e-15 std::vector in_recip_h = {0.1, 0.01, -0.01, 0.1e-16}; std::vector in_recip_ref_h = {10.0, 100.0, -100.0, 0.0}; @@ -167,38 +167,23 @@ class MathTest : public ::testing::TestWithParam> { std::vector in_small_val_zero_h = {0.1, 1e-16, -1e-16, -0.1}; std::vector in_small_val_zero_ref_h = {0.1, 0.0, 0.0, -0.1}; - allocate(in_smallzero, 4); - allocate(out_smallzero, 4); - allocate(out_smallzero_ref, 4); + raft::allocate(in_smallzero, 4, stream); + raft::allocate(out_smallzero, 4, stream); + raft::allocate(out_smallzero_ref, 4, stream); update_device(in_smallzero, in_small_val_zero_h.data(), 4, stream); update_device(out_smallzero_ref, in_small_val_zero_ref_h.data(), 4, stream); setSmallValuesZero(out_smallzero, in_smallzero, 4, stream); setSmallValuesZero(in_smallzero, 4, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { - CUDA_CHECK(cudaFree(in_power)); - CUDA_CHECK(cudaFree(out_power_ref)); - CUDA_CHECK(cudaFree(in_sqrt)); - CUDA_CHECK(cudaFree(out_sqrt_ref)); - CUDA_CHECK(cudaFree(in_ratio)); - CUDA_CHECK(cudaFree(out_ratio_ref)); - CUDA_CHECK(cudaFree(in_sign_flip)); - CUDA_CHECK(cudaFree(out_sign_flip_ref)); - CUDA_CHECK(cudaFree(in_recip)); - CUDA_CHECK(cudaFree(in_recip_ref)); - CUDA_CHECK(cudaFree(out_recip)); - CUDA_CHECK(cudaFree(in_smallzero)); - CUDA_CHECK(cudaFree(out_smallzero)); - CUDA_CHECK(cudaFree(out_smallzero_ref)); - } + void TearDown() override { raft::deallocate_all(stream); } protected: MathInputs params; T *in_power, *out_power_ref, *in_sqrt, *out_sqrt_ref, *in_ratio, *out_ratio_ref, *in_sign_flip, *out_sign_flip_ref, *in_recip, *in_recip_ref, *out_recip, *in_smallzero, *out_smallzero, *out_smallzero_ref; + cudaStream_t stream; }; const std::vector> inputsf = { diff --git a/cpp/test/matrix/matrix.cu b/cpp/test/matrix/matrix.cu index 28222c0697..cc88df0a73 100644 --- a/cpp/test/matrix/matrix.cu +++ b/cpp/test/matrix/matrix.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include "../test_utils.h" namespace raft { @@ -43,11 +44,10 @@ class MatrixTest : public ::testing::TestWithParam> { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.n_row * params.n_col; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(in1, len); - raft::allocate(in2, len); - raft::allocate(in1_revr, len); + raft::allocate(in1, len, stream); + raft::allocate(in2, len, stream); + raft::allocate(in1_revr, len, stream); r.uniform(in1, len, T(-1.0), T(1.0), stream); copy(in1, in2, params.n_row, params.n_col, stream); @@ -55,20 +55,20 @@ class MatrixTest : public ::testing::TestWithParam> { // colReverse(in1_revr, params.n_row, params.n_col); T *outTrunc; - raft::allocate(outTrunc, 6); + raft::allocate(outTrunc, 6, stream); truncZeroOrigin(in1, params.n_row, outTrunc, 3, 2, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(in1)); - CUDA_CHECK(cudaFree(in2)); - // CUDA_CHECK(cudaFree(in1_revr)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: MatrixInputs params; T *in1, *in2, *in1_revr; + cudaStream_t stream; }; const std::vector> inputsf2 = {{0.000001f, 4, 4, 1234ULL}}; @@ -102,17 +102,16 @@ class MatrixCopyRowsTest : public ::testing::Test { protected: MatrixCopyRowsTest() - : allocator(handle.get_device_allocator()), - input(allocator, handle.get_stream(), n_cols * n_rows), - indices(allocator, handle.get_stream(), n_selected), - output(allocator, handle.get_stream(), n_cols * n_selected) { + : input(n_cols * n_rows, handle.get_stream()), + indices(n_selected, handle.get_stream()), + output(n_cols * n_selected, handle.get_stream()) { CUDA_CHECK(cudaStreamCreate(&stream)); handle.set_stream(stream); raft::update_device(indices.data(), indices_host, n_selected, stream); // Init input array thrust::counting_iterator first(0); thrust::device_ptr ptr(input.data()); - thrust::copy(thrust::cuda::par.on(stream), first, first + n_cols * n_rows, + thrust::copy(handle.get_thrust_policy(), first, first + n_cols * n_rows, ptr); } @@ -143,10 +142,9 @@ class MatrixCopyRowsTest : public ::testing::Test { 14, 21, 22, 23, 27, 28, 29}; raft::handle_t handle; cudaStream_t stream; - std::shared_ptr allocator; - raft::mr::device::buffer input; - raft::mr::device::buffer output; - raft::mr::device::buffer indices; + rmm::device_uvector input; + rmm::device_uvector output; + rmm::device_uvector indices; }; using TypeTuple = diff --git a/cpp/test/mr/device/buffer.cpp b/cpp/test/mr/device/buffer.cpp index 223efdbfe8..fe42cea8b3 100644 --- a/cpp/test/mr/device/buffer.cpp +++ b/cpp/test/mr/device/buffer.cpp @@ -15,22 +15,21 @@ */ #include +#include #include #include -#include +#include #include -#include namespace raft { namespace mr { namespace device { TEST(Raft, DeviceBufferAlloc) { - auto alloc = std::make_shared(); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); // no allocation at construction - buffer buff(alloc, stream); + rmm::device_uvector buff(0, stream); ASSERT_EQ(0, buff.size()); // explicit allocation after construction buff.resize(20, stream); @@ -39,12 +38,12 @@ TEST(Raft, DeviceBufferAlloc) { buff.resize(10, stream); ASSERT_EQ(10, buff.size()); // explicit deallocation - buff.release(stream); + buff.release(); ASSERT_EQ(0, buff.size()); // use these methods without the explicit stream parameter - buff.resize(20); + buff.resize(20, stream); ASSERT_EQ(20, buff.size()); - buff.resize(10); + buff.resize(10, stream); ASSERT_EQ(10, buff.size()); buff.release(); ASSERT_EQ(0, buff.size()); @@ -62,11 +61,10 @@ TEST(Raft, DeviceBufferZeroResize) { rmm::mr::set_current_device_resource(limit_mr.get()); - auto alloc = std::make_shared(); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); // no allocation at construction - buffer buff(alloc, stream, 10); + rmm::device_uvector buff(10, stream); ASSERT_EQ(10, buff.size()); // explicit allocation after construction buff.resize(0, stream); @@ -75,7 +73,7 @@ TEST(Raft, DeviceBufferZeroResize) { buff.resize(20, stream); ASSERT_EQ(20, buff.size()); // explicit deallocation - buff.release(stream); + buff.release(); ASSERT_EQ(0, buff.size()); // Now check that there is no memory left. (Used to not be true) diff --git a/cpp/test/mst.cu b/cpp/test/mst.cu index 94f81cddb8..781e6d1d3f 100644 --- a/cpp/test/mst.cu +++ b/cpp/test/mst.cu @@ -14,6 +14,12 @@ * limitations under the License. */ +#include + +#include +#include +#include +#include #include "test_utils.h" #include @@ -128,11 +134,18 @@ class MSTTest v = static_cast((csr_d.offsets.size() / sizeof(vertex_t)) - 1); e = static_cast(csr_d.indices.size() / sizeof(edge_t)); - rmm::device_vector mst_src(2 * v - 2, - std::numeric_limits::max()); - rmm::device_vector mst_dst(2 * v - 2, - std::numeric_limits::max()); - rmm::device_vector color(v, 0); + rmm::device_uvector mst_src(2 * v - 2, handle.get_stream()); + rmm::device_uvector mst_dst(2 * v - 2, handle.get_stream()); + rmm::device_uvector color(v, handle.get_stream()); + + CUDA_CHECK( + cudaMemsetAsync(mst_src.data(), std::numeric_limits::max(), + mst_src.size() * sizeof(vertex_t), handle.get_stream())); + CUDA_CHECK( + cudaMemsetAsync(mst_dst.data(), std::numeric_limits::max(), + mst_dst.size() * sizeof(vertex_t), handle.get_stream())); + CUDA_CHECK(cudaMemsetAsync(color.data(), 0, color.size() * sizeof(vertex_t), + handle.get_stream())); vertex_t *color_ptr = thrust::raw_pointer_cast(color.data()); @@ -215,7 +228,6 @@ class MSTTest protected: MSTTestInput mst_input; CSRDevice csr_d; - rmm::device_vector mst_edge; vertex_t v; edge_t e; int iterations; diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu index af10dcab30..c2ec7a340f 100644 --- a/cpp/test/random/rng.cu +++ b/cpp/test/random/rng.cu @@ -85,11 +85,10 @@ class RngTest : public ::testing::TestWithParam> { // 4 x sigma indicates the test shouldn't fail 99.9% of the time. num_sigma = 10; params = ::testing::TestWithParam>::GetParam(); - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); Rng r(params.seed, params.gtype); - allocate(data, params.len); - allocate(stats, 2, true); + raft::allocate(data, params.len, stream); + raft::allocate(stats, 2, stream, true); switch (params.type) { case RNG_Normal: r.normal(data, params.len, params.start, params.end, stream); @@ -124,12 +123,12 @@ class RngTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamSynchronize(stream)); h_stats[0] /= params.len; h_stats[1] = (h_stats[1] / params.len) - (h_stats[0] * h_stats[0]); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(stats)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } void getExpectedMeanVar(T meanvar[2]) { @@ -182,6 +181,7 @@ class RngTest : public ::testing::TestWithParam> { T *data, *stats; T h_stats[2]; // mean, var int num_sigma; + cudaStream_t stream; }; // The measured mean and standard deviation for each tested distribution are, @@ -383,9 +383,9 @@ TEST(Rng, MeanError) { cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - allocate(data, len); - allocate(mean_result, num_experiments); - allocate(std_result, num_experiments); + raft::allocate(data, len, stream); + raft::allocate(mean_result, num_experiments, stream); + raft::allocate(std_result, num_experiments, stream); for (auto rtype : {GenPhilox, GenKiss99 /*, raft::random::GenTaps */}) { Rng r(seed, rtype); @@ -416,10 +416,8 @@ TEST(Rng, MeanError) { ASSERT_TRUE( (diff_expected_vs_measured_mean_error / d_std_of_mean_analytical < 0.5)); } + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(mean_result)); - CUDA_CHECK(cudaFree(std_result)); // std::cout << "mean_res:" << h_mean_result << "\n"; } @@ -432,7 +430,7 @@ class ScaledBernoulliTest : public ::testing::Test { Rng r(42); - allocate(data, len * sizeof(T), stream); + raft::allocate(data, len * sizeof(T), stream); r.scaled_bernoulli(data, len, T(0.5), T(scale), stream); } @@ -463,7 +461,7 @@ class BernoulliTest : public ::testing::Test { void SetUp() override { CUDA_CHECK(cudaStreamCreate(&stream)); Rng r(42); - allocate(data, len * sizeof(bool), stream); + raft::allocate(data, len * sizeof(bool), stream); r.bernoulli(data, len, T(0.5), stream); } @@ -515,12 +513,11 @@ class RngNormalTableTest params = ::testing::TestWithParam>::GetParam(); int len = params.rows * params.cols; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); Rng r(params.seed, params.gtype); - allocate(data, len); - allocate(stats, 2, true); - allocate(mu_vec, params.cols); + raft::allocate(data, len, stream); + raft::allocate(stats, 2, stream, true); + raft::allocate(mu_vec, params.cols, stream); r.fill(mu_vec, params.cols, params.mu, stream); T* sigma_vec = nullptr; r.normalTable(data, params.rows, params.cols, mu_vec, sigma_vec, @@ -532,13 +529,12 @@ class RngNormalTableTest CUDA_CHECK(cudaStreamSynchronize(stream)); h_stats[0] /= len; h_stats[1] = (h_stats[1] / len) - (h_stats[0] * h_stats[0]); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(stats)); - CUDA_CHECK(cudaFree(mu_vec)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } void getExpectedMeanVar(T meanvar[2]) { @@ -551,6 +547,7 @@ class RngNormalTableTest T *data, *stats, *mu_vec; T h_stats[2]; // mean, var int num_sigma; + cudaStream_t stream; }; typedef RngNormalTableTest RngNormalTableTestF; diff --git a/cpp/test/random/rng_int.cu b/cpp/test/random/rng_int.cu index 92f12206e8..a98619e5b4 100644 --- a/cpp/test/random/rng_int.cu +++ b/cpp/test/random/rng_int.cu @@ -70,10 +70,9 @@ class RngTest : public ::testing::TestWithParam> { params = ::testing::TestWithParam>::GetParam(); Rng r(params.seed, params.gtype); - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - allocate(data, params.len); - allocate(stats, 2, true); + raft::allocate(data, params.len, stream); + raft::allocate(stats, 2, stream, true); switch (params.type) { case RNG_Uniform: r.uniformInt(data, params.len, params.start, params.end, stream); @@ -87,12 +86,12 @@ class RngTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamSynchronize(stream)); h_stats[0] /= params.len; h_stats[1] = (h_stats[1] / params.len) - (h_stats[0] * h_stats[0]); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(stats)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } void getExpectedMeanVar(float meanvar[2]) { @@ -110,6 +109,7 @@ class RngTest : public ::testing::TestWithParam> { T *data; float *stats; float h_stats[2]; // mean, var + cudaStream_t stream; }; typedef RngTest RngTestU32; diff --git a/cpp/test/random/sample_without_replacement.cu b/cpp/test/random/sample_without_replacement.cu index d7e52a8958..cf60f46afe 100644 --- a/cpp/test/random/sample_without_replacement.cu +++ b/cpp/test/random/sample_without_replacement.cu @@ -50,10 +50,10 @@ class SWoRTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamCreate(&stream)); Rng r(params.seed, params.gtype); - allocate(in, params.len); - allocate(wts, params.len); - allocate(out, params.sampledLen); - allocate(outIdx, params.sampledLen); + raft::allocate(in, params.len, stream); + raft::allocate(wts, params.len, stream); + raft::allocate(out, params.sampledLen, stream); + raft::allocate(outIdx, params.sampledLen, stream); h_outIdx.resize(params.sampledLen); r.uniform(in, params.len, T(-1.0), T(1.0), stream); r.uniform(wts, params.len, T(1.0), T(2.0), stream); @@ -67,12 +67,8 @@ class SWoRTest : public ::testing::TestWithParam> { } void TearDown() override { - CUDA_CHECK(cudaStreamSynchronize(stream)); + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); - CUDA_CHECK(cudaFree(in)); - CUDA_CHECK(cudaFree(wts)); - CUDA_CHECK(cudaFree(out)); - CUDA_CHECK(cudaFree(outIdx)); } protected: diff --git a/cpp/test/sparse/add.cu b/cpp/test/sparse/add.cu index 713708d4cd..8429a46941 100644 --- a/cpp/test/sparse/add.cu +++ b/cpp/test/sparse/add.cu @@ -56,27 +56,24 @@ class CSRAddTest cudaStreamCreate(&stream); - raft::allocate(ind_a, n_rows); - raft::allocate(ind_ptr_a, nnz_a); - raft::allocate(values_a, nnz_a); + raft::allocate(ind_a, n_rows, stream); + raft::allocate(ind_ptr_a, nnz_a, stream); + raft::allocate(values_a, nnz_a, stream); - raft::allocate(ind_b, n_rows); - raft::allocate(ind_ptr_b, nnz_b); - raft::allocate(values_b, nnz_b); + raft::allocate(ind_b, n_rows, stream); + raft::allocate(ind_ptr_b, nnz_b, stream); + raft::allocate(values_b, nnz_b, stream); - raft::allocate(ind_verify, n_rows); - raft::allocate(ind_ptr_verify, nnz_result); - raft::allocate(values_verify, nnz_result); + raft::allocate(ind_verify, n_rows, stream); + raft::allocate(ind_ptr_verify, nnz_result, stream); + raft::allocate(values_verify, nnz_result, stream); - raft::allocate(ind_result, n_rows); - raft::allocate(ind_ptr_result, nnz_result); - raft::allocate(values_result, nnz_result); + raft::allocate(ind_result, n_rows, stream); + raft::allocate(ind_ptr_result, nnz_result, stream); + raft::allocate(values_result, nnz_result, stream); } void Run() { - std::shared_ptr alloc( - new raft::mr::device::default_allocator); - raft::update_device(ind_a, params.matrix_a.row_ind.data(), n_rows, stream); raft::update_device(ind_ptr_a, params.matrix_a.row_ind_ptr.data(), nnz_a, stream); @@ -96,7 +93,7 @@ class CSRAddTest Index_ nnz = linalg::csr_add_calc_inds( ind_a, ind_ptr_a, values_a, nnz_a, ind_b, ind_ptr_b, values_b, nnz_b, - n_rows, ind_result, alloc, stream); + n_rows, ind_result, stream); ASSERT_TRUE(nnz == nnz_result); ASSERT_TRUE(raft::devArrMatch(ind_verify, ind_result, n_rows, @@ -113,18 +110,8 @@ class CSRAddTest } void TearDown() override { - CUDA_CHECK(cudaFree(ind_a)); - CUDA_CHECK(cudaFree(ind_b)); - CUDA_CHECK(cudaFree(ind_result)); - CUDA_CHECK(cudaFree(ind_ptr_a)); - CUDA_CHECK(cudaFree(ind_ptr_b)); - CUDA_CHECK(cudaFree(ind_ptr_verify)); - CUDA_CHECK(cudaFree(ind_ptr_result)); - CUDA_CHECK(cudaFree(values_a)); - CUDA_CHECK(cudaFree(values_b)); - CUDA_CHECK(cudaFree(values_verify)); - CUDA_CHECK(cudaFree(values_result)); - cudaStreamDestroy(stream); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: diff --git a/cpp/test/sparse/connect_components.cu b/cpp/test/sparse/connect_components.cu index d98f9de9c3..dd6ba1479e 100644 --- a/cpp/test/sparse/connect_components.cu +++ b/cpp/test/sparse/connect_components.cu @@ -28,7 +28,6 @@ #include #include -#include #include #include #include @@ -57,14 +56,12 @@ class ConnectComponentsTest : public ::testing::TestWithParam< void basicTest() { raft::handle_t handle; - auto d_alloc = handle.get_device_allocator(); auto stream = handle.get_stream(); params = ::testing::TestWithParam< ConnectComponentsInputs>::GetParam(); - raft::sparse::COO out_edges( - handle.get_device_allocator(), handle.get_stream()); + raft::sparse::COO out_edges(handle.get_stream()); rmm::device_uvector data(params.n_row * params.n_col, handle.get_stream()); @@ -77,7 +74,7 @@ class ConnectComponentsTest : public ::testing::TestWithParam< /** * 1. Construct knn graph */ - raft::sparse::COO knn_graph_coo(d_alloc, stream); + raft::sparse::COO knn_graph_coo(stream); raft::sparse::selection::knn_graph( handle, data.data(), params.n_row, params.n_col, @@ -85,7 +82,7 @@ class ConnectComponentsTest : public ::testing::TestWithParam< raft::sparse::convert::sorted_coo_to_csr(knn_graph_coo.rows(), knn_graph_coo.nnz, indptr.data(), - params.n_row + 1, d_alloc, stream); + params.n_row + 1, stream); /** * 2. Construct MST, sorted by weights @@ -112,7 +109,7 @@ class ConnectComponentsTest : public ::testing::TestWithParam< raft::sparse::convert::sorted_coo_to_csr(out_edges.rows(), out_edges.nnz, indptr2.data(), params.n_row + 1, - d_alloc, stream); + stream); auto output_mst = raft::mst::mst( handle, indptr2.data(), out_edges.cols(), out_edges.vals(), params.n_row, diff --git a/cpp/test/sparse/convert_coo.cu b/cpp/test/sparse/convert_coo.cu index ea69ecfc53..4f9c00c7ab 100644 --- a/cpp/test/sparse/convert_coo.cu +++ b/cpp/test/sparse/convert_coo.cu @@ -43,9 +43,9 @@ class CSRtoCOOTest : public ::testing::TestWithParam> { params = ::testing::TestWithParam>::GetParam(); cudaStreamCreate(&stream); - raft::allocate(ex_scan, params.ex_scan.size()); - raft::allocate(verify, params.verify.size()); - raft::allocate(result, params.verify.size(), true); + raft::allocate(ex_scan, params.ex_scan.size(), stream); + raft::allocate(verify, params.verify.size(), stream); + raft::allocate(result, params.verify.size(), stream, true); } void Run() { @@ -62,9 +62,7 @@ class CSRtoCOOTest : public ::testing::TestWithParam> { } void TearDown() override { - CUDA_CHECK(cudaFree(ex_scan)); - CUDA_CHECK(cudaFree(verify)); - CUDA_CHECK(cudaFree(result)); + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); } diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu index 553ef2ddee..465aad4e7f 100644 --- a/cpp/test/sparse/convert_csr.cu +++ b/cpp/test/sparse/convert_csr.cu @@ -19,7 +19,6 @@ #include #include "../test_utils.h" -#include #include #include @@ -61,8 +60,6 @@ typedef SparseConvertCSRTest SortedCOOToCSR; TEST_P(SortedCOOToCSR, Result) { cudaStream_t stream; cudaStreamCreate(&stream); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); int nnz = 8; @@ -71,14 +68,14 @@ TEST_P(SortedCOOToCSR, Result) { int *in_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3}; int *exp_h = new int[4]{0, 2, 4, 6}; - raft::allocate(in, nnz, true); - raft::allocate(exp, 4, true); - raft::allocate(out, 4, true); + raft::allocate(in, nnz, stream, true); + raft::allocate(exp, 4, stream, true); + raft::allocate(out, 4, stream, true); raft::update_device(in, in_h, nnz, stream); raft::update_device(exp, exp_h, 4, stream); - convert::sorted_coo_to_csr(in, nnz, out, 4, alloc, stream); + convert::sorted_coo_to_csr(in, nnz, out, 4, stream); ASSERT_TRUE(raft::devArrMatch(out, exp, 4, raft::Compare())); @@ -115,10 +112,10 @@ class CSRAdjGraphTest cudaStreamCreate(&stream); nnz = params.verify.size(); - raft::allocate(row_ind, params.n_rows); - raft::allocate(adj, params.n_rows * params.n_cols); - raft::allocate(result, nnz, true); - raft::allocate(verify, nnz); + raft::allocate(row_ind, params.n_rows, stream); + raft::allocate(adj, params.n_rows * params.n_cols, stream); + raft::allocate(result, nnz, stream, true); + raft::allocate(verify, nnz, stream); } void Run() { @@ -135,11 +132,8 @@ class CSRAdjGraphTest } void TearDown() override { - CUDA_CHECK(cudaFree(row_ind)); - CUDA_CHECK(cudaFree(adj)); - CUDA_CHECK(cudaFree(verify)); - CUDA_CHECK(cudaFree(result)); - cudaStreamDestroy(stream); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: diff --git a/cpp/test/sparse/csr_row_slice.cu b/cpp/test/sparse/csr_row_slice.cu index 625772a842..00e6899cb2 100644 --- a/cpp/test/sparse/csr_row_slice.cu +++ b/cpp/test/sparse/csr_row_slice.cu @@ -19,8 +19,6 @@ #include #include -#include -#include #include @@ -61,9 +59,9 @@ class CSRRowSliceTest std::vector indices_h = params.indices_h; std::vector data_h = params.data_h; - allocate(indptr, indptr_h.size()); - allocate(indices, indices_h.size()); - allocate(data, data_h.size()); + raft::allocate(indptr, indptr_h.size(), stream); + raft::allocate(indices, indices_h.size(), stream); + raft::allocate(data, data_h.size(), stream); update_device(indptr, indptr_h.data(), indptr_h.size(), stream); update_device(indices, indices_h.data(), indices_h.size(), stream); @@ -73,9 +71,9 @@ class CSRRowSliceTest std::vector out_indices_ref_h = params.out_indices_ref_h; std::vector out_data_ref_h = params.out_data_ref_h; - allocate(out_indptr_ref, out_indptr_ref_h.size()); - allocate(out_indices_ref, out_indices_ref_h.size()); - allocate(out_data_ref, out_data_ref_h.size()); + raft::allocate(out_indptr_ref, out_indptr_ref_h.size(), stream); + raft::allocate(out_indices_ref, out_indices_ref_h.size(), stream); + raft::allocate(out_data_ref, out_data_ref_h.size(), stream); update_device(out_indptr_ref, out_indptr_ref_h.data(), out_indptr_ref_h.size(), stream); @@ -84,16 +82,14 @@ class CSRRowSliceTest update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(), stream); - allocate(out_indptr, out_indptr_ref_h.size()); - allocate(out_indices, out_indices_ref_h.size()); - allocate(out_data, out_data_ref_h.size()); + raft::allocate(out_indptr, out_indptr_ref_h.size(), stream); + raft::allocate(out_indices, out_indices_ref_h.size(), stream); + raft::allocate(out_data, out_data_ref_h.size(), stream); } void SetUp() override { params = ::testing::TestWithParam< CSRRowSliceInputs>::GetParam(); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); CUDA_CHECK(cudaStreamCreate(&stream)); make_data(); @@ -113,16 +109,8 @@ class CSRRowSliceTest } void TearDown() override { - CUDA_CHECK(cudaStreamSynchronize(stream)); - CUDA_CHECK(cudaFree(indptr)); - CUDA_CHECK(cudaFree(indices)); - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(out_indptr)); - CUDA_CHECK(cudaFree(out_indices)); - CUDA_CHECK(cudaFree(out_data)); - CUDA_CHECK(cudaFree(out_indptr_ref)); - CUDA_CHECK(cudaFree(out_indices_ref)); - CUDA_CHECK(cudaFree(out_data_ref)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } void compare() { diff --git a/cpp/test/sparse/csr_to_dense.cu b/cpp/test/sparse/csr_to_dense.cu index 5535df4fe3..7f6b7dad07 100644 --- a/cpp/test/sparse/csr_to_dense.cu +++ b/cpp/test/sparse/csr_to_dense.cu @@ -16,8 +16,6 @@ #include #include -#include -#include #include #include @@ -57,9 +55,9 @@ class CSRToDenseTest std::vector indices_h = params.indices_h; std::vector data_h = params.data_h; - allocate(indptr, indptr_h.size()); - allocate(indices, indices_h.size()); - allocate(data, data_h.size()); + raft::allocate(indptr, indptr_h.size(), stream); + raft::allocate(indices, indices_h.size(), stream); + raft::allocate(data, data_h.size(), stream); update_device(indptr, indptr_h.data(), indptr_h.size(), stream); update_device(indices, indices_h.data(), indices_h.size(), stream); @@ -67,18 +65,16 @@ class CSRToDenseTest std::vector out_ref_h = params.out_ref_h; - allocate(out_ref, out_ref_h.size()); + raft::allocate(out_ref, out_ref_h.size(), stream); update_device(out_ref, out_ref_h.data(), out_ref_h.size(), stream); - allocate(out, out_ref_h.size()); + raft::allocate(out, out_ref_h.size(), stream); } void SetUp() override { params = ::testing::TestWithParam< CSRToDenseInputs>::GetParam(); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); CUDA_CHECK(cudaStreamCreate(&stream)); CUSPARSE_CHECK(cusparseCreate(&handle)); @@ -92,12 +88,8 @@ class CSRToDenseTest } void TearDown() override { - CUDA_CHECK(cudaStreamSynchronize(stream)); - CUDA_CHECK(cudaFree(indptr)); - CUDA_CHECK(cudaFree(indices)); - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(out)); - CUDA_CHECK(cudaFree(out_ref)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } void compare() { diff --git a/cpp/test/sparse/csr_transpose.cu b/cpp/test/sparse/csr_transpose.cu index c257d6eb3c..e50a9d94a9 100644 --- a/cpp/test/sparse/csr_transpose.cu +++ b/cpp/test/sparse/csr_transpose.cu @@ -20,8 +20,6 @@ #include #include -#include -#include #include @@ -63,9 +61,9 @@ class CSRTransposeTest std::vector indices_h = params.indices_h; std::vector data_h = params.data_h; - allocate(indptr, indptr_h.size()); - allocate(indices, indices_h.size()); - allocate(data, data_h.size()); + raft::allocate(indptr, indptr_h.size(), stream); + raft::allocate(indices, indices_h.size(), stream); + raft::allocate(data, data_h.size(), stream); update_device(indptr, indptr_h.data(), indptr_h.size(), stream); update_device(indices, indices_h.data(), indices_h.size(), stream); @@ -75,9 +73,9 @@ class CSRTransposeTest std::vector out_indices_ref_h = params.out_indices_ref_h; std::vector out_data_ref_h = params.out_data_ref_h; - allocate(out_indptr_ref, out_indptr_ref_h.size()); - allocate(out_indices_ref, out_indices_ref_h.size()); - allocate(out_data_ref, out_data_ref_h.size()); + raft::allocate(out_indptr_ref, out_indptr_ref_h.size(), stream); + raft::allocate(out_indices_ref, out_indices_ref_h.size(), stream); + raft::allocate(out_data_ref, out_data_ref_h.size(), stream); update_device(out_indptr_ref, out_indptr_ref_h.data(), out_indptr_ref_h.size(), stream); @@ -86,16 +84,14 @@ class CSRTransposeTest update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(), stream); - allocate(out_indptr, out_indptr_ref_h.size()); - allocate(out_indices, out_indices_ref_h.size()); - allocate(out_data, out_data_ref_h.size()); + raft::allocate(out_indptr, out_indptr_ref_h.size(), stream); + raft::allocate(out_indices, out_indices_ref_h.size(), stream); + raft::allocate(out_data, out_data_ref_h.size(), stream); } void SetUp() override { params = ::testing::TestWithParam< CSRTransposeInputs>::GetParam(); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); CUDA_CHECK(cudaStreamCreate(&stream)); CUSPARSE_CHECK(cusparseCreate(&handle)); @@ -103,23 +99,15 @@ class CSRTransposeTest raft::sparse::linalg::csr_transpose( handle, indptr, indices, data, out_indptr, out_indices, out_data, - params.nrows, params.ncols, params.nnz, alloc, stream); + params.nrows, params.ncols, params.nnz, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); CUSPARSE_CHECK(cusparseDestroy(handle)); } void TearDown() override { - CUDA_CHECK(cudaStreamSynchronize(stream)); - CUDA_CHECK(cudaFree(indptr)); - CUDA_CHECK(cudaFree(indices)); - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(out_indptr)); - CUDA_CHECK(cudaFree(out_indices)); - CUDA_CHECK(cudaFree(out_data)); - CUDA_CHECK(cudaFree(out_indptr_ref)); - CUDA_CHECK(cudaFree(out_indices_ref)); - CUDA_CHECK(cudaFree(out_data_ref)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } void compare() { diff --git a/cpp/test/sparse/degree.cu b/cpp/test/sparse/degree.cu index 5d687ad92b..f8a469af45 100644 --- a/cpp/test/sparse/degree.cu +++ b/cpp/test/sparse/degree.cu @@ -48,25 +48,27 @@ const std::vector> inputsf = {{5, 10, 5, 1234ULL}}; typedef SparseDegreeTests COODegree; TEST_P(COODegree, Result) { + cudaStream_t stream; + cudaStreamCreate(&stream); int *in_rows, *verify, *results; int in_rows_h[5] = {0, 0, 1, 2, 2}; int verify_h[5] = {2, 1, 2, 0, 0}; - raft::allocate(in_rows, 5); - raft::allocate(verify, 5, true); - raft::allocate(results, 5, true); + raft::allocate(in_rows, 5, stream); + raft::allocate(verify, 5, stream, true); + raft::allocate(results, 5, stream, true); - raft::update_device(in_rows, *&in_rows_h, 5, 0); - raft::update_device(verify, *&verify_h, 5, 0); + raft::update_device(in_rows, *&in_rows_h, 5, stream); + raft::update_device(verify, *&verify_h, 5, stream); - linalg::coo_degree<32>(in_rows, 5, results, 0); + linalg::coo_degree<32>(in_rows, 5, results, stream); cudaDeviceSynchronize(); ASSERT_TRUE(raft::devArrMatch(verify, results, 5, raft::Compare())); - CUDA_CHECK(cudaFree(in_rows)); - CUDA_CHECK(cudaFree(verify)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } typedef SparseDegreeTests COODegreeNonzero; @@ -81,23 +83,21 @@ TEST_P(COODegreeNonzero, Result) { float in_vals_h[5] = {0.0, 5.0, 0.0, 1.0, 1.0}; int verify_h[5] = {1, 0, 2, 0, 0}; - raft::allocate(in_rows, 5); - raft::allocate(verify, 5, true); - raft::allocate(results, 5, true); - raft::allocate(in_vals, 5, true); + raft::allocate(in_rows, 5, stream); + raft::allocate(verify, 5, stream, true); + raft::allocate(results, 5, stream, true); + raft::allocate(in_vals, 5, stream, true); - raft::update_device(in_rows, *&in_rows_h, 5, 0); - raft::update_device(verify, *&verify_h, 5, 0); - raft::update_device(in_vals, *&in_vals_h, 5, 0); + raft::update_device(in_rows, *&in_rows_h, 5, stream); + raft::update_device(verify, *&verify_h, 5, stream); + raft::update_device(in_vals, *&in_vals_h, 5, stream); linalg::coo_degree_nz<32, float>(in_rows, in_vals, 5, results, stream); cudaDeviceSynchronize(); ASSERT_TRUE(raft::devArrMatch(verify, results, 5, raft::Compare())); - CUDA_CHECK(cudaFree(in_rows)); - CUDA_CHECK(cudaFree(verify)); - + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); } diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu index a83b93f83f..563dcf6f15 100644 --- a/cpp/test/sparse/dist_coo_spmv.cu +++ b/cpp/test/sparse/dist_coo_spmv.cu @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include @@ -94,10 +94,9 @@ class SparseDistanceCOOSPMVTest template void compute_dist(reduce_f reduce_func, accum_f accum_func, write_f write_func, bool rev = true) { - raft::mr::device::buffer coo_rows( - dist_config.handle.get_device_allocator(), - dist_config.handle.get_stream(), - max(dist_config.b_nnz, dist_config.a_nnz)); + rmm::device_uvector coo_rows( + max(dist_config.b_nnz, dist_config.a_nnz), + dist_config.handle.get_stream()); raft::sparse::convert::csr_to_coo(dist_config.b_indptr, dist_config.b_nrows, coo_rows.data(), dist_config.b_nnz, @@ -161,9 +160,9 @@ class SparseDistanceCOOSPMVTest std::vector indices_h = params.input_configuration.indices_h; std::vector data_h = params.input_configuration.data_h; - allocate(indptr, indptr_h.size()); - allocate(indices, indices_h.size()); - allocate(data, data_h.size()); + raft::allocate(indptr, indptr_h.size(), handle.get_stream()); + raft::allocate(indices, indices_h.size(), handle.get_stream()); + raft::allocate(data, data_h.size(), handle.get_stream()); update_device(indptr, indptr_h.data(), indptr_h.size(), handle.get_stream()); @@ -174,7 +173,8 @@ class SparseDistanceCOOSPMVTest std::vector out_dists_ref_h = params.input_configuration.out_dists_ref_h; - allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1)); + raft::allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1), + handle.get_stream()); update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), handle.get_stream()); @@ -201,21 +201,14 @@ class SparseDistanceCOOSPMVTest int out_size = dist_config.a_nrows * dist_config.b_nrows; - allocate(out_dists, out_size); + raft::allocate(out_dists, out_size, handle.get_stream()); run_spmv(); CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); } - void TearDown() override { - CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); - CUDA_CHECK(cudaFree(indptr)); - CUDA_CHECK(cudaFree(indices)); - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(out_dists)); - CUDA_CHECK(cudaFree(out_dists_ref)); - } + void TearDown() override { raft::deallocate_all(handle.get_stream()); } void compare() { ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu index 0589637061..4b531992f0 100644 --- a/cpp/test/sparse/distance.cu +++ b/cpp/test/sparse/distance.cu @@ -21,7 +21,6 @@ #include #include #include -#include #include @@ -82,21 +81,14 @@ class SparseDistanceTest int out_size = dist_config.a_nrows * dist_config.b_nrows; - allocate(out_dists, out_size); + raft::allocate(out_dists, out_size, handle.get_stream()); pairwiseDistance(out_dists, dist_config, params.metric, params.metric_arg); CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); } - void TearDown() override { - CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); - CUDA_CHECK(cudaFree(indptr)); - CUDA_CHECK(cudaFree(indices)); - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(out_dists)); - CUDA_CHECK(cudaFree(out_dists_ref)); - } + void TearDown() override { raft::deallocate_all(handle.get_stream()); } void compare() { ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, @@ -110,9 +102,9 @@ class SparseDistanceTest std::vector indices_h = params.indices_h; std::vector data_h = params.data_h; - allocate(indptr, indptr_h.size()); - allocate(indices, indices_h.size()); - allocate(data, data_h.size()); + raft::allocate(indptr, indptr_h.size(), handle.get_stream()); + raft::allocate(indices, indices_h.size(), handle.get_stream()); + raft::allocate(data, data_h.size(), handle.get_stream()); update_device(indptr, indptr_h.data(), indptr_h.size(), handle.get_stream()); @@ -122,7 +114,8 @@ class SparseDistanceTest std::vector out_dists_ref_h = params.out_dists_ref_h; - allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1)); + raft::allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1), + handle.get_stream()); update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), dist_config.handle.get_stream()); diff --git a/cpp/test/sparse/filter.cu b/cpp/test/sparse/filter.cu index f7954f899f..4634e5fc0e 100644 --- a/cpp/test/sparse/filter.cu +++ b/cpp/test/sparse/filter.cu @@ -20,7 +20,6 @@ #include "../test_utils.h" #include -#include #include #include @@ -53,13 +52,11 @@ typedef SparseFilterTests COORemoveZeros; TEST_P(COORemoveZeros, Result) { cudaStream_t stream; cudaStreamCreate(&stream); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); params = ::testing::TestWithParam>::GetParam(); float *in_h_vals = new float[params.nnz]; - COO in(alloc, stream, params.nnz, 5, 5); + COO in(stream, params.nnz, 5, 5); raft::random::Rng r(params.seed); r.uniform(in.vals(), params.nnz, float(-1.0), float(1.0), stream); @@ -82,7 +79,7 @@ TEST_P(COORemoveZeros, Result) { raft::update_device(in.cols(), in_h_cols, params.nnz, stream); raft::update_device(in.vals(), in_h_vals, params.nnz, stream); - op::coo_sort(&in, alloc, stream); + op::coo_sort(&in, stream); int out_rows_ref_h[2] = {0, 3}; int out_cols_ref_h[2] = {4, 1}; @@ -91,14 +88,14 @@ TEST_P(COORemoveZeros, Result) { out_vals_ref_h[0] = in_h_vals[4]; out_vals_ref_h[1] = in_h_vals[1]; - COO out_ref(alloc, stream, 2, 5, 5); - COO out(alloc, stream); + COO out_ref(stream, 2, 5, 5); + COO out(stream); raft::update_device(out_ref.rows(), *&out_rows_ref_h, 2, stream); raft::update_device(out_ref.cols(), *&out_cols_ref_h, 2, stream); raft::update_device(out_ref.vals(), out_vals_ref_h, 2, stream); - op::coo_remove_zeros<32, float>(&in, &out, alloc, stream); + op::coo_remove_zeros<32, float>(&in, &out, stream); ASSERT_TRUE(raft::devArrMatch(out_ref.rows(), out.rows(), 2, raft::Compare())); diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/knn.cu index 8c3bf36318..22f97559b1 100644 --- a/cpp/test/sparse/knn.cu +++ b/cpp/test/sparse/knn.cu @@ -24,8 +24,6 @@ #include #include -#include -#include namespace raft { namespace sparse { @@ -82,15 +80,7 @@ class SparseKNNTest CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); } - void TearDown() override { - CUDA_CHECK(cudaFree(indptr)); - CUDA_CHECK(cudaFree(indices)); - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(out_indices)); - CUDA_CHECK(cudaFree(out_dists)); - CUDA_CHECK(cudaFree(out_indices_ref)); - CUDA_CHECK(cudaFree(out_dists_ref)); - } + void TearDown() override { raft::deallocate_all(handle.get_stream()); } void compare() { ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, n_rows * k, @@ -105,9 +95,9 @@ class SparseKNNTest std::vector indices_h = params.indices_h; std::vector data_h = params.data_h; - allocate(indptr, indptr_h.size()); - allocate(indices, indices_h.size()); - allocate(data, data_h.size()); + raft::allocate(indptr, indptr_h.size(), handle.get_stream()); + raft::allocate(indices, indices_h.size(), handle.get_stream()); + raft::allocate(data, data_h.size(), handle.get_stream()); update_device(indptr, indptr_h.data(), indptr_h.size(), handle.get_stream()); @@ -118,16 +108,17 @@ class SparseKNNTest std::vector out_dists_ref_h = params.out_dists_ref_h; std::vector out_indices_ref_h = params.out_indices_ref_h; - allocate(out_indices_ref, out_indices_ref_h.size()); - allocate(out_dists_ref, out_dists_ref_h.size()); + raft::allocate(out_indices_ref, out_indices_ref_h.size(), + handle.get_stream()); + raft::allocate(out_dists_ref, out_dists_ref_h.size(), handle.get_stream()); update_device(out_indices_ref, out_indices_ref_h.data(), out_indices_ref_h.size(), handle.get_stream()); update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), handle.get_stream()); - allocate(out_dists, n_rows * k); - allocate(out_indices, n_rows * k); + raft::allocate(out_dists, n_rows * k, handle.get_stream()); + raft::allocate(out_indices, n_rows * k, handle.get_stream()); } raft::handle_t handle; diff --git a/cpp/test/sparse/knn_graph.cu b/cpp/test/sparse/knn_graph.cu index ec41b32374..e259eafa70 100644 --- a/cpp/test/sparse/knn_graph.cu +++ b/cpp/test/sparse/knn_graph.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include "../test_utils.h" @@ -64,12 +65,11 @@ class KNNGraphTest raft::handle_t handle; - auto alloc = handle.get_device_allocator(); stream = handle.get_stream(); - out = new raft::sparse::COO(alloc, stream); + out = new raft::sparse::COO(stream); - allocate(X, params.X.size()); + raft::allocate(X, params.X.size(), stream); update_device(X, params.X.data(), params.X.size(), stream); @@ -77,9 +77,8 @@ class KNNGraphTest handle, X, params.m, params.n, raft::distance::DistanceType::L2Unexpanded, *out); - rmm::device_uvector sum(1, stream); - - CUDA_CHECK(cudaMemsetAsync(sum.data(), 0, 1 * sizeof(value_idx), stream)); + rmm::device_scalar sum(stream); + sum.set_value_to_zero_async(stream); /** * Assert the knn graph is symmetric @@ -87,12 +86,13 @@ class KNNGraphTest assert_symmetry<<nnz, 256), 256, 0, stream>>>( out->rows(), out->cols(), out->vals(), out->nnz, sum.data()); - raft::update_host(&sum_h, sum.data(), 1, stream); + sum_h = sum.value(stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(X)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); delete out; } diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu index a157a17e30..3bd144ef54 100644 --- a/cpp/test/sparse/linkage.cu +++ b/cpp/test/sparse/linkage.cu @@ -19,8 +19,6 @@ #include #include #include -#include -#include #include #include @@ -108,18 +106,16 @@ __global__ void computeTheNumerator(const T* firstClusterArray, * @param firstClusterArray: the array of classes of type T * @param secondClusterArray: the array of classes of type T * @param size: the size of the data points of type uint64_t -* @param allocator: object that takes care of temporary device memory allocation of type std::shared_ptr * @param stream: the cudaStream object */ template -double compute_rand_index( - T* firstClusterArray, T* secondClusterArray, uint64_t size, - std::shared_ptr allocator, cudaStream_t stream) { +double compute_rand_index(T* firstClusterArray, T* secondClusterArray, + uint64_t size, cudaStream_t stream) { //rand index for size less than 2 is not defined ASSERT(size >= 2, "Rand Index for size less than 2 not defined!"); //allocating and initializing memory for a and b in the GPU - raft::mr::device::buffer arr_buf(allocator, stream, 2); + rmm::device_uvector arr_buf(2, stream); CUDA_CHECK(cudaMemsetAsync(arr_buf.data(), 0, 2 * sizeof(uint64_t), stream)); //kernel configuration @@ -159,30 +155,27 @@ template class LinkageTest : public ::testing::TestWithParam> { protected: void basicTest() { - raft::handle_t handle; + CUDA_CHECK(cudaStreamCreate(&stream)); params = ::testing::TestWithParam>::GetParam(); - rmm::device_uvector data(params.n_row * params.n_col, - handle.get_stream()); + rmm::device_uvector data(params.n_row * params.n_col, stream); // Allocate result labels and expected labels on device - raft::allocate(labels, params.n_row); - raft::allocate(labels_ref, params.n_row); + raft::allocate(labels, params.n_row, stream); + raft::allocate(labels_ref, params.n_row, stream); - raft::copy(data.data(), params.data.data(), data.size(), - handle.get_stream()); - raft::copy(labels_ref, params.expected_labels.data(), params.n_row, - handle.get_stream()); + raft::copy(data.data(), params.data.data(), data.size(), stream); + raft::copy(labels_ref, params.expected_labels.data(), params.n_row, stream); raft::hierarchy::linkage_output out_arrs; out_arrs.labels = labels; - rmm::device_uvector out_children(params.n_row * 2, - handle.get_stream()); + rmm::device_uvector out_children(params.n_row * 2, stream); out_arrs.children = out_children.data(); + raft::handle_t handle; raft::hierarchy::single_linkage< IdxT, T, raft::hierarchy::LinkageDistance::KNN_GRAPH>( handle, data.data(), params.n_row, params.n_col, @@ -191,23 +184,21 @@ class LinkageTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); - score = - compute_rand_index(labels, labels_ref, params.n_row, - handle.get_device_allocator(), handle.get_stream()); + score = compute_rand_index(labels, labels_ref, params.n_row, stream); } void SetUp() override { basicTest(); } void TearDown() override { - CUDA_CHECK(cudaFree(labels)); - CUDA_CHECK(cudaFree(labels_ref)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: LinkageInputs params; IdxT *labels, *labels_ref; - double score; + cudaStream_t stream; }; const std::vector> linkage_inputsf2 = { diff --git a/cpp/test/sparse/norm.cu b/cpp/test/sparse/norm.cu index 7adbbf8b9a..d69dd15c57 100644 --- a/cpp/test/sparse/norm.cu +++ b/cpp/test/sparse/norm.cu @@ -47,10 +47,10 @@ class CSRRowNormalizeTest CSRRowNormalizeInputs>::GetParam(); cudaStreamCreate(&stream); - raft::allocate(in_vals, params.in_vals.size()); - raft::allocate(verify, params.verify.size()); - raft::allocate(ex_scan, params.ex_scan.size()); - raft::allocate(result, params.verify.size(), true); + raft::allocate(in_vals, params.in_vals.size(), stream); + raft::allocate(verify, params.verify.size(), stream); + raft::allocate(ex_scan, params.ex_scan.size(), stream); + raft::allocate(result, params.verify.size(), stream, true); } void Run() { @@ -77,11 +77,8 @@ class CSRRowNormalizeTest } void TearDown() override { - CUDA_CHECK(cudaFree(ex_scan)); - CUDA_CHECK(cudaFree(in_vals)); - CUDA_CHECK(cudaFree(verify)); - CUDA_CHECK(cudaFree(result)); - cudaStreamDestroy(stream); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: diff --git a/cpp/test/sparse/reduce.cu b/cpp/test/sparse/reduce.cu index 50b5dc5993..8ff4a600bc 100644 --- a/cpp/test/sparse/reduce.cu +++ b/cpp/test/sparse/reduce.cu @@ -53,7 +53,6 @@ class SparseReduceTest void Run() { raft::handle_t handle; - auto d_alloc = handle.get_device_allocator(); auto stream = handle.get_stream(); rmm::device_uvector in_rows(params.in_rows.size(), stream); @@ -76,7 +75,7 @@ class SparseReduceTest raft::update_device(out_vals.data(), params.out_vals.data(), params.out_vals.size(), stream); - raft::sparse::COO out(d_alloc, stream); + raft::sparse::COO out(stream); raft::sparse::op::max_duplicates(handle, out, in_rows.data(), in_cols.data(), in_vals.data(), params.in_rows.size(), params.m, params.n); diff --git a/cpp/test/sparse/row_op.cu b/cpp/test/sparse/row_op.cu index b64fa25883..805a3d85da 100644 --- a/cpp/test/sparse/row_op.cu +++ b/cpp/test/sparse/row_op.cu @@ -59,9 +59,9 @@ class CSRRowOpTest n_rows = params.ex_scan.size(); nnz = params.verify.size(); - raft::allocate(verify, nnz); - raft::allocate(ex_scan, n_rows); - raft::allocate(result, nnz, true); + raft::allocate(verify, nnz, stream); + raft::allocate(ex_scan, n_rows, stream); + raft::allocate(result, nnz, stream, true); } void Run() { @@ -75,10 +75,8 @@ class CSRRowOpTest } void TearDown() override { - CUDA_CHECK(cudaFree(ex_scan)); - CUDA_CHECK(cudaFree(verify)); - CUDA_CHECK(cudaFree(result)); - cudaStreamDestroy(stream); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: diff --git a/cpp/test/sparse/selection.cu b/cpp/test/sparse/selection.cu index 46f2f6a844..256ecfdfb7 100644 --- a/cpp/test/sparse/selection.cu +++ b/cpp/test/sparse/selection.cu @@ -57,32 +57,30 @@ class SparseSelectionTest void make_data() { std::vector dists_h = params.dists_h; - allocate(dists, n_rows * n_cols); + raft::allocate(dists, n_rows * n_cols, stream); update_device(dists, dists_h.data(), dists_h.size(), stream); - allocate(inds, n_rows * n_cols); + raft::allocate(inds, n_rows * n_cols, stream); iota_fill(inds, n_rows, n_cols, stream); std::vector out_dists_ref_h = params.out_dists_ref_h; std::vector out_indices_ref_h = params.out_indices_ref_h; - allocate(out_indices_ref, out_indices_ref_h.size()); - allocate(out_dists_ref, out_dists_ref_h.size()); + raft::allocate(out_indices_ref, out_indices_ref_h.size(), stream); + raft::allocate(out_dists_ref, out_dists_ref_h.size(), stream); update_device(out_indices_ref, out_indices_ref_h.data(), out_indices_ref_h.size(), stream); update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), stream); - allocate(out_dists, n_rows * k); - allocate(out_indices, n_rows * k); + raft::allocate(out_dists, n_rows * k, stream); + raft::allocate(out_indices, n_rows * k, stream); } void SetUp() override { params = ::testing::TestWithParam< SparseSelectionInputs>::GetParam(); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); CUDA_CHECK(cudaStreamCreate(&stream)); n_rows = params.n_rows; @@ -99,15 +97,7 @@ class SparseSelectionTest } void TearDown() override { - CUDA_CHECK(cudaStreamSynchronize(stream)); - - CUDA_CHECK(cudaFree(dists)); - CUDA_CHECK(cudaFree(inds)); - CUDA_CHECK(cudaFree(out_indices)); - CUDA_CHECK(cudaFree(out_dists)); - CUDA_CHECK(cudaFree(out_indices_ref)); - CUDA_CHECK(cudaFree(out_dists_ref)); - + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); } diff --git a/cpp/test/sparse/sort.cu b/cpp/test/sparse/sort.cu index b9a8b849eb..e73a8a547b 100644 --- a/cpp/test/sparse/sort.cu +++ b/cpp/test/sparse/sort.cu @@ -20,7 +20,6 @@ #include "../test_utils.h" #include -#include #include @@ -55,10 +54,8 @@ TEST_P(COOSort, Result) { raft::random::Rng r(params.seed); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); - raft::allocate(in_vals, params.nnz); + raft::allocate(in_vals, params.nnz, stream); r.uniform(in_vals, params.nnz, float(-1.0), float(1.0), stream); int *in_rows_h = (int *)malloc(params.nnz * sizeof(int)); @@ -71,16 +68,16 @@ TEST_P(COOSort, Result) { in_cols_h[i] = i; } - raft::allocate(in_rows, params.nnz); - raft::allocate(in_cols, params.nnz); - raft::allocate(verify, params.nnz); + raft::allocate(in_rows, params.nnz, stream); + raft::allocate(in_cols, params.nnz, stream); + raft::allocate(verify, params.nnz, stream); raft::update_device(in_rows, in_rows_h, params.nnz, stream); raft::update_device(in_cols, in_cols_h, params.nnz, stream); raft::update_device(verify, verify_h, params.nnz, stream); - op::coo_sort(params.m, params.n, params.nnz, in_rows, in_cols, in_vals, alloc, + op::coo_sort(params.m, params.n, params.nnz, in_rows, in_cols, in_vals, stream); ASSERT_TRUE( diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu index d104028d2b..35233dc473 100644 --- a/cpp/test/sparse/symmetrize.cu +++ b/cpp/test/sparse/symmetrize.cu @@ -17,11 +17,13 @@ #include #include #include -#include "../test_utils.h" - #include #include #include +#include +#include + +#include "../test_utils.h" #include @@ -63,9 +65,9 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam< std::vector indices_h = params.indices_h; std::vector data_h = params.data_h; - allocate(indptr, indptr_h.size()); - allocate(indices, indices_h.size()); - allocate(data, data_h.size()); + raft::allocate(indptr, indptr_h.size(), stream); + raft::allocate(indices, indices_h.size(), stream); + raft::allocate(data, data_h.size(), stream); update_device(indptr, indptr_h.data(), indptr_h.size(), stream); update_device(indices, indices_h.data(), indices_h.size(), stream); @@ -78,7 +80,6 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam< raft::handle_t handle; - auto alloc = handle.get_device_allocator(); stream = handle.get_stream(); make_data(); @@ -87,23 +88,22 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam< value_idx n = params.n_cols; value_idx nnz = params.indices_h.size(); - raft::mr::device::buffer coo_rows(alloc, stream, nnz); + rmm::device_uvector coo_rows(nnz, stream); raft::sparse::convert::csr_to_coo(indptr, m, coo_rows.data(), nnz, stream); - raft::sparse::COO out(alloc, stream); + raft::sparse::COO out(stream); raft::sparse::linalg::symmetrize(handle, coo_rows.data(), indices, data, m, n, coo_rows.size(), out); - raft::mr::device::buffer sum(alloc, stream, 1); - - CUDA_CHECK(cudaMemsetAsync(sum.data(), 0, 1 * sizeof(value_idx), stream)); + rmm::device_scalar sum(stream); + sum.set_value_to_zero_async(stream); assert_symmetry<<>>( out.rows(), out.cols(), out.vals(), out.nnz, sum.data()); - raft::update_host(&sum_h, sum.data(), 1, stream); + sum_h = sum.value(stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -148,9 +148,6 @@ TEST_P(COOSymmetrize, Result) { cudaStream_t stream; cudaStreamCreate(&stream); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); - int nnz = 8; int *in_rows_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3}; @@ -164,19 +161,19 @@ TEST_P(COOSymmetrize, Result) { float *exp_vals_h = new float[nnz * 2]{0.5, 0.5, 1.5, 0, 0.5, 0.5, 0.5, 0, 0.5, 0.5, 0.5, 0, 1.5, 0.5, 0.5, 0.0}; - COO in(alloc, stream, nnz, 4, 4); + COO in(stream, nnz, 4, 4); raft::update_device(in.rows(), *&in_rows_h, nnz, stream); raft::update_device(in.cols(), *&in_cols_h, nnz, stream); raft::update_device(in.vals(), *&in_vals_h, nnz, stream); - COO out(alloc, stream); + COO out(stream); linalg::coo_symmetrize<32, float>( &in, &out, [] __device__(int row, int col, float val, float trans) { return val + trans; }, - alloc, stream); + stream); CUDA_CHECK(cudaStreamSynchronize(stream)); std::cout << out << std::endl; diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu index def1f1685b..122d7f2d6a 100644 --- a/cpp/test/spatial/haversine.cu +++ b/cpp/test/spatial/haversine.cu @@ -18,7 +18,6 @@ #include #include #include -#include #include #include "../test_utils.h" @@ -30,18 +29,18 @@ template class HaversineKNNTest : public ::testing::Test { protected: void basicTest() { - auto alloc = std::make_shared(); + CUDA_CHECK(cudaStreamCreate(&stream)); // Allocate input - raft::allocate(d_train_inputs, n * d); + raft::allocate(d_train_inputs, n * d, stream); // Allocate reference arrays - raft::allocate(d_ref_I, n * n); - raft::allocate(d_ref_D, n * n); + raft::allocate(d_ref_I, n * n, stream); + raft::allocate(d_ref_D, n * n, stream); // Allocate predicted arrays - raft::allocate(d_pred_I, n * n); - raft::allocate(d_pred_D, n * n); + raft::allocate(d_pred_I, n * n, stream); + raft::allocate(d_pred_D, n * n, stream); // make testdata on host std::vector h_train_inputs = { @@ -50,7 +49,7 @@ class HaversineKNNTest : public ::testing::Test { 0.53154002, -1.47049808, 0.72891737, -1.54095137}; h_train_inputs.resize(n); - raft::update_device(d_train_inputs, h_train_inputs.data(), n * d, 0); + raft::update_device(d_train_inputs, h_train_inputs.data(), n * d, stream); std::vector h_res_D = { 0., 0.05041587, 0.18767063, 0.23048252, 0.35749438, 0.62925595, @@ -60,34 +59,28 @@ class HaversineKNNTest : public ::testing::Test { 0., 0.16461092, 0.20535265, 0.23048252, 0.2426416, 0.5170737, 0., 0.152463, 0.18767063, 0.20535265, 0.2345792, 0.44288665}; h_res_D.resize(n * n); - raft::update_device(d_ref_D, h_res_D.data(), n * n, 0); + raft::update_device(d_ref_D, h_res_D.data(), n * n, stream); std::vector h_res_I = {0, 2, 5, 4, 3, 1, 1, 3, 5, 4, 2, 0, 2, 0, 5, 4, 3, 1, 3, 4, 5, 2, 0, 1, 4, 3, 5, 0, 2, 1, 5, 2, 0, 4, 3, 1}; h_res_I.resize(n * n); - raft::update_device(d_ref_I, h_res_I.data(), n * n, 0); + raft::update_device(d_ref_I, h_res_I.data(), n * n, stream); std::vector input_vec = {d_train_inputs}; std::vector sizes_vec = {n}; - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - raft::spatial::knn::detail::haversine_knn( d_pred_I, d_pred_D, d_train_inputs, d_train_inputs, n, n, k, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void SetUp() override { basicTest(); } void TearDown() override { - CUDA_CHECK(cudaFree(d_train_inputs)); - CUDA_CHECK(cudaFree(d_pred_I)); - CUDA_CHECK(cudaFree(d_pred_D)); - CUDA_CHECK(cudaFree(d_ref_I)); - CUDA_CHECK(cudaFree(d_ref_D)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: @@ -103,6 +96,8 @@ class HaversineKNNTest : public ::testing::Test { value_idx *d_ref_I; value_t *d_ref_D; + + cudaStream_t stream; }; typedef HaversineKNNTest HaversineKNNTestF; diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu index de6251d32d..e4d05920c0 100644 --- a/cpp/test/spatial/knn.cu +++ b/cpp/test/spatial/knn.cu @@ -68,8 +68,8 @@ class KNNTest : public ::testing::TestWithParam { auto stream = handle_.get_stream(); - raft::allocate(actual_labels_, rows_ * k_, true); - raft::allocate(expected_labels_, rows_ * k_, true); + raft::allocate(actual_labels_, rows_ * k_, stream, true); + raft::allocate(expected_labels_, rows_ * k_, stream, true); std::vector input_vec; std::vector sizes_vec; @@ -104,6 +104,8 @@ class KNNTest : public ::testing::TestWithParam { cols_ = params_.input[0].size(); k_ = params_.k; + cudaStream_t stream = handle_.get_stream(); + std::vector row_major_input; for (std::size_t i = 0; i < params_.input.size(); ++i) { for (std::size_t j = 0; j < params_.input[i].size(); ++j) { @@ -111,31 +113,27 @@ class KNNTest : public ::testing::TestWithParam { } } rmm::device_buffer input_d = rmm::device_buffer( - row_major_input.data(), row_major_input.size() * sizeof(float), - handle_.get_stream()); + row_major_input.data(), row_major_input.size() * sizeof(float), stream); float *input_ptr = static_cast(input_d.data()); rmm::device_buffer labels_d = rmm::device_buffer( - params_.labels.data(), params_.labels.size() * sizeof(int), - handle_.get_stream()); + params_.labels.data(), params_.labels.size() * sizeof(int), stream); int *labels_ptr = static_cast(labels_d.data()); - raft::allocate(input_, rows_ * cols_, true); - raft::allocate(search_data_, rows_ * cols_, true); - raft::allocate(indices_, rows_ * k_, true); - raft::allocate(distances_, rows_ * k_, true); - raft::allocate(search_labels_, rows_, true); + raft::allocate(input_, rows_ * cols_, stream, true); + raft::allocate(search_data_, rows_ * cols_, stream, true); + raft::allocate(indices_, rows_ * k_, stream, true); + raft::allocate(distances_, rows_ * k_, stream, true); + raft::allocate(search_labels_, rows_, stream, true); - raft::copy(input_, input_ptr, rows_ * cols_, handle_.get_stream()); - raft::copy(search_data_, input_ptr, rows_ * cols_, handle_.get_stream()); - raft::copy(search_labels_, labels_ptr, rows_, handle_.get_stream()); + raft::copy(input_, input_ptr, rows_ * cols_, stream); + raft::copy(search_data_, input_ptr, rows_ * cols_, stream); + raft::copy(search_labels_, labels_ptr, rows_, stream); } void TearDown() override { - CUDA_CHECK(cudaFree(search_data_)); - CUDA_CHECK(cudaFree(indices_)); - CUDA_CHECK(cudaFree(distances_)); - CUDA_CHECK(cudaFree(actual_labels_)); + cudaStream_t stream = handle_.get_stream(); + raft::deallocate_all(stream); } private: diff --git a/cpp/test/spectral_matrix.cu b/cpp/test/spectral_matrix.cu index e5c2d52764..b85d35e3f8 100644 --- a/cpp/test/spectral_matrix.cu +++ b/cpp/test/spectral_matrix.cu @@ -57,27 +57,24 @@ TEST(Raft, SpectralMatrices) { ASSERT_EQ(nullptr, sm2.row_offsets_); auto stream = h.get_stream(); - auto t_exe_pol = thrust::cuda::par.on(stream); - auto cnstr_lm1 = [&h, t_exe_pol, ro, ci, vs, nrows, nnz](void) { - laplacian_matrix_t lm1{h, t_exe_pol, ro, ci, - vs, nrows, nnz}; + auto cnstr_lm1 = [&h, ro, ci, vs, nrows, nnz](void) { + laplacian_matrix_t lm1{h, ro, ci, vs, nrows, nnz}; }; EXPECT_ANY_THROW(cnstr_lm1()); // because of nullptr ptr args - auto cnstr_lm2 = [&h, t_exe_pol, &sm2](void) { - laplacian_matrix_t lm2{h, t_exe_pol, sm2}; + auto cnstr_lm2 = [&h, &sm2](void) { + laplacian_matrix_t lm2{h, sm2}; }; EXPECT_ANY_THROW(cnstr_lm2()); // because of nullptr ptr args - auto cnstr_mm1 = [&h, t_exe_pol, ro, ci, vs, nrows, nnz](void) { - modularity_matrix_t mm1{h, t_exe_pol, ro, ci, - vs, nrows, nnz}; + auto cnstr_mm1 = [&h, ro, ci, vs, nrows, nnz](void) { + modularity_matrix_t mm1{h, ro, ci, vs, nrows, nnz}; }; EXPECT_ANY_THROW(cnstr_mm1()); // because of nullptr ptr args - auto cnstr_mm2 = [&h, t_exe_pol, &sm2](void) { - modularity_matrix_t mm2{h, t_exe_pol, sm2}; + auto cnstr_mm2 = [&h, &sm2](void) { + modularity_matrix_t mm2{h, sm2}; }; EXPECT_ANY_THROW(cnstr_mm2()); // because of nullptr ptr args } diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu index 4a3b0ed196..a3c88a92be 100644 --- a/cpp/test/stats/mean.cu +++ b/cpp/test/stats/mean.cu @@ -49,11 +49,10 @@ class MeanTest : public ::testing::TestWithParam> { int rows = params.rows, cols = params.cols; int len = rows * cols; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - allocate(data, len); - allocate(mean_act, cols); + raft::allocate(data, len, stream); + raft::allocate(mean_act, cols, stream); r.normal(data, len, params.mean, (T)1.0, stream); meanSGtest(data, stream); @@ -66,13 +65,14 @@ class MeanTest : public ::testing::TestWithParam> { } void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(mean_act)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: MeanInputs params; T *data, *mean_act; + cudaStream_t stream; }; // Note: For 1024 samples, 256 experiments, a mean of 1.0 with stddev=1.0, the diff --git a/cpp/test/stats/mean_center.cu b/cpp/test/stats/mean_center.cu index 8b0d607561..b827230b5d 100644 --- a/cpp/test/stats/mean_center.cu +++ b/cpp/test/stats/mean_center.cu @@ -47,17 +47,16 @@ class MeanCenterTest params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); auto rows = params.rows, cols = params.cols; auto len = rows * cols; IdxType vecLen = params.bcastAlongRows ? cols : rows; - raft::allocate(out, len); - raft::allocate(out_ref, len); - raft::allocate(data, len); - raft::allocate(meanVec, vecLen); + raft::allocate(out, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(data, len, stream); + raft::allocate(meanVec, vecLen, stream); r.normal(data, len, params.mean, (T)1.0, stream); raft::stats::mean(meanVec, data, cols, rows, params.sample, params.rowMajor, stream); @@ -65,19 +64,18 @@ class MeanCenterTest params.bcastAlongRows, stream); raft::linalg::naiveMatVec(out_ref, data, meanVec, cols, rows, params.rowMajor, params.bcastAlongRows, (T)-1.0); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(out)); - CUDA_CHECK(cudaFree(out_ref)); - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(meanVec)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: MeanCenterInputs params; T *data, *meanVec, *out, *out_ref; + cudaStream_t stream; }; const std::vector> inputsf_i32 = { diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu index ff2698788f..fd374249d2 100644 --- a/cpp/test/stats/stddev.cu +++ b/cpp/test/stats/stddev.cu @@ -47,15 +47,14 @@ class StdDevTest : public ::testing::TestWithParam> { int rows = params.rows, cols = params.cols; int len = rows * cols; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - allocate(data, len); - allocate(mean_act, cols); - allocate(stddev_act, cols); - allocate(vars_act, cols); + raft::allocate(data, len, stream); + raft::allocate(mean_act, cols, stream); + raft::allocate(stddev_act, cols, stream); + raft::allocate(vars_act, cols, stream); r.normal(data, len, params.mean, params.stddev, stream); stdVarSGtest(data, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void stdVarSGtest(T *data, cudaStream_t stream) { @@ -73,15 +72,14 @@ class StdDevTest : public ::testing::TestWithParam> { } void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(mean_act)); - CUDA_CHECK(cudaFree(stddev_act)); - CUDA_CHECK(cudaFree(vars_act)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: StdDevInputs params; T *data, *mean_act, *stddev_act, *vars_act; + cudaStream_t stream; }; const std::vector> inputsf = { diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu index c3140d4588..58ebec7859 100644 --- a/cpp/test/stats/sum.cu +++ b/cpp/test/stats/sum.cu @@ -43,9 +43,8 @@ class SumTest : public ::testing::TestWithParam> { params = ::testing::TestWithParam>::GetParam(); int rows = params.rows, cols = params.cols; int len = rows * cols; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(data, len); + raft::allocate(data, len, stream); T data_h[len]; for (int i = 0; i < len; i++) { @@ -54,19 +53,20 @@ class SumTest : public ::testing::TestWithParam> { raft::update_device(data, data_h, len, stream); - raft::allocate(sum_act, cols); + raft::allocate(sum_act, cols, stream); sum(sum_act, data, cols, rows, false, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(sum_act)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: SumInputs params; T *data, *sum_act; + cudaStream_t stream; }; const std::vector> inputsf = {{0.05f, 1024, 32, 1234ULL}, diff --git a/python/raft/common/handle.pxd b/python/raft/common/handle.pxd index 6076640312..884d81bed1 100644 --- a/python/raft/common/handle.pxd +++ b/python/raft/common/handle.pxd @@ -34,7 +34,5 @@ cdef extern from "raft/handle.hpp" namespace "raft" nogil: handle_t() except + handle_t(int ns) except + void set_stream(_Stream s) except + - void set_device_allocator(shared_ptr[allocator] a) except + - shared_ptr[allocator] get_device_allocator() except + _Stream get_stream() except + int get_num_internal_streams() except +