From f774b0b21fa3af4b553534dc56cb4f37d4e5c2c7 Mon Sep 17 00:00:00 2001 From: viclafargue Date: Tue, 29 Jun 2021 12:47:14 +0200 Subject: [PATCH 01/17] Replace RAFT buffer adapter with RMM buffer --- cpp/include/raft/common/cub_wrappers.cuh | 4 +- cpp/include/raft/comms/test.hpp | 41 +++------ cpp/include/raft/distance/distance.cuh | 7 +- cpp/include/raft/label/classlabels.cuh | 10 +-- cpp/include/raft/lap/lap.cuh | 83 ++++++++++--------- cpp/include/raft/lap/lap_functions.cuh | 47 ++++------- cpp/include/raft/linalg/eig.cuh | 16 ++-- cpp/include/raft/linalg/qr.cuh | 18 ++-- cpp/include/raft/linalg/svd.cuh | 27 +++--- cpp/include/raft/matrix/math.cuh | 7 +- cpp/include/raft/random/rng.cuh | 12 +-- cpp/include/raft/sparse/convert/csr.cuh | 10 +-- cpp/include/raft/sparse/coo.cuh | 27 +++--- cpp/include/raft/sparse/csr.cuh | 6 +- .../raft/sparse/distance/bin_distance.cuh | 26 +++--- cpp/include/raft/sparse/distance/csr_spmv.cuh | 4 +- .../raft/sparse/distance/ip_distance.cuh | 33 ++++---- .../raft/sparse/distance/l2_distance.cuh | 32 +++---- .../raft/sparse/distance/lp_distance.cuh | 6 +- cpp/include/raft/sparse/linalg/add.cuh | 4 +- cpp/include/raft/sparse/linalg/spectral.cuh | 14 ++-- cpp/include/raft/sparse/linalg/symmetrize.cuh | 9 +- cpp/include/raft/sparse/linalg/transpose.h | 6 +- cpp/include/raft/sparse/op/filter.cuh | 10 +-- .../knn/detail/knn_brute_force_faiss.cuh | 9 +- .../raft/spatial/knn/detail/processing.hpp | 10 +-- cpp/test/lap/lap.cu | 17 ++-- cpp/test/linalg/cholesky_r1.cu | 25 +++--- cpp/test/linalg/map_then_reduce.cu | 10 +-- cpp/test/matrix/matrix.cu | 15 ++-- cpp/test/sparse/dist_coo_spmv.cu | 6 +- cpp/test/sparse/linkage.cu | 2 +- cpp/test/sparse/symmetrize.cu | 9 +- 33 files changed, 252 insertions(+), 310 deletions(-) diff --git a/cpp/include/raft/common/cub_wrappers.cuh b/cpp/include/raft/common/cub_wrappers.cuh index 8d5b29f700..8e3519fea5 100644 --- a/cpp/include/raft/common/cub_wrappers.cuh +++ b/cpp/include/raft/common/cub_wrappers.cuh @@ -17,7 +17,7 @@ #pragma once #include -#include +#include namespace raft { @@ -34,7 +34,7 @@ namespace raft { * @param stream cuda stream */ template -void sortPairs(raft::mr::device::buffer &workspace, const KeyT *inKeys, +void sortPairs(rmm::device_uvector &workspace, const KeyT *inKeys, KeyT *outKeys, const ValueT *inVals, ValueT *outVals, int len, cudaStream_t stream) { size_t worksize; diff --git a/cpp/include/raft/comms/test.hpp b/cpp/include/raft/comms/test.hpp index 4e95c4eef0..9f5edc1425 100644 --- a/cpp/include/raft/comms/test.hpp +++ b/cpp/include/raft/comms/test.hpp @@ -18,7 +18,6 @@ #include #include -#include #include #include @@ -44,8 +43,7 @@ bool test_collective_allreduce(const handle_t &handle, int root) { cudaStream_t stream = handle.get_stream(); - raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); - temp_d.resize(1, stream); + rmm::device_uvector temp_d(1, stream); CUDA_CHECK( cudaMemcpyAsync(temp_d.data(), &send, 1, cudaMemcpyHostToDevice, stream)); @@ -76,8 +74,7 @@ bool test_collective_broadcast(const handle_t &handle, int root) { cudaStream_t stream = handle.get_stream(); - raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); - temp_d.resize(1, stream); + rmm::device_uvector temp_d(1, stream); if (communicator.get_rank() == root) CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), @@ -104,8 +101,7 @@ bool test_collective_reduce(const handle_t &handle, int root) { cudaStream_t stream = handle.get_stream(); - raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); - temp_d.resize(1, stream); + rmm::device_uvector temp_d(1, stream); CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); @@ -134,11 +130,8 @@ bool test_collective_allgather(const handle_t &handle, int root) { cudaStream_t stream = handle.get_stream(); - raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); - temp_d.resize(1, stream); - - raft::mr::device::buffer recv_d(handle.get_device_allocator(), stream, - communicator.get_size()); + rmm::device_uvector temp_d(1, stream); + rmm::device_uvector recv_d(communicator.get_size(), stream); CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); @@ -169,12 +162,9 @@ bool test_collective_gather(const handle_t &handle, int root) { cudaStream_t stream = handle.get_stream(); - raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); - temp_d.resize(1, stream); - - raft::mr::device::buffer recv_d( - handle.get_device_allocator(), stream, - communicator.get_rank() == root ? communicator.get_size() : 0); + rmm::device_uvector temp_d(1, stream); + rmm::device_uvector recv_d( + communicator.get_rank() == root ? communicator.get_size() : 0, stream); CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); @@ -211,12 +201,9 @@ bool test_collective_gatherv(const handle_t &handle, int root) { cudaStream_t stream = handle.get_stream(); - raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); - temp_d.resize(sends.size(), stream); - - raft::mr::device::buffer recv_d( - handle.get_device_allocator(), stream, - communicator.get_rank() == root ? displacements.back() : 0); + rmm::device_uvector temp_d(sends.size(), stream); + rmm::device_uvector recv_d( + communicator.get_rank() == root ? displacements.back() : 0, stream); CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(), sends.size() * sizeof(int), cudaMemcpyHostToDevice, @@ -256,10 +243,8 @@ bool test_collective_reducescatter(const handle_t &handle, int root) { cudaStream_t stream = handle.get_stream(); - raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream, - sends.size()); - raft::mr::device::buffer recv_d(handle.get_device_allocator(), stream, - 1); + rmm::device_uvector temp_d(sends.size(), stream); + rmm::device_uvector recv_d(1, stream); CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(), sends.size() * sizeof(int), cudaMemcpyHostToDevice, diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/distance.cuh index 579b3bb446..a0a47a2bdb 100644 --- a/cpp/include/raft/distance/distance.cuh +++ b/cpp/include/raft/distance/distance.cuh @@ -22,7 +22,7 @@ #include #include #include -#include +#include namespace raft { namespace distance { @@ -243,7 +243,7 @@ void distance(const InType *x, const InType *y, OutType *dist, Index_ m, template void pairwise_distance_impl(const Type *x, const Type *y, Type *dist, Index_ m, Index_ n, Index_ k, - raft::mr::device::buffer &workspace, + rmm::device_uvector &workspace, cudaStream_t stream, bool isRowMajor) { auto worksize = getWorkspaceSize(x, y, m, n, k); @@ -254,8 +254,7 @@ void pairwise_distance_impl(const Type *x, const Type *y, Type *dist, Index_ m, template void pairwise_distance(const Type *x, const Type *y, Type *dist, Index_ m, - Index_ n, Index_ k, - raft::mr::device::buffer &workspace, + Index_ n, Index_ k, rmm::device_uvector &workspace, raft::distance::DistanceType metric, cudaStream_t stream, bool isRowMajor = true) { switch (metric) { diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/classlabels.cuh index 0da7da2eb6..f2b2463165 100644 --- a/cpp/include/raft/label/classlabels.cuh +++ b/cpp/include/raft/label/classlabels.cuh @@ -22,7 +22,7 @@ #include #include #include -#include +#include namespace raft { namespace label { @@ -46,9 +46,9 @@ template void getUniquelabels(value_t *y, size_t n, value_t **y_unique, int *n_unique, cudaStream_t stream, std::shared_ptr allocator) { - raft::mr::device::buffer y2(allocator, stream, n); - raft::mr::device::buffer y3(allocator, stream, n); - raft::mr::device::buffer d_num_selected(allocator, stream, 1); + rmm::device_uvector y2(n, stream); + rmm::device_uvector y3(n, stream); + rmm::device_uvector d_num_selected(1, stream); size_t bytes = 0; size_t bytes2 = 0; @@ -58,7 +58,7 @@ void getUniquelabels(value_t *y, size_t n, value_t **y_unique, int *n_unique, cub::DeviceSelect::Unique(NULL, bytes2, y2.data(), y3.data(), d_num_selected.data(), n); bytes = max(bytes, bytes2); - raft::mr::device::buffer cub_storage(allocator, stream, bytes); + rmm::device_uvector cub_storage(bytes, stream); // Select Unique classes cub::DeviceRadixSort::SortKeys(cub_storage.data(), bytes, y, y2.data(), n); diff --git a/cpp/include/raft/lap/lap.cuh b/cpp/include/raft/lap/lap.cuh index 6bc1c08029..29c1cf6aa1 100644 --- a/cpp/include/raft/lap/lap.cuh +++ b/cpp/include/raft/lap/lap.cuh @@ -25,6 +25,7 @@ #pragma once #include +#include #include "d_structs.h" #include "lap_functions.cuh" @@ -44,19 +45,19 @@ class LinearAssignmentProblem { VertexData d_row_data_dev, d_col_data_dev; raft::handle_t const &handle_; - raft::mr::device::buffer row_covers_v; - raft::mr::device::buffer col_covers_v; - raft::mr::device::buffer row_duals_v; - raft::mr::device::buffer col_duals_v; - raft::mr::device::buffer col_slacks_v; - raft::mr::device::buffer row_is_visited_v; - raft::mr::device::buffer col_is_visited_v; - raft::mr::device::buffer row_parents_v; - raft::mr::device::buffer col_parents_v; - raft::mr::device::buffer row_children_v; - raft::mr::device::buffer col_children_v; - raft::mr::device::buffer obj_val_primal_v; - raft::mr::device::buffer obj_val_dual_v; + rmm::device_uvector row_covers_v; + rmm::device_uvector col_covers_v; + rmm::device_uvector row_duals_v; + rmm::device_uvector col_duals_v; + rmm::device_uvector col_slacks_v; + rmm::device_uvector row_is_visited_v; + rmm::device_uvector col_is_visited_v; + rmm::device_uvector row_parents_v; + rmm::device_uvector col_parents_v; + rmm::device_uvector row_children_v; + rmm::device_uvector col_children_v; + rmm::device_uvector obj_val_primal_v; + rmm::device_uvector obj_val_dual_v; public: LinearAssignmentProblem(raft::handle_t const &handle, vertex_t size, @@ -66,19 +67,19 @@ class LinearAssignmentProblem { batchsize_(batchsize), epsilon_(epsilon), d_costs_(nullptr), - row_covers_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - col_covers_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - row_duals_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - col_duals_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - col_slacks_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - row_is_visited_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - col_is_visited_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - row_parents_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - col_parents_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - row_children_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - col_children_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - obj_val_primal_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - obj_val_dual_v(handle_.get_device_allocator(), handle_.get_stream(), 0) {} + row_covers_v(0, handle_.get_stream()), + col_covers_v(0, handle_.get_stream()), + row_duals_v(0, handle_.get_stream()), + col_duals_v(0, handle_.get_stream()), + col_slacks_v(0, handle_.get_stream()), + row_is_visited_v(0, handle_.get_stream()), + col_is_visited_v(0, handle_.get_stream()), + row_parents_v(0, handle_.get_stream()), + col_parents_v(0, handle_.get_stream()), + row_children_v(0, handle_.get_stream()), + col_children_v(0, handle_.get_stream()), + obj_val_primal_v(0, handle_.get_stream()), + obj_val_dual_v(0, handle_.get_stream()) {} // Executes Hungarian algorithm on the input cost matrix. void solve(weight_t const *d_cost_matrix, vertex_t *d_row_assignment, @@ -152,19 +153,20 @@ class LinearAssignmentProblem { private: // Helper function for initializing global variables and arrays on a single host. void initializeDevice() { - row_covers_v.resize(batchsize_ * size_); - col_covers_v.resize(batchsize_ * size_); - row_duals_v.resize(batchsize_ * size_); - col_duals_v.resize(batchsize_ * size_); - col_slacks_v.resize(batchsize_ * size_); - row_is_visited_v.resize(batchsize_ * size_); - col_is_visited_v.resize(batchsize_ * size_); - row_parents_v.resize(batchsize_ * size_); - col_parents_v.resize(batchsize_ * size_); - row_children_v.resize(batchsize_ * size_); - col_children_v.resize(batchsize_ * size_); - obj_val_primal_v.resize(batchsize_); - obj_val_dual_v.resize(batchsize_); + cudaStream_t stream = handle_.get_stream(); + row_covers_v.resize(batchsize_ * size_, stream); + col_covers_v.resize(batchsize_ * size_, stream); + row_duals_v.resize(batchsize_ * size_, stream); + col_duals_v.resize(batchsize_ * size_, stream); + col_slacks_v.resize(batchsize_ * size_, stream); + row_is_visited_v.resize(batchsize_ * size_, stream); + col_is_visited_v.resize(batchsize_ * size_, stream); + row_parents_v.resize(batchsize_ * size_, stream); + col_parents_v.resize(batchsize_ * size_, stream); + row_children_v.resize(batchsize_ * size_, stream); + col_children_v.resize(batchsize_ * size_, stream); + obj_val_primal_v.resize(batchsize_, stream); + obj_val_dual_v.resize(batchsize_, stream); d_vertices_dev.row_covers = row_covers_v.data(); d_vertices_dev.col_covers = col_covers_v.data(); @@ -231,8 +233,7 @@ class LinearAssignmentProblem { int hungarianStep3() { int next; - raft::mr::device::buffer flag_v(handle_.get_device_allocator(), - handle_.get_stream(), 1); + rmm::device_uvector flag_v(1, handle_.get_stream()); bool h_flag = false; raft::update_device(flag_v.data(), &h_flag, 1, handle_.get_stream()); diff --git a/cpp/include/raft/lap/lap_functions.cuh b/cpp/include/raft/lap/lap_functions.cuh index 0079f50e82..ce3f67f8fe 100644 --- a/cpp/include/raft/lap/lap_functions.cuh +++ b/cpp/include/raft/lap/lap_functions.cuh @@ -34,9 +34,8 @@ #include #include -#include - #include +#include namespace raft { namespace lap { @@ -127,10 +126,8 @@ inline void computeInitialAssignments(raft::handle_t const &handle, std::size_t size = SP * N; - raft::mr::device::buffer row_lock_v(handle.get_device_allocator(), - handle.get_stream(), size); - raft::mr::device::buffer col_lock_v(handle.get_device_allocator(), - handle.get_stream(), size); + rmm::device_uvector row_lock_v(size, handle.get_stream()); + rmm::device_uvector col_lock_v(size, handle.get_stream()); thrust::fill_n(thrust::device, d_vertices.row_assignments, size, -1); thrust::fill_n(thrust::device, d_vertices.col_assignments, size, -1); @@ -216,25 +213,21 @@ inline vertex_t zeroCoverIteration(raft::handle_t const &handle, weight_t epsilon) { vertex_t M; - raft::mr::device::buffer csr_ptrs_v(handle.get_device_allocator(), - handle.get_stream(), 0); - raft::mr::device::buffer csr_neighbors_v( - handle.get_device_allocator(), handle.get_stream(), 0); + rmm::device_uvector csr_ptrs_v(0, handle.get_stream()); + rmm::device_uvector csr_neighbors_v(0, handle.get_stream()); { dim3 blocks_per_grid; dim3 threads_per_block; int total_blocks = 0; - raft::mr::device::buffer predicates_v(handle.get_device_allocator(), - handle.get_stream(), SP * N); - raft::mr::device::buffer addresses_v( - handle.get_device_allocator(), handle.get_stream(), SP * N); + rmm::device_uvector predicates_v(SP * N, handle.get_stream()); + rmm::device_uvector addresses_v(SP * N, handle.get_stream()); thrust::fill_n(thrust::device, predicates_v.data(), SP * N, false); thrust::fill_n(thrust::device, addresses_v.data(), SP * N, vertex_t{0}); - csr_ptrs_v.resize(SP + 1); + csr_ptrs_v.resize(SP + 1, handle.get_stream()); thrust::fill_n(thrust::device, csr_ptrs_v.data(), (SP + 1), vertex_t{-1}); @@ -253,7 +246,7 @@ inline vertex_t zeroCoverIteration(raft::handle_t const &handle, addresses_v.end(), addresses_v.begin()); if (M > 0) { - csr_neighbors_v.resize(M); + csr_neighbors_v.resize(M, handle.get_stream()); kernel_rowScatterCSR<<>>( @@ -302,10 +295,8 @@ inline void reversePass(raft::handle_t const &handle, raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, size); - raft::mr::device::buffer predicates_v(handle.get_device_allocator(), - handle.get_stream(), size); - raft::mr::device::buffer addresses_v(handle.get_device_allocator(), - handle.get_stream(), size); + rmm::device_uvector predicates_v(size, handle.get_stream()); + rmm::device_uvector addresses_v(size, handle.get_stream()); thrust::fill_n(thrust::device, predicates_v.data(), size, false); thrust::fill_n(thrust::device, addresses_v.data(), size, vertex_t{0}); @@ -331,8 +322,7 @@ inline void reversePass(raft::handle_t const &handle, raft::lap::detail::calculateLinearDims( blocks_per_grid_1, threads_per_block_1, total_blocks_1, csr_size); - raft::mr::device::buffer elements_v( - handle.get_device_allocator(), handle.get_stream(), csr_size); + rmm::device_uvector elements_v(csr_size, handle.get_stream()); kernel_augmentScatter<<>>( @@ -360,10 +350,8 @@ inline void augmentationPass(raft::handle_t const &handle, raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP * N); - raft::mr::device::buffer predicates_v(handle.get_device_allocator(), - handle.get_stream(), SP * N); - raft::mr::device::buffer addresses_v(handle.get_device_allocator(), - handle.get_stream(), SP * N); + rmm::device_uvector predicates_v(SP * N, handle.get_stream()); + rmm::device_uvector addresses_v(SP * N, handle.get_stream()); thrust::fill_n(thrust::device, predicates_v.data(), SP * N, false); thrust::fill_n(thrust::device, addresses_v.data(), SP * N, vertex_t{0}); @@ -390,8 +378,8 @@ inline void augmentationPass(raft::handle_t const &handle, raft::lap::detail::calculateLinearDims( blocks_per_grid_1, threads_per_block_1, total_blocks_1, row_ids_csr_size); - raft::mr::device::buffer elements_v( - handle.get_device_allocator(), handle.get_stream(), row_ids_csr_size); + rmm::device_uvector elements_v(row_ids_csr_size, + handle.get_stream()); kernel_augmentScatter<<>>( @@ -420,8 +408,7 @@ inline void dualUpdate(raft::handle_t const &handle, dim3 threads_per_block; int total_blocks; - raft::mr::device::buffer sp_min_v(handle.get_device_allocator(), - handle.get_stream(), 1); + rmm::device_uvector sp_min_v(1, handle.get_stream()); raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP); diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh index 6172618380..8ab7011db4 100644 --- a/cpp/include/raft/linalg/eig.cuh +++ b/cpp/include/raft/linalg/eig.cuh @@ -22,7 +22,7 @@ #include #include #include -#include +#include namespace raft { namespace linalg { @@ -52,8 +52,8 @@ void eigDC(const raft::handle_t &handle, const math_t *in, int n_rows, CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, eig_vals, &lwork)); - raft::mr::device::buffer d_work(allocator, stream, lwork); - raft::mr::device::buffer d_dev_info(allocator, stream, 1); + rmm::device_uvector d_work(lwork, stream); + rmm::device_uvector d_dev_info(1, stream); raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream); @@ -104,9 +104,9 @@ void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, math_t(0.0), math_t(0.0), n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, &lwork)); - raft::mr::device::buffer d_work(allocator, stream, lwork); - raft::mr::device::buffer d_dev_info(allocator, stream, 1); - raft::mr::device::buffer d_eig_vectors(allocator, stream, 0); + rmm::device_uvector d_work(lwork, stream); + rmm::device_uvector d_dev_info(1, stream); + rmm::device_uvector d_eig_vectors(0, stream); if (memUsage == OVERWRITE_INPUT) { CUSOLVER_CHECK(cusolverDnsyevdx( @@ -176,8 +176,8 @@ void eigJacobi(const raft::handle_t &handle, const math_t *in, int n_rows, cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors, n_cols, eig_vals, &lwork, syevj_params)); - raft::mr::device::buffer d_work(allocator, stream, lwork); - raft::mr::device::buffer dev_info(allocator, stream, 1); + rmm::device_uvector d_work(lwork, stream); + rmm::device_uvector dev_info(1, stream); raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream); diff --git a/cpp/include/raft/linalg/qr.cuh b/cpp/include/raft/linalg/qr.cuh index cafa8d54f1..14771f289d 100644 --- a/cpp/include/raft/linalg/qr.cuh +++ b/cpp/include/raft/linalg/qr.cuh @@ -19,7 +19,7 @@ #include #include #include -#include +#include namespace raft { namespace linalg { @@ -42,7 +42,6 @@ namespace linalg { template void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q, int n_rows, int n_cols, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); int m = n_rows, n = n_cols; @@ -50,14 +49,14 @@ void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q, CUDA_CHECK(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream)); - raft::mr::device::buffer tau(allocator, stream, k); + rmm::device_uvector tau(k, stream); CUDA_CHECK(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * k, stream)); - raft::mr::device::buffer devInfo(allocator, stream, 1); + rmm::device_uvector devInfo(1, stream); int Lwork; CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, m, n, Q, m, &Lwork)); - raft::mr::device::buffer workspace(allocator, stream, Lwork); + rmm::device_uvector workspace(Lwork, stream); CUSOLVER_CHECK(cusolverDngeqrf(cusolverH, m, n, Q, m, tau.data(), workspace.data(), Lwork, devInfo.data(), stream)); @@ -86,12 +85,11 @@ void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q, template void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R, int n_rows, int n_cols, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); int m = n_rows, n = n_cols; - raft::mr::device::buffer R_full(allocator, stream, m * n); - raft::mr::device::buffer tau(allocator, stream, min(m, n)); + rmm::device_uvector R_full(m * n, stream); + rmm::device_uvector tau(min(m, n), stream); CUDA_CHECK( cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream)); int R_full_nrows = m, R_full_ncols = n; @@ -99,12 +97,12 @@ void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R, cudaMemcpyDeviceToDevice, stream)); int Lwork; - raft::mr::device::buffer devInfo(allocator, stream, 1); + rmm::device_uvector devInfo(1, stream); CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows, &Lwork)); - raft::mr::device::buffer workspace(allocator, stream, Lwork); + rmm::device_uvector workspace(Lwork, stream); CUSOLVER_CHECK(cusolverDngeqrf( cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows, tau.data(), workspace.data(), Lwork, devInfo.data(), stream)); diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/svd.cuh index 7357a68a4c..a1507cfa9b 100644 --- a/cpp/include/raft/linalg/svd.cuh +++ b/cpp/include/raft/linalg/svd.cuh @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include "eig.cuh" #include "gemm.cuh" #include "transpose.h" @@ -54,8 +54,6 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *sing_vals, T *left_sing_vecs, T *right_sing_vecs, bool trans_right, bool gen_left_vec, bool gen_right_vec, cudaStream_t stream) { - std::shared_ptr allocator = - handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); cublasHandle_t cublasH = handle.get_cublas_handle(); @@ -71,13 +69,13 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols, const int m = n_rows; const int n = n_cols; - raft::mr::device::buffer devInfo(allocator, stream, 1); + rmm::device_uvector devInfo(1, stream); T *d_rwork = nullptr; int lwork = 0; CUSOLVER_CHECK( cusolverDngesvd_bufferSize(cusolverH, n_rows, n_cols, &lwork)); - raft::mr::device::buffer d_work(allocator, stream, lwork); + rmm::device_uvector d_work(lwork, stream); char jobu = 'S'; char jobvt = 'A'; @@ -112,12 +110,11 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols, template void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S, T *U, T *V, bool gen_left_vec, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); cublasHandle_t cublasH = handle.get_cublas_handle(); int len = n_cols * n_cols; - raft::mr::device::buffer in_cross_mult(allocator, stream, len); + rmm::device_uvector in_cross_mult(len, stream); T alpha = T(1); T beta = T(0); @@ -162,7 +159,6 @@ void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, math_t *sing_vals, math_t *left_sing_vecs, math_t *right_sing_vecs, bool gen_left_vec, bool gen_right_vec, math_t tol, int max_sweeps, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); gesvdjInfo_t gesvdj_params = NULL; @@ -174,7 +170,7 @@ void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, int m = n_rows; int n = n_cols; - raft::mr::device::buffer devInfo(allocator, stream, 1); + rmm::device_uvector devInfo(1, stream); int lwork = 0; int econ = 1; @@ -183,7 +179,7 @@ void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals, left_sing_vecs, m, right_sing_vecs, n, &lwork, gesvdj_params)); - raft::mr::device::buffer d_work(allocator, stream, lwork); + rmm::device_uvector d_work(lwork, stream); CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj( cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals, @@ -210,10 +206,8 @@ template void svdReconstruction(const raft::handle_t &handle, math_t *U, math_t *S, math_t *V, math_t *out, int n_rows, int n_cols, int k, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); - const math_t alpha = 1.0, beta = 0.0; - raft::mr::device::buffer SVT(allocator, stream, k * n_cols); + rmm::device_uvector SVT(k * n_cols, stream); raft::linalg::gemm(handle, S, k, k, V, SVT.data(), k, n_cols, CUBLAS_OP_N, CUBLAS_OP_T, alpha, beta, stream); @@ -239,14 +233,13 @@ template bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U, math_t *S_vec, math_t *V, int n_rows, int n_cols, int k, math_t tol, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); cublasHandle_t cublasH = handle.get_cublas_handle(); int m = n_rows, n = n_cols; // form product matrix - raft::mr::device::buffer P_d(allocator, stream, m * n); - raft::mr::device::buffer S_mat(allocator, stream, k * k); + rmm::device_uvector P_d(m * n, stream); + rmm::device_uvector S_mat(k * k, stream); CUDA_CHECK(cudaMemsetAsync(P_d.data(), 0, sizeof(math_t) * m * n, stream)); CUDA_CHECK(cudaMemsetAsync(S_mat.data(), 0, sizeof(math_t) * k * k, stream)); @@ -262,7 +255,7 @@ bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U, // calculate percent error const math_t alpha = 1.0, beta = -1.0; - raft::mr::device::buffer A_minus_P(allocator, stream, m * n); + rmm::device_uvector A_minus_P(m * n, stream); CUDA_CHECK( cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream)); diff --git a/cpp/include/raft/matrix/math.cuh b/cpp/include/raft/matrix/math.cuh index 0a72117140..f84b85d222 100644 --- a/cpp/include/raft/matrix/math.cuh +++ b/cpp/include/raft/matrix/math.cuh @@ -22,7 +22,7 @@ #include #include #include -#include +#include namespace raft { namespace matrix { @@ -294,10 +294,7 @@ void ratio(const raft::handle_t &handle, math_t *src, math_t *dest, IdxType len, auto d_src = src; auto d_dest = dest; - std::shared_ptr allocator = - handle.get_device_allocator(); - - raft::mr::device::buffer d_sum(allocator, stream, 1); + rmm::device_uvector d_sum(1, stream); auto *d_sum_ptr = d_sum.data(); auto no_op = [] __device__(math_t in) { return in; }; raft::linalg::mapThenSumReduce(d_sum_ptr, len, no_op, stream, src); diff --git a/cpp/include/raft/random/rng.cuh b/cpp/include/raft/random/rng.cuh index 56710ea81f..a96c0bae38 100644 --- a/cpp/include/raft/random/rng.cuh +++ b/cpp/include/raft/random/rng.cuh @@ -25,8 +25,8 @@ #include #include #include -#include #include +#include #include #include "rng_impl.cuh" @@ -512,10 +512,10 @@ class Rng { std::shared_ptr allocator = handle.get_device_allocator(); - raft::mr::device::buffer expWts(allocator, stream, len); - raft::mr::device::buffer sortedWts(allocator, stream, len); - raft::mr::device::buffer inIdx(allocator, stream, len); - raft::mr::device::buffer outIdxBuff(allocator, stream, len); + rmm::device_uvector expWts(len, stream); + rmm::device_uvector sortedWts(len, stream); + rmm::device_uvector inIdx(len, stream); + rmm::device_uvector outIdxBuff(len, stream); auto *inIdxPtr = inIdx.data(); // generate modified weights custom_distribution( @@ -533,7 +533,7 @@ class Rng { ///@todo: use a more efficient partitioning scheme instead of full sort // sort the array and pick the top sampledLen items IdxT *outIdxPtr = outIdxBuff.data(); - raft::mr::device::buffer workspace(allocator, stream); + rmm::device_uvector workspace(0, stream); sortPairs(workspace, expWts.data(), sortedWts.data(), inIdxPtr, outIdxPtr, (int)len, stream); if (outIdx != nullptr) { diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh index a034bdbda8..31a8f54721 100644 --- a/cpp/include/raft/sparse/convert/csr.cuh +++ b/cpp/include/raft/sparse/convert/csr.cuh @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include @@ -50,15 +50,15 @@ void coo_to_csr(const raft::handle_t &handle, const int *srcRows, auto stream = handle.get_stream(); auto cusparseHandle = handle.get_cusparse_handle(); auto d_alloc = handle.get_device_allocator(); - raft::mr::device::buffer dstRows(d_alloc, stream, nnz); + rmm::device_uvector dstRows(nnz, stream); CUDA_CHECK(cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream)); CUDA_CHECK(cudaMemcpyAsync(dstCols, srcCols, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream)); auto buffSize = raft::sparse::cusparsecoosort_bufferSizeExt( cusparseHandle, m, m, nnz, srcRows, srcCols, stream); - raft::mr::device::buffer pBuffer(d_alloc, stream, buffSize); - raft::mr::device::buffer P(d_alloc, stream, nnz); + rmm::device_uvector pBuffer(buffSize, stream); + rmm::device_uvector P(nnz, stream); CUSPARSE_CHECK( cusparseCreateIdentityPermutation(cusparseHandle, nnz, P.data())); raft::sparse::cusparsecoosortByRow(cusparseHandle, m, m, nnz, dstRows.data(), @@ -154,7 +154,7 @@ template void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m, std::shared_ptr d_alloc, cudaStream_t stream) { - raft::mr::device::buffer row_counts(d_alloc, stream, m); + rmm::device_uvector row_counts(m, stream); CUDA_CHECK(cudaMemsetAsync(row_counts.data(), 0, m * sizeof(T), stream)); diff --git a/cpp/include/raft/sparse/coo.cuh b/cpp/include/raft/sparse/coo.cuh index 73120fea8c..40d1a06720 100644 --- a/cpp/include/raft/sparse/coo.cuh +++ b/cpp/include/raft/sparse/coo.cuh @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include @@ -58,9 +58,9 @@ namespace sparse { template class COO { protected: - raft::mr::device::buffer rows_arr; - raft::mr::device::buffer cols_arr; - raft::mr::device::buffer vals_arr; + rmm::device_uvector rows_arr; + rmm::device_uvector cols_arr; + rmm::device_uvector vals_arr; public: Index_Type nnz; @@ -72,9 +72,9 @@ class COO { * @param stream: CUDA stream to use */ COO(std::shared_ptr d_alloc, cudaStream_t stream) - : rows_arr(d_alloc, stream, 0), - cols_arr(d_alloc, stream, 0), - vals_arr(d_alloc, stream, 0), + : rows_arr(0, stream), + cols_arr(0, stream), + vals_arr(0, stream), nnz(0), n_rows(0), n_cols(0) {} @@ -87,10 +87,9 @@ class COO { * @param n_rows: number of rows in the dense matrix * @param n_cols: number of cols in the dense matrix */ - COO(raft::mr::device::buffer &rows, - raft::mr::device::buffer &cols, - raft::mr::device::buffer &vals, Index_Type nnz, Index_Type n_rows = 0, - Index_Type n_cols = 0) + COO(rmm::device_uvector &rows, + rmm::device_uvector &cols, rmm::device_uvector &vals, + Index_Type nnz, Index_Type n_rows = 0, Index_Type n_cols = 0) : rows_arr(rows), cols_arr(cols), vals_arr(vals), @@ -109,9 +108,9 @@ class COO { COO(std::shared_ptr d_alloc, cudaStream_t stream, Index_Type nnz, Index_Type n_rows = 0, Index_Type n_cols = 0, bool init = true) - : rows_arr(d_alloc, stream, nnz), - cols_arr(d_alloc, stream, nnz), - vals_arr(d_alloc, stream, nnz), + : rows_arr(nnz, stream), + cols_arr(nnz, stream), + vals_arr(nnz, stream), nnz(nnz), n_rows(n_rows), n_cols(n_cols) { diff --git a/cpp/include/raft/sparse/csr.cuh b/cpp/include/raft/sparse/csr.cuh index bc4a68d296..b30d7af8b4 100644 --- a/cpp/include/raft/sparse/csr.cuh +++ b/cpp/include/raft/sparse/csr.cuh @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include @@ -219,7 +219,7 @@ void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr, Index_ nnz, Index_ N, std::shared_ptr d_alloc, cudaStream_t stream, Lambda filter_op) { - raft::mr::device::buffer m(d_alloc, stream, 1); + rmm::device_uvector m(1, stream); WeakCCState state(m.data()); weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, 0, N, @@ -253,7 +253,7 @@ void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr, Index_ nnz, Index_ N, std::shared_ptr d_alloc, cudaStream_t stream) { - raft::mr::device::buffer m(d_alloc, stream, 1); + rmm::device_uvector m(1, stream); WeakCCState state(m.data()); weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, 0, N, stream, [](Index_) { return true; }); diff --git a/cpp/include/raft/sparse/distance/bin_distance.cuh b/cpp/include/raft/sparse/distance/bin_distance.cuh index ae5cbdf9d3..1ad5466aad 100644 --- a/cpp/include/raft/sparse/distance/bin_distance.cuh +++ b/cpp/include/raft/sparse/distance/bin_distance.cuh @@ -24,7 +24,7 @@ #include #include -#include +#include #include #include @@ -88,8 +88,8 @@ void compute_bin_distance(value_t *out, const value_idx *Q_coo_rows, cusparseHandle_t handle, std::shared_ptr alloc, cudaStream_t stream, expansion_f expansion_func) { - raft::mr::device::buffer Q_norms(alloc, stream, m); - raft::mr::device::buffer R_norms(alloc, stream, n); + rmm::device_uvector Q_norms(m, stream); + rmm::device_uvector R_norms(n, stream); CUDA_CHECK( cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t))); CUDA_CHECK( @@ -113,9 +113,7 @@ class jaccard_expanded_distances_t : public distances_t { public: explicit jaccard_expanded_distances_t( const distances_config_t &config) - : config_(&config), - workspace(config.allocator, config.stream, 0), - ip_dists(config) {} + : config_(&config), workspace(0, config.stream), ip_dists(config) {} void compute(value_t *out_dists) { ip_dists.compute(out_dists); @@ -123,8 +121,8 @@ class jaccard_expanded_distances_t : public distances_t { value_idx *b_indices = ip_dists.b_rows_coo(); value_t *b_data = ip_dists.b_data_coo(); - raft::mr::device::buffer search_coo_rows( - config_->allocator, config_->stream, config_->a_nnz); + rmm::device_uvector search_coo_rows(config_->a_nnz, + config_->stream); raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, search_coo_rows.data(), config_->a_nnz, config_->stream); @@ -149,7 +147,7 @@ class jaccard_expanded_distances_t : public distances_t { private: const distances_config_t *config_; - raft::mr::device::buffer workspace; + rmm::device_uvector workspace; ip_distances_t ip_dists; }; @@ -162,9 +160,7 @@ class dice_expanded_distances_t : public distances_t { public: explicit dice_expanded_distances_t( const distances_config_t &config) - : config_(&config), - workspace(config.allocator, config.stream, 0), - ip_dists(config) {} + : config_(&config), workspace(0, config.stream), ip_dists(config) {} void compute(value_t *out_dists) { ip_dists.compute(out_dists); @@ -172,8 +168,8 @@ class dice_expanded_distances_t : public distances_t { value_idx *b_indices = ip_dists.b_rows_coo(); value_t *b_data = ip_dists.b_data_coo(); - raft::mr::device::buffer search_coo_rows( - config_->allocator, config_->stream, config_->a_nnz); + rmm::device_uvector search_coo_rows(config_->a_nnz, + config_->stream); raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, search_coo_rows.data(), config_->a_nnz, config_->stream); @@ -194,7 +190,7 @@ class dice_expanded_distances_t : public distances_t { private: const distances_config_t *config_; - raft::mr::device::buffer workspace; + rmm::device_uvector workspace; ip_distances_t ip_dists; }; diff --git a/cpp/include/raft/sparse/distance/csr_spmv.cuh b/cpp/include/raft/sparse/distance/csr_spmv.cuh index cd8ca09913..2ee0abe47a 100644 --- a/cpp/include/raft/sparse/distance/csr_spmv.cuh +++ b/cpp/include/raft/sparse/distance/csr_spmv.cuh @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include @@ -346,7 +346,7 @@ template inline value_idx max_degree( value_idx *indptr, value_idx n_rows, std::shared_ptr allocator, cudaStream_t stream) { - raft::mr::device::buffer max_d(allocator, stream, 1); + rmm::device_uvector max_d(1, stream); CUDA_CHECK(cudaMemsetAsync(max_d.data(), 0, sizeof(value_idx), stream)); /** diff --git a/cpp/include/raft/sparse/distance/ip_distance.cuh b/cpp/include/raft/sparse/distance/ip_distance.cuh index 90717bfc5f..6297b009f4 100644 --- a/cpp/include/raft/sparse/distance/ip_distance.cuh +++ b/cpp/include/raft/sparse/distance/ip_distance.cuh @@ -23,7 +23,7 @@ #include #include -#include +#include #include #include @@ -92,10 +92,10 @@ class ip_distances_gemm_t : public ip_trans_getters_t { explicit ip_distances_gemm_t( const distances_config_t &config) : config_(&config), - workspace(config.allocator, config.stream, 0), - csc_indptr(config.allocator, config.stream, 0), - csc_indices(config.allocator, config.stream, 0), - csc_data(config.allocator, config.stream, 0), + workspace(0, config.stream), + csc_indptr(0, config.stream), + csc_indices(0, config.stream), + csc_data(0, config.stream), alpha(1.0) { init_mat_descriptor(matA); init_mat_descriptor(matB); @@ -118,12 +118,10 @@ class ip_distances_gemm_t : public ip_trans_getters_t { /** * Compute pairwise distances and return dense matrix in column-major format */ - raft::mr::device::buffer out_batch_indptr( - config_->allocator, config_->stream, config_->a_nrows + 1); - raft::mr::device::buffer out_batch_indices(config_->allocator, - config_->stream, 0); - raft::mr::device::buffer out_batch_data(config_->allocator, - config_->stream, 0); + rmm::device_uvector out_batch_indptr(config_->a_nrows + 1, + config_->stream); + rmm::device_uvector out_batch_indices(0, config_->stream); + rmm::device_uvector out_batch_data(0, config_->stream); value_idx out_batch_nnz = get_nnz(out_batch_indptr.data()); @@ -219,10 +217,10 @@ class ip_distances_gemm_t : public ip_trans_getters_t { cusparseMatDescr_t matC; cusparseMatDescr_t matD; cusparsePointerMode_t orig_ptr_mode; - raft::mr::device::buffer workspace; - raft::mr::device::buffer csc_indptr; - raft::mr::device::buffer csc_indices; - raft::mr::device::buffer csc_data; + rmm::device_uvector workspace; + rmm::device_uvector csc_indptr; + rmm::device_uvector csc_indices; + rmm::device_uvector csc_data; const distances_config_t *config_; }; @@ -234,8 +232,7 @@ class ip_distances_spmv_t : public ip_trans_getters_t { * @param[in] config specifies inputs, outputs, and sizes */ ip_distances_spmv_t(const distances_config_t &config) - : config_(&config), - coo_rows_b(config.allocator, config.stream, config.b_nnz) { + : config_(&config), coo_rows_b(config.b_nnz, config.stream) { raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows, coo_rows_b.data(), config_->b_nnz, config_->stream); @@ -262,7 +259,7 @@ class ip_distances_spmv_t : public ip_trans_getters_t { private: const distances_config_t *config_; - raft::mr::device::buffer coo_rows_b; + rmm::device_uvector coo_rows_b; }; template diff --git a/cpp/include/raft/sparse/distance/l2_distance.cuh b/cpp/include/raft/sparse/distance/l2_distance.cuh index 829471e0e3..3898189630 100644 --- a/cpp/include/raft/sparse/distance/l2_distance.cuh +++ b/cpp/include/raft/sparse/distance/l2_distance.cuh @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include @@ -92,8 +92,8 @@ void compute_l2(value_t *out, const value_idx *Q_coo_rows, cusparseHandle_t handle, std::shared_ptr alloc, cudaStream_t stream, expansion_f expansion_func) { - raft::mr::device::buffer Q_sq_norms(alloc, stream, m); - raft::mr::device::buffer R_sq_norms(alloc, stream, n); + rmm::device_uvector Q_sq_norms(m, stream); + rmm::device_uvector R_sq_norms(n, stream); CUDA_CHECK( cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t))); CUDA_CHECK( @@ -117,9 +117,7 @@ class l2_expanded_distances_t : public distances_t { public: explicit l2_expanded_distances_t( const distances_config_t &config) - : config_(&config), - workspace(config.allocator, config.stream, 0), - ip_dists(config) {} + : config_(&config), workspace(0, config.stream), ip_dists(config) {} void compute(value_t *out_dists) { ip_dists.compute(out_dists); @@ -127,8 +125,8 @@ class l2_expanded_distances_t : public distances_t { value_idx *b_indices = ip_dists.b_rows_coo(); value_t *b_data = ip_dists.b_data_coo(); - raft::mr::device::buffer search_coo_rows( - config_->allocator, config_->stream, config_->a_nnz); + rmm::device_uvector search_coo_rows(config_->a_nnz, + config_->stream); raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, search_coo_rows.data(), config_->a_nnz, config_->stream); @@ -146,7 +144,7 @@ class l2_expanded_distances_t : public distances_t { protected: const distances_config_t *config_; - raft::mr::device::buffer workspace; + rmm::device_uvector workspace; ip_distances_t ip_dists; }; @@ -186,9 +184,7 @@ class cosine_expanded_distances_t : public distances_t { public: explicit cosine_expanded_distances_t( const distances_config_t &config) - : config_(&config), - workspace(config.allocator, config.stream, 0), - ip_dists(config) {} + : config_(&config), workspace(0, config.stream), ip_dists(config) {} void compute(value_t *out_dists) { ip_dists.compute(out_dists); @@ -196,8 +192,8 @@ class cosine_expanded_distances_t : public distances_t { value_idx *b_indices = ip_dists.b_rows_coo(); value_t *b_data = ip_dists.b_data_coo(); - raft::mr::device::buffer search_coo_rows( - config_->allocator, config_->stream, config_->a_nnz); + rmm::device_uvector search_coo_rows(config_->a_nnz, + config_->stream); raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, search_coo_rows.data(), config_->a_nnz, config_->stream); @@ -221,7 +217,7 @@ class cosine_expanded_distances_t : public distances_t { private: const distances_config_t *config_; - raft::mr::device::buffer workspace; + rmm::device_uvector workspace; ip_distances_t ip_dists; }; @@ -239,9 +235,7 @@ class hellinger_expanded_distances_t : public distances_t { public: explicit hellinger_expanded_distances_t( const distances_config_t &config) - : config_(&config), - workspace(config.allocator, config.stream, 0), - ip_dists(config) {} + : config_(&config), workspace(0, config.stream), ip_dists(config) {} void compute(value_t *out_dists) { // First sqrt A and B @@ -282,7 +276,7 @@ class hellinger_expanded_distances_t : public distances_t { private: const distances_config_t *config_; - raft::mr::device::buffer workspace; + rmm::device_uvector workspace; ip_distances_t ip_dists; }; diff --git a/cpp/include/raft/sparse/distance/lp_distance.cuh b/cpp/include/raft/sparse/distance/lp_distance.cuh index e524d87b7c..4a7d5e53ff 100644 --- a/cpp/include/raft/sparse/distance/lp_distance.cuh +++ b/cpp/include/raft/sparse/distance/lp_distance.cuh @@ -24,7 +24,7 @@ #include #include -#include +#include #include #include @@ -62,8 +62,8 @@ void unexpanded_lp_distances( // for max occupancy. // Ref: https://github.com/rapidsai/cuml/issues/3371 - raft::mr::device::buffer coo_rows( - config_->allocator, config_->stream, max(config_->b_nnz, config_->a_nnz)); + rmm::device_uvector coo_rows(max(config_->b_nnz, config_->a_nnz), + config_->stream); raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows, coo_rows.data(), config_->b_nnz, diff --git a/cpp/include/raft/sparse/linalg/add.cuh b/cpp/include/raft/sparse/linalg/add.cuh index 47b1ba6e41..6bb4b31bd5 100644 --- a/cpp/include/raft/sparse/linalg/add.cuh +++ b/cpp/include/raft/sparse/linalg/add.cuh @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include @@ -168,7 +168,7 @@ size_t csr_add_calc_inds(const int *a_ind, const int *a_indptr, const T *a_val, dim3 grid(raft::ceildiv(m, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); - raft::mr::device::buffer row_counts(d_alloc, stream, m + 1); + rmm::device_uvector row_counts(m + 1, stream); CUDA_CHECK( cudaMemsetAsync(row_counts.data(), 0, (m + 1) * sizeof(int), stream)); diff --git a/cpp/include/raft/sparse/linalg/spectral.cuh b/cpp/include/raft/sparse/linalg/spectral.cuh index 15302f3b74..0d6c55bb65 100644 --- a/cpp/include/raft/sparse/linalg/spectral.cuh +++ b/cpp/include/raft/sparse/linalg/spectral.cuh @@ -19,8 +19,8 @@ #include #include #include -#include #include +#include #include #include @@ -36,15 +36,15 @@ void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals, unsigned long long seed = 1234567) { auto stream = handle.get_stream(); auto d_alloc = handle.get_device_allocator(); - raft::mr::device::buffer src_offsets(d_alloc, stream, n + 1); - raft::mr::device::buffer dst_cols(d_alloc, stream, nnz); - raft::mr::device::buffer dst_vals(d_alloc, stream, nnz); + rmm::device_uvector src_offsets(n + 1, stream); + rmm::device_uvector dst_cols(nnz, stream); + rmm::device_uvector dst_vals(nnz, stream); convert::coo_to_csr(handle, rows, cols, vals, nnz, n, src_offsets.data(), dst_cols.data(), dst_vals.data()); - raft::mr::device::buffer eigVals(d_alloc, stream, n_components + 1); - raft::mr::device::buffer eigVecs(d_alloc, stream, n * (n_components + 1)); - raft::mr::device::buffer labels(d_alloc, stream, n); + rmm::device_uvector eigVals(n_components + 1, stream); + rmm::device_uvector eigVecs(n * (n_components + 1), stream); + rmm::device_uvector labels(n, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); diff --git a/cpp/include/raft/sparse/linalg/symmetrize.cuh b/cpp/include/raft/sparse/linalg/symmetrize.cuh index 5c2c78f0c3..128ecb21bb 100644 --- a/cpp/include/raft/sparse/linalg/symmetrize.cuh +++ b/cpp/include/raft/sparse/linalg/symmetrize.cuh @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include @@ -31,7 +31,6 @@ #include #include -#include #include #include @@ -135,7 +134,7 @@ void coo_symmetrize(COO *in, COO *out, ASSERT(!out->validate_mem(), "Expecting unallocated COO for output"); - raft::mr::device::buffer in_row_ind(d_alloc, stream, in->n_rows); + rmm::device_uvector in_row_ind(in->n_rows, stream); convert::sorted_coo_to_csr(in, in_row_ind.data(), d_alloc, stream); @@ -265,11 +264,11 @@ void from_knn_symmetrize_matrix( raft::ceildiv(k, TPB_Y)); // Notice n+1 since we can reuse these arrays for transpose_edges, original_edges in step (4) - raft::mr::device::buffer row_sizes(d_alloc, stream, n); + rmm::device_uvector row_sizes(n, stream); CUDA_CHECK( cudaMemsetAsync(row_sizes.data(), 0, sizeof(value_idx) * n, stream)); - raft::mr::device::buffer row_sizes2(d_alloc, stream, n); + rmm::device_uvector row_sizes2(n, stream); CUDA_CHECK( cudaMemsetAsync(row_sizes2.data(), 0, sizeof(value_idx) * n, stream)); diff --git a/cpp/include/raft/sparse/linalg/transpose.h b/cpp/include/raft/sparse/linalg/transpose.h index 6afe4ca8f6..7979de5657 100644 --- a/cpp/include/raft/sparse/linalg/transpose.h +++ b/cpp/include/raft/sparse/linalg/transpose.h @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include @@ -72,8 +72,8 @@ void csr_transpose(cusparseHandle_t handle, const value_idx *csr_indptr, CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1, &convert_csc_workspace_size, stream)); - raft::mr::device::buffer convert_csc_workspace( - allocator, stream, convert_csc_workspace_size); + rmm::device_uvector convert_csc_workspace(convert_csc_workspace_size, + stream); CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc( handle, csr_nrows, csr_ncols, nnz, csr_data, csr_indptr, csr_indices, diff --git a/cpp/include/raft/sparse/op/filter.cuh b/cpp/include/raft/sparse/op/filter.cuh index 562d506cfe..5df7decf8b 100644 --- a/cpp/include/raft/sparse/op/filter.cuh +++ b/cpp/include/raft/sparse/op/filter.cuh @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include @@ -87,8 +87,8 @@ void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz, int *cur_cnnz, T scalar, int n, std::shared_ptr d_alloc, cudaStream_t stream) { - raft::mr::device::buffer ex_scan(d_alloc, stream, n); - raft::mr::device::buffer cur_ex_scan(d_alloc, stream, n); + rmm::device_uvector ex_scan(n, stream); + rmm::device_uvector cur_ex_scan(n, stream); CUDA_CHECK(cudaMemsetAsync(ex_scan.data(), 0, n * sizeof(int), stream)); CUDA_CHECK(cudaMemsetAsync(cur_ex_scan.data(), 0, n * sizeof(int), stream)); @@ -129,8 +129,8 @@ template void coo_remove_scalar(COO *in, COO *out, T scalar, std::shared_ptr d_alloc, cudaStream_t stream) { - raft::mr::device::buffer row_count_nz(d_alloc, stream, in->n_rows); - raft::mr::device::buffer row_count(d_alloc, stream, in->n_rows); + rmm::device_uvector row_count_nz(in->n_rows, stream); + rmm::device_uvector row_count(in->n_rows, stream); CUDA_CHECK( cudaMemsetAsync(row_count_nz.data(), 0, in->n_rows * sizeof(int), stream)); diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh index 09494e9eb1..a6834552e3 100644 --- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh @@ -19,6 +19,8 @@ #include #include +#include + #include #include #include @@ -243,13 +245,12 @@ void brute_force_knn_impl(std::vector &input, std::vector &sizes, int device; CUDA_CHECK(cudaGetDevice(&device)); - raft::mr::device::buffer trans(allocator, userStream, - id_ranges->size()); + rmm::device_uvector trans(id_ranges->size(), userStream); raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(), userStream); - raft::mr::device::buffer all_D(allocator, userStream, 0); - raft::mr::device::buffer all_I(allocator, userStream, 0); + rmm::device_uvector all_D(0, userStream); + rmm::device_uvector all_I(0, userStream); float *out_D = res_D; int64_t *out_I = res_I; diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp index a645412c2f..d9292440d1 100644 --- a/cpp/include/raft/spatial/knn/detail/processing.hpp +++ b/cpp/include/raft/spatial/knn/detail/processing.hpp @@ -20,9 +20,9 @@ #include #include #include -#include #include #include +#include namespace raft { namespace spatial { @@ -57,7 +57,7 @@ class CosineMetricProcessor : public MetricProcessor { size_t n_cols_; cudaStream_t stream_; std::shared_ptr device_allocator_; - raft::mr::device::buffer colsums_; + rmm::device_uvector colsums_; public: CosineMetricProcessor(size_t n_rows, size_t n_cols, int k, bool row_major, @@ -65,7 +65,7 @@ class CosineMetricProcessor : public MetricProcessor { std::shared_ptr allocator) : device_allocator_(allocator), stream_(stream), - colsums_(allocator, stream, n_rows), + colsums_(n_rows, stream), n_cols_(n_cols), n_rows_(n_rows), row_major_(row_major), @@ -108,7 +108,7 @@ class CorrelationMetricProcessor : public CosineMetricProcessor { std::shared_ptr allocator) : CosineMetricProcessor(n_rows, n_cols, k, row_major, stream, allocator), - means_(allocator, stream, n_rows) {} + means_(n_rows, stream) {} void preprocess(math_t *data) { math_t normalizer_const = 1.0 / (math_t)cosine::n_cols_; @@ -143,7 +143,7 @@ class CorrelationMetricProcessor : public CosineMetricProcessor { ~CorrelationMetricProcessor() = default; - raft::mr::device::buffer means_; + rmm::device_uvector means_; }; template diff --git a/cpp/test/lap/lap.cu b/cpp/test/lap/lap.cu index 04f473f836..08429e18f2 100644 --- a/cpp/test/lap/lap.cu +++ b/cpp/test/lap/lap.cu @@ -24,6 +24,8 @@ */ #include +#include + #include #include #include @@ -65,15 +67,12 @@ void hungarian_test(int problemsize, int costrange, int problemcount, for (int j = 0; j < problemcount; j++) { generateProblem(h_cost, batchsize, problemsize, costrange); - raft::mr::device::buffer elements_v( - handle.get_device_allocator(), handle.get_stream(), - batchsize * problemsize * problemsize); - raft::mr::device::buffer row_assignment_v( - handle.get_device_allocator(), handle.get_stream(), - batchsize * problemsize); - raft::mr::device::buffer col_assignment_v( - handle.get_device_allocator(), handle.get_stream(), - batchsize * problemsize); + rmm::device_uvector elements_v( + batchsize * problemsize * problemsize, handle.get_stream()); + rmm::device_uvector row_assignment_v(batchsize * problemsize, + handle.get_stream()); + rmm::device_uvector col_assignment_v(batchsize * problemsize, + handle.get_stream()); raft::update_device(elements_v.data(), h_cost, batchsize * problemsize * problemsize, diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/test/linalg/cholesky_r1.cu index 00236d53fa..3c84d0db5f 100644 --- a/cpp/test/linalg/cholesky_r1.cu +++ b/cpp/test/linalg/cholesky_r1.cu @@ -20,7 +20,8 @@ #include #include #include -#include +#include + #include #include #include "../test_utils.h" @@ -31,12 +32,11 @@ template class CholeskyR1Test : public ::testing::Test { protected: CholeskyR1Test() - : allocator(handle.get_device_allocator()), - G(allocator, handle.get_stream(), n_rows * n_rows), - L(allocator, handle.get_stream(), n_rows * n_rows), - L_exp(allocator, handle.get_stream(), n_rows * n_rows), - devInfo(allocator, handle.get_stream(), 1), - workspace(allocator, handle.get_stream()) { + : G(n_rows * n_rows, handle.get_stream()), + L(n_rows * n_rows, handle.get_stream()), + L_exp(n_rows * n_rows, handle.get_stream()), + devInfo(1, handle.get_stream()), + workspace(0, handle.get_stream()) { CUDA_CHECK(cudaStreamCreate(&stream)); handle.set_stream(stream); raft::update_device(G.data(), G_host, n_rows * n_rows, stream); @@ -105,7 +105,6 @@ class CholeskyR1Test : public ::testing::Test { } raft::handle_t handle; - std::shared_ptr allocator; cusolverDnHandle_t solver_handle; cudaStream_t stream; @@ -120,11 +119,11 @@ class CholeskyR1Test : public ::testing::Test { math_t G2_host[4] = {3, 4, 2, 1}; - raft::mr::device::buffer devInfo; - raft::mr::device::buffer G; - raft::mr::device::buffer L_exp; - raft::mr::device::buffer L; - raft::mr::device::buffer workspace; + rmm::device_uvector devInfo; + rmm::device_uvector G; + rmm::device_uvector L_exp; + rmm::device_uvector L; + rmm::device_uvector workspace; }; typedef ::testing::Types FloatTypes; diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu index 6e146fa4bb..c78dd9e8fb 100644 --- a/cpp/test/linalg/map_then_reduce.cu +++ b/cpp/test/linalg/map_then_reduce.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include "../test_utils.h" namespace raft { @@ -131,9 +132,7 @@ class MapGenericReduceTest : public ::testing::Test { protected: MapGenericReduceTest() - : allocator(handle.get_device_allocator()), - input(allocator, handle.get_stream(), n), - output(allocator, handle.get_stream(), 1) { + : input(n, handle.get_stream()), output(1, handle.get_stream()) { CUDA_CHECK(cudaStreamCreate(&stream)); handle.set_stream(stream); initInput(input.data(), input.size(), stream); @@ -172,9 +171,8 @@ class MapGenericReduceTest : public ::testing::Test { int n = 1237; raft::handle_t handle; cudaStream_t stream; - std::shared_ptr allocator; - raft::mr::device::buffer input; - raft::mr::device::buffer output; + rmm::device_uvector input; + rmm::device_uvector output; }; using IoTypePair = diff --git a/cpp/test/matrix/matrix.cu b/cpp/test/matrix/matrix.cu index 28222c0697..e7da92a136 100644 --- a/cpp/test/matrix/matrix.cu +++ b/cpp/test/matrix/matrix.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include "../test_utils.h" namespace raft { @@ -102,10 +103,9 @@ class MatrixCopyRowsTest : public ::testing::Test { protected: MatrixCopyRowsTest() - : allocator(handle.get_device_allocator()), - input(allocator, handle.get_stream(), n_cols * n_rows), - indices(allocator, handle.get_stream(), n_selected), - output(allocator, handle.get_stream(), n_cols * n_selected) { + : input(n_cols * n_rows, handle.get_stream()), + indices(n_selected, handle.get_stream()), + output(n_cols * n_selected, handle.get_stream()) { CUDA_CHECK(cudaStreamCreate(&stream)); handle.set_stream(stream); raft::update_device(indices.data(), indices_host, n_selected, stream); @@ -143,10 +143,9 @@ class MatrixCopyRowsTest : public ::testing::Test { 14, 21, 22, 23, 27, 28, 29}; raft::handle_t handle; cudaStream_t stream; - std::shared_ptr allocator; - raft::mr::device::buffer input; - raft::mr::device::buffer output; - raft::mr::device::buffer indices; + rmm::device_uvector input; + rmm::device_uvector output; + rmm::device_uvector indices; }; using TypeTuple = diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu index 6e3f3b5038..3582075bfb 100644 --- a/cpp/test/sparse/dist_coo_spmv.cu +++ b/cpp/test/sparse/dist_coo_spmv.cu @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -67,9 +68,8 @@ class SparseDistanceCOOSPMVTest template void compute_dist(reduce_f reduce_func, accum_f accum_func, write_f write_func, bool rev = true) { - raft::mr::device::buffer coo_rows( - dist_config.allocator, dist_config.stream, - max(dist_config.b_nnz, dist_config.a_nnz)); + rmm::device_uvector coo_rows( + max(dist_config.b_nnz, dist_config.a_nnz), dist_config.stream); raft::sparse::convert::csr_to_coo(dist_config.b_indptr, dist_config.b_nrows, coo_rows.data(), dist_config.b_nnz, diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu index ce567e4298..64403eab7f 100644 --- a/cpp/test/sparse/linkage.cu +++ b/cpp/test/sparse/linkage.cu @@ -117,7 +117,7 @@ double compute_rand_index( ASSERT(size >= 2, "Rand Index for size less than 2 not defined!"); //allocating and initializing memory for a and b in the GPU - raft::mr::device::buffer arr_buf(allocator, stream, 2); + rmm::device_uvector arr_buf(2, stream); CUDA_CHECK(cudaMemsetAsync(arr_buf.data(), 0, 2 * sizeof(uint64_t), stream)); //kernel configuration diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu index d104028d2b..e96e5c289c 100644 --- a/cpp/test/sparse/symmetrize.cu +++ b/cpp/test/sparse/symmetrize.cu @@ -17,11 +17,12 @@ #include #include #include -#include "../test_utils.h" - #include #include #include +#include + +#include "../test_utils.h" #include @@ -87,7 +88,7 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam< value_idx n = params.n_cols; value_idx nnz = params.indices_h.size(); - raft::mr::device::buffer coo_rows(alloc, stream, nnz); + rmm::device_uvector coo_rows(nnz, stream); raft::sparse::convert::csr_to_coo(indptr, m, coo_rows.data(), nnz, stream); @@ -96,7 +97,7 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam< raft::sparse::linalg::symmetrize(handle, coo_rows.data(), indices, data, m, n, coo_rows.size(), out); - raft::mr::device::buffer sum(alloc, stream, 1); + rmm::device_uvector sum(1, stream); CUDA_CHECK(cudaMemsetAsync(sum.data(), 0, 1 * sizeof(value_idx), stream)); From 360bebb18959f026295600a91a7d2fa9bc520b5b Mon Sep 17 00:00:00 2001 From: viclafargue Date: Wed, 30 Jun 2021 14:53:49 +0200 Subject: [PATCH 02/17] Remove references to RAFT allocator adapter --- cpp/include/raft/comms/helper.hpp | 10 ++-- cpp/include/raft/comms/std_comms.hpp | 40 ++++++---------- cpp/include/raft/label/classlabels.cuh | 33 +++++-------- .../raft/linalg/cholesky_r1_update.cuh | 8 ++-- cpp/include/raft/linalg/eig.cuh | 3 -- cpp/include/raft/matrix/math.cuh | 2 - cpp/include/raft/random/rng.cuh | 5 -- cpp/include/raft/sparse/convert/csr.cuh | 12 +---- cpp/include/raft/sparse/coo.cuh | 10 ++-- cpp/include/raft/sparse/csr.cuh | 11 +---- .../raft/sparse/distance/bin_distance.cuh | 10 ++-- cpp/include/raft/sparse/distance/common.h | 3 -- cpp/include/raft/sparse/distance/coo_spmv.cuh | 1 - cpp/include/raft/sparse/distance/csr_spmv.cuh | 12 ++--- cpp/include/raft/sparse/distance/distance.cuh | 1 - .../raft/sparse/distance/ip_distance.cuh | 3 +- .../raft/sparse/distance/l2_distance.cuh | 10 ++-- .../raft/sparse/distance/lp_distance.cuh | 1 - .../sparse/hierarchy/detail/agglomerative.cuh | 2 - .../hierarchy/detail/connectivities.cuh | 8 ++-- .../raft/sparse/hierarchy/detail/mst.cuh | 13 ++---- .../raft/sparse/hierarchy/single_linkage.hpp | 1 - cpp/include/raft/sparse/linalg/add.cuh | 3 -- cpp/include/raft/sparse/linalg/spectral.cuh | 2 - cpp/include/raft/sparse/linalg/symmetrize.cuh | 18 +++----- cpp/include/raft/sparse/linalg/transpose.h | 6 +-- cpp/include/raft/sparse/op/filter.cuh | 19 ++------ cpp/include/raft/sparse/op/reduce.cuh | 2 - cpp/include/raft/sparse/op/sort.h | 12 +---- .../sparse/selection/connect_components.cuh | 26 ++++------- cpp/include/raft/sparse/selection/knn.cuh | 18 ++------ .../raft/sparse/selection/knn_graph.cuh | 1 - cpp/include/raft/spatial/knn/ann.hpp | 3 -- .../knn/detail/ann_quantized_faiss.cuh | 6 +-- .../knn/detail/knn_brute_force_faiss.cuh | 9 ++-- .../raft/spatial/knn/detail/processing.hpp | 23 ++++------ cpp/include/raft/spatial/knn/knn.hpp | 6 +-- cpp/include/raft/spectral/matrix_wrappers.hpp | 46 ++++++------------- cpp/test/label/label.cu | 18 +++----- cpp/test/label/merge_labels.cu | 1 - cpp/test/linalg/binary_op.cu | 8 ++-- cpp/test/linalg/cholesky_r1.cu | 1 - cpp/test/linalg/map.cu | 1 - cpp/test/mr/device/buffer.cpp | 18 ++++---- cpp/test/sparse/add.cu | 5 +- cpp/test/sparse/connect_components.cu | 11 ++--- cpp/test/sparse/convert_csr.cu | 5 +- cpp/test/sparse/csr_row_slice.cu | 4 -- cpp/test/sparse/csr_to_dense.cu | 4 -- cpp/test/sparse/csr_transpose.cu | 6 +-- cpp/test/sparse/dist_coo_spmv.cu | 4 -- cpp/test/sparse/dist_csr_spmv.cu | 4 -- cpp/test/sparse/distance.cu | 4 -- cpp/test/sparse/filter.cu | 13 ++---- cpp/test/sparse/knn.cu | 7 +-- cpp/test/sparse/knn_graph.cu | 3 +- cpp/test/sparse/linkage.cu | 10 ++-- cpp/test/sparse/reduce.cu | 3 +- cpp/test/sparse/selection.cu | 2 - cpp/test/sparse/sort.cu | 5 +- cpp/test/sparse/symmetrize.cu | 12 ++--- cpp/test/spatial/haversine.cu | 3 -- 62 files changed, 161 insertions(+), 390 deletions(-) diff --git a/cpp/include/raft/comms/helper.hpp b/cpp/include/raft/comms/helper.hpp index 7b24e31bbe..e01490d728 100644 --- a/cpp/include/raft/comms/helper.hpp +++ b/cpp/include/raft/comms/helper.hpp @@ -38,11 +38,10 @@ namespace comms { */ void build_comms_nccl_only(handle_t *handle, ncclComm_t nccl_comm, int num_ranks, int rank) { - auto d_alloc = handle->get_device_allocator(); cudaStream_t stream = handle->get_stream(); auto communicator = std::make_shared(std::unique_ptr( - new raft::comms::std_comms(nccl_comm, num_ranks, rank, d_alloc, stream))); + new raft::comms::std_comms(nccl_comm, num_ranks, rank, stream))); handle->set_comms(communicator); } @@ -80,12 +79,11 @@ void build_comms_nccl_ucx(handle_t *handle, ncclComm_t nccl_comm, } } - auto d_alloc = handle->get_device_allocator(); cudaStream_t stream = handle->get_stream(); - auto communicator = std::make_shared(std::unique_ptr( - new raft::comms::std_comms(nccl_comm, (ucp_worker_h)ucp_worker, eps_sp, - num_ranks, rank, d_alloc, stream))); + auto communicator = std::make_shared( + std::unique_ptr(new raft::comms::std_comms( + nccl_comm, (ucp_worker_h)ucp_worker, eps_sp, num_ranks, rank, stream))); handle->set_comms(communicator); } diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index 765e8741bb..ff75931fb9 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -20,7 +20,7 @@ #include #include -#include +#include #include @@ -64,17 +64,16 @@ class std_comms : public comms_iface { */ std_comms(ncclComm_t nccl_comm, ucp_worker_h ucp_worker, std::shared_ptr eps, int num_ranks, int rank, - const std::shared_ptr device_allocator, cudaStream_t stream, bool subcomms_ucp = true) : nccl_comm_(nccl_comm), stream_(stream), + status_(2, stream), num_ranks_(num_ranks), rank_(rank), subcomms_ucp_(subcomms_ucp), ucp_worker_(ucp_worker), ucp_eps_(eps), - next_request_id_(0), - device_allocator_(device_allocator) { + next_request_id_(0) { initialize(); }; @@ -85,27 +84,19 @@ class std_comms : public comms_iface { * @param rank rank of the current worker */ std_comms(const ncclComm_t nccl_comm, int num_ranks, int rank, - const std::shared_ptr device_allocator, cudaStream_t stream) : nccl_comm_(nccl_comm), stream_(stream), + status_(2, stream), num_ranks_(num_ranks), rank_(rank), - subcomms_ucp_(false), - device_allocator_(device_allocator) { + subcomms_ucp_(false) { initialize(); }; - virtual ~std_comms() { - device_allocator_->deallocate(sendbuff_, sizeof(int), stream_); - device_allocator_->deallocate(recvbuff_, sizeof(int), stream_); - } - void initialize() { - sendbuff_ = reinterpret_cast( - device_allocator_->allocate(sizeof(int), stream_)); - recvbuff_ = reinterpret_cast( - device_allocator_->allocate(sizeof(int), stream_)); + sendbuff_ = status_.data(); + recvbuff_ = status_.data() + 1; } int get_size() const { return num_ranks_; } @@ -113,8 +104,8 @@ class std_comms : public comms_iface { int get_rank() const { return rank_; } std::unique_ptr comm_split(int color, int key) const { - mr::device::buffer d_colors(device_allocator_, stream_, get_size()); - mr::device::buffer d_keys(device_allocator_, stream_, get_size()); + rmm::device_uvector d_colors(get_size(), stream_); + rmm::device_uvector d_keys(get_size(), stream_); update_device(d_colors.data() + get_rank(), &color, 1, stream_); update_device(d_keys.data() + get_rank(), &key, 1, stream_); @@ -167,12 +158,12 @@ class std_comms : public comms_iface { if (ucp_worker_ != nullptr && subcomms_ucp_) { auto eps_sp = std::make_shared(new_ucx_ptrs.data()); - return std::unique_ptr(new std_comms( - nccl_comm, (ucp_worker_h)ucp_worker_, eps_sp, subcomm_ranks.size(), key, - device_allocator_, stream_, subcomms_ucp_)); + return std::unique_ptr( + new std_comms(nccl_comm, (ucp_worker_h)ucp_worker_, eps_sp, + subcomm_ranks.size(), key, stream_, subcomms_ucp_)); } else { - return std::unique_ptr(new std_comms( - nccl_comm, subcomm_ranks.size(), key, device_allocator_, stream_)); + return std::unique_ptr( + new std_comms(nccl_comm, subcomm_ranks.size(), key, stream_)); } } @@ -465,6 +456,7 @@ class std_comms : public comms_iface { cudaStream_t stream_; int *sendbuff_, *recvbuff_; + rmm::device_uvector status_; int num_ranks_; int rank_; @@ -478,8 +470,6 @@ class std_comms : public comms_iface { mutable std::unordered_map requests_in_flight_; mutable std::unordered_set free_requests_; - - std::shared_ptr device_allocator_; }; } // end namespace comms } // end namespace raft diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/classlabels.cuh index f2b2463165..01a0afa774 100644 --- a/cpp/include/raft/label/classlabels.cuh +++ b/cpp/include/raft/label/classlabels.cuh @@ -21,7 +21,6 @@ #include #include #include -#include #include namespace raft { @@ -40,12 +39,11 @@ namespace label { * on exit it has size [n_unique] * \param [out] n_unique number of unique labels * \param [in] stream cuda stream - * \param [in] allocator device allocator */ template -void getUniquelabels(value_t *y, size_t n, value_t **y_unique, int *n_unique, - cudaStream_t stream, - std::shared_ptr allocator) { +int getUniquelabels(rmm::device_uvector &y_unique, value_t *y, + size_t n, cudaStream_t stream) { + int n_unique; rmm::device_uvector y2(n, stream); rmm::device_uvector y3(n, stream); rmm::device_uvector d_num_selected(1, stream); @@ -64,13 +62,13 @@ void getUniquelabels(value_t *y, size_t n, value_t **y_unique, int *n_unique, cub::DeviceRadixSort::SortKeys(cub_storage.data(), bytes, y, y2.data(), n); cub::DeviceSelect::Unique(cub_storage.data(), bytes, y2.data(), y3.data(), d_num_selected.data(), n); - raft::update_host(n_unique, d_num_selected.data(), 1, stream); + raft::update_host(&n_unique, d_num_selected.data(), 1, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); // Copy unique classes to output - *y_unique = - (value_t *)allocator->allocate(*n_unique * sizeof(value_t), stream); - raft::copy(*y_unique, y3.data(), *n_unique, stream); + y_unique.resize(n_unique, stream); + raft::copy(y_unique.data(), y3.data(), n_unique, stream); + return n_unique; } /** @@ -147,22 +145,17 @@ __global__ void map_label_kernel(Type *map_ids, size_t N_labels, Type *in, */ template void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream, - Lambda filter_op, - std::shared_ptr allocator, - bool zero_based = false) { + Lambda filter_op, bool zero_based = false) { static const size_t TPB_X = 256; dim3 blocks(raft::ceildiv(N, TPB_X)); dim3 threads(TPB_X); - Type *map_ids; - int num_clusters; - getUniquelabels(in, N, &map_ids, &num_clusters, stream, allocator); + rmm::device_uvector map_ids(0, stream); + int num_clusters = getUniquelabels(map_ids, in, N, stream); map_label_kernel<<>>( - map_ids, num_clusters, in, out, N, filter_op, zero_based); - - allocator->deallocate(map_ids, num_clusters * sizeof(Type), stream); + map_ids.data(), num_clusters, in, out, N, filter_op, zero_based); } /** @@ -184,11 +177,9 @@ void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream, */ template void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream, - std::shared_ptr allocator, bool zero_based = false) { make_monotonic( - out, in, N, stream, [] __device__(Type val) { return false; }, allocator, - zero_based); + out, in, N, stream, [] __device__(Type val) { return false; }, zero_based); } }; // namespace label }; // end namespace raft diff --git a/cpp/include/raft/linalg/cholesky_r1_update.cuh b/cpp/include/raft/linalg/cholesky_r1_update.cuh index b5a93c4953..d6d064c20e 100644 --- a/cpp/include/raft/linalg/cholesky_r1_update.cuh +++ b/cpp/include/raft/linalg/cholesky_r1_update.cuh @@ -63,11 +63,11 @@ namespace linalg { * @code{.cpp} * // Initialize arrays * int ld_L = n_rows; - * device_buffer L(allocator, stream, ld_L * n_rows); + * rmm::device_uvector L(ld_L * n_rows, stream); * MLCommon::LinAlg::choleskyRank1Update(handle, L, n_rows, ld_L, nullptr, * &n_bytes, CUBLAS_FILL_MODE_LOWER, * stream); - * device_buffer workspace(allocator, stream, n_bytes); + * rmm::device_uvector workspace(n_bytes, stream); * * for (n=1; n<=n_rows; rank++) { * // Calculate a new row/column of matrix A into A_new @@ -87,11 +87,11 @@ namespace linalg { * @code{.cpp} * // Initialize arrays * int ld_U = n_rows; - * device_buffer U(allocator, stream, ld_U * n_rows); + * rmm::device_uvector U(ld_U * n_rows, stream); * MLCommon::LinAlg::choleskyRank1Update(handle, L, n_rows, ld_U, nullptr, * &n_bytes, CUBLAS_FILL_MODE_UPPER, * stream); - * device_buffer workspace(allocator, stream, n_bytes); + * rmm::device_uvector workspace(stream, n_bytes, stream); * * for (n=1; n<=n_rows; n++) { * // Calculate a new row/column of matrix A into array A_new diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh index 8ab7011db4..83abd96756 100644 --- a/cpp/include/raft/linalg/eig.cuh +++ b/cpp/include/raft/linalg/eig.cuh @@ -44,7 +44,6 @@ template void eigDC(const raft::handle_t &handle, const math_t *in, int n_rows, int n_cols, math_t *eig_vectors, math_t *eig_vals, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); int lwork; @@ -93,7 +92,6 @@ template void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, int n_eig_vals, math_t *eig_vectors, math_t *eig_vals, EigVecMemUsage memUsage, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); int lwork; @@ -163,7 +161,6 @@ template void eigJacobi(const raft::handle_t &handle, const math_t *in, int n_rows, int n_cols, math_t *eig_vectors, math_t *eig_vals, cudaStream_t stream, math_t tol = 1.e-7, int sweeps = 15) { - auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); syevjInfo_t syevj_params = nullptr; diff --git a/cpp/include/raft/matrix/math.cuh b/cpp/include/raft/matrix/math.cuh index f84b85d222..cd99bbfc84 100644 --- a/cpp/include/raft/matrix/math.cuh +++ b/cpp/include/raft/matrix/math.cuh @@ -21,7 +21,6 @@ #include #include #include -#include #include namespace raft { @@ -285,7 +284,6 @@ void setValue(math_t *out, const math_t *in, math_t scalar, int len, * @param src: input matrix * @param dest: output matrix. The result is stored in the dest matrix * @param len: number elements of input matrix - * @param allocator device allocator * @param stream cuda stream */ template diff --git a/cpp/include/raft/random/rng.cuh b/cpp/include/raft/random/rng.cuh index a96c0bae38..3d2e44e49b 100644 --- a/cpp/include/raft/random/rng.cuh +++ b/cpp/include/raft/random/rng.cuh @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -498,7 +497,6 @@ class Rng { * sampling is desired * @param sampledLen output sampled array length * @param len input array length - * @param allocator device allocator for allocating any workspace required * @param stream cuda stream */ template @@ -509,9 +507,6 @@ class Rng { ASSERT(sampledLen <= len, "sampleWithoutReplacement: 'sampledLen' cant be more than 'len'."); - std::shared_ptr allocator = - handle.get_device_allocator(); - rmm::device_uvector expWts(len, stream); rmm::device_uvector sortedWts(len, stream); rmm::device_uvector inIdx(len, stream); diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh index 31a8f54721..16f351bf48 100644 --- a/cpp/include/raft/sparse/convert/csr.cuh +++ b/cpp/include/raft/sparse/convert/csr.cuh @@ -22,7 +22,6 @@ #include #include #include -#include #include #include @@ -49,7 +48,6 @@ void coo_to_csr(const raft::handle_t &handle, const int *srcRows, int *dst_offsets, int *dstCols, value_t *dstVals) { auto stream = handle.get_stream(); auto cusparseHandle = handle.get_cusparse_handle(); - auto d_alloc = handle.get_device_allocator(); rmm::device_uvector dstRows(nnz, stream); CUDA_CHECK(cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream)); @@ -147,12 +145,10 @@ void csr_adj_graph(const Index_ *row_ind, Index_ total_rows, Index_ nnz, * @param nnz: size of COO rows array * @param row_ind: output row indices array * @param m: number of rows in dense matrix - * @param d_alloc device allocator for temporary buffers * @param stream: cuda stream to use */ template void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m, - std::shared_ptr d_alloc, cudaStream_t stream) { rmm::device_uvector row_counts(m, stream); @@ -173,15 +169,11 @@ void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m, * * @param coo: Input COO matrix * @param row_ind: output row indices array - * @param d_alloc device allocator for temporary buffers * @param stream: cuda stream to use */ template -void sorted_coo_to_csr(COO *coo, int *row_ind, - std::shared_ptr d_alloc, - cudaStream_t stream) { - sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, d_alloc, - stream); +void sorted_coo_to_csr(COO *coo, int *row_ind, cudaStream_t stream) { + sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, stream); } }; // end NAMESPACE convert diff --git a/cpp/include/raft/sparse/coo.cuh b/cpp/include/raft/sparse/coo.cuh index 40d1a06720..6af8eae395 100644 --- a/cpp/include/raft/sparse/coo.cuh +++ b/cpp/include/raft/sparse/coo.cuh @@ -17,7 +17,6 @@ #include #include #include -#include #include #include @@ -68,10 +67,9 @@ class COO { Index_Type n_cols; /** - * @param d_alloc: the device allocator to use for the underlying buffers * @param stream: CUDA stream to use */ - COO(std::shared_ptr d_alloc, cudaStream_t stream) + COO(cudaStream_t stream) : rows_arr(0, stream), cols_arr(0, stream), vals_arr(0, stream), @@ -98,16 +96,14 @@ class COO { n_cols(n_cols) {} /** - * @param d_alloc: the device allocator use * @param stream: CUDA stream to use * @param nnz: size of the rows/cols/vals arrays * @param n_rows: number of rows in the dense matrix * @param n_cols: number of cols in the dense matrix * @param init: initialize arrays with zeros */ - COO(std::shared_ptr d_alloc, cudaStream_t stream, - Index_Type nnz, Index_Type n_rows = 0, Index_Type n_cols = 0, - bool init = true) + COO(cudaStream_t stream, Index_Type nnz, Index_Type n_rows = 0, + Index_Type n_cols = 0, bool init = true) : rows_arr(nnz, stream), cols_arr(nnz, stream), vals_arr(nnz, stream), diff --git a/cpp/include/raft/sparse/csr.cuh b/cpp/include/raft/sparse/csr.cuh index b30d7af8b4..a9ee3e63b2 100644 --- a/cpp/include/raft/sparse/csr.cuh +++ b/cpp/include/raft/sparse/csr.cuh @@ -20,7 +20,6 @@ #include #include #include -#include #include #include @@ -208,7 +207,6 @@ void weak_cc_batched(Index_ *labels, const Index_ *row_ind, * @param row_ind_ptr the row index pointer of the CSR array * @param nnz the size of row_ind_ptr array * @param N number of vertices - * @param d_alloc: deviceAllocator to use for temp memory * @param stream the cuda stream to use * @param filter_op an optional filtering function to determine which points * should get considered for labeling. It gets global indexes (not batch-wide!) @@ -216,9 +214,7 @@ void weak_cc_batched(Index_ *labels, const Index_ *row_ind, template bool> void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr, - Index_ nnz, Index_ N, - std::shared_ptr d_alloc, - cudaStream_t stream, Lambda filter_op) { + Index_ nnz, Index_ N, cudaStream_t stream, Lambda filter_op) { rmm::device_uvector m(1, stream); WeakCCState state(m.data()); @@ -245,14 +241,11 @@ void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr, * @param row_ind_ptr the row index pointer of the CSR array * @param nnz the size of row_ind_ptr array * @param N number of vertices - * @param d_alloc: deviceAllocator to use for temp memory * @param stream the cuda stream to use */ template void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr, - Index_ nnz, Index_ N, - std::shared_ptr d_alloc, - cudaStream_t stream) { + Index_ nnz, Index_ N, cudaStream_t stream) { rmm::device_uvector m(1, stream); WeakCCState state(m.data()); weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, 0, N, diff --git a/cpp/include/raft/sparse/distance/bin_distance.cuh b/cpp/include/raft/sparse/distance/bin_distance.cuh index 1ad5466aad..68c9548f7f 100644 --- a/cpp/include/raft/sparse/distance/bin_distance.cuh +++ b/cpp/include/raft/sparse/distance/bin_distance.cuh @@ -23,7 +23,6 @@ #include #include -#include #include #include @@ -85,9 +84,8 @@ void compute_bin_distance(value_t *out, const value_idx *Q_coo_rows, const value_t *Q_data, value_idx Q_nnz, const value_idx *R_coo_rows, const value_t *R_data, value_idx R_nnz, value_idx m, value_idx n, - cusparseHandle_t handle, - std::shared_ptr alloc, - cudaStream_t stream, expansion_f expansion_func) { + cusparseHandle_t handle, cudaStream_t stream, + expansion_f expansion_func) { rmm::device_uvector Q_norms(m, stream); rmm::device_uvector R_norms(n, stream); CUDA_CHECK( @@ -130,7 +128,7 @@ class jaccard_expanded_distances_t : public distances_t { compute_bin_distance( out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, - config_->handle, config_->allocator, config_->stream, + config_->handle, config_->stream, [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { value_t q_r_union = q_norm + r_norm; value_t denom = q_r_union - dot; @@ -177,7 +175,7 @@ class dice_expanded_distances_t : public distances_t { compute_bin_distance( out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, - config_->handle, config_->allocator, config_->stream, + config_->handle, config_->stream, [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { value_t q_r_union = q_norm + r_norm; value_t dice = (2 * dot) / q_r_union; diff --git a/cpp/include/raft/sparse/distance/common.h b/cpp/include/raft/sparse/distance/common.h index 712d2c52bd..aabf5e47d4 100644 --- a/cpp/include/raft/sparse/distance/common.h +++ b/cpp/include/raft/sparse/distance/common.h @@ -17,7 +17,6 @@ #pragma once #include -#include namespace raft { namespace sparse { @@ -42,8 +41,6 @@ struct distances_config_t { value_t *b_data; cusparseHandle_t handle; - - std::shared_ptr allocator; cudaStream_t stream; }; diff --git a/cpp/include/raft/sparse/distance/coo_spmv.cuh b/cpp/include/raft/sparse/distance/coo_spmv.cuh index d596c6b852..edd1b78ae8 100644 --- a/cpp/include/raft/sparse/distance/coo_spmv.cuh +++ b/cpp/include/raft/sparse/distance/coo_spmv.cuh @@ -19,7 +19,6 @@ #include #include #include -#include #include #include diff --git a/cpp/include/raft/sparse/distance/csr_spmv.cuh b/cpp/include/raft/sparse/distance/csr_spmv.cuh index 2ee0abe47a..635e4568fd 100644 --- a/cpp/include/raft/sparse/distance/csr_spmv.cuh +++ b/cpp/include/raft/sparse/distance/csr_spmv.cuh @@ -19,7 +19,6 @@ #include #include #include -#include #include #include @@ -343,9 +342,8 @@ __global__ void max_kernel(value_idx *out, value_idx *in, value_idx n) { } template -inline value_idx max_degree( - value_idx *indptr, value_idx n_rows, - std::shared_ptr allocator, cudaStream_t stream) { +inline value_idx max_degree(value_idx *indptr, value_idx n_rows, + cudaStream_t stream) { rmm::device_uvector max_d(1, stream); CUDA_CHECK(cudaMemsetAsync(max_d.data(), 0, sizeof(value_idx), stream)); @@ -466,9 +464,9 @@ void generalized_csr_pairwise_semiring( int nnz_upper_bound = max_nnz_per_block(); // max_nnz set from max(diff(indptrA)) - value_idx max_nnz = max_degree(config_.a_indptr, config_.a_nrows, - config_.allocator, config_.stream) + - 1; + value_idx max_nnz = + max_degree(config_.a_indptr, config_.a_nrows, config_.stream) + + 1; if (max_nnz <= nnz_upper_bound) // use smem diff --git a/cpp/include/raft/sparse/distance/distance.cuh b/cpp/include/raft/sparse/distance/distance.cuh index 0cd0be11be..f22397614d 100644 --- a/cpp/include/raft/sparse/distance/distance.cuh +++ b/cpp/include/raft/sparse/distance/distance.cuh @@ -22,7 +22,6 @@ #include #include #include -#include #include #include diff --git a/cpp/include/raft/sparse/distance/ip_distance.cuh b/cpp/include/raft/sparse/distance/ip_distance.cuh index 6297b009f4..cd48cd21a0 100644 --- a/cpp/include/raft/sparse/distance/ip_distance.cuh +++ b/cpp/include/raft/sparse/distance/ip_distance.cuh @@ -22,7 +22,6 @@ #include #include -#include #include #include @@ -207,7 +206,7 @@ class ip_distances_gemm_t : public ip_trans_getters_t { raft::sparse::linalg::csr_transpose( config_->handle, config_->b_indptr, config_->b_indices, config_->b_data, csc_indptr.data(), csc_indices.data(), csc_data.data(), config_->b_nrows, - config_->b_ncols, config_->b_nnz, config_->allocator, config_->stream); + config_->b_ncols, config_->b_nnz, config_->stream); } value_t alpha; diff --git a/cpp/include/raft/sparse/distance/l2_distance.cuh b/cpp/include/raft/sparse/distance/l2_distance.cuh index 3898189630..2ae7e365e4 100644 --- a/cpp/include/raft/sparse/distance/l2_distance.cuh +++ b/cpp/include/raft/sparse/distance/l2_distance.cuh @@ -23,7 +23,6 @@ #include #include #include -#include #include #include @@ -89,9 +88,8 @@ void compute_l2(value_t *out, const value_idx *Q_coo_rows, const value_t *Q_data, value_idx Q_nnz, const value_idx *R_coo_rows, const value_t *R_data, value_idx R_nnz, value_idx m, value_idx n, - cusparseHandle_t handle, - std::shared_ptr alloc, - cudaStream_t stream, expansion_f expansion_func) { + cusparseHandle_t handle, cudaStream_t stream, + expansion_f expansion_func) { rmm::device_uvector Q_sq_norms(m, stream); rmm::device_uvector R_sq_norms(n, stream); CUDA_CHECK( @@ -134,7 +132,7 @@ class l2_expanded_distances_t : public distances_t { compute_l2( out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, - config_->handle, config_->allocator, config_->stream, + config_->handle, config_->stream, [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { return -2 * dot + q_norm + r_norm; }); @@ -201,7 +199,7 @@ class cosine_expanded_distances_t : public distances_t { compute_l2( out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, - config_->handle, config_->allocator, config_->stream, + config_->handle, config_->stream, [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { value_t norms = sqrt(q_norm) * sqrt(r_norm); // deal with potential for 0 in denominator by forcing 0/1 instead diff --git a/cpp/include/raft/sparse/distance/lp_distance.cuh b/cpp/include/raft/sparse/distance/lp_distance.cuh index 4a7d5e53ff..31fed0656f 100644 --- a/cpp/include/raft/sparse/distance/lp_distance.cuh +++ b/cpp/include/raft/sparse/distance/lp_distance.cuh @@ -23,7 +23,6 @@ #include #include -#include #include #include diff --git a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh index 1ac075489a..187985627f 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh @@ -99,7 +99,6 @@ void build_dendrogram_host(const handle_t &handle, const value_idx *rows, const value_idx *cols, const value_t *data, size_t nnz, value_idx *children, value_t *out_delta, value_idx *out_size) { - auto d_alloc = handle.get_device_allocator(); auto stream = handle.get_stream(); value_idx n_edges = nnz; @@ -224,7 +223,6 @@ template void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels, const value_idx *children, size_t n_clusters, size_t n_leaves) { - auto d_alloc = handle.get_device_allocator(); auto stream = handle.get_stream(); auto thrust_policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); diff --git a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh index 7cf959dda6..b6ec190a98 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh @@ -60,12 +60,11 @@ struct distance_graph_impl &indptr, rmm::device_uvector &indices, rmm::device_uvector &data, int c) { - auto d_alloc = handle.get_device_allocator(); auto stream = handle.get_stream(); auto exec_policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); // Need to symmetrize knn into undirected graph - raft::sparse::COO knn_graph_coo(d_alloc, stream); + raft::sparse::COO knn_graph_coo(stream); raft::sparse::selection::knn_graph(handle, X, m, n, metric, knn_graph_coo, c); @@ -86,9 +85,8 @@ struct distance_graph_impl(tup)); }); - raft::sparse::convert::sorted_coo_to_csr(knn_graph_coo.rows(), - knn_graph_coo.nnz, indptr.data(), - m + 1, d_alloc, stream); + raft::sparse::convert::sorted_coo_to_csr( + knn_graph_coo.rows(), knn_graph_coo.nnz, indptr.data(), m + 1, stream); // TODO: Wouldn't need to copy here if we could compute knn // graph directly on the device uvectors diff --git a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh index 765a5ad77f..033d5881d5 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh @@ -80,18 +80,16 @@ void connect_knn_graph(const raft::handle_t &handle, const value_t *X, red_op reduction_op, raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded) { - auto d_alloc = handle.get_device_allocator(); auto stream = handle.get_stream(); - raft::sparse::COO connected_edges(d_alloc, stream); + raft::sparse::COO connected_edges(stream); raft::linkage::connect_components( handle, connected_edges, X, color, m, n, reduction_op); rmm::device_uvector indptr2(m + 1, stream); - raft::sparse::convert::sorted_coo_to_csr(connected_edges.rows(), - connected_edges.nnz, indptr2.data(), - m + 1, d_alloc, stream); + raft::sparse::convert::sorted_coo_to_csr( + connected_edges.rows(), connected_edges.nnz, indptr2.data(), m + 1, stream); // On the second call, we hand the MST the original colors // and the new set of edges and let it restart the optimization process @@ -136,7 +134,6 @@ void build_sorted_mst(const raft::handle_t &handle, const value_t *X, raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded, int max_iter = 10) { - auto d_alloc = handle.get_device_allocator(); auto stream = handle.get_stream(); // We want to have MST initialize colors on first call. @@ -145,7 +142,7 @@ void build_sorted_mst(const raft::handle_t &handle, const value_t *X, true); int iters = 1; - int n_components = linkage::get_n_components(color, m, d_alloc, stream); + int n_components = linkage::get_n_components(color, m, stream); while (n_components > 1 && iters < max_iter) { connect_knn_graph(handle, X, mst_coo, m, n, color, @@ -153,7 +150,7 @@ void build_sorted_mst(const raft::handle_t &handle, const value_t *X, iters++; - n_components = linkage::get_n_components(color, m, d_alloc, stream); + n_components = linkage::get_n_components(color, m, stream); } /** diff --git a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp index 01a033945c..06fffb8aed 100644 --- a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp +++ b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp @@ -58,7 +58,6 @@ void single_linkage(const raft::handle_t &handle, const value_t *X, size_t m, "n_clusters must be less than or equal to the number of data points"); auto stream = handle.get_stream(); - auto d_alloc = handle.get_device_allocator(); rmm::device_uvector indptr(EMPTY, stream); rmm::device_uvector indices(EMPTY, stream); diff --git a/cpp/include/raft/sparse/linalg/add.cuh b/cpp/include/raft/sparse/linalg/add.cuh index 6bb4b31bd5..3bf028d14a 100644 --- a/cpp/include/raft/sparse/linalg/add.cuh +++ b/cpp/include/raft/sparse/linalg/add.cuh @@ -21,7 +21,6 @@ #include #include #include -#include #include #include @@ -156,14 +155,12 @@ __global__ void csr_add_kernel(const int *a_ind, const int *a_indptr, * @param nnz2: size of right hand index_ptr and val arrays * @param m: size of output array (number of rows in final matrix) * @param out_ind: output row_ind array - * @param d_alloc: device allocator to use for temp memory * @param stream: cuda stream to use */ template size_t csr_add_calc_inds(const int *a_ind, const int *a_indptr, const T *a_val, int nnz1, const int *b_ind, const int *b_indptr, const T *b_val, int nnz2, int m, int *out_ind, - std::shared_ptr d_alloc, cudaStream_t stream) { dim3 grid(raft::ceildiv(m, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); diff --git a/cpp/include/raft/sparse/linalg/spectral.cuh b/cpp/include/raft/sparse/linalg/spectral.cuh index 0d6c55bb65..28b9190c53 100644 --- a/cpp/include/raft/sparse/linalg/spectral.cuh +++ b/cpp/include/raft/sparse/linalg/spectral.cuh @@ -18,7 +18,6 @@ #include #include -#include #include #include @@ -35,7 +34,6 @@ void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals, int nnz, int n, int n_components, T *out, unsigned long long seed = 1234567) { auto stream = handle.get_stream(); - auto d_alloc = handle.get_device_allocator(); rmm::device_uvector src_offsets(n + 1, stream); rmm::device_uvector dst_cols(nnz, stream); rmm::device_uvector dst_vals(nnz, stream); diff --git a/cpp/include/raft/sparse/linalg/symmetrize.cuh b/cpp/include/raft/sparse/linalg/symmetrize.cuh index 128ecb21bb..5fcd336551 100644 --- a/cpp/include/raft/sparse/linalg/symmetrize.cuh +++ b/cpp/include/raft/sparse/linalg/symmetrize.cuh @@ -21,7 +21,6 @@ #include #include #include -#include #include #include @@ -121,13 +120,11 @@ __global__ void coo_symmetrize_kernel(int *row_ind, int *rows, int *cols, * @param in: Input COO matrix * @param out: Output symmetrized COO matrix * @param reduction_op: a custom reduction function - * @param d_alloc device allocator for temporary buffers * @param stream: cuda stream to use */ template void coo_symmetrize(COO *in, COO *out, Lambda reduction_op, // two-argument reducer - std::shared_ptr d_alloc, cudaStream_t stream) { dim3 grid(raft::ceildiv(in->n_rows, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); @@ -136,7 +133,7 @@ void coo_symmetrize(COO *in, COO *out, rmm::device_uvector in_row_ind(in->n_rows, stream); - convert::sorted_coo_to_csr(in, in_row_ind.data(), d_alloc, stream); + convert::sorted_coo_to_csr(in, in_row_ind.data(), stream); out->allocate(in->nnz * 2, in->n_rows, in->n_cols, true, stream); @@ -249,14 +246,14 @@ __global__ static void symmetric_sum(value_idx *restrict edges, * @param k: Number of n_neighbors * @param out: Output COO Matrix class * @param stream: Input cuda stream - * @param d_alloc device allocator for temporary buffers */ template -void from_knn_symmetrize_matrix( - const value_idx *restrict knn_indices, const value_t *restrict knn_dists, - const value_idx n, const int k, COO *out, - cudaStream_t stream, std::shared_ptr d_alloc) { +void from_knn_symmetrize_matrix(const value_idx *restrict knn_indices, + const value_t *restrict knn_dists, + const value_idx n, const int k, + COO *out, + cudaStream_t stream) { // (1) Find how much space needed in each row // We look through all datapoints and increment the count for each row. const dim3 threadsPerBlock(TPB_X, TPB_Y); @@ -313,7 +310,6 @@ template void symmetrize(const raft::handle_t &handle, const value_idx *rows, const value_idx *cols, const value_t *vals, size_t m, size_t n, size_t nnz, raft::sparse::COO &out) { - auto d_alloc = handle.get_device_allocator(); auto stream = handle.get_stream(); // copy rows to cols and cols to rows @@ -332,7 +328,7 @@ void symmetrize(const raft::handle_t &handle, const value_idx *rows, // sort COO raft::sparse::op::coo_sort((value_idx)m, (value_idx)n, (value_idx)nnz * 2, symm_rows.data(), symm_cols.data(), - symm_vals.data(), d_alloc, stream); + symm_vals.data(), stream); raft::sparse::op::max_duplicates(handle, out, symm_rows.data(), symm_cols.data(), symm_vals.data(), nnz * 2, diff --git a/cpp/include/raft/sparse/linalg/transpose.h b/cpp/include/raft/sparse/linalg/transpose.h index 7979de5657..7ad4b93ec0 100644 --- a/cpp/include/raft/sparse/linalg/transpose.h +++ b/cpp/include/raft/sparse/linalg/transpose.h @@ -21,7 +21,6 @@ #include #include #include -#include #include #include @@ -53,7 +52,6 @@ namespace linalg { * @param[in] csr_nrows : Number of rows in CSR * @param[in] csr_ncols : Number of columns in CSR * @param[in] nnz : Number of nonzeros of CSR - * @param[in] allocator : Allocator for intermediate memory * @param[in] stream : Cuda stream for ordering events */ template @@ -61,9 +59,7 @@ void csr_transpose(cusparseHandle_t handle, const value_idx *csr_indptr, const value_idx *csr_indices, const value_t *csr_data, value_idx *csc_indptr, value_idx *csc_indices, value_t *csc_data, value_idx csr_nrows, value_idx csr_ncols, - value_idx nnz, - std::shared_ptr allocator, - cudaStream_t stream) { + value_idx nnz, cudaStream_t stream) { size_t convert_csc_workspace_size = 0; CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc_bufferSize( diff --git a/cpp/include/raft/sparse/op/filter.cuh b/cpp/include/raft/sparse/op/filter.cuh index 5df7decf8b..5383f6fe7e 100644 --- a/cpp/include/raft/sparse/op/filter.cuh +++ b/cpp/include/raft/sparse/op/filter.cuh @@ -21,7 +21,6 @@ #include #include #include -#include #include #include @@ -84,9 +83,7 @@ __global__ void coo_remove_scalar_kernel(const int *rows, const int *cols, template void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz, int *crows, int *ccols, T *cvals, int *cnnz, - int *cur_cnnz, T scalar, int n, - std::shared_ptr d_alloc, - cudaStream_t stream) { + int *cur_cnnz, T scalar, int n, cudaStream_t stream) { rmm::device_uvector ex_scan(n, stream); rmm::device_uvector cur_ex_scan(n, stream); @@ -122,13 +119,10 @@ void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz, * @param in: input COO matrix * @param out: output COO matrix * @param scalar: scalar to remove from arrays - * @param d_alloc device allocator for temporary buffers * @param stream: cuda stream to use */ template -void coo_remove_scalar(COO *in, COO *out, T scalar, - std::shared_ptr d_alloc, - cudaStream_t stream) { +void coo_remove_scalar(COO *in, COO *out, T scalar, cudaStream_t stream) { rmm::device_uvector row_count_nz(in->n_rows, stream); rmm::device_uvector row_count(in->n_rows, stream); @@ -154,7 +148,7 @@ void coo_remove_scalar(COO *in, COO *out, T scalar, coo_remove_scalar(in->rows(), in->cols(), in->vals(), in->nnz, out->rows(), out->cols(), out->vals(), row_count_nz.data(), row_count.data(), scalar, - in->n_rows, d_alloc, stream); + in->n_rows, stream); CUDA_CHECK(cudaPeekAtLastError()); } @@ -163,14 +157,11 @@ void coo_remove_scalar(COO *in, COO *out, T scalar, * * @param in: input COO matrix * @param out: output COO matrix - * @param d_alloc device allocator for temporary buffers * @param stream: cuda stream to use */ template -void coo_remove_zeros(COO *in, COO *out, - std::shared_ptr d_alloc, - cudaStream_t stream) { - coo_remove_scalar(in, out, T(0.0), d_alloc, stream); +void coo_remove_zeros(COO *in, COO *out, cudaStream_t stream) { + coo_remove_scalar(in, out, T(0.0), stream); } }; // namespace op diff --git a/cpp/include/raft/sparse/op/reduce.cuh b/cpp/include/raft/sparse/op/reduce.cuh index 53c9f89074..2708f0491e 100644 --- a/cpp/include/raft/sparse/op/reduce.cuh +++ b/cpp/include/raft/sparse/op/reduce.cuh @@ -21,7 +21,6 @@ #include #include #include -#include #include #include @@ -126,7 +125,6 @@ void max_duplicates(const raft::handle_t &handle, raft::sparse::COO &out, const value_idx *rows, const value_idx *cols, const value_t *vals, size_t nnz, size_t m, size_t n) { - auto d_alloc = handle.get_device_allocator(); auto stream = handle.get_stream(); auto exec_policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); diff --git a/cpp/include/raft/sparse/op/sort.h b/cpp/include/raft/sparse/op/sort.h index 9dbe2b67c5..d53ceb62a9 100644 --- a/cpp/include/raft/sparse/op/sort.h +++ b/cpp/include/raft/sparse/op/sort.h @@ -21,7 +21,6 @@ #include #include #include -#include #include #include @@ -62,13 +61,10 @@ struct TupleComp { * @param rows rows array from coo matrix * @param cols cols array from coo matrix * @param vals vals array from coo matrix - * @param d_alloc device allocator for temporary buffers * @param stream: cuda stream to use */ template void coo_sort(int m, int n, int nnz, int *rows, int *cols, T *vals, - // TODO: Remove this - std::shared_ptr d_alloc, cudaStream_t stream) { auto coo_indices = thrust::make_zip_iterator(thrust::make_tuple(rows, cols)); @@ -81,16 +77,12 @@ void coo_sort(int m, int n, int nnz, int *rows, int *cols, T *vals, * @brief Sort the underlying COO arrays by row * @tparam T: the type name of the underlying value array * @param in: COO to sort by row - * @param d_alloc device allocator for temporary buffers * @param stream: the cuda stream to use */ template -void coo_sort(COO *const in, - // TODO: Remove this - std::shared_ptr d_alloc, - cudaStream_t stream) { +void coo_sort(COO *const in, cudaStream_t stream) { coo_sort(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), - in->vals(), d_alloc, stream); + in->vals(), stream); } /** diff --git a/cpp/include/raft/sparse/selection/connect_components.cuh b/cpp/include/raft/sparse/selection/connect_components.cuh index 8aae90f1d8..9b02ae67e6 100644 --- a/cpp/include/raft/sparse/selection/connect_components.cuh +++ b/cpp/include/raft/sparse/selection/connect_components.cuh @@ -159,14 +159,10 @@ struct CubKVPMinReduce { */ template value_idx get_n_components(value_idx *colors, size_t n_rows, - std::shared_ptr d_alloc, cudaStream_t stream) { - value_idx *map_ids; - int num_clusters; - raft::label::getUniquelabels(colors, n_rows, &map_ids, &num_clusters, stream, - d_alloc); - d_alloc->deallocate(map_ids, num_clusters * sizeof(value_idx), stream); - + rmm::device_uvector map_ids(0, stream); + int num_clusters = + raft::label::getUniquelabels(map_ids, colors, n_rows, stream); return num_clusters; } @@ -197,15 +193,13 @@ struct LookupColorOp { * @param[in] X original dense data * @param[in] n_rows number of rows in original dense data * @param[in] n_cols number of columns in original dense data - * @param[in] d_alloc device allocator to use * @param[in] stream cuda stream for which to order cuda operations */ template void perform_1nn(cub::KeyValuePair *kvp, value_idx *nn_colors, value_idx *colors, const value_t *X, - size_t n_rows, size_t n_cols, - std::shared_ptr d_alloc, - cudaStream_t stream, red_op reduction_op) { + size_t n_rows, size_t n_cols, cudaStream_t stream, + red_op reduction_op) { rmm::device_uvector workspace(n_rows, stream); rmm::device_uvector x_norm(n_rows, stream); @@ -324,7 +318,6 @@ void connect_components(const raft::handle_t &handle, size_t n_rows, size_t n_cols, red_op reduction_op, raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded) { - auto d_alloc = handle.get_device_allocator(); auto stream = handle.get_stream(); RAFT_EXPECTS(metric == raft::distance::DistanceType::L2SqrtExpanded, @@ -336,10 +329,9 @@ void connect_components(const raft::handle_t &handle, // Normalize colors so they are drawn from a monotonically increasing set raft::label::make_monotonic(colors.data(), colors.data(), n_rows, stream, - d_alloc, true); + true); - value_idx n_components = - get_n_components(colors.data(), n_rows, d_alloc, stream); + value_idx n_components = get_n_components(colors.data(), n_rows, stream); /** * First compute 1-nn for all colors where the color of each data point @@ -351,7 +343,7 @@ void connect_components(const raft::handle_t &handle, rmm::device_uvector src_indices(n_rows, stream); perform_1nn(temp_inds_dists.data(), nn_colors.data(), colors.data(), X, - n_rows, n_cols, d_alloc, stream, reduction_op); + n_rows, n_cols, stream, reduction_op); /** * Sort data points by color (neighbors are not sorted) @@ -380,7 +372,7 @@ void connect_components(const raft::handle_t &handle, size++; - raft::sparse::COO min_edges(d_alloc, stream); + raft::sparse::COO min_edges(stream); min_edges.allocate(size, n_rows, n_rows, true, stream); min_components_by_color(min_edges, out_index.data(), src_indices.data(), diff --git a/cpp/include/raft/sparse/selection/knn.cuh b/cpp/include/raft/sparse/selection/knn.cuh index e327386d13..d571eabb77 100644 --- a/cpp/include/raft/sparse/selection/knn.cuh +++ b/cpp/include/raft/sparse/selection/knn.cuh @@ -24,7 +24,6 @@ #include #include #include -#include #include #include @@ -120,9 +119,7 @@ class sparse_knn_t { const value_idx *queryIndices_, const value_t *queryData_, size_t queryNNZ_, int n_query_rows_, int n_query_cols_, value_idx *output_indices_, value_t *output_dists_, int k_, - cusparseHandle_t cusparseHandle_, - std::shared_ptr allocator_, - cudaStream_t stream_, + cusparseHandle_t cusparseHandle_, cudaStream_t stream_, size_t batch_size_index_ = 2 << 14, // approx 1M size_t batch_size_query_ = 2 << 14, raft::distance::DistanceType metric_ = @@ -144,7 +141,6 @@ class sparse_knn_t { output_dists(output_dists_), k(k_), cusparseHandle(cusparseHandle_), - allocator(allocator_), stream(stream_), batch_size_index(batch_size_index_), batch_size_query(batch_size_query_), @@ -369,7 +365,6 @@ class sparse_knn_t { dist_config.a_data = query_batch_data; dist_config.handle = cusparseHandle; - dist_config.allocator = allocator; dist_config.stream = stream; if (raft::sparse::distance::supportedDistance.find(metric) == @@ -395,8 +390,6 @@ class sparse_knn_t { cusparseHandle_t cusparseHandle; - std::shared_ptr allocator; - cudaStream_t stream; }; @@ -418,7 +411,6 @@ class sparse_knn_t { * @param[out] output_dists dense matrix for output distances (size n_query_rows * k) * @param[in] k the number of neighbors to query * @param[in] cusparseHandle the initialized cusparseHandle instance to use - * @param[in] allocator device allocator instance to use * @param[in] stream CUDA stream to order operations with respect to * @param[in] batch_size_index maximum number of rows to use from index matrix per batch * @param[in] batch_size_query maximum number of rows to use from query matrix per batch @@ -432,9 +424,7 @@ void brute_force_knn(const value_idx *idxIndptr, const value_idx *idxIndices, const value_idx *queryIndices, const value_t *queryData, size_t queryNNZ, int n_query_rows, int n_query_cols, value_idx *output_indices, value_t *output_dists, int k, - cusparseHandle_t cusparseHandle, - std::shared_ptr allocator, - cudaStream_t stream, + cusparseHandle_t cusparseHandle, cudaStream_t stream, size_t batch_size_index = 2 << 14, // approx 1M size_t batch_size_query = 2 << 14, raft::distance::DistanceType metric = @@ -443,8 +433,8 @@ void brute_force_knn(const value_idx *idxIndptr, const value_idx *idxIndices, sparse_knn_t( idxIndptr, idxIndices, idxData, idxNNZ, n_idx_rows, n_idx_cols, queryIndptr, queryIndices, queryData, queryNNZ, n_query_rows, n_query_cols, - output_indices, output_dists, k, cusparseHandle, allocator, stream, - batch_size_index, batch_size_query, metric, metricArg) + output_indices, output_dists, k, cusparseHandle, stream, batch_size_index, + batch_size_query, metric, metricArg) .run(); } diff --git a/cpp/include/raft/sparse/selection/knn_graph.cuh b/cpp/include/raft/sparse/selection/knn_graph.cuh index 1cf225087a..1cdd66f516 100644 --- a/cpp/include/raft/sparse/selection/knn_graph.cuh +++ b/cpp/include/raft/sparse/selection/knn_graph.cuh @@ -96,7 +96,6 @@ void knn_graph(const handle_t &handle, const value_t *X, size_t m, size_t n, raft::sparse::COO &out, int c = 15) { int k = build_k(m, c); - auto d_alloc = handle.get_device_allocator(); auto stream = handle.get_stream(); size_t nnz = m * k; diff --git a/cpp/include/raft/spatial/knn/ann.hpp b/cpp/include/raft/spatial/knn/ann.hpp index 77d7831b4a..2cdf9bf4f5 100644 --- a/cpp/include/raft/spatial/knn/ann.hpp +++ b/cpp/include/raft/spatial/knn/ann.hpp @@ -22,15 +22,12 @@ #include #include -#include #include namespace raft { namespace spatial { namespace knn { -using deviceAllocator = raft::mr::device::allocator; - /** * @brief Flat C++ API function to build an approximate nearest neighbors index * from an index array and a set of parameters. diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh index 6e4c99b646..c0345a01e6 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh @@ -46,7 +46,6 @@ #include #include -#include #include #include @@ -145,8 +144,7 @@ void approx_knn_build_index(raft::handle_t &handle, // perform preprocessing // k set to 0 (unused during preprocessing / revertion) std::unique_ptr> query_metric_processor = - create_processor(metric, n, D, 0, false, handle.get_stream(), - handle.get_device_allocator()); + create_processor(metric, n, D, 0, false, handle.get_stream()); query_metric_processor->preprocess(index_array); @@ -183,7 +181,7 @@ void approx_knn_search(raft::handle_t &handle, float *distances, // perform preprocessing std::unique_ptr> query_metric_processor = create_processor(index->metric, n, index->index->d, k, false, - handle.get_stream(), handle.get_device_allocator()); + handle.get_stream()); query_metric_processor->preprocess(query_array); index->index->search(n, query_array, k, distances, indices); diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh index a6834552e3..6db8fb7a8e 100644 --- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh @@ -181,7 +181,6 @@ inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK, * @param[out] res_I pointer to device memory for returning k nearest indices * @param[out] res_D pointer to device memory for returning k nearest distances * @param[in] k number of neighbors to query - * @param[in] allocator the device memory allocator to use for temporary scratch memory * @param[in] userStream the main cuda stream to use * @param[in] internalStreams optional when n_params > 0, the index partitions can be * queried in parallel using these streams. Note that n_int_streams also @@ -200,7 +199,6 @@ template void brute_force_knn_impl(std::vector &input, std::vector &sizes, IntType D, float *search_items, IntType n, int64_t *res_I, float *res_D, IntType k, - std::shared_ptr allocator, cudaStream_t userStream, cudaStream_t *internalStreams = nullptr, int n_int_streams = 0, bool rowMajorIndex = true, @@ -230,15 +228,14 @@ void brute_force_knn_impl(std::vector &input, std::vector &sizes, // perform preprocessing std::unique_ptr> query_metric_processor = - create_processor(metric, n, D, k, rowMajorQuery, userStream, - allocator); + create_processor(metric, n, D, k, rowMajorQuery, userStream); query_metric_processor->preprocess(search_items); std::vector>> metric_processors( input.size()); for (size_t i = 0; i < input.size(); i++) { - metric_processors[i] = create_processor( - metric, sizes[i], D, k, rowMajorQuery, userStream, allocator); + metric_processors[i] = create_processor(metric, sizes[i], D, k, + rowMajorQuery, userStream); metric_processors[i]->preprocess(input[i]); } diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp index d9292440d1..876e91e877 100644 --- a/cpp/include/raft/spatial/knn/detail/processing.hpp +++ b/cpp/include/raft/spatial/knn/detail/processing.hpp @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -28,7 +27,6 @@ namespace raft { namespace spatial { namespace knn { -using deviceAllocator = raft::mr::device::allocator; /** * @brief A virtual class defining pre- and post-processing * for metrics. This class will temporarily modify its given @@ -56,15 +54,12 @@ class CosineMetricProcessor : public MetricProcessor { size_t n_rows_; size_t n_cols_; cudaStream_t stream_; - std::shared_ptr device_allocator_; rmm::device_uvector colsums_; public: CosineMetricProcessor(size_t n_rows, size_t n_cols, int k, bool row_major, - cudaStream_t stream, - std::shared_ptr allocator) - : device_allocator_(allocator), - stream_(stream), + cudaStream_t stream) + : stream_(stream), colsums_(n_rows, stream), n_cols_(n_cols), n_rows_(n_rows), @@ -104,10 +99,8 @@ class CorrelationMetricProcessor : public CosineMetricProcessor { public: CorrelationMetricProcessor(size_t n_rows, size_t n_cols, int k, - bool row_major, cudaStream_t stream, - std::shared_ptr allocator) - : CosineMetricProcessor(n_rows, n_cols, k, row_major, stream, - allocator), + bool row_major, cudaStream_t stream) + : CosineMetricProcessor(n_rows, n_cols, k, row_major, stream), means_(n_rows, stream) {} void preprocess(math_t *data) { @@ -161,18 +154,18 @@ class DefaultMetricProcessor : public MetricProcessor { template inline std::unique_ptr> create_processor( distance::DistanceType metric, int n, int D, int k, bool rowMajorQuery, - cudaStream_t userStream, std::shared_ptr allocator) { + cudaStream_t userStream) { MetricProcessor *mp = nullptr; switch (metric) { case distance::DistanceType::CosineExpanded: - mp = new CosineMetricProcessor(n, D, k, rowMajorQuery, userStream, - allocator); + mp = + new CosineMetricProcessor(n, D, k, rowMajorQuery, userStream); break; case distance::DistanceType::CorrelationExpanded: mp = new CorrelationMetricProcessor(n, D, k, rowMajorQuery, - userStream, allocator); + userStream); break; default: mp = new DefaultMetricProcessor(); diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp index a3a1972c13..71c547c281 100644 --- a/cpp/include/raft/spatial/knn/knn.hpp +++ b/cpp/include/raft/spatial/knn/knn.hpp @@ -18,15 +18,12 @@ #include "detail/knn_brute_force_faiss.cuh" -#include #include namespace raft { namespace spatial { namespace knn { -using deviceAllocator = raft::mr::device::allocator; - template inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK, value_idx *outV, size_t n_samples, int n_parts, @@ -72,8 +69,7 @@ inline void brute_force_knn( std::vector int_streams = handle.get_internal_streams(); detail::brute_force_knn_impl(input, sizes, D, search_items, n, res_I, res_D, - k, handle.get_device_allocator(), - handle.get_stream(), int_streams.data(), + k, handle.get_stream(), int_streams.data(), handle.get_num_internal_streams(), rowMajorIndex, rowMajorQuery, translations, metric, metric_arg); } diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index c43154d17a..efa98313b6 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -72,52 +73,30 @@ struct vector_view_t { : buffer_(buffer), size_(sz) {} vector_view_t(vector_view_t&& other) - : buffer_(other.buffer_), size_(other.size_) { - other.buffer_ = nullptr; - other.size_ = 0; - } + : buffer_(other.raw()), size_(other.size()) {} vector_view_t& operator=(vector_view_t&& other) { - buffer_ = other.buffer_; - size_ = other.size_; - - other.buffer_ = nullptr; - other.size_ = 0; + buffer_ = other.raw(); + size_ = other.size(); } }; -// allocatable vector, using raft handle allocator -// template class vector_t { - handle_t const& handle_; - value_type* buffer_; - size_type size_; - cudaStream_t stream_; - public: vector_t(handle_t const& raft_handle, size_type sz) - : handle_(raft_handle), - buffer_( - static_cast(raft_handle.get_device_allocator()->allocate( - sz * sizeof(value_type), raft_handle.get_stream()))), - size_(sz), - stream_(raft_handle.get_stream()) {} - - ~vector_t(void) { - handle_.get_device_allocator()->deallocate( - buffer_, size_ * sizeof(value_type), stream_); - } + : buffer_(sz, raft_handle.get_stream()) {} - size_type size(void) const { return size_; } + size_type size(void) const { return buffer_.size(); } - value_type* raw(void) { return buffer_; } + value_type* raw(void) { return buffer_.data(); } - value_type const* raw(void) const { return buffer_; } + value_type const* raw(void) const { return buffer_.data(); } template value_type nrm1(ThrustExecPolicy t_exe_pol) const { - return thrust::reduce(t_exe_pol, buffer_, buffer_ + size_, value_type{0}, + return thrust::reduce(t_exe_pol, buffer_.data(), + buffer_.data() + buffer_.size(), value_type{0}, [] __device__(auto left, auto right) { auto abs_left = left > 0 ? left : -left; auto abs_right = right > 0 ? right : -right; @@ -127,8 +106,11 @@ class vector_t { template void fill(ThrustExecPolicy t_exe_pol, value_type value) { - thrust::fill_n(t_exe_pol, buffer_, size_, value); + thrust::fill_n(t_exe_pol, buffer_.data(), buffer_.size(), value); } + + private: + rmm::device_uvector buffer_; }; template diff --git a/cpp/test/label/label.cu b/cpp/test/label/label.cu index dc2846fdba..2a159994dc 100644 --- a/cpp/test/label/label.cu +++ b/cpp/test/label/label.cu @@ -20,7 +20,6 @@ #include #include -#include #include "../test_utils.h" #include @@ -57,9 +56,7 @@ TEST_F(MakeMonotonicTest, Result) { raft::update_device(data, data_h, m, stream); raft::update_device(expected, expected_h, m, stream); - std::shared_ptr allocator( - new raft::mr::device::default_allocator); - make_monotonic(actual, data, m, stream, allocator); + make_monotonic(actual, data, m, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -76,8 +73,6 @@ TEST_F(MakeMonotonicTest, Result) { TEST(labelTest, Classlabels) { cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - std::shared_ptr allocator( - new raft::mr::device::default_allocator); int n_rows = 6; float *y_d; @@ -86,20 +81,20 @@ TEST(labelTest, Classlabels) { float y_h[] = {2, -1, 1, 2, 1, 1}; raft::update_device(y_d, y_h, n_rows, stream); - int n_classes; - float *y_unique_d; - getUniquelabels(y_d, n_rows, &y_unique_d, &n_classes, stream, allocator); + rmm::device_uvector y_unique_d(0, stream); + int n_classes = getUniquelabels(y_unique_d, y_d, n_rows, stream); ASSERT_EQ(n_classes, 3); float y_unique_exp[] = {-1, 1, 2}; - EXPECT_TRUE(devArrMatchHost(y_unique_exp, y_unique_d, n_classes, + EXPECT_TRUE(devArrMatchHost(y_unique_exp, y_unique_d.data(), n_classes, raft::Compare(), stream)); float *y_relabeled_d; raft::allocate(y_relabeled_d, n_rows); - getOvrlabels(y_d, n_rows, y_unique_d, n_classes, y_relabeled_d, 2, stream); + getOvrlabels(y_d, n_rows, y_unique_d.data(), n_classes, y_relabeled_d, 2, + stream); float y_relabeled_exp[] = {1, -1, -1, 1, -1, -1}; EXPECT_TRUE(devArrMatchHost(y_relabeled_exp, y_relabeled_d, n_rows, @@ -107,7 +102,6 @@ TEST(labelTest, Classlabels) { CUDA_CHECK(cudaStreamDestroy(stream)); CUDA_CHECK(cudaFree(y_d)); - CUDA_CHECK(cudaFree(y_unique_d)); CUDA_CHECK(cudaFree(y_relabeled_d)); } }; // namespace label diff --git a/cpp/test/label/merge_labels.cu b/cpp/test/label/merge_labels.cu index a2f14a8dbc..0648f165a6 100644 --- a/cpp/test/label/merge_labels.cu +++ b/cpp/test/label/merge_labels.cu @@ -20,7 +20,6 @@ #include #include #include -#include #include #include "../test_utils.h" diff --git a/cpp/test/linalg/binary_op.cu b/cpp/test/linalg/binary_op.cu index 3ae4f86066..979b12f237 100644 --- a/cpp/test/linalg/binary_op.cu +++ b/cpp/test/linalg/binary_op.cu @@ -17,8 +17,8 @@ #include #include #include -#include #include +#include #include "../test_utils.h" #include "binary_op.cuh" @@ -136,9 +136,9 @@ class BinaryOpAlignment : public ::testing::Test { // Test to trigger cudaErrorMisalignedAddress if veclen is incorrectly // chosen. int n = 1024; - mr::device::buffer x(handle.get_device_allocator(), stream, n); - mr::device::buffer y(handle.get_device_allocator(), stream, n); - mr::device::buffer z(handle.get_device_allocator(), stream, n); + rmm::device_uvector x(n, stream); + rmm::device_uvector y(n, stream); + rmm::device_uvector z(n, stream); CUDA_CHECK(cudaMemsetAsync(x.data(), 0, n * sizeof(math_t), stream)); CUDA_CHECK(cudaMemsetAsync(y.data(), 0, n * sizeof(math_t), stream)); raft::linalg::binaryOp( diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/test/linalg/cholesky_r1.cu index 3c84d0db5f..2cb3bed754 100644 --- a/cpp/test/linalg/cholesky_r1.cu +++ b/cpp/test/linalg/cholesky_r1.cu @@ -19,7 +19,6 @@ #include #include #include -#include #include #include diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu index 227bce6a48..089edd738e 100644 --- a/cpp/test/linalg/map.cu +++ b/cpp/test/linalg/map.cu @@ -18,7 +18,6 @@ #include #include #include -#include #include #include "../test_utils.h" diff --git a/cpp/test/mr/device/buffer.cpp b/cpp/test/mr/device/buffer.cpp index 223efdbfe8..fe42cea8b3 100644 --- a/cpp/test/mr/device/buffer.cpp +++ b/cpp/test/mr/device/buffer.cpp @@ -15,22 +15,21 @@ */ #include +#include #include #include -#include +#include #include -#include namespace raft { namespace mr { namespace device { TEST(Raft, DeviceBufferAlloc) { - auto alloc = std::make_shared(); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); // no allocation at construction - buffer buff(alloc, stream); + rmm::device_uvector buff(0, stream); ASSERT_EQ(0, buff.size()); // explicit allocation after construction buff.resize(20, stream); @@ -39,12 +38,12 @@ TEST(Raft, DeviceBufferAlloc) { buff.resize(10, stream); ASSERT_EQ(10, buff.size()); // explicit deallocation - buff.release(stream); + buff.release(); ASSERT_EQ(0, buff.size()); // use these methods without the explicit stream parameter - buff.resize(20); + buff.resize(20, stream); ASSERT_EQ(20, buff.size()); - buff.resize(10); + buff.resize(10, stream); ASSERT_EQ(10, buff.size()); buff.release(); ASSERT_EQ(0, buff.size()); @@ -62,11 +61,10 @@ TEST(Raft, DeviceBufferZeroResize) { rmm::mr::set_current_device_resource(limit_mr.get()); - auto alloc = std::make_shared(); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); // no allocation at construction - buffer buff(alloc, stream, 10); + rmm::device_uvector buff(10, stream); ASSERT_EQ(10, buff.size()); // explicit allocation after construction buff.resize(0, stream); @@ -75,7 +73,7 @@ TEST(Raft, DeviceBufferZeroResize) { buff.resize(20, stream); ASSERT_EQ(20, buff.size()); // explicit deallocation - buff.release(stream); + buff.release(); ASSERT_EQ(0, buff.size()); // Now check that there is no memory left. (Used to not be true) diff --git a/cpp/test/sparse/add.cu b/cpp/test/sparse/add.cu index 713708d4cd..38c3e5bfbb 100644 --- a/cpp/test/sparse/add.cu +++ b/cpp/test/sparse/add.cu @@ -74,9 +74,6 @@ class CSRAddTest } void Run() { - std::shared_ptr alloc( - new raft::mr::device::default_allocator); - raft::update_device(ind_a, params.matrix_a.row_ind.data(), n_rows, stream); raft::update_device(ind_ptr_a, params.matrix_a.row_ind_ptr.data(), nnz_a, stream); @@ -96,7 +93,7 @@ class CSRAddTest Index_ nnz = linalg::csr_add_calc_inds( ind_a, ind_ptr_a, values_a, nnz_a, ind_b, ind_ptr_b, values_b, nnz_b, - n_rows, ind_result, alloc, stream); + n_rows, ind_result, stream); ASSERT_TRUE(nnz == nnz_result); ASSERT_TRUE(raft::devArrMatch(ind_verify, ind_result, n_rows, diff --git a/cpp/test/sparse/connect_components.cu b/cpp/test/sparse/connect_components.cu index d98f9de9c3..dd6ba1479e 100644 --- a/cpp/test/sparse/connect_components.cu +++ b/cpp/test/sparse/connect_components.cu @@ -28,7 +28,6 @@ #include #include -#include #include #include #include @@ -57,14 +56,12 @@ class ConnectComponentsTest : public ::testing::TestWithParam< void basicTest() { raft::handle_t handle; - auto d_alloc = handle.get_device_allocator(); auto stream = handle.get_stream(); params = ::testing::TestWithParam< ConnectComponentsInputs>::GetParam(); - raft::sparse::COO out_edges( - handle.get_device_allocator(), handle.get_stream()); + raft::sparse::COO out_edges(handle.get_stream()); rmm::device_uvector data(params.n_row * params.n_col, handle.get_stream()); @@ -77,7 +74,7 @@ class ConnectComponentsTest : public ::testing::TestWithParam< /** * 1. Construct knn graph */ - raft::sparse::COO knn_graph_coo(d_alloc, stream); + raft::sparse::COO knn_graph_coo(stream); raft::sparse::selection::knn_graph( handle, data.data(), params.n_row, params.n_col, @@ -85,7 +82,7 @@ class ConnectComponentsTest : public ::testing::TestWithParam< raft::sparse::convert::sorted_coo_to_csr(knn_graph_coo.rows(), knn_graph_coo.nnz, indptr.data(), - params.n_row + 1, d_alloc, stream); + params.n_row + 1, stream); /** * 2. Construct MST, sorted by weights @@ -112,7 +109,7 @@ class ConnectComponentsTest : public ::testing::TestWithParam< raft::sparse::convert::sorted_coo_to_csr(out_edges.rows(), out_edges.nnz, indptr2.data(), params.n_row + 1, - d_alloc, stream); + stream); auto output_mst = raft::mst::mst( handle, indptr2.data(), out_edges.cols(), out_edges.vals(), params.n_row, diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu index 553ef2ddee..5bee5a000e 100644 --- a/cpp/test/sparse/convert_csr.cu +++ b/cpp/test/sparse/convert_csr.cu @@ -19,7 +19,6 @@ #include #include "../test_utils.h" -#include #include #include @@ -61,8 +60,6 @@ typedef SparseConvertCSRTest SortedCOOToCSR; TEST_P(SortedCOOToCSR, Result) { cudaStream_t stream; cudaStreamCreate(&stream); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); int nnz = 8; @@ -78,7 +75,7 @@ TEST_P(SortedCOOToCSR, Result) { raft::update_device(in, in_h, nnz, stream); raft::update_device(exp, exp_h, 4, stream); - convert::sorted_coo_to_csr(in, nnz, out, 4, alloc, stream); + convert::sorted_coo_to_csr(in, nnz, out, 4, stream); ASSERT_TRUE(raft::devArrMatch(out, exp, 4, raft::Compare())); diff --git a/cpp/test/sparse/csr_row_slice.cu b/cpp/test/sparse/csr_row_slice.cu index 625772a842..b158ffdedd 100644 --- a/cpp/test/sparse/csr_row_slice.cu +++ b/cpp/test/sparse/csr_row_slice.cu @@ -19,8 +19,6 @@ #include #include -#include -#include #include @@ -92,8 +90,6 @@ class CSRRowSliceTest void SetUp() override { params = ::testing::TestWithParam< CSRRowSliceInputs>::GetParam(); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); CUDA_CHECK(cudaStreamCreate(&stream)); make_data(); diff --git a/cpp/test/sparse/csr_to_dense.cu b/cpp/test/sparse/csr_to_dense.cu index 5535df4fe3..d04799befa 100644 --- a/cpp/test/sparse/csr_to_dense.cu +++ b/cpp/test/sparse/csr_to_dense.cu @@ -16,8 +16,6 @@ #include #include -#include -#include #include #include @@ -77,8 +75,6 @@ class CSRToDenseTest void SetUp() override { params = ::testing::TestWithParam< CSRToDenseInputs>::GetParam(); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); CUDA_CHECK(cudaStreamCreate(&stream)); CUSPARSE_CHECK(cusparseCreate(&handle)); diff --git a/cpp/test/sparse/csr_transpose.cu b/cpp/test/sparse/csr_transpose.cu index c257d6eb3c..5baeadd16f 100644 --- a/cpp/test/sparse/csr_transpose.cu +++ b/cpp/test/sparse/csr_transpose.cu @@ -20,8 +20,6 @@ #include #include -#include -#include #include @@ -94,8 +92,6 @@ class CSRTransposeTest void SetUp() override { params = ::testing::TestWithParam< CSRTransposeInputs>::GetParam(); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); CUDA_CHECK(cudaStreamCreate(&stream)); CUSPARSE_CHECK(cusparseCreate(&handle)); @@ -103,7 +99,7 @@ class CSRTransposeTest raft::sparse::linalg::csr_transpose( handle, indptr, indices, data, out_indptr, out_indices, out_data, - params.nrows, params.ncols, params.nnz, alloc, stream); + params.nrows, params.ncols, params.nnz, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); CUSPARSE_CHECK(cusparseDestroy(handle)); diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu index 3582075bfb..8b2e69e5a2 100644 --- a/cpp/test/sparse/dist_coo_spmv.cu +++ b/cpp/test/sparse/dist_coo_spmv.cu @@ -22,7 +22,6 @@ #include #include #include -#include #include #include @@ -150,8 +149,6 @@ class SparseDistanceCOOSPMVTest void SetUp() override { params = ::testing::TestWithParam< SparseDistanceCOOSPMVInputs>::GetParam(); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); CUDA_CHECK(cudaStreamCreate(&stream)); CUSPARSE_CHECK(cusparseCreate(&cusparseHandle)); @@ -172,7 +169,6 @@ class SparseDistanceCOOSPMVTest dist_config.a_indices = indices; dist_config.a_data = data; dist_config.handle = cusparseHandle; - dist_config.allocator = alloc; dist_config.stream = stream; int out_size = dist_config.a_nrows * dist_config.b_nrows; diff --git a/cpp/test/sparse/dist_csr_spmv.cu b/cpp/test/sparse/dist_csr_spmv.cu index c32748a04e..b087c3a612 100644 --- a/cpp/test/sparse/dist_csr_spmv.cu +++ b/cpp/test/sparse/dist_csr_spmv.cu @@ -22,7 +22,6 @@ #include #include #include -#include #include #include @@ -130,8 +129,6 @@ class SparseDistanceCSRSPMVTest void SetUp() override { params = ::testing::TestWithParam< SparseDistanceCSRSPMVInputs>::GetParam(); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); CUDA_CHECK(cudaStreamCreate(&stream)); CUSPARSE_CHECK(cusparseCreate(&cusparseHandle)); @@ -151,7 +148,6 @@ class SparseDistanceCSRSPMVTest dist_config.a_indices = indices; dist_config.a_data = data; dist_config.handle = cusparseHandle; - dist_config.allocator = alloc; dist_config.stream = stream; int out_size = dist_config.a_nrows * dist_config.b_nrows; diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu index 4247e374d6..729394fd2c 100644 --- a/cpp/test/sparse/distance.cu +++ b/cpp/test/sparse/distance.cu @@ -21,7 +21,6 @@ #include #include #include -#include #include @@ -83,8 +82,6 @@ class SparseDistanceTest void SetUp() override { params = ::testing::TestWithParam< SparseDistanceInputs>::GetParam(); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); CUDA_CHECK(cudaStreamCreate(&stream)); CUSPARSE_CHECK(cusparseCreate(&cusparseHandle)); @@ -106,7 +103,6 @@ class SparseDistanceTest dist_config.a_indices = indices; dist_config.a_data = data; dist_config.handle = cusparseHandle; - dist_config.allocator = alloc; dist_config.stream = stream; int out_size = dist_config.a_nrows * dist_config.b_nrows; diff --git a/cpp/test/sparse/filter.cu b/cpp/test/sparse/filter.cu index f7954f899f..4634e5fc0e 100644 --- a/cpp/test/sparse/filter.cu +++ b/cpp/test/sparse/filter.cu @@ -20,7 +20,6 @@ #include "../test_utils.h" #include -#include #include #include @@ -53,13 +52,11 @@ typedef SparseFilterTests COORemoveZeros; TEST_P(COORemoveZeros, Result) { cudaStream_t stream; cudaStreamCreate(&stream); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); params = ::testing::TestWithParam>::GetParam(); float *in_h_vals = new float[params.nnz]; - COO in(alloc, stream, params.nnz, 5, 5); + COO in(stream, params.nnz, 5, 5); raft::random::Rng r(params.seed); r.uniform(in.vals(), params.nnz, float(-1.0), float(1.0), stream); @@ -82,7 +79,7 @@ TEST_P(COORemoveZeros, Result) { raft::update_device(in.cols(), in_h_cols, params.nnz, stream); raft::update_device(in.vals(), in_h_vals, params.nnz, stream); - op::coo_sort(&in, alloc, stream); + op::coo_sort(&in, stream); int out_rows_ref_h[2] = {0, 3}; int out_cols_ref_h[2] = {4, 1}; @@ -91,14 +88,14 @@ TEST_P(COORemoveZeros, Result) { out_vals_ref_h[0] = in_h_vals[4]; out_vals_ref_h[1] = in_h_vals[1]; - COO out_ref(alloc, stream, 2, 5, 5); - COO out(alloc, stream); + COO out_ref(stream, 2, 5, 5); + COO out(stream); raft::update_device(out_ref.rows(), *&out_rows_ref_h, 2, stream); raft::update_device(out_ref.cols(), *&out_cols_ref_h, 2, stream); raft::update_device(out_ref.vals(), out_vals_ref_h, 2, stream); - op::coo_remove_zeros<32, float>(&in, &out, alloc, stream); + op::coo_remove_zeros<32, float>(&in, &out, stream); ASSERT_TRUE(raft::devArrMatch(out_ref.rows(), out.rows(), 2, raft::Compare())); diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/knn.cu index 4759eebe4b..e92eb53ada 100644 --- a/cpp/test/sparse/knn.cu +++ b/cpp/test/sparse/knn.cu @@ -24,8 +24,6 @@ #include #include -#include -#include namespace raft { namespace sparse { @@ -95,8 +93,6 @@ class SparseKNNTest void SetUp() override { params = ::testing::TestWithParam>::GetParam(); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); CUDA_CHECK(cudaStreamCreate(&stream)); CUSPARSE_CHECK(cusparseCreate(&cusparseHandle)); @@ -110,8 +106,7 @@ class SparseKNNTest raft::sparse::selection::brute_force_knn( indptr, indices, data, nnz, n_rows, params.n_cols, indptr, indices, data, nnz, n_rows, params.n_cols, out_indices, out_dists, k, cusparseHandle, - alloc, stream, params.batch_size_index, params.batch_size_query, - params.metric); + stream, params.batch_size_index, params.batch_size_query, params.metric); CUDA_CHECK(cudaStreamSynchronize(stream)); } diff --git a/cpp/test/sparse/knn_graph.cu b/cpp/test/sparse/knn_graph.cu index ec41b32374..d4e9a915e5 100644 --- a/cpp/test/sparse/knn_graph.cu +++ b/cpp/test/sparse/knn_graph.cu @@ -64,10 +64,9 @@ class KNNGraphTest raft::handle_t handle; - auto alloc = handle.get_device_allocator(); stream = handle.get_stream(); - out = new raft::sparse::COO(alloc, stream); + out = new raft::sparse::COO(stream); allocate(X, params.X.size()); diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu index 64403eab7f..291880bf53 100644 --- a/cpp/test/sparse/linkage.cu +++ b/cpp/test/sparse/linkage.cu @@ -21,7 +21,6 @@ #include #include -#include #include #include #include @@ -106,13 +105,11 @@ __global__ void computeTheNumerator(const T* firstClusterArray, * @param firstClusterArray: the array of classes of type T * @param secondClusterArray: the array of classes of type T * @param size: the size of the data points of type uint64_t -* @param allocator: object that takes care of temporary device memory allocation of type std::shared_ptr * @param stream: the cudaStream object */ template -double compute_rand_index( - T* firstClusterArray, T* secondClusterArray, uint64_t size, - std::shared_ptr allocator, cudaStream_t stream) { +double compute_rand_index(T* firstClusterArray, T* secondClusterArray, + uint64_t size, cudaStream_t stream) { //rand index for size less than 2 is not defined ASSERT(size >= 2, "Rand Index for size less than 2 not defined!"); @@ -190,8 +187,7 @@ class LinkageTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); score = - compute_rand_index(labels, labels_ref, params.n_row, - handle.get_device_allocator(), handle.get_stream()); + compute_rand_index(labels, labels_ref, params.n_row, handle.get_stream()); } void SetUp() override { basicTest(); } diff --git a/cpp/test/sparse/reduce.cu b/cpp/test/sparse/reduce.cu index 50b5dc5993..8ff4a600bc 100644 --- a/cpp/test/sparse/reduce.cu +++ b/cpp/test/sparse/reduce.cu @@ -53,7 +53,6 @@ class SparseReduceTest void Run() { raft::handle_t handle; - auto d_alloc = handle.get_device_allocator(); auto stream = handle.get_stream(); rmm::device_uvector in_rows(params.in_rows.size(), stream); @@ -76,7 +75,7 @@ class SparseReduceTest raft::update_device(out_vals.data(), params.out_vals.data(), params.out_vals.size(), stream); - raft::sparse::COO out(d_alloc, stream); + raft::sparse::COO out(stream); raft::sparse::op::max_duplicates(handle, out, in_rows.data(), in_cols.data(), in_vals.data(), params.in_rows.size(), params.m, params.n); diff --git a/cpp/test/sparse/selection.cu b/cpp/test/sparse/selection.cu index 46f2f6a844..74aa9d6eaf 100644 --- a/cpp/test/sparse/selection.cu +++ b/cpp/test/sparse/selection.cu @@ -81,8 +81,6 @@ class SparseSelectionTest void SetUp() override { params = ::testing::TestWithParam< SparseSelectionInputs>::GetParam(); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); CUDA_CHECK(cudaStreamCreate(&stream)); n_rows = params.n_rows; diff --git a/cpp/test/sparse/sort.cu b/cpp/test/sparse/sort.cu index b9a8b849eb..9deac1b82f 100644 --- a/cpp/test/sparse/sort.cu +++ b/cpp/test/sparse/sort.cu @@ -20,7 +20,6 @@ #include "../test_utils.h" #include -#include #include @@ -55,8 +54,6 @@ TEST_P(COOSort, Result) { raft::random::Rng r(params.seed); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); raft::allocate(in_vals, params.nnz); r.uniform(in_vals, params.nnz, float(-1.0), float(1.0), stream); @@ -80,7 +77,7 @@ TEST_P(COOSort, Result) { raft::update_device(in_cols, in_cols_h, params.nnz, stream); raft::update_device(verify, verify_h, params.nnz, stream); - op::coo_sort(params.m, params.n, params.nnz, in_rows, in_cols, in_vals, alloc, + op::coo_sort(params.m, params.n, params.nnz, in_rows, in_cols, in_vals, stream); ASSERT_TRUE( diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu index e96e5c289c..a060b8128a 100644 --- a/cpp/test/sparse/symmetrize.cu +++ b/cpp/test/sparse/symmetrize.cu @@ -79,7 +79,6 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam< raft::handle_t handle; - auto alloc = handle.get_device_allocator(); stream = handle.get_stream(); make_data(); @@ -92,7 +91,7 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam< raft::sparse::convert::csr_to_coo(indptr, m, coo_rows.data(), nnz, stream); - raft::sparse::COO out(alloc, stream); + raft::sparse::COO out(stream); raft::sparse::linalg::symmetrize(handle, coo_rows.data(), indices, data, m, n, coo_rows.size(), out); @@ -149,9 +148,6 @@ TEST_P(COOSymmetrize, Result) { cudaStream_t stream; cudaStreamCreate(&stream); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); - int nnz = 8; int *in_rows_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3}; @@ -165,19 +161,19 @@ TEST_P(COOSymmetrize, Result) { float *exp_vals_h = new float[nnz * 2]{0.5, 0.5, 1.5, 0, 0.5, 0.5, 0.5, 0, 0.5, 0.5, 0.5, 0, 1.5, 0.5, 0.5, 0.0}; - COO in(alloc, stream, nnz, 4, 4); + COO in(stream, nnz, 4, 4); raft::update_device(in.rows(), *&in_rows_h, nnz, stream); raft::update_device(in.cols(), *&in_cols_h, nnz, stream); raft::update_device(in.vals(), *&in_vals_h, nnz, stream); - COO out(alloc, stream); + COO out(stream); linalg::coo_symmetrize<32, float>( &in, &out, [] __device__(int row, int col, float val, float trans) { return val + trans; }, - alloc, stream); + stream); CUDA_CHECK(cudaStreamSynchronize(stream)); std::cout << out << std::endl; diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu index def1f1685b..e62bec4381 100644 --- a/cpp/test/spatial/haversine.cu +++ b/cpp/test/spatial/haversine.cu @@ -18,7 +18,6 @@ #include #include #include -#include #include #include "../test_utils.h" @@ -30,8 +29,6 @@ template class HaversineKNNTest : public ::testing::Test { protected: void basicTest() { - auto alloc = std::make_shared(); - // Allocate input raft::allocate(d_train_inputs, n * d); From 565cf68404670ec65a5db26237dff6d03447315e Mon Sep 17 00:00:00 2001 From: viclafargue Date: Thu, 1 Jul 2021 11:52:45 +0200 Subject: [PATCH 03/17] RAFT handle update --- cpp/include/raft/handle.hpp | 52 +++++++++++++++++++---------------- python/raft/common/handle.pxd | 2 -- 2 files changed, 28 insertions(+), 26 deletions(-) diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp index dbe7e83189..1b1923abb7 100644 --- a/cpp/include/raft/handle.hpp +++ b/cpp/include/raft/handle.hpp @@ -36,9 +36,8 @@ #include #include #include -#include -#include #include +#include #include "cudart_utils.h" namespace raft { @@ -48,6 +47,9 @@ namespace raft { * necessary cuda kernels and/or libraries */ class handle_t { + using thrust_exec_policy_t = thrust::detail::execute_with_allocator< + rmm::mr::thrust_allocator, thrust::cuda_cub::execute_on_stream_base>; + private: static constexpr int kNumDefaultWorkerStreams = 0; @@ -63,9 +65,7 @@ class handle_t { CUDA_CHECK(cudaGetDevice(&cur_dev)); return cur_dev; }()), - streams_(n_streams), - device_allocator_(std::make_shared()), - host_allocator_(std::make_shared()) { + streams_(n_streams) { create_resources(); } @@ -86,8 +86,6 @@ class handle_t { "ERROR: the main handle must have at least one worker stream\n"); prop_ = other.get_device_properties(); device_prop_initialized_ = true; - device_allocator_ = other.get_device_allocator(); - host_allocator_ = other.get_host_allocator(); create_resources(); set_stream(other.get_internal_stream(stream_id)); } @@ -97,26 +95,15 @@ class handle_t { int get_device() const { return dev_id_; } - void set_stream(cudaStream_t stream) { user_stream_ = stream; } + void set_stream(cudaStream_t stream) { + thrust_policy_initialized_ = false; + user_stream_ = stream; + } cudaStream_t get_stream() const { return user_stream_; } rmm::cuda_stream_view get_stream_view() const { return rmm::cuda_stream_view(user_stream_); } - void set_device_allocator(std::shared_ptr allocator) { - device_allocator_ = allocator; - } - std::shared_ptr get_device_allocator() const { - return device_allocator_; - } - - void set_host_allocator(std::shared_ptr allocator) { - host_allocator_ = allocator; - } - std::shared_ptr get_host_allocator() const { - return host_allocator_; - } - cublasHandle_t get_cublas_handle() const { std::lock_guard _(mutex_); if (!cublas_initialized_) { @@ -153,6 +140,23 @@ class handle_t { return cusparse_handle_; } + thrust_exec_policy_t get_thrust_policy() const { + std::lock_guard _(mutex_); + if (!thrust_policy_initialized_) { + if (!thrust_policy_) { + thrust_policy_ = + (thrust_exec_policy_t*)malloc(sizeof(thrust_exec_policy_t)); + } + *thrust_policy_ = rmm::exec_policy(this->get_stream()); + thrust_policy_initialized_ = true; + } + return *thrust_policy_; + } + + thrust_exec_policy_t get_thrust_policy(cudaStream_t stream) const { + return rmm::exec_policy(stream); + } + // legacy compatibility for cuML cudaStream_t get_internal_stream(int sid) const { return streams_.get_stream(sid).value(); @@ -236,8 +240,8 @@ class handle_t { mutable bool cusolver_sp_initialized_{false}; mutable cusparseHandle_t cusparse_handle_; mutable bool cusparse_initialized_{false}; - std::shared_ptr device_allocator_; - std::shared_ptr host_allocator_; + mutable thrust_exec_policy_t* thrust_policy_{nullptr}; + mutable bool thrust_policy_initialized_{false}; cudaStream_t user_stream_{nullptr}; cudaEvent_t event_; mutable cudaDeviceProp prop_; diff --git a/python/raft/common/handle.pxd b/python/raft/common/handle.pxd index 6076640312..884d81bed1 100644 --- a/python/raft/common/handle.pxd +++ b/python/raft/common/handle.pxd @@ -34,7 +34,5 @@ cdef extern from "raft/handle.hpp" namespace "raft" nogil: handle_t() except + handle_t(int ns) except + void set_stream(_Stream s) except + - void set_device_allocator(shared_ptr[allocator] a) except + - shared_ptr[allocator] get_device_allocator() except + _Stream get_stream() except + int get_num_internal_streams() except + From eb1253ad0e4924a81910d07624eada21624eb52c Mon Sep 17 00:00:00 2001 From: viclafargue Date: Thu, 1 Jul 2021 11:53:51 +0200 Subject: [PATCH 04/17] Use raft::handle_t::get_thrust_policy to create thrust policy --- cpp/include/raft/sparse/distance/coo_spmv.cuh | 2 - .../coo_spmv_strategies/base_strategy.cuh | 1 - .../coo_mask_row_iterators.cuh | 1 - .../coo_spmv_strategies/hash_strategy.cuh | 2 +- .../raft/sparse/distance/l2_distance.cuh | 5 +- .../sparse/hierarchy/detail/agglomerative.cuh | 3 +- .../hierarchy/detail/connectivities.cuh | 5 +- .../raft/sparse/hierarchy/detail/mst.cuh | 3 - cpp/include/raft/sparse/linalg/spectral.cuh | 5 +- cpp/include/raft/sparse/linalg/symmetrize.cuh | 1 - .../raft/sparse/mst/detail/mst_solver_inl.cuh | 12 +- cpp/include/raft/sparse/op/reduce.cuh | 8 +- .../sparse/selection/connect_components.cuh | 3 +- cpp/include/raft/spectral/cluster_solvers.hpp | 9 +- cpp/include/raft/spectral/kmeans.hpp | 129 ++++++++---------- cpp/include/raft/spectral/matrix_wrappers.hpp | 43 +++--- .../raft/spectral/modularity_maximization.hpp | 28 ++-- cpp/include/raft/spectral/partition.hpp | 28 ++-- cpp/include/raft/spectral/spectral_util.hpp | 22 ++- cpp/test/cluster_solvers.cu | 9 +- cpp/test/eigen_solvers.cu | 7 +- cpp/test/spectral_matrix.cu | 19 ++- 22 files changed, 140 insertions(+), 205 deletions(-) diff --git a/cpp/include/raft/sparse/distance/coo_spmv.cuh b/cpp/include/raft/sparse/distance/coo_spmv.cuh index 4f979aa6f1..24be171900 100644 --- a/cpp/include/raft/sparse/distance/coo_spmv.cuh +++ b/cpp/include/raft/sparse/distance/coo_spmv.cuh @@ -34,8 +34,6 @@ #include -#include - namespace raft { namespace sparse { namespace distance { diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh index 5ace978a23..194799aed0 100644 --- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh +++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh @@ -23,7 +23,6 @@ #include #include -#include namespace raft { namespace sparse { diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh index 44c3833f96..74eb37bc2b 100644 --- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh +++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh @@ -20,7 +20,6 @@ #include "../utils.cuh" #include -#include namespace raft { namespace sparse { diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh index 1295d24103..a95c6ff85b 100644 --- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh +++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh @@ -55,7 +55,7 @@ class hash_strategy : public coo_spmv_strategy { rmm::device_uvector &mask_indptr, std::tuple &n_rows_divided, cudaStream_t stream) { - auto policy = rmm::exec_policy(stream); + auto policy = this->config.handle.get_thrust_policy(); auto less = thrust::copy_if( policy, thrust::make_counting_iterator(value_idx(0)), diff --git a/cpp/include/raft/sparse/distance/l2_distance.cuh b/cpp/include/raft/sparse/distance/l2_distance.cuh index 5f89101082..534280191b 100644 --- a/cpp/include/raft/sparse/distance/l2_distance.cuh +++ b/cpp/include/raft/sparse/distance/l2_distance.cuh @@ -245,9 +245,8 @@ class hellinger_expanded_distances_t : public distances_t { : config_(&config), workspace(0, config.handle.get_stream()) {} void compute(value_t *out_dists) { - raft::mr::device::buffer coo_rows( - config_->handle.get_device_allocator(), config_->handle.get_stream(), - max(config_->b_nnz, config_->a_nnz)); + rmm::device_uvector coo_rows(max(config_->b_nnz, config_->a_nnz), + config_->handle.get_stream()); raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows, coo_rows.data(), config_->b_nnz, diff --git a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh index 187985627f..2a7a8b3e4e 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh @@ -21,7 +21,6 @@ #include #include #include -#include #include #include @@ -224,7 +223,7 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels, const value_idx *children, size_t n_clusters, size_t n_leaves) { auto stream = handle.get_stream(); - auto thrust_policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); + auto thrust_policy = handle.get_thrust_policy(); // Handle special case where n_clusters == 1 if (n_clusters == 1) { diff --git a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh index b6ec190a98..31e4a0f263 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh @@ -22,7 +22,6 @@ #include #include -#include #include #include @@ -61,7 +60,7 @@ struct distance_graph_impl &indices, rmm::device_uvector &data, int c) { auto stream = handle.get_stream(); - auto exec_policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); + auto thrust_policy = handle.get_thrust_policy(); // Need to symmetrize knn into undirected graph raft::sparse::COO knn_graph_coo(stream); @@ -77,7 +76,7 @@ struct distance_graph_impl &tup) { bool self_loop = thrust::get<0>(tup) == thrust::get<1>(tup); diff --git a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh index 033d5881d5..6ef6f9879b 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh @@ -25,12 +25,9 @@ #include #include -#include - #include #include #include -#include namespace raft { namespace hierarchy { diff --git a/cpp/include/raft/sparse/linalg/spectral.cuh b/cpp/include/raft/sparse/linalg/spectral.cuh index 28b9190c53..d15c2cdf23 100644 --- a/cpp/include/raft/sparse/linalg/spectral.cuh +++ b/cpp/include/raft/sparse/linalg/spectral.cuh @@ -63,7 +63,7 @@ void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals, index_type maxiter = 4000; //default reset value (when set to 0); value_type tol = 0.01; index_type restart_iter = 15 + neigvs; //what cugraph is using - auto t_exe_p = thrust::cuda::par.on(stream); + auto t_exe_p = handle.get_thrust_policy(); using thrust_exe_policy_t = decltype(t_exe_p); raft::eigen_solver_config_t cfg{neigvs, maxiter, @@ -83,8 +83,7 @@ void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals, using value_type_t = value_type; std::pair solve( - handle_t const &handle, thrust_exe_policy_t t_exe_policy, - size_type_t n_obs_vecs, size_type_t dim, + handle_t const &handle, size_type_t n_obs_vecs, size_type_t dim, value_type_t const *__restrict__ obs, index_type_t *__restrict__ codes) const { return std::make_pair(0, 0); diff --git a/cpp/include/raft/sparse/linalg/symmetrize.cuh b/cpp/include/raft/sparse/linalg/symmetrize.cuh index 5fcd336551..614c9d830e 100644 --- a/cpp/include/raft/sparse/linalg/symmetrize.cuh +++ b/cpp/include/raft/sparse/linalg/symmetrize.cuh @@ -30,7 +30,6 @@ #include #include -#include #include #include diff --git a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh index c5ba4fcb4f..029b76a945 100644 --- a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh +++ b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh @@ -24,7 +24,6 @@ #include #include -#include #include #include @@ -38,7 +37,6 @@ #include #include -#include namespace raft { namespace mst { @@ -88,7 +86,7 @@ MST_solver::MST_solver( sm_count = handle_.get_device_properties().multiProcessorCount; //Initially, color holds the vertex id as color - auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); + auto policy = handle.get_thrust_policy(); if (initialize_colors_) { thrust::sequence(policy, color.begin(), color.end(), 0); thrust::sequence(policy, color_index, color_index + v, 0); @@ -227,7 +225,7 @@ template alteration_t MST_solver::alteration_max() { - auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); + auto policy = handle.get_thrust_policy(); rmm::device_vector tmp(e); thrust::device_ptr weights_ptr(weights); thrust::copy(policy, weights_ptr, weights_ptr + e, tmp.begin()); @@ -327,7 +325,7 @@ template void MST_solver::min_edge_per_vertex() { - auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); + auto policy = handle.get_thrust_policy(); thrust::fill(policy, min_edge_color.begin(), min_edge_color.end(), std::numeric_limits::max()); thrust::fill(policy, new_mst_edge.begin(), new_mst_edge.end(), @@ -354,7 +352,7 @@ void MST_solver::max()); @@ -411,7 +409,7 @@ template void MST_solver::append_src_dst_pair( vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights) { - auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); + auto policy = handle.get_thrust_policy(); auto curr_mst_edge_count = prev_mst_edge_count[0]; diff --git a/cpp/include/raft/sparse/op/reduce.cuh b/cpp/include/raft/sparse/op/reduce.cuh index 2708f0491e..09a35720fb 100644 --- a/cpp/include/raft/sparse/op/reduce.cuh +++ b/cpp/include/raft/sparse/op/reduce.cuh @@ -31,7 +31,6 @@ #include #include #include -#include #include #include @@ -126,16 +125,15 @@ void max_duplicates(const raft::handle_t &handle, const value_idx *rows, const value_idx *cols, const value_t *vals, size_t nnz, size_t m, size_t n) { auto stream = handle.get_stream(); - - auto exec_policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); + auto thrust_policy = handle.get_thrust_policy(); // compute diffs & take exclusive scan rmm::device_uvector diff(nnz + 1, stream); compute_duplicates_mask(diff.data(), rows, cols, nnz, stream); - thrust::exclusive_scan(thrust::cuda::par.on(stream), diff.data(), - diff.data() + diff.size(), diff.data()); + thrust::exclusive_scan(thrust_policy, diff.data(), diff.data() + diff.size(), + diff.data()); // compute final size value_idx size = 0; diff --git a/cpp/include/raft/sparse/selection/connect_components.cuh b/cpp/include/raft/sparse/selection/connect_components.cuh index 9b02ae67e6..390522c9bc 100644 --- a/cpp/include/raft/sparse/selection/connect_components.cuh +++ b/cpp/include/raft/sparse/selection/connect_components.cuh @@ -30,7 +30,6 @@ #include #include #include -#include #include @@ -361,7 +360,7 @@ void connect_components(const raft::handle_t &handle, raft::sparse::op::compute_duplicates_mask(out_index.data(), colors.data(), nn_colors.data(), n_rows, stream); - thrust::exclusive_scan(thrust::cuda::par.on(stream), out_index.data(), + thrust::exclusive_scan(handle.get_thrust_policy(), out_index.data(), out_index.data() + out_index.size(), out_index.data()); // compute final size diff --git a/cpp/include/raft/spectral/cluster_solvers.hpp b/cpp/include/raft/spectral/cluster_solvers.hpp index 922ae7cfab..6f507331d9 100644 --- a/cpp/include/raft/spectral/cluster_solvers.hpp +++ b/cpp/include/raft/spectral/cluster_solvers.hpp @@ -42,19 +42,16 @@ struct kmeans_solver_t { size_type_t> const& config) : config_(config) {} - template std::pair solve( - handle_t const& handle, thrust_exe_policy_t t_exe_policy, - size_type_t n_obs_vecs, size_type_t dim, + handle_t const& handle, size_type_t n_obs_vecs, size_type_t dim, value_type_t const* __restrict__ obs, index_type_t* __restrict__ codes) const { RAFT_EXPECTS(obs != nullptr, "Null obs buffer."); RAFT_EXPECTS(codes != nullptr, "Null codes buffer."); value_type_t residual{}; index_type_t iters{}; - kmeans(handle, t_exe_policy, n_obs_vecs, dim, config_.n_clusters, - config_.tol, config_.maxIter, obs, codes, residual, iters, - config_.seed); + kmeans(handle, n_obs_vecs, dim, config_.n_clusters, config_.tol, + config_.maxIter, obs, codes, residual, iters, config_.seed); return std::make_pair(residual, iters); } diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index fb05bff3e2..5928c727c6 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -325,7 +325,6 @@ static __global__ void divideCentroids( * Centroid is randomly chosen with k-means++ algorithm. * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. - * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. * @param n Number of observation vectors. * @param d Dimension of observation vectors. @@ -341,12 +340,9 @@ static __global__ void divideCentroids( * coordinates. * @return Zero if successful. Otherwise non-zero. */ -template -static int chooseNewCentroid(handle_t const& handle, - thrust_exe_pol_t thrust_exec_policy, - index_type_t n, index_type_t d, index_type_t k, - value_type_t rand, +template +static int chooseNewCentroid(handle_t const& handle, index_type_t n, + index_type_t d, index_type_t k, value_type_t rand, const value_type_t* __restrict__ obs, value_type_t* __restrict__ dists, value_type_t* __restrict__ centroid) { @@ -357,8 +353,9 @@ static int chooseNewCentroid(handle_t const& handle, // Observation vector that is chosen as new centroid index_type_t obsIndex; - auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); + auto thrust_exec_policy = handle.get_thrust_policy(); // Compute cumulative sum of distances thrust::inclusive_scan(thrust_exec_policy, thrust::device_pointer_cast(dists), @@ -417,10 +414,7 @@ static int chooseNewCentroid(handle_t const& handle, * Centroids are randomly chosen with k-means++ algorithm * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. - * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. - * @param thrust_exec_policy thrust execution policy - * (assumed to have same stream as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -439,14 +433,12 @@ static int chooseNewCentroid(handle_t const& handle, * distance between observation vectors and the closest centroid. * @return Zero if successful. Otherwise non-zero. */ -template +template static int initializeCentroids( - handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, index_type_t n, - index_type_t d, index_type_t k, const value_type_t* __restrict__ obs, - value_type_t* __restrict__ centroids, index_type_t* __restrict__ codes, - index_type_t* __restrict__ clusterSizes, value_type_t* __restrict__ dists, - unsigned long long seed) { + handle_t const& handle, index_type_t n, index_type_t d, index_type_t k, + const value_type_t* __restrict__ obs, value_type_t* __restrict__ centroids, + index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes, + value_type_t* __restrict__ dists, unsigned long long seed) { // ------------------------------------------------------- // Variable declarations // ------------------------------------------------------- @@ -458,8 +450,9 @@ static int initializeCentroids( thrust::default_random_engine rng(seed); thrust::uniform_real_distribution uniformDist(0, 1); - auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); + auto thrust_exec_policy = handle.get_thrust_policy(); constexpr index_type_t grid_lower_bound{65535}; @@ -486,8 +479,8 @@ static int initializeCentroids( thrust::fill(thrust_exec_policy, thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n), 1); CHECK_CUDA(stream); - if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, uniformDist(rng), - obs, dists, centroids)) + if (chooseNewCentroid(handle, n, d, k, uniformDist(rng), obs, dists, + centroids)) WARNING("error in k-means++ (could not pick centroid)"); // Compute distances from first centroid @@ -499,8 +492,8 @@ static int initializeCentroids( // Choose remaining centroids for (i = 1; i < k; ++i) { // Choose ith centroid - if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, uniformDist(rng), - obs, dists, centroids + IDX(0, i, d))) + if (chooseNewCentroid(handle, n, d, k, uniformDist(rng), obs, dists, + centroids + IDX(0, i, d))) WARNING("error in k-means++ (could not pick centroid)"); // Compute distances from ith centroid @@ -529,10 +522,7 @@ static int initializeCentroids( * Distance is measured with Euclidean norm. * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. - * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. - * @param thrust_exec_policy thrust execution policy - * (assumed to have same stream as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -553,16 +543,18 @@ static int initializeCentroids( * of squares of assignment. * @return Zero if successful. Otherwise non-zero. */ -template -static int assignCentroids( - handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, index_type_t n, - index_type_t d, index_type_t k, const value_type_t* __restrict__ obs, - const value_type_t* __restrict__ centroids, value_type_t* __restrict__ dists, - index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes, - value_type_t* residual_host) { - auto cublas_h = handle.get_cublas_handle(); +template +static int assignCentroids(handle_t const& handle, index_type_t n, + index_type_t d, index_type_t k, + const value_type_t* __restrict__ obs, + const value_type_t* __restrict__ centroids, + value_type_t* __restrict__ dists, + index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes, + value_type_t* residual_host) { auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); + auto thrust_exec_policy = handle.get_thrust_policy(); // Compute distance between centroids and observation vectors CUDA_TRY(cudaMemsetAsync(dists, 0, n * k * sizeof(value_type_t), stream)); @@ -606,10 +598,7 @@ static int assignCentroids( * All clusters are assumed to be non-empty. * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. - * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. - * @param thrust_exec_policy thrust execution policy - * (assumed to have same stream as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -628,10 +617,8 @@ static int assignCentroids( * Workspace. * @return Zero if successful. Otherwise non-zero. */ -template -static int updateCentroids(handle_t const& handle, - thrust_exe_pol_t thrust_exec_policy, index_type_t n, +template +static int updateCentroids(handle_t const& handle, index_type_t n, index_type_t d, index_type_t k, const value_type_t* __restrict__ obs, const index_type_t* __restrict__ codes, @@ -649,8 +636,9 @@ static int updateCentroids(handle_t const& handle, constexpr index_type_t grid_lower_bound{65535}; - auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); + auto thrust_exec_policy = handle.get_thrust_policy(); // Device memory thrust::device_ptr obs_copy(work); @@ -722,10 +710,7 @@ namespace raft { * k-means++ algorithm. * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. - * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. - * @param thrust_exec_policy thrust execution policy - * (assumed to have same stream as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -754,11 +739,10 @@ namespace raft { * @param seed random seed to be used. * @return error flag. */ -template -int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, - index_type_t n, index_type_t d, index_type_t k, value_type_t tol, - index_type_t maxiter, const value_type_t* __restrict__ obs, +template +int kmeans(handle_t const& handle, index_type_t n, index_type_t d, + index_type_t k, value_type_t tol, index_type_t maxiter, + const value_type_t* __restrict__ obs, index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes, value_type_t* __restrict__ centroids, @@ -785,16 +769,17 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, // Initialization // ------------------------------------------------------- - auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); + auto thrust_exec_policy = handle.get_thrust_policy(); // Trivial cases if (k == 1) { CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream)); CUDA_TRY(cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t), cudaMemcpyHostToDevice, stream)); - if (updateCentroids(handle, thrust_exec_policy, n, d, k, obs, codes, - clusterSizes, centroids, work, work_int)) + if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, + work, work_int)) WARNING("could not compute k-means centroids"); dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE}; @@ -840,21 +825,21 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, // ------------------------------------------------------- // Choose initial cluster centroids - if (initializeCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids, - codes, clusterSizes, work, seed)) + if (initializeCentroids(handle, n, d, k, obs, centroids, codes, clusterSizes, + work, seed)) WARNING("could not initialize k-means centroids"); // Apply k-means iteration until convergence for (iter = 0; iter < maxiter; ++iter) { // Update cluster centroids - if (updateCentroids(handle, thrust_exec_policy, n, d, k, obs, codes, - clusterSizes, centroids, work, work_int)) + if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, + work, work_int)) WARNING("could not update k-means centroids"); // Determine centroid closest to each observation residualPrev = *residual_host; - if (assignCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids, - work, codes, clusterSizes, residual_host)) + if (assignCentroids(handle, n, d, k, obs, centroids, work, codes, + clusterSizes, residual_host)) WARNING("could not assign observation vectors to k-means clusters"); // Reinitialize empty clusters with new centroids @@ -868,12 +853,11 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, // conditions, such as if obs is corrupt (as seen as a result of a // DataFrame column of NULL edge vals used to create the Graph) while (emptyCentroid < k) { - if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, - uniformDist(rng), obs, work, + if (chooseNewCentroid(handle, n, d, k, uniformDist(rng), obs, work, centroids + IDX(0, emptyCentroid, d))) WARNING("could not replace empty centroid"); - if (assignCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids, - work, codes, clusterSizes, residual_host)) + if (assignCentroids(handle, n, d, k, obs, centroids, work, codes, + clusterSizes, residual_host)) WARNING("could not assign observation vectors to k-means clusters"); emptyCentroid = (thrust::find(thrust_exec_policy, @@ -905,10 +889,7 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, * k-means++ algorithm. * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. - * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. - * @param thrust_exec_policy thrust execution policy - * (assumed to have same stream as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -926,11 +907,10 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, * @param seed random seed to be used. * @return error flag */ -template -int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, - index_type_t n, index_type_t d, index_type_t k, value_type_t tol, - index_type_t maxiter, const value_type_t* __restrict__ obs, +template +int kmeans(handle_t const& handle, index_type_t n, index_type_t d, + index_type_t k, value_type_t tol, index_type_t maxiter, + const value_type_t* __restrict__ obs, index_type_t* __restrict__ codes, value_type_t& residual, index_type_t& iters, unsigned long long seed = 123456) { using namespace matrix; @@ -950,9 +930,8 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, // Perform k-means return kmeans( - handle, thrust_exec_policy, n, d, k, tol, maxiter, obs, codes, - clusterSizes.raw(), centroids.raw(), work.raw(), work_int.raw(), &residual, - &iters, seed); + handle, n, d, k, tol, maxiter, obs, codes, clusterSizes.raw(), + centroids.raw(), work.raw(), work_int.raw(), &residual, &iters, seed); } } // namespace raft diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index efa98313b6..42fc621a1a 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -85,7 +85,8 @@ template class vector_t { public: vector_t(handle_t const& raft_handle, size_type sz) - : buffer_(sz, raft_handle.get_stream()) {} + : buffer_(sz, raft_handle.get_stream()), + thrust_policy(raft_handle.get_thrust_policy()) {} size_type size(void) const { return buffer_.size(); } @@ -93,9 +94,8 @@ class vector_t { value_type const* raw(void) const { return buffer_.data(); } - template - value_type nrm1(ThrustExecPolicy t_exe_pol) const { - return thrust::reduce(t_exe_pol, buffer_.data(), + value_type nrm1() const { + return thrust::reduce(thrust_policy, buffer_.data(), buffer_.data() + buffer_.size(), value_type{0}, [] __device__(auto left, auto right) { auto abs_left = left > 0 ? left : -left; @@ -104,13 +104,15 @@ class vector_t { }); } - template - void fill(ThrustExecPolicy t_exe_pol, value_type value) { - thrust::fill_n(t_exe_pol, buffer_.data(), buffer_.size(), value); + void fill(value_type value) { + thrust::fill_n(thrust_policy, buffer_.data(), buffer_.size(), value); } private: + using thrust_exec_policy_t = thrust::detail::execute_with_allocator< + rmm::mr::thrust_allocator, thrust::cuda_cub::execute_on_stream_base>; rmm::device_uvector buffer_; + const thrust_exec_policy_t thrust_policy; }; template @@ -262,31 +264,26 @@ struct sparse_matrix_t { template struct laplacian_matrix_t : sparse_matrix_t { - template - laplacian_matrix_t(handle_t const& raft_handle, - ThrustExePolicy thrust_exec_policy, - index_type const* row_offsets, + laplacian_matrix_t(handle_t const& raft_handle, index_type const* row_offsets, index_type const* col_indices, value_type const* values, index_type const nrows, index_type const nnz) : sparse_matrix_t(raft_handle, row_offsets, col_indices, values, nrows, nnz), diagonal_(raft_handle, nrows) { vector_t ones{raft_handle, nrows}; - ones.fill(thrust_exec_policy, 1.0); + ones.fill(1.0); sparse_matrix_t::mv(1, ones.raw(), 0, diagonal_.raw()); } - template laplacian_matrix_t(handle_t const& raft_handle, - ThrustExePolicy thrust_exec_policy, sparse_matrix_t const& csr_m) : sparse_matrix_t(raft_handle, csr_m.row_offsets_, csr_m.col_indices_, csr_m.values_, csr_m.nrows_, csr_m.nnz_), diagonal_(raft_handle, csr_m.nrows_) { vector_t ones{raft_handle, csr_m.nrows_}; - ones.fill(thrust_exec_policy, 1.0); + ones.fill(1.0); sparse_matrix_t::mv(1, ones.raw(), 0, diagonal_.raw()); } @@ -333,27 +330,19 @@ struct laplacian_matrix_t : sparse_matrix_t { template struct modularity_matrix_t : laplacian_matrix_t { - template modularity_matrix_t(handle_t const& raft_handle, - ThrustExePolicy thrust_exec_policy, index_type const* row_offsets, index_type const* col_indices, value_type const* values, index_type const nrows, index_type const nnz) : laplacian_matrix_t( - raft_handle, thrust_exec_policy, row_offsets, col_indices, values, - nrows, nnz) { - edge_sum_ = laplacian_matrix_t::diagonal_.nrm1( - thrust_exec_policy); + raft_handle, row_offsets, col_indices, values, nrows, nnz) { + edge_sum_ = laplacian_matrix_t::diagonal_.nrm1(); } - template modularity_matrix_t(handle_t const& raft_handle, - ThrustExePolicy thrust_exec_policy, sparse_matrix_t const& csr_m) - : laplacian_matrix_t(raft_handle, - thrust_exec_policy, csr_m) { - edge_sum_ = laplacian_matrix_t::diagonal_.nrm1( - thrust_exec_policy); + : laplacian_matrix_t(raft_handle, csr_m) { + edge_sum_ = laplacian_matrix_t::diagonal_.nrm1(); } // y = alpha*A*x + beta*y diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp index f8dfe5daa3..1fe7819a7e 100644 --- a/cpp/include/raft/spectral/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -79,19 +79,18 @@ using namespace linalg; * performed. * @return error flag. */ -template +template std::tuple modularity_maximization( - handle_t const &handle, ThrustExePolicy thrust_exec_policy, - sparse_matrix_t const &csr_m, + handle_t const &handle, sparse_matrix_t const &csr_m, EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); - auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); std::tuple stats; // # iters eigen solver, cluster solver residual, # iters cluster solver @@ -101,7 +100,7 @@ std::tuple modularity_maximization( // Compute eigenvectors of Modularity Matrix // Initialize Modularity Matrix - modularity_matrix_t B{handle, thrust_exec_policy, csr_m}; + modularity_matrix_t B{handle, csr_m}; auto eigen_config = eigen_solver.get_config(); auto nEigVecs = eigen_config.n_eigVecs; @@ -111,7 +110,7 @@ std::tuple modularity_maximization( eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs); // Whiten eigenvector matrix - transform_eigen_matrix(handle, thrust_exec_policy, n, nEigVecs, eigVecs); + transform_eigen_matrix(handle, n, nEigVecs, eigVecs); // notice that at this point the matrix has already been transposed, so we are scaling // columns @@ -119,8 +118,8 @@ std::tuple modularity_maximization( CHECK_CUDA(stream); // Find partition clustering - auto pair_cluster = cluster_solver.solve(handle, thrust_exec_policy, n, - nEigVecs, eigVecs, clusters); + auto pair_cluster = + cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters); std::get<1>(stats) = pair_cluster.first; std::get<2>(stats) = pair_cluster.second; @@ -138,9 +137,8 @@ std::tuple modularity_maximization( * @param clusters (Input, device memory, n entries) Cluster assignments. * @param modularity On exit, modularity */ -template +template void analyzeModularity(handle_t const &handle, - ThrustExePolicy thrust_exec_policy, sparse_matrix_t const &csr_m, vertex_t nClusters, vertex_t const *__restrict__ clusters, @@ -163,15 +161,15 @@ void analyzeModularity(handle_t const &handle, cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // Initialize Modularity - modularity_matrix_t B{handle, thrust_exec_policy, csr_m}; + modularity_matrix_t B{handle, csr_m}; // Initialize output modularity = 0; // Iterate through partitions for (i = 0; i < nClusters; ++i) { - if (!construct_indicator(handle, thrust_exec_policy, i, n, clustersize, - partModularity, clusters, part_i, Bx, B)) { + if (!construct_indicator(handle, i, n, clustersize, partModularity, + clusters, part_i, Bx, B)) { WARNING("empty partition"); continue; } @@ -180,7 +178,7 @@ void analyzeModularity(handle_t const &handle, modularity += partModularity; } - modularity = modularity / B.diagonal_.nrm1(thrust_exec_policy); + modularity = modularity / B.diagonal_.nrm1(); } } // namespace spectral diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index 841fca04d9..a994895886 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -62,19 +62,18 @@ using namespace linalg; * performed. * @return statistics: number of eigensolver iterations, . */ -template +template std::tuple partition( - handle_t const &handle, ThrustExePolicy thrust_exec_policy, - sparse_matrix_t const &csr_m, + handle_t const &handle, sparse_matrix_t const &csr_m, EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); - auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); std::tuple stats; //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, cluster solver residual, # iters cluster solver @@ -89,7 +88,7 @@ std::tuple partition( // Initialize Laplacian ///sparse_matrix_t A{handle, graph}; - laplacian_matrix_t L{handle, thrust_exec_policy, csr_m}; + laplacian_matrix_t L{handle, csr_m}; auto eigen_config = eigen_solver.get_config(); auto nEigVecs = eigen_config.n_eigVecs; @@ -99,11 +98,11 @@ std::tuple partition( eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs); // Whiten eigenvector matrix - transform_eigen_matrix(handle, thrust_exec_policy, n, nEigVecs, eigVecs); + transform_eigen_matrix(handle, n, nEigVecs, eigVecs); // Find partition clustering - auto pair_cluster = cluster_solver.solve(handle, thrust_exec_policy, n, - nEigVecs, eigVecs, clusters); + auto pair_cluster = + cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters); std::get<1>(stats) = pair_cluster.first; std::get<2>(stats) = pair_cluster.second; @@ -129,9 +128,8 @@ std::tuple partition( * @param cost On exit, partition cost function. * @return error flag. */ -template +template void analyzePartition(handle_t const &handle, - ThrustExePolicy thrust_exec_policy, sparse_matrix_t const &csr_m, vertex_t nClusters, const vertex_t *__restrict__ clusters, weight_t &edgeCut, weight_t &cost) { @@ -140,8 +138,8 @@ void analyzePartition(handle_t const &handle, vertex_t i; vertex_t n = csr_m.nrows_; - auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); weight_t partEdgesCut, clustersize; @@ -155,7 +153,7 @@ void analyzePartition(handle_t const &handle, // Initialize Laplacian ///sparse_matrix_t A{handle, graph}; - laplacian_matrix_t L{handle, thrust_exec_policy, csr_m}; + laplacian_matrix_t L{handle, csr_m}; // Initialize output cost = 0; @@ -164,8 +162,8 @@ void analyzePartition(handle_t const &handle, // Iterate through partitions for (i = 0; i < nClusters; ++i) { // Construct indicator vector for ith partition - if (!construct_indicator(handle, thrust_exec_policy, i, n, clustersize, - partEdgesCut, clusters, part_i, Lx, L)) { + if (!construct_indicator(handle, i, n, clustersize, partEdgesCut, clusters, + part_i, Lx, L)) { WARNING("empty partition"); continue; } diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp index 40dde30a74..de9ff1917f 100644 --- a/cpp/include/raft/spectral/spectral_util.hpp +++ b/cpp/include/raft/spectral/spectral_util.hpp @@ -108,13 +108,12 @@ cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs) { return cudaSuccess; } -template -void transform_eigen_matrix(handle_t const& handle, - ThrustExePolicy thrust_exec_policy, edge_t n, - vertex_t nEigVecs, weight_t* eigVecs) { - auto cublas_h = handle.get_cublas_handle(); +template +void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs, + weight_t* eigVecs) { auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); + auto thrust_exec_policy = handle.get_thrust_policy(); const weight_t zero{0.0}; const weight_t one{1.0}; @@ -187,16 +186,15 @@ struct equal_to_i_op { // Construct indicator vector for ith partition // -template -bool construct_indicator(handle_t const& handle, - ThrustExePolicy thrust_exec_policy, edge_t index, - edge_t n, weight_t& clustersize, weight_t& partStats, +template +bool construct_indicator(handle_t const& handle, edge_t index, edge_t n, + weight_t& clustersize, weight_t& partStats, vertex_t const* __restrict__ clusters, vector_t& part_i, vector_t& Bx, laplacian_matrix_t const& B) { - auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); + auto thrust_exec_policy = handle.get_thrust_policy(); thrust::for_each(thrust_exec_policy, thrust::make_zip_iterator(thrust::make_tuple( diff --git a/cpp/test/cluster_solvers.cu b/cpp/test/cluster_solvers.cu index 4ff6cdf5fa..d280b3e95c 100644 --- a/cpp/test/cluster_solvers.cu +++ b/cpp/test/cluster_solvers.cu @@ -49,8 +49,7 @@ TEST(Raft, ClusterSolvers) { kmeans_solver_t cluster_solver{cfg}; - EXPECT_ANY_THROW(cluster_solver.solve(h, thrust::cuda::par.on(stream), n, d, - eigvecs, codes)); + EXPECT_ANY_THROW(cluster_solver.solve(h, n, d, eigvecs, codes)); } TEST(Raft, ModularitySolvers) { @@ -89,14 +88,12 @@ TEST(Raft, ModularitySolvers) { auto stream = h.get_stream(); sparse_matrix_t sm{h, nullptr, nullptr, nullptr, 0, 0}; - auto t_exe_p = thrust::cuda::par.on(stream); EXPECT_ANY_THROW(spectral::modularity_maximization( - h, t_exe_p, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs)); + h, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs)); value_type modularity{0}; - EXPECT_ANY_THROW( - spectral::analyzeModularity(h, t_exe_p, sm, k, clusters, modularity)); + EXPECT_ANY_THROW(spectral::analyzeModularity(h, sm, k, clusters, modularity)); } } // namespace raft diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu index e6ee09262e..8025d8dcd6 100644 --- a/cpp/test/eigen_solvers.cu +++ b/cpp/test/eigen_solvers.cu @@ -102,16 +102,15 @@ TEST(Raft, SpectralSolvers) { auto stream = h.get_stream(); - auto t_exe_p = thrust::cuda::par.on(stream); sparse_matrix_t sm{h, nullptr, nullptr, nullptr, 0, 0}; - EXPECT_ANY_THROW(spectral::partition( - h, t_exe_p, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs)); + EXPECT_ANY_THROW(spectral::partition(h, sm, eig_solver, cluster_solver, + clusters, eigvals, eigvecs)); value_type edgeCut{0}; value_type cost{0}; EXPECT_ANY_THROW( - spectral::analyzePartition(h, t_exe_p, sm, k, clusters, edgeCut, cost)); + spectral::analyzePartition(h, sm, k, clusters, edgeCut, cost)); } } // namespace raft diff --git a/cpp/test/spectral_matrix.cu b/cpp/test/spectral_matrix.cu index e5c2d52764..b85d35e3f8 100644 --- a/cpp/test/spectral_matrix.cu +++ b/cpp/test/spectral_matrix.cu @@ -57,27 +57,24 @@ TEST(Raft, SpectralMatrices) { ASSERT_EQ(nullptr, sm2.row_offsets_); auto stream = h.get_stream(); - auto t_exe_pol = thrust::cuda::par.on(stream); - auto cnstr_lm1 = [&h, t_exe_pol, ro, ci, vs, nrows, nnz](void) { - laplacian_matrix_t lm1{h, t_exe_pol, ro, ci, - vs, nrows, nnz}; + auto cnstr_lm1 = [&h, ro, ci, vs, nrows, nnz](void) { + laplacian_matrix_t lm1{h, ro, ci, vs, nrows, nnz}; }; EXPECT_ANY_THROW(cnstr_lm1()); // because of nullptr ptr args - auto cnstr_lm2 = [&h, t_exe_pol, &sm2](void) { - laplacian_matrix_t lm2{h, t_exe_pol, sm2}; + auto cnstr_lm2 = [&h, &sm2](void) { + laplacian_matrix_t lm2{h, sm2}; }; EXPECT_ANY_THROW(cnstr_lm2()); // because of nullptr ptr args - auto cnstr_mm1 = [&h, t_exe_pol, ro, ci, vs, nrows, nnz](void) { - modularity_matrix_t mm1{h, t_exe_pol, ro, ci, - vs, nrows, nnz}; + auto cnstr_mm1 = [&h, ro, ci, vs, nrows, nnz](void) { + modularity_matrix_t mm1{h, ro, ci, vs, nrows, nnz}; }; EXPECT_ANY_THROW(cnstr_mm1()); // because of nullptr ptr args - auto cnstr_mm2 = [&h, t_exe_pol, &sm2](void) { - modularity_matrix_t mm2{h, t_exe_pol, sm2}; + auto cnstr_mm2 = [&h, &sm2](void) { + modularity_matrix_t mm2{h, sm2}; }; EXPECT_ANY_THROW(cnstr_mm2()); // because of nullptr ptr args } From 0bc3d2b5aa314b26f7b6f1dea5313789e064703b Mon Sep 17 00:00:00 2001 From: viclafargue Date: Fri, 2 Jul 2021 11:49:55 +0200 Subject: [PATCH 05/17] Use of rmm::device_scalar --- cpp/include/raft/comms/test.hpp | 12 ++++++------ cpp/include/raft/label/classlabels.cuh | 5 +++-- cpp/include/raft/lap/lap.cuh | 6 +++--- cpp/include/raft/lap/lap_functions.cuh | 3 ++- cpp/include/raft/linalg/eig.cuh | 13 ++++++------- cpp/include/raft/linalg/qr.cuh | 5 +++-- cpp/include/raft/linalg/svd.cuh | 5 +++-- cpp/include/raft/matrix/math.cuh | 3 ++- cpp/include/raft/sparse/csr.cuh | 5 +++-- cpp/test/label/merge_labels.cu | 5 +++-- cpp/test/linalg/cholesky_r1.cu | 5 +++-- cpp/test/linalg/map_then_reduce.cu | 5 +++-- cpp/test/sparse/knn_graph.cu | 8 ++++---- cpp/test/sparse/symmetrize.cu | 8 ++++---- 14 files changed, 48 insertions(+), 40 deletions(-) diff --git a/cpp/include/raft/comms/test.hpp b/cpp/include/raft/comms/test.hpp index 9f5edc1425..17db8e88af 100644 --- a/cpp/include/raft/comms/test.hpp +++ b/cpp/include/raft/comms/test.hpp @@ -43,7 +43,7 @@ bool test_collective_allreduce(const handle_t &handle, int root) { cudaStream_t stream = handle.get_stream(); - rmm::device_uvector temp_d(1, stream); + rmm::device_scalar temp_d(stream); CUDA_CHECK( cudaMemcpyAsync(temp_d.data(), &send, 1, cudaMemcpyHostToDevice, stream)); @@ -74,7 +74,7 @@ bool test_collective_broadcast(const handle_t &handle, int root) { cudaStream_t stream = handle.get_stream(); - rmm::device_uvector temp_d(1, stream); + rmm::device_scalar temp_d(stream); if (communicator.get_rank() == root) CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), @@ -101,7 +101,7 @@ bool test_collective_reduce(const handle_t &handle, int root) { cudaStream_t stream = handle.get_stream(); - rmm::device_uvector temp_d(1, stream); + rmm::device_scalar temp_d(stream); CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); @@ -130,7 +130,7 @@ bool test_collective_allgather(const handle_t &handle, int root) { cudaStream_t stream = handle.get_stream(); - rmm::device_uvector temp_d(1, stream); + rmm::device_scalar temp_d(stream); rmm::device_uvector recv_d(communicator.get_size(), stream); CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), @@ -162,7 +162,7 @@ bool test_collective_gather(const handle_t &handle, int root) { cudaStream_t stream = handle.get_stream(); - rmm::device_uvector temp_d(1, stream); + rmm::device_scalar temp_d(stream); rmm::device_uvector recv_d( communicator.get_rank() == root ? communicator.get_size() : 0, stream); @@ -244,7 +244,7 @@ bool test_collective_reducescatter(const handle_t &handle, int root) { cudaStream_t stream = handle.get_stream(); rmm::device_uvector temp_d(sends.size(), stream); - rmm::device_uvector recv_d(1, stream); + rmm::device_scalar recv_d(stream); CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(), sends.size() * sizeof(int), cudaMemcpyHostToDevice, diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/classlabels.cuh index f2b2463165..b7878dc276 100644 --- a/cpp/include/raft/label/classlabels.cuh +++ b/cpp/include/raft/label/classlabels.cuh @@ -22,6 +22,7 @@ #include #include #include +#include #include namespace raft { @@ -48,7 +49,7 @@ void getUniquelabels(value_t *y, size_t n, value_t **y_unique, int *n_unique, std::shared_ptr allocator) { rmm::device_uvector y2(n, stream); rmm::device_uvector y3(n, stream); - rmm::device_uvector d_num_selected(1, stream); + rmm::device_scalar d_num_selected(stream); size_t bytes = 0; size_t bytes2 = 0; @@ -64,7 +65,7 @@ void getUniquelabels(value_t *y, size_t n, value_t **y_unique, int *n_unique, cub::DeviceRadixSort::SortKeys(cub_storage.data(), bytes, y, y2.data(), n); cub::DeviceSelect::Unique(cub_storage.data(), bytes, y2.data(), y3.data(), d_num_selected.data(), n); - raft::update_host(n_unique, d_num_selected.data(), 1, stream); + *n_unique = d_num_selected.value(stream); CUDA_CHECK(cudaStreamSynchronize(stream)); // Copy unique classes to output diff --git a/cpp/include/raft/lap/lap.cuh b/cpp/include/raft/lap/lap.cuh index 29c1cf6aa1..f64afb3549 100644 --- a/cpp/include/raft/lap/lap.cuh +++ b/cpp/include/raft/lap/lap.cuh @@ -233,16 +233,16 @@ class LinearAssignmentProblem { int hungarianStep3() { int next; - rmm::device_uvector flag_v(1, handle_.get_stream()); + rmm::device_scalar flag_v(handle_.get_stream()); bool h_flag = false; - raft::update_device(flag_v.data(), &h_flag, 1, handle_.get_stream()); + flag_v.set_value_async(h_flag, handle_.get_stream()); detail::executeZeroCover(handle_, d_costs_, d_vertices_dev, d_row_data_dev, d_col_data_dev, flag_v.data(), batchsize_, size_, epsilon_); - raft::update_host(&h_flag, flag_v.data(), 1, handle_.get_stream()); + h_flag = flag_v.value(handle_.get_stream()); next = h_flag ? 4 : 5; diff --git a/cpp/include/raft/lap/lap_functions.cuh b/cpp/include/raft/lap/lap_functions.cuh index ce3f67f8fe..1e12e3917c 100644 --- a/cpp/include/raft/lap/lap_functions.cuh +++ b/cpp/include/raft/lap/lap_functions.cuh @@ -35,6 +35,7 @@ #include #include #include +#include #include namespace raft { @@ -408,7 +409,7 @@ inline void dualUpdate(raft::handle_t const &handle, dim3 threads_per_block; int total_blocks; - rmm::device_uvector sp_min_v(1, handle.get_stream()); + rmm::device_scalar sp_min_v(handle.get_stream()); raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP); diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh index 8ab7011db4..951e2b8e7a 100644 --- a/cpp/include/raft/linalg/eig.cuh +++ b/cpp/include/raft/linalg/eig.cuh @@ -22,6 +22,7 @@ #include #include #include +#include #include namespace raft { @@ -53,7 +54,7 @@ void eigDC(const raft::handle_t &handle, const math_t *in, int n_rows, n_cols, eig_vals, &lwork)); rmm::device_uvector d_work(lwork, stream); - rmm::device_uvector d_dev_info(1, stream); + rmm::device_scalar d_dev_info(stream); raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream); @@ -63,8 +64,7 @@ void eigDC(const raft::handle_t &handle, const math_t *in, int n_rows, d_dev_info.data(), stream)); CUDA_CHECK(cudaGetLastError()); - int dev_info; - raft::update_host(&dev_info, d_dev_info.data(), 1, stream); + int dev_info = d_dev_info.value(stream); CUDA_CHECK(cudaStreamSynchronize(stream)); ASSERT(dev_info == 0, "eig.cuh: eigensolver couldn't converge to a solution. " @@ -105,7 +105,7 @@ void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, &lwork)); rmm::device_uvector d_work(lwork, stream); - rmm::device_uvector d_dev_info(1, stream); + rmm::device_scalar d_dev_info(stream); rmm::device_uvector d_eig_vectors(0, stream); if (memUsage == OVERWRITE_INPUT) { @@ -127,8 +127,7 @@ void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, CUDA_CHECK(cudaGetLastError()); - int dev_info; - raft::update_host(&dev_info, d_dev_info.data(), 1, stream); + int dev_info = d_dev_info.value(stream); CUDA_CHECK(cudaStreamSynchronize(stream)); ASSERT(dev_info == 0, "eig.cuh: eigensolver couldn't converge to a solution. " @@ -177,7 +176,7 @@ void eigJacobi(const raft::handle_t &handle, const math_t *in, int n_rows, eig_vectors, n_cols, eig_vals, &lwork, syevj_params)); rmm::device_uvector d_work(lwork, stream); - rmm::device_uvector dev_info(1, stream); + rmm::device_scalar dev_info(stream); raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream); diff --git a/cpp/include/raft/linalg/qr.cuh b/cpp/include/raft/linalg/qr.cuh index 14771f289d..cc912d7d86 100644 --- a/cpp/include/raft/linalg/qr.cuh +++ b/cpp/include/raft/linalg/qr.cuh @@ -19,6 +19,7 @@ #include #include #include +#include #include namespace raft { @@ -52,7 +53,7 @@ void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q, rmm::device_uvector tau(k, stream); CUDA_CHECK(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * k, stream)); - rmm::device_uvector devInfo(1, stream); + rmm::device_scalar devInfo(stream); int Lwork; CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, m, n, Q, m, &Lwork)); @@ -97,7 +98,7 @@ void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R, cudaMemcpyDeviceToDevice, stream)); int Lwork; - rmm::device_uvector devInfo(1, stream); + rmm::device_scalar devInfo(stream); CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, R_full_nrows, R_full_ncols, R_full.data(), diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/svd.cuh index a1507cfa9b..8b40a80903 100644 --- a/cpp/include/raft/linalg/svd.cuh +++ b/cpp/include/raft/linalg/svd.cuh @@ -23,6 +23,7 @@ #include #include #include +#include #include #include "eig.cuh" #include "gemm.cuh" @@ -69,7 +70,7 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols, const int m = n_rows; const int n = n_cols; - rmm::device_uvector devInfo(1, stream); + rmm::device_scalar devInfo(stream); T *d_rwork = nullptr; int lwork = 0; @@ -170,7 +171,7 @@ void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, int m = n_rows; int n = n_cols; - rmm::device_uvector devInfo(1, stream); + rmm::device_scalar devInfo(stream); int lwork = 0; int econ = 1; diff --git a/cpp/include/raft/matrix/math.cuh b/cpp/include/raft/matrix/math.cuh index f84b85d222..51e83b854b 100644 --- a/cpp/include/raft/matrix/math.cuh +++ b/cpp/include/raft/matrix/math.cuh @@ -22,6 +22,7 @@ #include #include #include +#include #include namespace raft { @@ -294,7 +295,7 @@ void ratio(const raft::handle_t &handle, math_t *src, math_t *dest, IdxType len, auto d_src = src; auto d_dest = dest; - rmm::device_uvector d_sum(1, stream); + rmm::device_scalar d_sum(stream); auto *d_sum_ptr = d_sum.data(); auto no_op = [] __device__(math_t in) { return in; }; raft::linalg::mapThenSumReduce(d_sum_ptr, len, no_op, stream, src); diff --git a/cpp/include/raft/sparse/csr.cuh b/cpp/include/raft/sparse/csr.cuh index b30d7af8b4..d571a4999a 100644 --- a/cpp/include/raft/sparse/csr.cuh +++ b/cpp/include/raft/sparse/csr.cuh @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -219,7 +220,7 @@ void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr, Index_ nnz, Index_ N, std::shared_ptr d_alloc, cudaStream_t stream, Lambda filter_op) { - rmm::device_uvector m(1, stream); + rmm::device_scalar m(stream); WeakCCState state(m.data()); weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, 0, N, @@ -253,7 +254,7 @@ void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr, Index_ nnz, Index_ N, std::shared_ptr d_alloc, cudaStream_t stream) { - rmm::device_uvector m(1, stream); + rmm::device_scalar m(stream); WeakCCState state(m.data()); weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, 0, N, stream, [](Index_) { return true; }); diff --git a/cpp/test/label/merge_labels.cu b/cpp/test/label/merge_labels.cu index a2f14a8dbc..f58daae01b 100644 --- a/cpp/test/label/merge_labels.cu +++ b/cpp/test/label/merge_labels.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include "../test_utils.h" @@ -50,7 +51,7 @@ class MergeLabelsTest expected(params.N, stream), R(params.N, stream), mask(params.N, stream), - m(1, stream) {} + m(stream) {} void Run() { raft::update_device(labels_a.data(), params.labels_a.data(), params.N, @@ -76,7 +77,7 @@ class MergeLabelsTest raft::handle_t handle; cudaStream_t stream; rmm::device_uvector labels_a, labels_b, expected, R; - rmm::device_uvector mask, m; + rmm::device_scalar mask, m; }; using MergeLabelsTestI = MergeLabelsTest; diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/test/linalg/cholesky_r1.cu index 3c84d0db5f..3265ef9e13 100644 --- a/cpp/test/linalg/cholesky_r1.cu +++ b/cpp/test/linalg/cholesky_r1.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -35,7 +36,7 @@ class CholeskyR1Test : public ::testing::Test { : G(n_rows * n_rows, handle.get_stream()), L(n_rows * n_rows, handle.get_stream()), L_exp(n_rows * n_rows, handle.get_stream()), - devInfo(1, handle.get_stream()), + devInfo(handle.get_stream()), workspace(0, handle.get_stream()) { CUDA_CHECK(cudaStreamCreate(&stream)); handle.set_stream(stream); @@ -119,7 +120,7 @@ class CholeskyR1Test : public ::testing::Test { math_t G2_host[4] = {3, 4, 2, 1}; - rmm::device_uvector devInfo; + rmm::device_scalar devInfo; rmm::device_uvector G; rmm::device_uvector L_exp; rmm::device_uvector L; diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu index c78dd9e8fb..2a8b356f3f 100644 --- a/cpp/test/linalg/map_then_reduce.cu +++ b/cpp/test/linalg/map_then_reduce.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include "../test_utils.h" @@ -132,7 +133,7 @@ class MapGenericReduceTest : public ::testing::Test { protected: MapGenericReduceTest() - : input(n, handle.get_stream()), output(1, handle.get_stream()) { + : input(n, handle.get_stream()), output(handle.get_stream()) { CUDA_CHECK(cudaStreamCreate(&stream)); handle.set_stream(stream); initInput(input.data(), input.size(), stream); @@ -172,7 +173,7 @@ class MapGenericReduceTest : public ::testing::Test { raft::handle_t handle; cudaStream_t stream; rmm::device_uvector input; - rmm::device_uvector output; + rmm::device_scalar output; }; using IoTypePair = diff --git a/cpp/test/sparse/knn_graph.cu b/cpp/test/sparse/knn_graph.cu index ec41b32374..e8391c3ea4 100644 --- a/cpp/test/sparse/knn_graph.cu +++ b/cpp/test/sparse/knn_graph.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include "../test_utils.h" @@ -77,9 +78,8 @@ class KNNGraphTest handle, X, params.m, params.n, raft::distance::DistanceType::L2Unexpanded, *out); - rmm::device_uvector sum(1, stream); - - CUDA_CHECK(cudaMemsetAsync(sum.data(), 0, 1 * sizeof(value_idx), stream)); + rmm::device_scalar sum(stream); + sum.set_value_to_zero_async(stream); /** * Assert the knn graph is symmetric @@ -87,7 +87,7 @@ class KNNGraphTest assert_symmetry<<nnz, 256), 256, 0, stream>>>( out->rows(), out->cols(), out->vals(), out->nnz, sum.data()); - raft::update_host(&sum_h, sum.data(), 1, stream); + sum_h = sum.value(stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu index e96e5c289c..b1745c9b26 100644 --- a/cpp/test/sparse/symmetrize.cu +++ b/cpp/test/sparse/symmetrize.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include "../test_utils.h" @@ -97,14 +98,13 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam< raft::sparse::linalg::symmetrize(handle, coo_rows.data(), indices, data, m, n, coo_rows.size(), out); - rmm::device_uvector sum(1, stream); - - CUDA_CHECK(cudaMemsetAsync(sum.data(), 0, 1 * sizeof(value_idx), stream)); + rmm::device_scalar sum(stream); + sum.set_value_to_zero_async(stream); assert_symmetry<<>>( out.rows(), out.cols(), out.vals(), out.nnz, sum.data()); - raft::update_host(&sum_h, sum.data(), 1, stream); + sum_h = sum.value(stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } From f6fe37a9718adc91a472f47aa357cc438930c69b Mon Sep 17 00:00:00 2001 From: viclafargue Date: Mon, 5 Jul 2021 12:04:35 +0200 Subject: [PATCH 06/17] Use rmm::exec_policy instead of thrust::cuda::par.on + remove rmm::device_vector --- cpp/include/raft/linalg/init.h | 3 +- cpp/include/raft/linalg/transpose.h | 4 +- cpp/include/raft/matrix/matrix.cuh | 14 +-- cpp/include/raft/sparse/convert/csr.cuh | 2 +- cpp/include/raft/sparse/coo.cuh | 1 - .../coo_spmv_strategies/base_strategy.cuh | 1 - cpp/include/raft/sparse/distance/utils.cuh | 2 - cpp/include/raft/sparse/linalg/add.cuh | 3 +- cpp/include/raft/sparse/linalg/symmetrize.cuh | 5 +- .../raft/sparse/mst/detail/mst_solver_inl.cuh | 110 +++++++++--------- cpp/include/raft/sparse/mst/detail/utils.cuh | 4 +- cpp/include/raft/sparse/mst/mst_solver.cuh | 24 ++-- cpp/include/raft/sparse/op/sort.h | 9 +- .../sparse/selection/connect_components.cuh | 9 +- .../knn/detail/ann_quantized_faiss.cuh | 1 - .../knn/detail/knn_brute_force_faiss.cuh | 1 - cpp/include/raft/spectral/kmeans.hpp | 1 - .../raft/spectral/modularity_maximization.hpp | 1 - cpp/include/raft/spectral/partition.hpp | 1 - cpp/include/raft/spectral/spectral_util.hpp | 1 - cpp/test/eigen_solvers.cu | 1 - cpp/test/linalg/reduce.cuh | 15 ++- cpp/test/matrix/matrix.cu | 2 +- cpp/test/mst.cu | 20 ++-- 24 files changed, 118 insertions(+), 117 deletions(-) diff --git a/cpp/include/raft/linalg/init.h b/cpp/include/raft/linalg/init.h index cb2e8ed1ab..9944685a1f 100644 --- a/cpp/include/raft/linalg/init.h +++ b/cpp/include/raft/linalg/init.h @@ -19,6 +19,7 @@ #include #include #include +#include namespace raft { namespace linalg { @@ -40,7 +41,7 @@ void range(T *out, int start, int end, cudaStream_t stream) { thrust::counting_iterator first(start); thrust::counting_iterator last = first + (end - start); thrust::device_ptr ptr(out); - thrust::copy(thrust::cuda::par.on(stream), first, last, ptr); + thrust::copy(rmm::exec_policy(stream), first, last, ptr); } /** diff --git a/cpp/include/raft/linalg/transpose.h b/cpp/include/raft/linalg/transpose.h index d90f6271fa..db1cabd694 100644 --- a/cpp/include/raft/linalg/transpose.h +++ b/cpp/include/raft/linalg/transpose.h @@ -17,8 +17,8 @@ #pragma once #include -#include #include +#include namespace raft { namespace linalg { @@ -60,7 +60,7 @@ void transpose(math_t *inout, int n, cudaStream_t stream) { auto d_inout = inout; auto counting = thrust::make_counting_iterator(0); - thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size, + thrust::for_each(rmm::exec_policy(stream), counting, counting + size, [=] __device__(int idx) { int s_row = idx % m; int s_col = idx / m; diff --git a/cpp/include/raft/matrix/matrix.cuh b/cpp/include/raft/matrix/matrix.cuh index 5f5755e24e..688b92da09 100644 --- a/cpp/include/raft/matrix/matrix.cuh +++ b/cpp/include/raft/matrix/matrix.cuh @@ -20,13 +20,13 @@ #include #include #include -#include #include #include #include #include #include #include +#include namespace raft { namespace matrix { @@ -64,7 +64,7 @@ void copyRows(const m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, idx_t size = n_rows_indices * n_cols; auto counting = thrust::make_counting_iterator(0); - thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size, + thrust::for_each(rmm::exec_policy(stream), counting, counting + size, [=] __device__(idx_t idx) { idx_t row = idx % n_rows_indices; idx_t col = idx / n_rows_indices; @@ -108,7 +108,7 @@ void truncZeroOrigin(m_t *in, idx_t in_n_rows, m_t *out, idx_t out_n_rows, auto d_q_trunc = out; auto counting = thrust::make_counting_iterator(0); - thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size, + thrust::for_each(rmm::exec_policy(stream), counting, counting + size, [=] __device__(idx_t idx) { idx_t row = idx % m; idx_t col = idx / m; @@ -133,8 +133,8 @@ void colReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) { auto d_q_reversed = inout; auto counting = thrust::make_counting_iterator(0); - thrust::for_each(thrust::cuda::par.on(stream), counting, - counting + (size / 2), [=] __device__(idx_t idx) { + thrust::for_each(rmm::exec_policy(stream), counting, counting + (size / 2), + [=] __device__(idx_t idx) { idx_t dest_row = idx % m; idx_t dest_col = idx / m; idx_t src_row = dest_row; @@ -161,8 +161,8 @@ void rowReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) { auto d_q_reversed = inout; auto counting = thrust::make_counting_iterator(0); - thrust::for_each(thrust::cuda::par.on(stream), counting, - counting + (size / 2), [=] __device__(idx_t idx) { + thrust::for_each(rmm::exec_policy(stream), counting, counting + (size / 2), + [=] __device__(idx_t idx) { idx_t dest_row = idx % m; idx_t dest_col = idx / m; idx_t src_row = (m - dest_row) - 1; diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh index 16f351bf48..79b18ebd0a 100644 --- a/cpp/include/raft/sparse/convert/csr.cuh +++ b/cpp/include/raft/sparse/convert/csr.cuh @@ -160,7 +160,7 @@ void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m, thrust::device_ptr row_counts_d = thrust::device_pointer_cast(row_counts.data()); thrust::device_ptr c_ind_d = thrust::device_pointer_cast(row_ind); - exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m, + exclusive_scan(rmm::exec_policy(stream), row_counts_d, row_counts_d + m, c_ind_d); } diff --git a/cpp/include/raft/sparse/coo.cuh b/cpp/include/raft/sparse/coo.cuh index 6af8eae395..fa21614f8f 100644 --- a/cpp/include/raft/sparse/coo.cuh +++ b/cpp/include/raft/sparse/coo.cuh @@ -22,7 +22,6 @@ #include #include -#include #include #include diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh index 194799aed0..3b57225350 100644 --- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh +++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh @@ -22,7 +22,6 @@ #include "coo_mask_row_iterators.cuh" #include -#include namespace raft { namespace sparse { diff --git a/cpp/include/raft/sparse/distance/utils.cuh b/cpp/include/raft/sparse/distance/utils.cuh index 6b6d77a2d5..3bee1bc87d 100644 --- a/cpp/include/raft/sparse/distance/utils.cuh +++ b/cpp/include/raft/sparse/distance/utils.cuh @@ -21,8 +21,6 @@ #include -#include - namespace raft { namespace sparse { namespace distance { diff --git a/cpp/include/raft/sparse/linalg/add.cuh b/cpp/include/raft/sparse/linalg/add.cuh index 3bf028d14a..7ed627b9e2 100644 --- a/cpp/include/raft/sparse/linalg/add.cuh +++ b/cpp/include/raft/sparse/linalg/add.cuh @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -181,7 +182,7 @@ size_t csr_add_calc_inds(const int *a_ind, const int *a_indptr, const T *a_val, thrust::device_ptr row_counts_d = thrust::device_pointer_cast(row_counts.data()); thrust::device_ptr c_ind_d = thrust::device_pointer_cast(out_ind); - exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m, + exclusive_scan(rmm::exec_policy(stream), row_counts_d, row_counts_d + m, c_ind_d); return cnnz; diff --git a/cpp/include/raft/sparse/linalg/symmetrize.cuh b/cpp/include/raft/sparse/linalg/symmetrize.cuh index 614c9d830e..a6e1027288 100644 --- a/cpp/include/raft/sparse/linalg/symmetrize.cuh +++ b/cpp/include/raft/sparse/linalg/symmetrize.cuh @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -293,8 +294,8 @@ void from_knn_symmetrize_matrix(const value_idx *restrict knn_indices, thrust::device_pointer_cast(row_sizes.data()); // Rolling cumulative sum - thrust::exclusive_scan(thrust::cuda::par.on(stream), __row_sizes, - __row_sizes + n, __edges); + thrust::exclusive_scan(rmm::exec_policy(stream), __row_sizes, __row_sizes + n, + __edges); // (5) Perform final data + data.T operation in tandem with memcpying symmetric_sum<<>>( diff --git a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh index 029b76a945..33b980afcd 100644 --- a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh +++ b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh @@ -23,10 +23,10 @@ #include "utils.cuh" #include -#include +#include +#include #include -#include #include #include #include @@ -34,10 +34,6 @@ #include #include -#include - -#include - namespace raft { namespace mst { typedef std::chrono::high_resolution_clock Clock; @@ -63,20 +59,20 @@ MST_solver::MST_solver( offsets(offsets_), indices(indices_), weights(weights_), - altered_weights(e_), + altered_weights(e_, stream_), v(v_), e(e_), color_index(color_), - color(v_), - next_color(v_), - min_edge_color(v_), - new_mst_edge(v_), - mst_edge(e_, false), - temp_src(2 * v_), - temp_dst(2 * v_), - temp_weights(2 * v_), - mst_edge_count(1, 0), - prev_mst_edge_count(1, 0), + color(v_, stream_), + next_color(v_, stream_), + min_edge_color(v_, stream_), + new_mst_edge(v_, stream_), + mst_edge(e_, stream_), + temp_src(2 * v_, stream_), + temp_dst(2 * v_, stream_), + temp_weights(2 * v_, stream_), + mst_edge_count(1, stream_), + prev_mst_edge_count(1, stream_), stream(stream_), symmetrize_output(symmetrize_output_), initialize_colors(initialize_colors_), @@ -85,13 +81,18 @@ MST_solver::MST_solver( max_threads = handle_.get_device_properties().maxThreadsPerBlock; sm_count = handle_.get_device_properties().multiProcessorCount; + mst_edge_count.set_value_to_zero_async(stream); + prev_mst_edge_count.set_value_to_zero_async(stream); + CUDA_CHECK(cudaMemsetAsync(mst_edge.data(), 0, mst_edge.size() * sizeof(bool), + stream)); + //Initially, color holds the vertex id as color auto policy = handle.get_thrust_policy(); if (initialize_colors_) { thrust::sequence(policy, color.begin(), color.end(), 0); thrust::sequence(policy, color_index, color_index + v, 0); } else { - raft::copy(color.data().get(), color_index, v, stream); + raft::copy(color.data(), color_index, v, stream); } thrust::sequence(policy, next_color.begin(), next_color.end(), 0); } @@ -158,12 +159,12 @@ MST_solver::solve() { timer3 += duration_us(stop - start); #endif - auto curr_mst_edge_count = mst_edge_count[0]; + auto curr_mst_edge_count = mst_edge_count.value(stream); RAFT_EXPECTS(curr_mst_edge_count <= max_mst_edges, "Number of edges found by MST is invalid. This may be due to " "loss in precision. Try increasing precision of weights."); - if (curr_mst_edge_count == prev_mst_edge_count[0]) { + if (curr_mst_edge_count == prev_mst_edge_count.value(stream)) { #ifdef MST_TIME std::cout << "Iterations: " << i << std::endl; std::cout << timer0 << "," << timer1 << "," << timer2 << "," << timer3 @@ -194,12 +195,11 @@ MST_solver::solve() { #endif // copy this iteration's results and store - prev_mst_edge_count = mst_edge_count; + prev_mst_edge_count.set_value_async(curr_mst_edge_count, stream); } // result packaging - thrust::host_vector host_mst_edge_count = mst_edge_count; - mst_result.n_edges = host_mst_edge_count[0]; + mst_result.n_edges = mst_edge_count.value(stream); mst_result.src.resize(mst_result.n_edges, stream); mst_result.dst.resize(mst_result.n_edges, stream); mst_result.weights.resize(mst_result.n_edges, stream); @@ -226,7 +226,7 @@ template ::alteration_max() { auto policy = handle.get_thrust_policy(); - rmm::device_vector tmp(e); + rmm::device_uvector tmp(e, stream); thrust::device_ptr weights_ptr(weights); thrust::copy(policy, weights_ptr, weights_ptr + e, tmp.begin()); //sort tmp weights @@ -240,7 +240,7 @@ MST_solver::alteration_max() { thrust::make_zip_iterator(thrust::make_tuple(tmp.begin(), tmp.begin() + 1)); auto end = thrust::make_zip_iterator(thrust::make_tuple(new_end - 1, new_end)); - auto init = tmp[1] - tmp[0]; + auto init = tmp.element(1, stream) - tmp.element(0, stream); auto max = thrust::transform_reduce(policy, begin, end, alteration_functor(), init, thrust::minimum()); @@ -259,7 +259,7 @@ void MST_solver::alteration() { alteration_t max = alteration_max(); // pool of rand values - rmm::device_vector rand_values(v); + rmm::device_uvector rand_values(v, stream); // Random number generator curandGenerator_t randGen; @@ -267,8 +267,7 @@ void MST_solver::alteration() { curandSetPseudoRandomGeneratorSeed(randGen, 1234567); // Initialize rand values - auto curand_status = - curand_generate_uniformX(randGen, rand_values.data().get(), v); + auto curand_status = curand_generate_uniformX(randGen, rand_values.data(), v); RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS, "MST: CURAND failed"); curand_status = curandDestroyGenerator(randGen); RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS, @@ -276,8 +275,8 @@ void MST_solver::alteration() { //Alterate the weights, make all undirected edge weight unique while keeping Wuv == Wvu detail::alteration_kernel<<>>( - v, e, offsets, indices, weights, max, rand_values.data().get(), - altered_weights.data().get()); + v, e, offsets, indices, weights, max, rand_values.data(), + altered_weights.data()); } // updates colors of vertices by propagating the lower color to the higher @@ -286,23 +285,24 @@ template ::label_prop( vertex_t* mst_src, vertex_t* mst_dst) { // update the colors of both ends its until there is no change in colors - thrust::host_vector curr_mst_edge_count = mst_edge_count; + edge_t curr_mst_edge_count = mst_edge_count.value(stream); auto min_pair_nthreads = std::min(v, (vertex_t)max_threads); auto min_pair_nblocks = std::min( (v + min_pair_nthreads - 1) / min_pair_nthreads, (vertex_t)max_blocks); - rmm::device_vector done(1, false); - - edge_t* new_mst_edge_ptr = new_mst_edge.data().get(); - vertex_t* color_ptr = color.data().get(); - vertex_t* next_color_ptr = next_color.data().get(); + edge_t* new_mst_edge_ptr = new_mst_edge.data(); + vertex_t* color_ptr = color.data(); + vertex_t* next_color_ptr = next_color.data(); - bool* done_ptr = done.data().get(); + rmm::device_scalar done(stream); + done.set_value_to_zero_async(stream); + bool* done_ptr = done.data(); + const bool true_val = true; auto i = 0; - while (!done[0]) { - done[0] = true; + while (!done.value(stream)) { + done.set_value_async(true_val, stream); detail::min_pair_colors<<>>( v, indices, new_mst_edge_ptr, color_ptr, color_index, next_color_ptr); @@ -333,11 +333,11 @@ void MST_solver>>( offsets, indices, altered_weights_ptr, color_ptr, color_index, @@ -356,14 +356,14 @@ void MST_solver::max()); - vertex_t* color_ptr = color.data().get(); - edge_t* new_mst_edge_ptr = new_mst_edge.data().get(); - bool* mst_edge_ptr = mst_edge.data().get(); - alteration_t* min_edge_color_ptr = min_edge_color.data().get(); - alteration_t* altered_weights_ptr = altered_weights.data().get(); - vertex_t* temp_src_ptr = temp_src.data().get(); - vertex_t* temp_dst_ptr = temp_dst.data().get(); - weight_t* temp_weights_ptr = temp_weights.data().get(); + vertex_t* color_ptr = color.data(); + edge_t* new_mst_edge_ptr = new_mst_edge.data(); + bool* mst_edge_ptr = mst_edge.data(); + alteration_t* min_edge_color_ptr = min_edge_color.data(); + alteration_t* altered_weights_ptr = altered_weights.data(); + vertex_t* temp_src_ptr = temp_src.data(); + vertex_t* temp_dst_ptr = temp_dst.data(); + weight_t* temp_weights_ptr = temp_weights.data(); detail::min_edge_per_supervertex<<>>( color_ptr, color_index, new_mst_edge_ptr, mst_edge_ptr, indices, weights, @@ -388,8 +388,8 @@ void MST_solver::check_termination() { std::min((2 * v + nthreads - 1) / nthreads, (vertex_t)max_blocks); // count number of new mst edges - edge_t* mst_edge_count_ptr = mst_edge_count.data().get(); - vertex_t* temp_src_ptr = temp_src.data().get(); + edge_t* mst_edge_count_ptr = mst_edge_count.data(); + vertex_t* temp_src_ptr = temp_src.data(); detail::kernel_count_new_mst_edges<<>>( temp_src_ptr, mst_edge_count_ptr, 2 * v); @@ -411,7 +411,7 @@ void MST_solver::append_src_dst_pair( vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights) { auto policy = handle.get_thrust_policy(); - auto curr_mst_edge_count = prev_mst_edge_count[0]; + edge_t curr_mst_edge_count = prev_mst_edge_count.value(stream); // iterator to end of mst edges added to final output in previous iteration auto src_dst_zip_end = thrust::make_zip_iterator(thrust::make_tuple( diff --git a/cpp/include/raft/sparse/mst/detail/utils.cuh b/cpp/include/raft/sparse/mst/detail/utils.cuh index 8f755de459..4d5ca6ebe1 100644 --- a/cpp/include/raft/sparse/mst/detail/utils.cuh +++ b/cpp/include/raft/sparse/mst/detail/utils.cuh @@ -18,7 +18,7 @@ #pragma once #include -#include +#include #define MST_TIME namespace raft { @@ -32,7 +32,7 @@ __device__ idx_t get_1D_idx() { // somewhat smart vector print template -void printv(rmm::device_vector& vec, const std::string& name = "", +void printv(rmm::device_uvector& vec, const std::string& name = "", const size_t displ = 5) { #ifdef MST_TIME std::cout.precision(15); diff --git a/cpp/include/raft/sparse/mst/mst_solver.cuh b/cpp/include/raft/sparse/mst/mst_solver.cuh index 833882ea0d..44b34ee5c7 100644 --- a/cpp/include/raft/sparse/mst/mst_solver.cuh +++ b/cpp/include/raft/sparse/mst/mst_solver.cuh @@ -18,8 +18,8 @@ #pragma once #include +#include #include -#include namespace raft { @@ -68,24 +68,24 @@ class MST_solver { vertex_t sm_count; vertex_t* color_index; // represent each supervertex as a color - rmm::device_vector + rmm::device_uvector min_edge_color; // minimum incident edge weight per color - rmm::device_vector new_mst_edge; // new minimum edge per vertex - rmm::device_vector + rmm::device_uvector new_mst_edge; // new minimum edge per vertex + rmm::device_uvector altered_weights; // weights to be used for mst - rmm::device_vector + rmm::device_scalar mst_edge_count; // total number of edges added after every iteration - rmm::device_vector + rmm::device_scalar prev_mst_edge_count; // total number of edges up to the previous iteration - rmm::device_vector + rmm::device_uvector mst_edge; // mst output - true if the edge belongs in mst - rmm::device_vector next_color; // next iteration color - rmm::device_vector color; // index of color that vertex points to + rmm::device_uvector next_color; // next iteration color + rmm::device_uvector color; // index of color that vertex points to // new src-dst pairs found per iteration - rmm::device_vector temp_src; - rmm::device_vector temp_dst; - rmm::device_vector temp_weights; + rmm::device_uvector temp_src; + rmm::device_uvector temp_dst; + rmm::device_uvector temp_weights; void label_prop(vertex_t* mst_src, vertex_t* mst_dst); void min_edge_per_vertex(); diff --git a/cpp/include/raft/sparse/op/sort.h b/cpp/include/raft/sparse/op/sort.h index d53ceb62a9..9414a11ade 100644 --- a/cpp/include/raft/sparse/op/sort.h +++ b/cpp/include/raft/sparse/op/sort.h @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include @@ -69,8 +69,8 @@ void coo_sort(int m, int n, int nnz, int *rows, int *cols, T *vals, auto coo_indices = thrust::make_zip_iterator(thrust::make_tuple(rows, cols)); // get all the colors in contiguous locations so we can map them to warps. - thrust::sort_by_key(thrust::cuda::par.on(stream), coo_indices, - coo_indices + nnz, vals, TupleComp()); + thrust::sort_by_key(rmm::exec_policy(stream), coo_indices, coo_indices + nnz, + vals, TupleComp()); } /** @@ -104,8 +104,7 @@ void coo_sort_by_weight(value_idx *rows, value_idx *cols, value_t *data, auto first = thrust::make_zip_iterator(thrust::make_tuple(rows, cols)); - thrust::sort_by_key(thrust::cuda::par.on(stream), t_data, t_data + nnz, - first); + thrust::sort_by_key(rmm::exec_policy(stream), t_data, t_data + nnz, first); } }; // namespace op }; // end NAMESPACE sparse diff --git a/cpp/include/raft/sparse/selection/connect_components.cuh b/cpp/include/raft/sparse/selection/connect_components.cuh index 390522c9bc..46369ca964 100644 --- a/cpp/include/raft/sparse/selection/connect_components.cuh +++ b/cpp/include/raft/sparse/selection/connect_components.cuh @@ -30,6 +30,7 @@ #include #include #include +#include #include @@ -211,7 +212,7 @@ void perform_1nn(cub::KeyValuePair *kvp, workspace.data(), reduction_op, reduction_op, true, true, stream); LookupColorOp extract_colors_op(colors); - thrust::transform(thrust::cuda::par.on(stream), kvp, kvp + n_rows, nn_colors, + thrust::transform(rmm::exec_policy(stream), kvp, kvp + n_rows, nn_colors, extract_colors_op); } @@ -232,15 +233,15 @@ void sort_by_color(value_idx *colors, value_idx *nn_colors, cub::KeyValuePair *kvp, value_idx *src_indices, size_t n_rows, cudaStream_t stream) { thrust::counting_iterator arg_sort_iter(0); - thrust::copy(thrust::cuda::par.on(stream), arg_sort_iter, - arg_sort_iter + n_rows, src_indices); + thrust::copy(rmm::exec_policy(stream), arg_sort_iter, arg_sort_iter + n_rows, + src_indices); auto keys = thrust::make_zip_iterator(thrust::make_tuple( colors, nn_colors, (raft::linkage::KeyValuePair *)kvp)); auto vals = thrust::make_zip_iterator(thrust::make_tuple(src_indices)); // get all the colors in contiguous locations so we can map them to warps. - thrust::sort_by_key(thrust::cuda::par.on(stream), keys, keys + n_rows, vals, + thrust::sort_by_key(rmm::exec_policy(stream), keys, keys + n_rows, vals, TupleComp()); } diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh index c0345a01e6..43bdf12a38 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh @@ -39,7 +39,6 @@ #include #include -#include #include #include diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh index 6db8fb7a8e..84c130b0e4 100644 --- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh @@ -29,7 +29,6 @@ #include #include -#include #include #include #include diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index 5928c727c6..b6f0105487 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -21,7 +21,6 @@ #include #include -#include #include #include #include diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp index 1fe7819a7e..fededbfcb4 100644 --- a/cpp/include/raft/spectral/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -20,7 +20,6 @@ #include #include -#include #include #include #include diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index a994895886..2df3812a4a 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -19,7 +19,6 @@ #include #include -#include #include #include #include diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp index de9ff1917f..c148350c0f 100644 --- a/cpp/include/raft/spectral/spectral_util.hpp +++ b/cpp/include/raft/spectral/spectral_util.hpp @@ -19,7 +19,6 @@ #include #include -#include #include #include #include diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu index 8025d8dcd6..ef67d95348 100644 --- a/cpp/test/eigen_solvers.cu +++ b/cpp/test/eigen_solvers.cu @@ -38,7 +38,6 @@ TEST(Raft, EigenSolvers) { index_type nnz = 0; index_type nrows = 0; auto stream = h.get_stream(); - auto t_exe_pol = thrust::cuda::par.on(stream); sparse_matrix_t sm1{h, ro, ci, vs, nrows, nnz}; ASSERT_EQ(nullptr, sm1.row_offsets_); diff --git a/cpp/test/linalg/reduce.cuh b/cpp/test/linalg/reduce.cuh index 18261287cf..86cb4f32b4 100644 --- a/cpp/test/linalg/reduce.cuh +++ b/cpp/test/linalg/reduce.cuh @@ -16,9 +16,9 @@ #include #include -#include #include #include +#include namespace raft { namespace linalg { @@ -51,17 +51,20 @@ void unaryAndGemv(Type *dots, const Type *data, int D, int N, cudaStream_t stream) { //computes a MLCommon unary op on data (squares it), then computes Ax //(A input matrix and x column vector) to sum columns - thrust::device_vector sq(D * N); + rmm::device_uvector sq(D * N, stream); raft::linalg::unaryOp( thrust::raw_pointer_cast(sq.data()), data, D * N, [] __device__(Type v) { return v * v; }, stream); cublasHandle_t handle; CUBLAS_CHECK(cublasCreate(&handle)); - thrust::device_vector ones(N, 1); //column vector [1...1] + rmm::device_uvector ones(N, stream); //column vector [1...1] + raft::linalg::unaryOp( + ones.data(), ones.data(), ones.size(), + [=] __device__(Type input) { return 1; }, stream); Type alpha = 1, beta = 0; - CUBLAS_CHECK(raft::linalg::cublasgemv( - handle, CUBLAS_OP_N, D, N, &alpha, thrust::raw_pointer_cast(sq.data()), D, - thrust::raw_pointer_cast(ones.data()), 1, &beta, dots, 1, stream)); + CUBLAS_CHECK(raft::linalg::cublasgemv(handle, CUBLAS_OP_N, D, N, &alpha, + sq.data(), D, ones.data(), 1, &beta, + dots, 1, stream)); CUDA_CHECK(cudaDeviceSynchronize()); CUBLAS_CHECK(cublasDestroy(handle)); } diff --git a/cpp/test/matrix/matrix.cu b/cpp/test/matrix/matrix.cu index e7da92a136..2d2d9d2057 100644 --- a/cpp/test/matrix/matrix.cu +++ b/cpp/test/matrix/matrix.cu @@ -112,7 +112,7 @@ class MatrixCopyRowsTest : public ::testing::Test { // Init input array thrust::counting_iterator first(0); thrust::device_ptr ptr(input.data()); - thrust::copy(thrust::cuda::par.on(stream), first, first + n_cols * n_rows, + thrust::copy(handle.get_thrust_policy(), first, first + n_cols * n_rows, ptr); } diff --git a/cpp/test/mst.cu b/cpp/test/mst.cu index d7aa76500b..4714fd5eaa 100644 --- a/cpp/test/mst.cu +++ b/cpp/test/mst.cu @@ -18,7 +18,7 @@ #include #include -#include +#include #include #include @@ -127,11 +127,18 @@ class MSTTest v = static_cast((csr_d.offsets.size() / sizeof(vertex_t)) - 1); e = static_cast(csr_d.indices.size() / sizeof(edge_t)); - rmm::device_vector mst_src(2 * v - 2, - std::numeric_limits::max()); - rmm::device_vector mst_dst(2 * v - 2, - std::numeric_limits::max()); - rmm::device_vector color(v, 0); + rmm::device_uvector mst_src(2 * v - 2, handle.get_stream()); + rmm::device_uvector mst_dst(2 * v - 2, handle.get_stream()); + rmm::device_uvector color(v, handle.get_stream()); + + CUDA_CHECK( + cudaMemsetAsync(mst_src.data(), std::numeric_limits::max(), + mst_src.size() * sizeof(vertex_t), handle.get_stream())); + CUDA_CHECK( + cudaMemsetAsync(mst_dst.data(), std::numeric_limits::max(), + mst_dst.size() * sizeof(vertex_t), handle.get_stream())); + CUDA_CHECK(cudaMemsetAsync(color.data(), 0, color.size() * sizeof(vertex_t), + handle.get_stream())); vertex_t *color_ptr = thrust::raw_pointer_cast(color.data()); @@ -214,7 +221,6 @@ class MSTTest protected: MSTTestInput mst_input; CSRDevice csr_d; - rmm::device_vector mst_edge; vertex_t v; edge_t e; int iterations; From 1b5dbfdebe81b508ad52ecdfbebb0ff7afd4648e Mon Sep 17 00:00:00 2001 From: viclafargue Date: Wed, 7 Jul 2021 14:12:20 +0200 Subject: [PATCH 07/17] Update raft::allocate to use RMM --- cpp/include/raft/cudart_utils.h | 17 ++++++++--- cpp/test/distance/dist_adj.cu | 10 +++---- cpp/test/distance/distance_base.cuh | 12 ++++---- cpp/test/distance/fused_l2_nn.cu | 18 ++++++----- cpp/test/label/label.cu | 10 +++---- cpp/test/linalg/add.cu | 8 ++--- cpp/test/linalg/binary_op.cu | 8 ++--- cpp/test/linalg/coalesced_reduction.cu | 6 ++-- cpp/test/linalg/divide.cu | 6 ++-- cpp/test/linalg/eig.cu | 24 +++++++-------- cpp/test/linalg/eig_sel.cu | 10 +++---- cpp/test/linalg/eltwise.cu | 14 ++++----- cpp/test/linalg/map.cu | 12 ++++---- cpp/test/linalg/map_then_reduce.cu | 6 ++-- cpp/test/linalg/matrix_vector_op.cu | 10 +++---- cpp/test/linalg/multiply.cu | 6 ++-- cpp/test/linalg/norm.cu | 12 ++++---- cpp/test/linalg/reduce.cu | 6 ++-- cpp/test/linalg/strided_reduction.cu | 6 ++-- cpp/test/linalg/subtract.cu | 8 ++--- cpp/test/linalg/svd.cu | 14 ++++----- cpp/test/linalg/transpose.cu | 6 ++-- cpp/test/linalg/unary_op.cu | 6 ++-- cpp/test/matrix/math.cu | 30 +++++++++---------- cpp/test/matrix/matrix.cu | 8 ++--- cpp/test/random/rng.cu | 20 ++++++------- cpp/test/random/rng_int.cu | 4 +-- cpp/test/random/sample_without_replacement.cu | 8 ++--- cpp/test/sparse/add.cu | 24 +++++++-------- cpp/test/sparse/convert_coo.cu | 6 ++-- cpp/test/sparse/convert_csr.cu | 14 ++++----- cpp/test/sparse/csr_row_slice.cu | 18 +++++------ cpp/test/sparse/csr_to_dense.cu | 10 +++---- cpp/test/sparse/csr_transpose.cu | 18 +++++------ cpp/test/sparse/degree.cu | 17 ++++++----- cpp/test/sparse/dist_coo_spmv.cu | 11 +++---- cpp/test/sparse/distance.cu | 11 +++---- cpp/test/sparse/knn.cu | 15 +++++----- cpp/test/sparse/knn_graph.cu | 2 +- cpp/test/sparse/linkage.cu | 4 +-- cpp/test/sparse/norm.cu | 8 ++--- cpp/test/sparse/row_op.cu | 6 ++-- cpp/test/sparse/selection.cu | 12 ++++---- cpp/test/sparse/sort.cu | 8 ++--- cpp/test/sparse/symmetrize.cu | 6 ++-- cpp/test/spatial/haversine.cu | 16 +++++----- cpp/test/spatial/knn.cu | 28 ++++++++--------- cpp/test/stats/mean.cu | 4 +-- cpp/test/stats/mean_center.cu | 8 ++--- cpp/test/stats/stddev.cu | 8 ++--- cpp/test/stats/sum.cu | 4 +-- 51 files changed, 290 insertions(+), 273 deletions(-) diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h index 86c60addf2..8b8b3cbeca 100644 --- a/cpp/include/raft/cudart_utils.h +++ b/cpp/include/raft/cudart_utils.h @@ -17,6 +17,7 @@ #pragma once #include +#include #include @@ -259,11 +260,19 @@ void print_device_vector(const char* variable_name, const T* devMem, } /** @} */ -/** cuda malloc */ template -void allocate(Type*& ptr, size_t len, bool setZero = false) { - CUDA_CHECK(cudaMalloc((void**)&ptr, sizeof(Type) * len)); - if (setZero) CUDA_CHECK(cudaMemset(ptr, 0, sizeof(Type) * len)); +void allocate(Type*& ptr, size_t len, cudaStream_t stream, + bool setZero = false) { + ptr = (Type*)rmm::mr::get_current_device_resource()->allocate( + len * sizeof(Type), stream); + if (setZero) CUDA_CHECK(cudaMemset(ptr, 0, len * sizeof(Type))); +} + +template +void deallocate(Type*& ptr, size_t len, cudaStream_t stream, + bool setZero = false) { + rmm::mr::get_current_device_resource()->deallocate(ptr, len * sizeof(Type), + stream); } /** helper method to get max usable shared mem per block parameter */ diff --git a/cpp/test/distance/dist_adj.cu b/cpp/test/distance/dist_adj.cu index e2ed2c01dc..dc397a5eb2 100644 --- a/cpp/test/distance/dist_adj.cu +++ b/cpp/test/distance/dist_adj.cu @@ -79,10 +79,10 @@ class DistanceAdjTest bool isRowMajor = params.isRowMajor; cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(x, m * k); - raft::allocate(y, n * k); - raft::allocate(dist_ref, m * n); - raft::allocate(dist, m * n); + raft::allocate(x, m * k, stream); + raft::allocate(y, n * k, stream); + raft::allocate(dist_ref, m * n, stream); + raft::allocate(dist, m * n, stream); r.uniform(x, m * k, DataType(-1.0), DataType(1.0), stream); r.uniform(y, n * k, DataType(-1.0), DataType(1.0), stream); @@ -94,7 +94,7 @@ class DistanceAdjTest raft::distance::getWorkspaceSize(x, y, m, n, k); if (worksize != 0) { - raft::allocate(workspace, worksize); + raft::allocate(workspace, worksize, stream); } auto fin_op = [threshold] __device__(DataType d_val, int g_d_idx) { diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh index d6f06c186a..ac12076e3a 100644 --- a/cpp/test/distance/distance_base.cuh +++ b/cpp/test/distance/distance_base.cuh @@ -169,11 +169,11 @@ class DistanceTest : public ::testing::TestWithParam> { bool isRowMajor = params.isRowMajor; cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(x, m * k); - raft::allocate(y, n * k); - raft::allocate(dist_ref, m * n); - raft::allocate(dist, m * n); - raft::allocate(dist2, m * n); + raft::allocate(x, m * k, stream); + raft::allocate(y, n * k, stream); + raft::allocate(dist_ref, m * n, stream); + raft::allocate(dist, m * n, stream); + raft::allocate(dist2, m * n, stream); r.uniform(x, m * k, DataType(-1.0), DataType(1.0), stream); r.uniform(y, n * k, DataType(-1.0), DataType(1.0), stream); naiveDistance(dist_ref, x, y, m, n, k, distanceType, isRowMajor); @@ -182,7 +182,7 @@ class DistanceTest : public ::testing::TestWithParam> { raft::distance::getWorkspaceSize(x, y, m, n, k); if (worksize != 0) { - raft::allocate(workspace, worksize); + raft::allocate(workspace, worksize, stream); } DataType threshold = -10000.f; diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu index 4573a070b6..8f12b26dc0 100644 --- a/cpp/test/distance/fused_l2_nn.cu +++ b/cpp/test/distance/fused_l2_nn.cu @@ -107,13 +107,13 @@ class FusedL2NNTest : public ::testing::TestWithParam> { int n = params.n; int k = params.k; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(x, m * k); - raft::allocate(y, n * k); - raft::allocate(xn, m); - raft::allocate(yn, n); - raft::allocate(workspace, sizeof(int) * m); - raft::allocate(min, m); - raft::allocate(min_ref, m); + raft::allocate(x, m * k, stream); + raft::allocate(y, n * k, stream); + raft::allocate(xn, m, stream); + raft::allocate(yn, n, stream); + raft::allocate(workspace, sizeof(int) * m, stream); + raft::allocate(min, m, stream); + raft::allocate(min_ref, m, stream); r.uniform(x, m * k, DataT(-1.0), DataT(1.0), stream); r.uniform(y, n * k, DataT(-1.0), DataT(1.0), stream); generateGoldenResult(); @@ -282,7 +282,8 @@ class FusedL2NNDetTest : public FusedL2NNTest { void SetUp() override { FusedL2NNTest::SetUp(); int m = this->params.m; - raft::allocate(min1, m); + CUDA_CHECK(cudaStreamCreate(&stream)); + raft::allocate(min1, m, stream); } void TearDown() override { @@ -294,6 +295,7 @@ class FusedL2NNDetTest : public FusedL2NNTest { cub::KeyValuePair *min1; static const int NumRepeats = 100; + cudaStream_t stream; void generateGoldenResult() override {} }; diff --git a/cpp/test/label/label.cu b/cpp/test/label/label.cu index 2a159994dc..86a8927283 100644 --- a/cpp/test/label/label.cu +++ b/cpp/test/label/label.cu @@ -43,9 +43,9 @@ TEST_F(MakeMonotonicTest, Result) { float *data, *actual, *expected; - raft::allocate(data, m, true); - raft::allocate(actual, m, true); - raft::allocate(expected, m, true); + raft::allocate(data, m, stream, true); + raft::allocate(actual, m, stream, true); + raft::allocate(expected, m, stream, true); float *data_h = new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 8.0, 7.0, 8.0, 8.0, 25.0, 80.0}; @@ -76,7 +76,7 @@ TEST(labelTest, Classlabels) { int n_rows = 6; float *y_d; - raft::allocate(y_d, n_rows); + raft::allocate(y_d, n_rows, stream); float y_h[] = {2, -1, 1, 2, 1, 1}; raft::update_device(y_d, y_h, n_rows, stream); @@ -91,7 +91,7 @@ TEST(labelTest, Classlabels) { raft::Compare(), stream)); float *y_relabeled_d; - raft::allocate(y_relabeled_d, n_rows); + raft::allocate(y_relabeled_d, n_rows, stream); getOvrlabels(y_d, n_rows, y_unique_d.data(), n_classes, y_relabeled_d, 2, stream); diff --git a/cpp/test/linalg/add.cu b/cpp/test/linalg/add.cu index 2fc9d4e30f..ab7de0f24b 100644 --- a/cpp/test/linalg/add.cu +++ b/cpp/test/linalg/add.cu @@ -32,10 +32,10 @@ class AddTest : public ::testing::TestWithParam> { raft::random::Rng r(params.seed); int len = params.len; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(in1, len); - raft::allocate(in2, len); - raft::allocate(out_ref, len); - raft::allocate(out, len); + raft::allocate(in1, len, stream); + raft::allocate(in2, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(out, len, stream); r.uniform(in1, len, InT(-1.0), InT(1.0), stream); r.uniform(in2, len, InT(-1.0), InT(1.0), stream); naiveAddElem(out_ref, in1, in2, len); diff --git a/cpp/test/linalg/binary_op.cu b/cpp/test/linalg/binary_op.cu index 979b12f237..475d8e58ff 100644 --- a/cpp/test/linalg/binary_op.cu +++ b/cpp/test/linalg/binary_op.cu @@ -48,10 +48,10 @@ class BinaryOpTest cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); IdxType len = params.len; - allocate(in1, len); - allocate(in2, len); - allocate(out_ref, len); - allocate(out, len); + raft::allocate(in1, len, stream); + raft::allocate(in2, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(out, len, stream); r.uniform(in1, len, InType(-1.0), InType(1.0), stream); r.uniform(in2, len, InType(-1.0), InType(1.0), stream); naiveAdd(out_ref, in1, in2, len); diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu index e45f5651b4..bfbaf9b5f9 100644 --- a/cpp/test/linalg/coalesced_reduction.cu +++ b/cpp/test/linalg/coalesced_reduction.cu @@ -59,9 +59,9 @@ class coalescedReductionTest int len = rows * cols; cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(data, len); - raft::allocate(dots_exp, rows); - raft::allocate(dots_act, rows); + raft::allocate(data, len, stream); + raft::allocate(dots_exp, rows, stream); + raft::allocate(dots_act, rows, stream); r.uniform(data, len, T(-1.0), T(1.0), stream); naiveCoalescedReduction(dots_exp, data, cols, rows, stream); diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu index 2396558939..07fbda7e1c 100644 --- a/cpp/test/linalg/divide.cu +++ b/cpp/test/linalg/divide.cu @@ -54,9 +54,9 @@ class DivideTest cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(in, len); - raft::allocate(out_ref, len); - raft::allocate(out, len); + raft::allocate(in, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(out, len, stream); r.uniform(in, len, T(-1.0), T(1.0), stream); naiveDivide(out_ref, in, params.scalar, len, stream); divideScalar(out, in, params.scalar, len, stream); diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu index 159d288174..d53713f004 100644 --- a/cpp/test/linalg/eig.cu +++ b/cpp/test/linalg/eig.cu @@ -50,24 +50,24 @@ class EigTest : public ::testing::TestWithParam> { raft::random::Rng r(params.seed); int len = params.len; - raft::allocate(cov_matrix, len); + raft::allocate(cov_matrix, len, stream); T cov_matrix_h[] = {1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0}; ASSERT(len == 16, "This test only works with 4x4 matrices!"); raft::update_device(cov_matrix, cov_matrix_h, len, stream); - raft::allocate(eig_vectors, len); - raft::allocate(eig_vals, params.n_col); - raft::allocate(eig_vectors_jacobi, len); - raft::allocate(eig_vals_jacobi, params.n_col); + raft::allocate(eig_vectors, len, stream); + raft::allocate(eig_vals, params.n_col, stream); + raft::allocate(eig_vectors_jacobi, len, stream); + raft::allocate(eig_vals_jacobi, params.n_col, stream); T eig_vectors_ref_h[] = {0.2790, -0.6498, 0.6498, -0.2789, -0.5123, 0.4874, 0.4874, -0.5123, 0.6498, 0.2789, -0.2789, -0.6498, 0.4874, 0.5123, 0.5123, 0.4874}; T eig_vals_ref_h[] = {0.0614, 0.1024, 0.3096, 3.5266}; - raft::allocate(eig_vectors_ref, len); - raft::allocate(eig_vals_ref, params.n_col); + raft::allocate(eig_vectors_ref, len, stream); + raft::allocate(eig_vals_ref, params.n_col, stream); raft::update_device(eig_vectors_ref, eig_vectors_ref_h, len, stream); raft::update_device(eig_vals_ref, eig_vals_ref_h, params.n_col, stream); @@ -82,11 +82,11 @@ class EigTest : public ::testing::TestWithParam> { // test code for comparing two methods len = params.n * params.n; - raft::allocate(cov_matrix_large, len); - raft::allocate(eig_vectors_large, len); - raft::allocate(eig_vectors_jacobi_large, len); - raft::allocate(eig_vals_large, params.n); - raft::allocate(eig_vals_jacobi_large, params.n); + raft::allocate(cov_matrix_large, len, stream); + raft::allocate(eig_vectors_large, len, stream); + raft::allocate(eig_vectors_jacobi_large, len, stream); + raft::allocate(eig_vals_large, params.n, stream); + raft::allocate(eig_vals_jacobi_large, params.n, stream); r.uniform(cov_matrix_large, len, T(-1.0), T(1.0), stream); diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu index b3980f281d..26c8177269 100644 --- a/cpp/test/linalg/eig_sel.cu +++ b/cpp/test/linalg/eig_sel.cu @@ -51,21 +51,21 @@ class EigSelTest : public ::testing::TestWithParam> { params = ::testing::TestWithParam>::GetParam(); int len = params.len; - raft::allocate(cov_matrix, len); + raft::allocate(cov_matrix, len, stream); T cov_matrix_h[] = {1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0}; ASSERT(len == 16, "This test only works with 4x4 matrices!"); raft::update_device(cov_matrix, cov_matrix_h, len, stream); - raft::allocate(eig_vectors, 12); - raft::allocate(eig_vals, params.n_col); + raft::allocate(eig_vectors, 12, stream); + raft::allocate(eig_vals, params.n_col, stream); T eig_vectors_ref_h[] = {-0.5123, 0.4874, 0.4874, -0.5123, 0.6498, 0.2789, -0.2789, -0.6498, 0.4874, 0.5123, 0.5123, 0.4874}; T eig_vals_ref_h[] = {0.1024, 0.3096, 3.5266, 3.5266}; - raft::allocate(eig_vectors_ref, 12); - raft::allocate(eig_vals_ref, params.n_col); + raft::allocate(eig_vectors_ref, 12, stream); + raft::allocate(eig_vals_ref, params.n_col, stream); raft::update_device(eig_vectors_ref, eig_vectors_ref_h, 12, stream); raft::update_device(eig_vals_ref, eig_vals_ref_h, 4, stream); diff --git a/cpp/test/linalg/eltwise.cu b/cpp/test/linalg/eltwise.cu index 572951c557..e955f7a354 100644 --- a/cpp/test/linalg/eltwise.cu +++ b/cpp/test/linalg/eltwise.cu @@ -69,9 +69,9 @@ class ScalarMultiplyTest cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - allocate(in, len); - allocate(out_ref, len); - allocate(out, len); + raft::allocate(in, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(out, len, stream); r.uniform(in, len, T(-1.0), T(1.0), stream); naiveScale(out_ref, in, scalar, len, stream); scalarMultiply(out, in, scalar, len, stream); @@ -156,10 +156,10 @@ class EltwiseAddTest : public ::testing::TestWithParam> { cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); int len = params.len; - allocate(in1, len); - allocate(in2, len); - allocate(out_ref, len); - allocate(out, len); + raft::allocate(in1, len, stream); + raft::allocate(in2, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(out, len, stream); r.uniform(in1, len, T(-1.0), T(1.0), stream); r.uniform(in2, len, T(-1.0), T(1.0), stream); naiveAdd(out_ref, in1, in2, len, stream); diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu index 089edd738e..6087893006 100644 --- a/cpp/test/linalg/map.cu +++ b/cpp/test/linalg/map.cu @@ -47,7 +47,7 @@ void create_ref(OutType *out_ref, const InType *in1, const InType *in2, const InType *in3, InType scalar, IdxType len, cudaStream_t stream) { InType *tmp; - allocate(tmp, len); + raft::allocate(tmp, len, stream); eltwiseAdd(tmp, in1, in2, len, stream); eltwiseAdd(out_ref, tmp, in3, len, stream); scalarAdd(out_ref, out_ref, (OutType)scalar, len, stream); @@ -66,11 +66,11 @@ class MapTest cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); IdxType len = params.len; - allocate(in1, len); - allocate(in2, len); - allocate(in3, len); - allocate(out_ref, len); - allocate(out, len); + raft::allocate(in1, len, stream); + raft::allocate(in2, len, stream); + raft::allocate(in3, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(out, len, stream); r.uniform(in1, len, InType(-1.0), InType(1.0), stream); r.uniform(in2, len, InType(-1.0), InType(1.0), stream); r.uniform(in3, len, InType(-1.0), InType(1.0), stream); diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu index c78dd9e8fb..49b7deaf1f 100644 --- a/cpp/test/linalg/map_then_reduce.cu +++ b/cpp/test/linalg/map_then_reduce.cu @@ -77,9 +77,9 @@ class MapReduceTest : public ::testing::TestWithParam> { cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - allocate(in, len); - allocate(out_ref, len); - allocate(out, len); + raft::allocate(in, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(out, len, stream); r.uniform(in, len, InType(-1.0), InType(1.0), stream); mapReduceLaunch(out_ref, out, in, len, stream); CUDA_CHECK(cudaStreamDestroy(stream)); diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu index aa46c78b0f..933f806ff5 100644 --- a/cpp/test/linalg/matrix_vector_op.cu +++ b/cpp/test/linalg/matrix_vector_op.cu @@ -68,12 +68,12 @@ class MatVecOpTest cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - allocate(in, len); - allocate(out_ref, len); - allocate(out, len); + raft::allocate(in, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(out, len, stream); IdxType vecLen = params.bcastAlongRows ? D : N; - allocate(vec1, vecLen); - allocate(vec2, vecLen); + raft::allocate(vec1, vecLen, stream); + raft::allocate(vec2, vecLen, stream); r.uniform(in, len, (T)-1.0, (T)1.0, stream); r.uniform(vec1, vecLen, (T)-1.0, (T)1.0, stream); r.uniform(vec2, vecLen, (T)-1.0, (T)1.0, stream); diff --git a/cpp/test/linalg/multiply.cu b/cpp/test/linalg/multiply.cu index 1d3e753de3..33f2b1c104 100644 --- a/cpp/test/linalg/multiply.cu +++ b/cpp/test/linalg/multiply.cu @@ -34,9 +34,9 @@ class MultiplyTest : public ::testing::TestWithParam> { cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(in, len); - raft::allocate(out_ref, len); - raft::allocate(out, len); + raft::allocate(in, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(out, len, stream); r.uniform(in, len, T(-1.0), T(1.0), stream); naiveScale(out_ref, in, params.scalar, len, stream); multiplyScalar(out, in, params.scalar, len, stream); diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu index acc25addd0..29ea0a1915 100644 --- a/cpp/test/linalg/norm.cu +++ b/cpp/test/linalg/norm.cu @@ -78,9 +78,9 @@ class RowNormTest : public ::testing::TestWithParam> { int rows = params.rows, cols = params.cols, len = rows * cols; cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(data, len); - raft::allocate(dots_exp, rows); - raft::allocate(dots_act, rows); + raft::allocate(data, len, stream); + raft::allocate(dots_exp, rows, stream); + raft::allocate(dots_act, rows, stream); r.uniform(data, len, T(-1.0), T(1.0), stream); naiveRowNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt, stream); @@ -143,10 +143,10 @@ class ColNormTest : public ::testing::TestWithParam> { int rows = params.rows, cols = params.cols, len = rows * cols; cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(data, len); + raft::allocate(data, len, stream); r.uniform(data, len, T(-1.0), T(1.0), stream); - raft::allocate(dots_exp, cols); - raft::allocate(dots_act, cols); + raft::allocate(dots_exp, cols, stream); + raft::allocate(dots_act, cols, stream); naiveColNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt, stream); diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu index 255cf1a696..dba8f76a22 100644 --- a/cpp/test/linalg/reduce.cu +++ b/cpp/test/linalg/reduce.cu @@ -58,9 +58,9 @@ class ReduceTest : public ::testing::TestWithParam> { int rows = params.rows, cols = params.cols; int len = rows * cols; outlen = params.alongRows ? rows : cols; - raft::allocate(data, len); - raft::allocate(dots_exp, outlen); - raft::allocate(dots_act, outlen); + raft::allocate(data, len, stream); + raft::allocate(dots_exp, outlen, stream); + raft::allocate(dots_act, outlen, stream); r.uniform(data, len, T(-1.0), T(1.0), stream); naiveReduction(dots_exp, data, cols, rows, params.rowMajor, params.alongRows, stream); diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/test/linalg/strided_reduction.cu index b27fa2ac1a..3fe076f1d4 100644 --- a/cpp/test/linalg/strided_reduction.cu +++ b/cpp/test/linalg/strided_reduction.cu @@ -49,9 +49,9 @@ class stridedReductionTest int rows = params.rows, cols = params.cols; int len = rows * cols; - raft::allocate(data, len); - raft::allocate(dots_exp, cols); //expected dot products (from test) - raft::allocate(dots_act, cols); //actual dot products (from prim) + raft::allocate(data, len, stream); + raft::allocate(dots_exp, cols, stream); //expected dot products (from test) + raft::allocate(dots_act, cols, stream); //actual dot products (from prim) r.uniform(data, len, T(-1.0), T(1.0), stream); //initialize matrix to random diff --git a/cpp/test/linalg/subtract.cu b/cpp/test/linalg/subtract.cu index ced3f65fdd..7318410330 100644 --- a/cpp/test/linalg/subtract.cu +++ b/cpp/test/linalg/subtract.cu @@ -81,10 +81,10 @@ class SubtractTest : public ::testing::TestWithParam> { int len = params.len; cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(in1, len); - raft::allocate(in2, len); - raft::allocate(out_ref, len); - raft::allocate(out, len); + raft::allocate(in1, len, stream); + raft::allocate(in2, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(out, len, stream); r.uniform(in1, len, T(-1.0), T(1.0), stream); r.uniform(in2, len, T(-1.0), T(1.0), stream); diff --git a/cpp/test/linalg/svd.cu b/cpp/test/linalg/svd.cu index fff321768f..adfbfd01d6 100644 --- a/cpp/test/linalg/svd.cu +++ b/cpp/test/linalg/svd.cu @@ -49,7 +49,7 @@ class SvdTest : public ::testing::TestWithParam> { raft::random::Rng r(params.seed); int len = params.len; cudaStream_t stream = handle.get_stream(); - raft::allocate(data, len); + raft::allocate(data, len, stream); ASSERT(params.n_row == 3, "This test only supports nrows=3!"); ASSERT(params.len == 6, "This test only supports len=6!"); @@ -59,9 +59,9 @@ class SvdTest : public ::testing::TestWithParam> { int left_evl = params.n_row * params.n_col; int right_evl = params.n_col * params.n_col; - raft::allocate(left_eig_vectors_qr, left_evl); - raft::allocate(right_eig_vectors_trans_qr, right_evl); - raft::allocate(sing_vals_qr, params.n_col); + raft::allocate(left_eig_vectors_qr, left_evl, stream); + raft::allocate(right_eig_vectors_trans_qr, right_evl, stream); + raft::allocate(sing_vals_qr, params.n_col, stream); // allocate(left_eig_vectors_jacobi, left_evl); // allocate(right_eig_vectors_trans_jacobi, right_evl); @@ -74,9 +74,9 @@ class SvdTest : public ::testing::TestWithParam> { T sing_vals_ref_h[] = {7.065283, 1.040081}; - raft::allocate(left_eig_vectors_ref, left_evl); - raft::allocate(right_eig_vectors_ref, right_evl); - raft::allocate(sing_vals_ref, params.n_col); + raft::allocate(left_eig_vectors_ref, left_evl, stream); + raft::allocate(right_eig_vectors_ref, right_evl, stream); + raft::allocate(sing_vals_ref, params.n_col, stream); raft::update_device(left_eig_vectors_ref, left_eig_vectors_ref_h, left_evl, stream); diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu index f10b029962..08be179c59 100644 --- a/cpp/test/linalg/transpose.cu +++ b/cpp/test/linalg/transpose.cu @@ -48,16 +48,16 @@ class TransposeTest : public ::testing::TestWithParam> { int len = params.len; - raft::allocate(data, len); + raft::allocate(data, len, stream); ASSERT(params.len == 9, "This test works only with len=9!"); T data_h[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0}; raft::update_device(data, data_h, len, stream); - raft::allocate(data_trans_ref, len); + raft::allocate(data_trans_ref, len, stream); T data_ref_h[] = {1.0, 4.0, 7.0, 2.0, 5.0, 8.0, 3.0, 6.0, 9.0}; raft::update_device(data_trans_ref, data_ref_h, len, stream); - raft::allocate(data_trans, len); + raft::allocate(data_trans, len, stream); transpose(handle, data, data_trans, params.n_row, params.n_col, stream); transpose(data, params.n_row, stream); diff --git a/cpp/test/linalg/unary_op.cu b/cpp/test/linalg/unary_op.cu index 666ab8619d..8ef810d794 100644 --- a/cpp/test/linalg/unary_op.cu +++ b/cpp/test/linalg/unary_op.cu @@ -53,9 +53,9 @@ class UnaryOpTest raft::random::Rng r(params.seed); CUDA_CHECK(cudaStreamCreate(&stream)); auto len = params.len; - allocate(in, len); - allocate(out_ref, len); - allocate(out, len); + raft::allocate(in, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(out, len, stream); r.uniform(in, len, InType(-1.0), InType(1.0), stream); } diff --git a/cpp/test/matrix/math.cu b/cpp/test/matrix/math.cu index 578139623a..227aca643e 100644 --- a/cpp/test/matrix/math.cu +++ b/cpp/test/matrix/math.cu @@ -115,22 +115,22 @@ class MathTest : public ::testing::TestWithParam> { random::Rng r(params.seed); int len = params.len; - allocate(in_power, len); - allocate(out_power_ref, len); - allocate(in_sqrt, len); - allocate(out_sqrt_ref, len); - allocate(in_sign_flip, len); - allocate(out_sign_flip_ref, len); - raft::handle_t handle; cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - allocate(in_ratio, 4); + raft::allocate(in_power, len, stream); + raft::allocate(out_power_ref, len, stream); + raft::allocate(in_sqrt, len, stream); + raft::allocate(out_sqrt_ref, len, stream); + raft::allocate(in_sign_flip, len, stream); + raft::allocate(out_sign_flip_ref, len, stream); + + raft::allocate(in_ratio, 4, stream); T in_ratio_h[4] = {1.0, 2.0, 2.0, 3.0}; update_device(in_ratio, in_ratio_h, 4, stream); - allocate(out_ratio_ref, 4); + raft::allocate(out_ratio_ref, 4, stream); T out_ratio_ref_h[4] = {0.125, 0.25, 0.25, 0.375}; update_device(out_ratio_ref, out_ratio_ref_h, 4, stream); @@ -150,9 +150,9 @@ class MathTest : public ::testing::TestWithParam> { naiveSignFlip(in_sign_flip, out_sign_flip_ref, params.n_row, params.n_col); signFlip(in_sign_flip, params.n_row, params.n_col, stream); - allocate(in_recip, 4); - allocate(in_recip_ref, 4); - allocate(out_recip, 4); + raft::allocate(in_recip, 4, stream); + raft::allocate(in_recip_ref, 4, stream); + raft::allocate(out_recip, 4, stream); // default threshold is 1e-15 std::vector in_recip_h = {0.1, 0.01, -0.01, 0.1e-16}; std::vector in_recip_ref_h = {10.0, 100.0, -100.0, 0.0}; @@ -167,9 +167,9 @@ class MathTest : public ::testing::TestWithParam> { std::vector in_small_val_zero_h = {0.1, 1e-16, -1e-16, -0.1}; std::vector in_small_val_zero_ref_h = {0.1, 0.0, 0.0, -0.1}; - allocate(in_smallzero, 4); - allocate(out_smallzero, 4); - allocate(out_smallzero_ref, 4); + raft::allocate(in_smallzero, 4, stream); + raft::allocate(out_smallzero, 4, stream); + raft::allocate(out_smallzero_ref, 4, stream); update_device(in_smallzero, in_small_val_zero_h.data(), 4, stream); update_device(out_smallzero_ref, in_small_val_zero_ref_h.data(), 4, stream); setSmallValuesZero(out_smallzero, in_smallzero, 4, stream); diff --git a/cpp/test/matrix/matrix.cu b/cpp/test/matrix/matrix.cu index 2d2d9d2057..047415957c 100644 --- a/cpp/test/matrix/matrix.cu +++ b/cpp/test/matrix/matrix.cu @@ -46,9 +46,9 @@ class MatrixTest : public ::testing::TestWithParam> { int len = params.n_row * params.n_col; cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(in1, len); - raft::allocate(in2, len); - raft::allocate(in1_revr, len); + raft::allocate(in1, len, stream); + raft::allocate(in2, len, stream); + raft::allocate(in1_revr, len, stream); r.uniform(in1, len, T(-1.0), T(1.0), stream); copy(in1, in2, params.n_row, params.n_col, stream); @@ -56,7 +56,7 @@ class MatrixTest : public ::testing::TestWithParam> { // colReverse(in1_revr, params.n_row, params.n_col); T *outTrunc; - raft::allocate(outTrunc, 6); + raft::allocate(outTrunc, 6, stream); truncZeroOrigin(in1, params.n_row, outTrunc, 3, 2, stream); CUDA_CHECK(cudaStreamDestroy(stream)); } diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu index af10dcab30..ee5312de22 100644 --- a/cpp/test/random/rng.cu +++ b/cpp/test/random/rng.cu @@ -88,8 +88,8 @@ class RngTest : public ::testing::TestWithParam> { cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); Rng r(params.seed, params.gtype); - allocate(data, params.len); - allocate(stats, 2, true); + raft::allocate(data, params.len, stream); + raft::allocate(stats, 2, stream, true); switch (params.type) { case RNG_Normal: r.normal(data, params.len, params.start, params.end, stream); @@ -383,9 +383,9 @@ TEST(Rng, MeanError) { cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - allocate(data, len); - allocate(mean_result, num_experiments); - allocate(std_result, num_experiments); + raft::allocate(data, len, stream); + raft::allocate(mean_result, num_experiments, stream); + raft::allocate(std_result, num_experiments, stream); for (auto rtype : {GenPhilox, GenKiss99 /*, raft::random::GenTaps */}) { Rng r(seed, rtype); @@ -432,7 +432,7 @@ class ScaledBernoulliTest : public ::testing::Test { Rng r(42); - allocate(data, len * sizeof(T), stream); + raft::allocate(data, len * sizeof(T), stream); r.scaled_bernoulli(data, len, T(0.5), T(scale), stream); } @@ -463,7 +463,7 @@ class BernoulliTest : public ::testing::Test { void SetUp() override { CUDA_CHECK(cudaStreamCreate(&stream)); Rng r(42); - allocate(data, len * sizeof(bool), stream); + raft::allocate(data, len * sizeof(bool), stream); r.bernoulli(data, len, T(0.5), stream); } @@ -518,9 +518,9 @@ class RngNormalTableTest cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); Rng r(params.seed, params.gtype); - allocate(data, len); - allocate(stats, 2, true); - allocate(mu_vec, params.cols); + raft::allocate(data, len, stream); + raft::allocate(stats, 2, stream, true); + raft::allocate(mu_vec, params.cols, stream); r.fill(mu_vec, params.cols, params.mu, stream); T* sigma_vec = nullptr; r.normalTable(data, params.rows, params.cols, mu_vec, sigma_vec, diff --git a/cpp/test/random/rng_int.cu b/cpp/test/random/rng_int.cu index 92f12206e8..cd948d50aa 100644 --- a/cpp/test/random/rng_int.cu +++ b/cpp/test/random/rng_int.cu @@ -72,8 +72,8 @@ class RngTest : public ::testing::TestWithParam> { cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - allocate(data, params.len); - allocate(stats, 2, true); + raft::allocate(data, params.len, stream); + raft::allocate(stats, 2, stream, true); switch (params.type) { case RNG_Uniform: r.uniformInt(data, params.len, params.start, params.end, stream); diff --git a/cpp/test/random/sample_without_replacement.cu b/cpp/test/random/sample_without_replacement.cu index d7e52a8958..7883c8fe1b 100644 --- a/cpp/test/random/sample_without_replacement.cu +++ b/cpp/test/random/sample_without_replacement.cu @@ -50,10 +50,10 @@ class SWoRTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamCreate(&stream)); Rng r(params.seed, params.gtype); - allocate(in, params.len); - allocate(wts, params.len); - allocate(out, params.sampledLen); - allocate(outIdx, params.sampledLen); + raft::allocate(in, params.len, stream); + raft::allocate(wts, params.len, stream); + raft::allocate(out, params.sampledLen, stream); + raft::allocate(outIdx, params.sampledLen, stream); h_outIdx.resize(params.sampledLen); r.uniform(in, params.len, T(-1.0), T(1.0), stream); r.uniform(wts, params.len, T(1.0), T(2.0), stream); diff --git a/cpp/test/sparse/add.cu b/cpp/test/sparse/add.cu index 38c3e5bfbb..5b2e5bd5b4 100644 --- a/cpp/test/sparse/add.cu +++ b/cpp/test/sparse/add.cu @@ -56,21 +56,21 @@ class CSRAddTest cudaStreamCreate(&stream); - raft::allocate(ind_a, n_rows); - raft::allocate(ind_ptr_a, nnz_a); - raft::allocate(values_a, nnz_a); + raft::allocate(ind_a, n_rows, stream); + raft::allocate(ind_ptr_a, nnz_a, stream); + raft::allocate(values_a, nnz_a, stream); - raft::allocate(ind_b, n_rows); - raft::allocate(ind_ptr_b, nnz_b); - raft::allocate(values_b, nnz_b); + raft::allocate(ind_b, n_rows, stream); + raft::allocate(ind_ptr_b, nnz_b, stream); + raft::allocate(values_b, nnz_b, stream); - raft::allocate(ind_verify, n_rows); - raft::allocate(ind_ptr_verify, nnz_result); - raft::allocate(values_verify, nnz_result); + raft::allocate(ind_verify, n_rows, stream); + raft::allocate(ind_ptr_verify, nnz_result, stream); + raft::allocate(values_verify, nnz_result, stream); - raft::allocate(ind_result, n_rows); - raft::allocate(ind_ptr_result, nnz_result); - raft::allocate(values_result, nnz_result); + raft::allocate(ind_result, n_rows, stream); + raft::allocate(ind_ptr_result, nnz_result, stream); + raft::allocate(values_result, nnz_result, stream); } void Run() { diff --git a/cpp/test/sparse/convert_coo.cu b/cpp/test/sparse/convert_coo.cu index ea69ecfc53..8af6d30d7d 100644 --- a/cpp/test/sparse/convert_coo.cu +++ b/cpp/test/sparse/convert_coo.cu @@ -43,9 +43,9 @@ class CSRtoCOOTest : public ::testing::TestWithParam> { params = ::testing::TestWithParam>::GetParam(); cudaStreamCreate(&stream); - raft::allocate(ex_scan, params.ex_scan.size()); - raft::allocate(verify, params.verify.size()); - raft::allocate(result, params.verify.size(), true); + raft::allocate(ex_scan, params.ex_scan.size(), stream); + raft::allocate(verify, params.verify.size(), stream); + raft::allocate(result, params.verify.size(), stream, true); } void Run() { diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu index 5bee5a000e..871d8475e8 100644 --- a/cpp/test/sparse/convert_csr.cu +++ b/cpp/test/sparse/convert_csr.cu @@ -68,9 +68,9 @@ TEST_P(SortedCOOToCSR, Result) { int *in_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3}; int *exp_h = new int[4]{0, 2, 4, 6}; - raft::allocate(in, nnz, true); - raft::allocate(exp, 4, true); - raft::allocate(out, 4, true); + raft::allocate(in, nnz, stream, true); + raft::allocate(exp, 4, stream, true); + raft::allocate(out, 4, stream, true); raft::update_device(in, in_h, nnz, stream); raft::update_device(exp, exp_h, 4, stream); @@ -112,10 +112,10 @@ class CSRAdjGraphTest cudaStreamCreate(&stream); nnz = params.verify.size(); - raft::allocate(row_ind, params.n_rows); - raft::allocate(adj, params.n_rows * params.n_cols); - raft::allocate(result, nnz, true); - raft::allocate(verify, nnz); + raft::allocate(row_ind, params.n_rows, stream); + raft::allocate(adj, params.n_rows * params.n_cols, stream); + raft::allocate(result, nnz, stream, true); + raft::allocate(verify, nnz, stream); } void Run() { diff --git a/cpp/test/sparse/csr_row_slice.cu b/cpp/test/sparse/csr_row_slice.cu index b158ffdedd..1cf88e7a77 100644 --- a/cpp/test/sparse/csr_row_slice.cu +++ b/cpp/test/sparse/csr_row_slice.cu @@ -59,9 +59,9 @@ class CSRRowSliceTest std::vector indices_h = params.indices_h; std::vector data_h = params.data_h; - allocate(indptr, indptr_h.size()); - allocate(indices, indices_h.size()); - allocate(data, data_h.size()); + raft::allocate(indptr, indptr_h.size(), stream); + raft::allocate(indices, indices_h.size(), stream); + raft::allocate(data, data_h.size(), stream); update_device(indptr, indptr_h.data(), indptr_h.size(), stream); update_device(indices, indices_h.data(), indices_h.size(), stream); @@ -71,9 +71,9 @@ class CSRRowSliceTest std::vector out_indices_ref_h = params.out_indices_ref_h; std::vector out_data_ref_h = params.out_data_ref_h; - allocate(out_indptr_ref, out_indptr_ref_h.size()); - allocate(out_indices_ref, out_indices_ref_h.size()); - allocate(out_data_ref, out_data_ref_h.size()); + raft::allocate(out_indptr_ref, out_indptr_ref_h.size(), stream); + raft::allocate(out_indices_ref, out_indices_ref_h.size(), stream); + raft::allocate(out_data_ref, out_data_ref_h.size(), stream); update_device(out_indptr_ref, out_indptr_ref_h.data(), out_indptr_ref_h.size(), stream); @@ -82,9 +82,9 @@ class CSRRowSliceTest update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(), stream); - allocate(out_indptr, out_indptr_ref_h.size()); - allocate(out_indices, out_indices_ref_h.size()); - allocate(out_data, out_data_ref_h.size()); + raft::allocate(out_indptr, out_indptr_ref_h.size(), stream); + raft::allocate(out_indices, out_indices_ref_h.size(), stream); + raft::allocate(out_data, out_data_ref_h.size(), stream); } void SetUp() override { diff --git a/cpp/test/sparse/csr_to_dense.cu b/cpp/test/sparse/csr_to_dense.cu index d04799befa..4a7c9b7b10 100644 --- a/cpp/test/sparse/csr_to_dense.cu +++ b/cpp/test/sparse/csr_to_dense.cu @@ -55,9 +55,9 @@ class CSRToDenseTest std::vector indices_h = params.indices_h; std::vector data_h = params.data_h; - allocate(indptr, indptr_h.size()); - allocate(indices, indices_h.size()); - allocate(data, data_h.size()); + raft::allocate(indptr, indptr_h.size(), stream); + raft::allocate(indices, indices_h.size(), stream); + raft::allocate(data, data_h.size(), stream); update_device(indptr, indptr_h.data(), indptr_h.size(), stream); update_device(indices, indices_h.data(), indices_h.size(), stream); @@ -65,11 +65,11 @@ class CSRToDenseTest std::vector out_ref_h = params.out_ref_h; - allocate(out_ref, out_ref_h.size()); + raft::allocate(out_ref, out_ref_h.size(), stream); update_device(out_ref, out_ref_h.data(), out_ref_h.size(), stream); - allocate(out, out_ref_h.size()); + raft::allocate(out, out_ref_h.size(), stream); } void SetUp() override { diff --git a/cpp/test/sparse/csr_transpose.cu b/cpp/test/sparse/csr_transpose.cu index 5baeadd16f..352a656bf7 100644 --- a/cpp/test/sparse/csr_transpose.cu +++ b/cpp/test/sparse/csr_transpose.cu @@ -61,9 +61,9 @@ class CSRTransposeTest std::vector indices_h = params.indices_h; std::vector data_h = params.data_h; - allocate(indptr, indptr_h.size()); - allocate(indices, indices_h.size()); - allocate(data, data_h.size()); + raft::allocate(indptr, indptr_h.size(), stream); + raft::allocate(indices, indices_h.size(), stream); + raft::allocate(data, data_h.size(), stream); update_device(indptr, indptr_h.data(), indptr_h.size(), stream); update_device(indices, indices_h.data(), indices_h.size(), stream); @@ -73,9 +73,9 @@ class CSRTransposeTest std::vector out_indices_ref_h = params.out_indices_ref_h; std::vector out_data_ref_h = params.out_data_ref_h; - allocate(out_indptr_ref, out_indptr_ref_h.size()); - allocate(out_indices_ref, out_indices_ref_h.size()); - allocate(out_data_ref, out_data_ref_h.size()); + raft::allocate(out_indptr_ref, out_indptr_ref_h.size(), stream); + raft::allocate(out_indices_ref, out_indices_ref_h.size(), stream); + raft::allocate(out_data_ref, out_data_ref_h.size(), stream); update_device(out_indptr_ref, out_indptr_ref_h.data(), out_indptr_ref_h.size(), stream); @@ -84,9 +84,9 @@ class CSRTransposeTest update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(), stream); - allocate(out_indptr, out_indptr_ref_h.size()); - allocate(out_indices, out_indices_ref_h.size()); - allocate(out_data, out_data_ref_h.size()); + raft::allocate(out_indptr, out_indptr_ref_h.size(), stream); + raft::allocate(out_indices, out_indices_ref_h.size(), stream); + raft::allocate(out_data, out_data_ref_h.size(), stream); } void SetUp() override { diff --git a/cpp/test/sparse/degree.cu b/cpp/test/sparse/degree.cu index 5d687ad92b..68dc1b51ac 100644 --- a/cpp/test/sparse/degree.cu +++ b/cpp/test/sparse/degree.cu @@ -48,14 +48,16 @@ const std::vector> inputsf = {{5, 10, 5, 1234ULL}}; typedef SparseDegreeTests COODegree; TEST_P(COODegree, Result) { + cudaStream_t stream; + cudaStreamCreate(&stream); int *in_rows, *verify, *results; int in_rows_h[5] = {0, 0, 1, 2, 2}; int verify_h[5] = {2, 1, 2, 0, 0}; - raft::allocate(in_rows, 5); - raft::allocate(verify, 5, true); - raft::allocate(results, 5, true); + raft::allocate(in_rows, 5, stream); + raft::allocate(verify, 5, stream, true); + raft::allocate(results, 5, stream, true); raft::update_device(in_rows, *&in_rows_h, 5, 0); raft::update_device(verify, *&verify_h, 5, 0); @@ -67,6 +69,7 @@ TEST_P(COODegree, Result) { CUDA_CHECK(cudaFree(in_rows)); CUDA_CHECK(cudaFree(verify)); + CUDA_CHECK(cudaStreamDestroy(stream)); } typedef SparseDegreeTests COODegreeNonzero; @@ -81,10 +84,10 @@ TEST_P(COODegreeNonzero, Result) { float in_vals_h[5] = {0.0, 5.0, 0.0, 1.0, 1.0}; int verify_h[5] = {1, 0, 2, 0, 0}; - raft::allocate(in_rows, 5); - raft::allocate(verify, 5, true); - raft::allocate(results, 5, true); - raft::allocate(in_vals, 5, true); + raft::allocate(in_rows, 5, stream); + raft::allocate(verify, 5, stream, true); + raft::allocate(results, 5, stream, true); + raft::allocate(in_vals, 5, stream, true); raft::update_device(in_rows, *&in_rows_h, 5, 0); raft::update_device(verify, *&verify_h, 5, 0); diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu index 4f40e16eba..6b18e773cc 100644 --- a/cpp/test/sparse/dist_coo_spmv.cu +++ b/cpp/test/sparse/dist_coo_spmv.cu @@ -160,9 +160,9 @@ class SparseDistanceCOOSPMVTest std::vector indices_h = params.input_configuration.indices_h; std::vector data_h = params.input_configuration.data_h; - allocate(indptr, indptr_h.size()); - allocate(indices, indices_h.size()); - allocate(data, data_h.size()); + raft::allocate(indptr, indptr_h.size(), handle.get_stream()); + raft::allocate(indices, indices_h.size(), handle.get_stream()); + raft::allocate(data, data_h.size(), handle.get_stream()); update_device(indptr, indptr_h.data(), indptr_h.size(), handle.get_stream()); @@ -173,7 +173,8 @@ class SparseDistanceCOOSPMVTest std::vector out_dists_ref_h = params.input_configuration.out_dists_ref_h; - allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1)); + raft::allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1), + handle.get_stream()); update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), handle.get_stream()); @@ -200,7 +201,7 @@ class SparseDistanceCOOSPMVTest int out_size = dist_config.a_nrows * dist_config.b_nrows; - allocate(out_dists, out_size); + raft::allocate(out_dists, out_size, handle.get_stream()); run_spmv(); diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu index 8874e5ddb8..4ae95ae232 100644 --- a/cpp/test/sparse/distance.cu +++ b/cpp/test/sparse/distance.cu @@ -81,7 +81,7 @@ class SparseDistanceTest int out_size = dist_config.a_nrows * dist_config.b_nrows; - allocate(out_dists, out_size); + raft::allocate(out_dists, out_size, handle.get_stream()); pairwiseDistance(out_dists, dist_config, params.metric, params.metric_arg); @@ -109,9 +109,9 @@ class SparseDistanceTest std::vector indices_h = params.indices_h; std::vector data_h = params.data_h; - allocate(indptr, indptr_h.size()); - allocate(indices, indices_h.size()); - allocate(data, data_h.size()); + raft::allocate(indptr, indptr_h.size(), handle.get_stream()); + raft::allocate(indices, indices_h.size(), handle.get_stream()); + raft::allocate(data, data_h.size(), handle.get_stream()); update_device(indptr, indptr_h.data(), indptr_h.size(), handle.get_stream()); @@ -121,7 +121,8 @@ class SparseDistanceTest std::vector out_dists_ref_h = params.out_dists_ref_h; - allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1)); + raft::allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1), + handle.get_stream()); update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), dist_config.handle.get_stream()); diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/knn.cu index dbb9ac0a3d..8b5c9edebe 100644 --- a/cpp/test/sparse/knn.cu +++ b/cpp/test/sparse/knn.cu @@ -103,9 +103,9 @@ class SparseKNNTest std::vector indices_h = params.indices_h; std::vector data_h = params.data_h; - allocate(indptr, indptr_h.size()); - allocate(indices, indices_h.size()); - allocate(data, data_h.size()); + raft::allocate(indptr, indptr_h.size(), handle.get_stream()); + raft::allocate(indices, indices_h.size(), handle.get_stream()); + raft::allocate(data, data_h.size(), handle.get_stream()); update_device(indptr, indptr_h.data(), indptr_h.size(), handle.get_stream()); @@ -116,16 +116,17 @@ class SparseKNNTest std::vector out_dists_ref_h = params.out_dists_ref_h; std::vector out_indices_ref_h = params.out_indices_ref_h; - allocate(out_indices_ref, out_indices_ref_h.size()); - allocate(out_dists_ref, out_dists_ref_h.size()); + raft::allocate(out_indices_ref, out_indices_ref_h.size(), + handle.get_stream()); + raft::allocate(out_dists_ref, out_dists_ref_h.size(), handle.get_stream()); update_device(out_indices_ref, out_indices_ref_h.data(), out_indices_ref_h.size(), handle.get_stream()); update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), handle.get_stream()); - allocate(out_dists, n_rows * k); - allocate(out_indices, n_rows * k); + raft::allocate(out_dists, n_rows * k, handle.get_stream()); + raft::allocate(out_indices, n_rows * k, handle.get_stream()); } raft::handle_t handle; diff --git a/cpp/test/sparse/knn_graph.cu b/cpp/test/sparse/knn_graph.cu index d4e9a915e5..a89529ddb3 100644 --- a/cpp/test/sparse/knn_graph.cu +++ b/cpp/test/sparse/knn_graph.cu @@ -68,7 +68,7 @@ class KNNGraphTest out = new raft::sparse::COO(stream); - allocate(X, params.X.size()); + raft::allocate(X, params.X.size(), stream); update_device(X, params.X.data(), params.X.size(), stream); diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu index 291880bf53..4d1a5a9ad6 100644 --- a/cpp/test/sparse/linkage.cu +++ b/cpp/test/sparse/linkage.cu @@ -162,8 +162,8 @@ class LinkageTest : public ::testing::TestWithParam> { handle.get_stream()); // Allocate result labels and expected labels on device - raft::allocate(labels, params.n_row); - raft::allocate(labels_ref, params.n_row); + raft::allocate(labels, params.n_row, handle.get_stream()); + raft::allocate(labels_ref, params.n_row, handle.get_stream()); raft::copy(data.data(), params.data.data(), data.size(), handle.get_stream()); diff --git a/cpp/test/sparse/norm.cu b/cpp/test/sparse/norm.cu index 7adbbf8b9a..f9a30d5147 100644 --- a/cpp/test/sparse/norm.cu +++ b/cpp/test/sparse/norm.cu @@ -47,10 +47,10 @@ class CSRRowNormalizeTest CSRRowNormalizeInputs>::GetParam(); cudaStreamCreate(&stream); - raft::allocate(in_vals, params.in_vals.size()); - raft::allocate(verify, params.verify.size()); - raft::allocate(ex_scan, params.ex_scan.size()); - raft::allocate(result, params.verify.size(), true); + raft::allocate(in_vals, params.in_vals.size(), stream); + raft::allocate(verify, params.verify.size(), stream); + raft::allocate(ex_scan, params.ex_scan.size(), stream); + raft::allocate(result, params.verify.size(), stream, true); } void Run() { diff --git a/cpp/test/sparse/row_op.cu b/cpp/test/sparse/row_op.cu index b64fa25883..e584598f85 100644 --- a/cpp/test/sparse/row_op.cu +++ b/cpp/test/sparse/row_op.cu @@ -59,9 +59,9 @@ class CSRRowOpTest n_rows = params.ex_scan.size(); nnz = params.verify.size(); - raft::allocate(verify, nnz); - raft::allocate(ex_scan, n_rows); - raft::allocate(result, nnz, true); + raft::allocate(verify, nnz, stream); + raft::allocate(ex_scan, n_rows, stream); + raft::allocate(result, nnz, stream, true); } void Run() { diff --git a/cpp/test/sparse/selection.cu b/cpp/test/sparse/selection.cu index 74aa9d6eaf..4434b65dde 100644 --- a/cpp/test/sparse/selection.cu +++ b/cpp/test/sparse/selection.cu @@ -57,25 +57,25 @@ class SparseSelectionTest void make_data() { std::vector dists_h = params.dists_h; - allocate(dists, n_rows * n_cols); + raft::allocate(dists, n_rows * n_cols, stream); update_device(dists, dists_h.data(), dists_h.size(), stream); - allocate(inds, n_rows * n_cols); + raft::allocate(inds, n_rows * n_cols, stream); iota_fill(inds, n_rows, n_cols, stream); std::vector out_dists_ref_h = params.out_dists_ref_h; std::vector out_indices_ref_h = params.out_indices_ref_h; - allocate(out_indices_ref, out_indices_ref_h.size()); - allocate(out_dists_ref, out_dists_ref_h.size()); + raft::allocate(out_indices_ref, out_indices_ref_h.size(), stream); + raft::allocate(out_dists_ref, out_dists_ref_h.size(), stream); update_device(out_indices_ref, out_indices_ref_h.data(), out_indices_ref_h.size(), stream); update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), stream); - allocate(out_dists, n_rows * k); - allocate(out_indices, n_rows * k); + raft::allocate(out_dists, n_rows * k, stream); + raft::allocate(out_indices, n_rows * k, stream); } void SetUp() override { diff --git a/cpp/test/sparse/sort.cu b/cpp/test/sparse/sort.cu index 9deac1b82f..e73a8a547b 100644 --- a/cpp/test/sparse/sort.cu +++ b/cpp/test/sparse/sort.cu @@ -55,7 +55,7 @@ TEST_P(COOSort, Result) { cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(in_vals, params.nnz); + raft::allocate(in_vals, params.nnz, stream); r.uniform(in_vals, params.nnz, float(-1.0), float(1.0), stream); int *in_rows_h = (int *)malloc(params.nnz * sizeof(int)); @@ -68,9 +68,9 @@ TEST_P(COOSort, Result) { in_cols_h[i] = i; } - raft::allocate(in_rows, params.nnz); - raft::allocate(in_cols, params.nnz); - raft::allocate(verify, params.nnz); + raft::allocate(in_rows, params.nnz, stream); + raft::allocate(in_cols, params.nnz, stream); + raft::allocate(verify, params.nnz, stream); raft::update_device(in_rows, in_rows_h, params.nnz, stream); diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu index a060b8128a..c854292b2d 100644 --- a/cpp/test/sparse/symmetrize.cu +++ b/cpp/test/sparse/symmetrize.cu @@ -64,9 +64,9 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam< std::vector indices_h = params.indices_h; std::vector data_h = params.data_h; - allocate(indptr, indptr_h.size()); - allocate(indices, indices_h.size()); - allocate(data, data_h.size()); + raft::allocate(indptr, indptr_h.size(), stream); + raft::allocate(indices, indices_h.size(), stream); + raft::allocate(data, data_h.size(), stream); update_device(indptr, indptr_h.data(), indptr_h.size(), stream); update_device(indices, indices_h.data(), indices_h.size(), stream); diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu index e62bec4381..a56dc03be3 100644 --- a/cpp/test/spatial/haversine.cu +++ b/cpp/test/spatial/haversine.cu @@ -29,16 +29,19 @@ template class HaversineKNNTest : public ::testing::Test { protected: void basicTest() { + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + // Allocate input - raft::allocate(d_train_inputs, n * d); + raft::allocate(d_train_inputs, n * d, stream); // Allocate reference arrays - raft::allocate(d_ref_I, n * n); - raft::allocate(d_ref_D, n * n); + raft::allocate(d_ref_I, n * n, stream); + raft::allocate(d_ref_D, n * n, stream); // Allocate predicted arrays - raft::allocate(d_pred_I, n * n); - raft::allocate(d_pred_D, n * n); + raft::allocate(d_pred_I, n * n, stream); + raft::allocate(d_pred_D, n * n, stream); // make testdata on host std::vector h_train_inputs = { @@ -68,9 +71,6 @@ class HaversineKNNTest : public ::testing::Test { std::vector input_vec = {d_train_inputs}; std::vector sizes_vec = {n}; - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - raft::spatial::knn::detail::haversine_knn( d_pred_I, d_pred_D, d_train_inputs, d_train_inputs, n, n, k, stream); diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu index 2b1ef89f7a..edf46b3fe0 100644 --- a/cpp/test/spatial/knn.cu +++ b/cpp/test/spatial/knn.cu @@ -64,8 +64,8 @@ class KNNTest : public ::testing::TestWithParam { auto stream = handle_.get_stream(); - raft::allocate(actual_labels_, rows_ * k_, true); - raft::allocate(expected_labels_, rows_ * k_, true); + raft::allocate(actual_labels_, rows_ * k_, stream, true); + raft::allocate(expected_labels_, rows_ * k_, stream, true); std::vector input_vec; std::vector sizes_vec; @@ -100,6 +100,8 @@ class KNNTest : public ::testing::TestWithParam { cols_ = params_.input[0].size(); k_ = params_.k; + cudaStream_t stream = handle_.get_stream(); + std::vector row_major_input; for (int i = 0; i < params_.input.size(); ++i) { for (int j = 0; j < params_.input[i].size(); ++j) { @@ -107,24 +109,22 @@ class KNNTest : public ::testing::TestWithParam { } } rmm::device_buffer input_d = rmm::device_buffer( - row_major_input.data(), row_major_input.size() * sizeof(float), - handle_.get_stream()); + row_major_input.data(), row_major_input.size() * sizeof(float), stream); float *input_ptr = static_cast(input_d.data()); rmm::device_buffer labels_d = rmm::device_buffer( - params_.labels.data(), params_.labels.size() * sizeof(int), - handle_.get_stream()); + params_.labels.data(), params_.labels.size() * sizeof(int), stream); int *labels_ptr = static_cast(labels_d.data()); - raft::allocate(input_, rows_ * cols_, true); - raft::allocate(search_data_, rows_ * cols_, true); - raft::allocate(indices_, rows_ * k_, true); - raft::allocate(distances_, rows_ * k_, true); - raft::allocate(search_labels_, rows_, true); + raft::allocate(input_, rows_ * cols_, stream, true); + raft::allocate(search_data_, rows_ * cols_, stream, true); + raft::allocate(indices_, rows_ * k_, stream, true); + raft::allocate(distances_, rows_ * k_, stream, true); + raft::allocate(search_labels_, rows_, stream, true); - raft::copy(input_, input_ptr, rows_ * cols_, handle_.get_stream()); - raft::copy(search_data_, input_ptr, rows_ * cols_, handle_.get_stream()); - raft::copy(search_labels_, labels_ptr, rows_, handle_.get_stream()); + raft::copy(input_, input_ptr, rows_ * cols_, stream); + raft::copy(search_data_, input_ptr, rows_ * cols_, stream); + raft::copy(search_labels_, labels_ptr, rows_, stream); } void TearDown() override { diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu index 4a3b0ed196..3c79fc2cae 100644 --- a/cpp/test/stats/mean.cu +++ b/cpp/test/stats/mean.cu @@ -52,8 +52,8 @@ class MeanTest : public ::testing::TestWithParam> { cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - allocate(data, len); - allocate(mean_act, cols); + raft::allocate(data, len, stream); + raft::allocate(mean_act, cols, stream); r.normal(data, len, params.mean, (T)1.0, stream); meanSGtest(data, stream); diff --git a/cpp/test/stats/mean_center.cu b/cpp/test/stats/mean_center.cu index 8b0d607561..89118b877b 100644 --- a/cpp/test/stats/mean_center.cu +++ b/cpp/test/stats/mean_center.cu @@ -54,10 +54,10 @@ class MeanCenterTest auto len = rows * cols; IdxType vecLen = params.bcastAlongRows ? cols : rows; - raft::allocate(out, len); - raft::allocate(out_ref, len); - raft::allocate(data, len); - raft::allocate(meanVec, vecLen); + raft::allocate(out, len, stream); + raft::allocate(out_ref, len, stream); + raft::allocate(data, len, stream); + raft::allocate(meanVec, vecLen, stream); r.normal(data, len, params.mean, (T)1.0, stream); raft::stats::mean(meanVec, data, cols, rows, params.sample, params.rowMajor, stream); diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu index ff2698788f..6dfa186825 100644 --- a/cpp/test/stats/stddev.cu +++ b/cpp/test/stats/stddev.cu @@ -49,10 +49,10 @@ class StdDevTest : public ::testing::TestWithParam> { cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - allocate(data, len); - allocate(mean_act, cols); - allocate(stddev_act, cols); - allocate(vars_act, cols); + raft::allocate(data, len, stream); + raft::allocate(mean_act, cols, stream); + raft::allocate(stddev_act, cols, stream); + raft::allocate(vars_act, cols, stream); r.normal(data, len, params.mean, params.stddev, stream); stdVarSGtest(data, stream); CUDA_CHECK(cudaStreamDestroy(stream)); diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu index c3140d4588..bf7a615f84 100644 --- a/cpp/test/stats/sum.cu +++ b/cpp/test/stats/sum.cu @@ -45,7 +45,7 @@ class SumTest : public ::testing::TestWithParam> { int len = rows * cols; cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - raft::allocate(data, len); + raft::allocate(data, len, stream); T data_h[len]; for (int i = 0; i < len; i++) { @@ -54,7 +54,7 @@ class SumTest : public ::testing::TestWithParam> { raft::update_device(data, data_h, len, stream); - raft::allocate(sum_act, cols); + raft::allocate(sum_act, cols, stream); sum(sum_act, data, cols, rows, false, stream); CUDA_CHECK(cudaStreamDestroy(stream)); } From e571293dcb126da17ce58a72bdec8f8075a3322c Mon Sep 17 00:00:00 2001 From: viclafargue Date: Wed, 7 Jul 2021 19:43:03 +0200 Subject: [PATCH 08/17] RAFT alloc + dealloc helper system --- cpp/include/raft/cudart_utils.h | 33 +++++++++++++++---- cpp/test/distance/dist_adj.cu | 12 ++----- cpp/test/distance/distance_base.cuh | 12 +++---- cpp/test/distance/fused_l2_nn.cu | 14 ++------ cpp/test/label/label.cu | 6 ++-- cpp/test/linalg/add.cu | 6 +--- cpp/test/linalg/coalesced_reduction.cu | 9 +++-- cpp/test/linalg/divide.cu | 9 +++-- cpp/test/linalg/eig.cu | 10 +----- cpp/test/linalg/eig_sel.cu | 9 ++--- cpp/test/linalg/map.cu | 12 +++---- cpp/test/linalg/map_then_reduce.cu | 9 +++-- cpp/test/linalg/matrix_vector_op.cu | 11 +++---- cpp/test/linalg/multiply.cu | 9 +++-- cpp/test/linalg/norm.cu | 6 ++-- cpp/test/linalg/reduce.cu | 4 +-- cpp/test/linalg/strided_reduction.cu | 4 +-- cpp/test/linalg/subtract.cu | 10 +++--- cpp/test/linalg/svd.cu | 14 +++----- cpp/test/linalg/transpose.cu | 6 +--- cpp/test/linalg/unary_op.cu | 5 +-- cpp/test/matrix/math.cu | 21 ++---------- cpp/test/matrix/matrix.cu | 9 +++-- cpp/test/random/rng.cu | 21 +++++------- cpp/test/random/rng_int.cu | 8 ++--- cpp/test/random/sample_without_replacement.cu | 6 +--- cpp/test/sparse/add.cu | 14 ++------ cpp/test/sparse/convert_coo.cu | 4 +-- cpp/test/sparse/convert_csr.cu | 7 ++-- cpp/test/sparse/csr_row_slice.cu | 12 ++----- cpp/test/sparse/csr_to_dense.cu | 8 ++--- cpp/test/sparse/csr_transpose.cu | 12 ++----- cpp/test/sparse/degree.cu | 7 ++-- cpp/test/sparse/dist_coo_spmv.cu | 9 +---- cpp/test/sparse/distance.cu | 9 +---- cpp/test/sparse/knn.cu | 10 +----- cpp/test/sparse/knn_graph.cu | 3 +- cpp/test/sparse/linkage.cu | 28 +++++++--------- cpp/test/sparse/norm.cu | 7 ++-- cpp/test/sparse/row_op.cu | 6 ++-- cpp/test/sparse/selection.cu | 10 +----- cpp/test/spatial/haversine.cu | 12 +++---- cpp/test/spatial/knn.cu | 6 ++-- cpp/test/stats/mean.cu | 6 ++-- cpp/test/stats/mean_center.cu | 10 +++--- cpp/test/stats/stddev.cu | 10 +++--- cpp/test/stats/sum.cu | 8 ++--- 47 files changed, 157 insertions(+), 316 deletions(-) diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h index 8b8b3cbeca..3c07f1974d 100644 --- a/cpp/include/raft/cudart_utils.h +++ b/cpp/include/raft/cudart_utils.h @@ -26,6 +26,8 @@ #include #include #include +#include +#include ///@todo: enable once logging has been enabled in raft //#include "logger.hpp" @@ -260,19 +262,36 @@ void print_device_vector(const char* variable_name, const T* devMem, } /** @} */ +static std::mutex mutex_; +static std::unordered_map allocations; + template void allocate(Type*& ptr, size_t len, cudaStream_t stream, bool setZero = false) { - ptr = (Type*)rmm::mr::get_current_device_resource()->allocate( - len * sizeof(Type), stream); - if (setZero) CUDA_CHECK(cudaMemset(ptr, 0, len * sizeof(Type))); + size_t size = len * sizeof(Type); + ptr = (Type*)rmm::mr::get_current_device_resource()->allocate(size, stream); + if (setZero) CUDA_CHECK(cudaMemset((void*)ptr, 0, size)); + + std::lock_guard _(mutex_); + allocations[ptr] = size; } template -void deallocate(Type*& ptr, size_t len, cudaStream_t stream, - bool setZero = false) { - rmm::mr::get_current_device_resource()->deallocate(ptr, len * sizeof(Type), - stream); +void deallocate(Type*& ptr, cudaStream_t stream) { + std::lock_guard _(mutex_); + size_t size = allocations[ptr]; + allocations.erase(ptr); + rmm::mr::get_current_device_resource()->deallocate((void*)ptr, size, stream); +} + +inline void deallocate_all(cudaStream_t stream) { + std::lock_guard _(mutex_); + for (auto& alloc : allocations) { + void* ptr = alloc.first; + size_t size = alloc.second; + rmm::mr::get_current_device_resource()->deallocate(ptr, size, stream); + } + allocations.clear(); } /** helper method to get max usable shared mem per block parameter */ diff --git a/cpp/test/distance/dist_adj.cu b/cpp/test/distance/dist_adj.cu index dc397a5eb2..8d5cd68f13 100644 --- a/cpp/test/distance/dist_adj.cu +++ b/cpp/test/distance/dist_adj.cu @@ -77,7 +77,6 @@ class DistanceAdjTest int n = params.n; int k = params.k; bool isRowMajor = params.isRowMajor; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); raft::allocate(x, m * k, stream); raft::allocate(y, n * k, stream); @@ -103,21 +102,16 @@ class DistanceAdjTest raft::distance::distance( x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor); - CUDA_CHECK(cudaStreamDestroy(stream)); - CUDA_CHECK(cudaFree(workspace)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } - void TearDown() override { - CUDA_CHECK(cudaFree(x)); - CUDA_CHECK(cudaFree(y)); - CUDA_CHECK(cudaFree(dist_ref)); - CUDA_CHECK(cudaFree(dist)); - } + void TearDown() override {} protected: DistanceAdjInputs params; DataType *x, *y; bool *dist_ref, *dist; + cudaStream_t stream; }; const std::vector> inputsf = { diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh index ac12076e3a..2b0d718890 100644 --- a/cpp/test/distance/distance_base.cuh +++ b/cpp/test/distance/distance_base.cuh @@ -167,7 +167,6 @@ class DistanceTest : public ::testing::TestWithParam> { int n = params.n; int k = params.k; bool isRowMajor = params.isRowMajor; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); raft::allocate(x, m * k, stream); raft::allocate(y, n * k, stream); @@ -189,21 +188,18 @@ class DistanceTest : public ::testing::TestWithParam> { distanceLauncher(x, y, dist, dist2, m, n, k, params, threshold, workspace, worksize, stream, isRowMajor); - CUDA_CHECK(cudaStreamDestroy(stream)); - CUDA_CHECK(cudaFree(workspace)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(x)); - CUDA_CHECK(cudaFree(y)); - CUDA_CHECK(cudaFree(dist_ref)); - CUDA_CHECK(cudaFree(dist)); - CUDA_CHECK(cudaFree(dist2)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: DistanceInputs params; DataType *x, *y, *dist_ref, *dist, *dist2; + cudaStream_t stream; }; } // end namespace distance diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu index 8f12b26dc0..cfea4ee2d9 100644 --- a/cpp/test/distance/fused_l2_nn.cu +++ b/cpp/test/distance/fused_l2_nn.cu @@ -122,15 +122,8 @@ class FusedL2NNTest : public ::testing::TestWithParam> { } void TearDown() override { - CUDA_CHECK(cudaStreamSynchronize(stream)); + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); - CUDA_CHECK(cudaFree(x)); - CUDA_CHECK(cudaFree(y)); - CUDA_CHECK(cudaFree(xn)); - CUDA_CHECK(cudaFree(yn)); - CUDA_CHECK(cudaFree(workspace)); - CUDA_CHECK(cudaFree(min_ref)); - CUDA_CHECK(cudaFree(min)); } protected: @@ -286,10 +279,7 @@ class FusedL2NNDetTest : public FusedL2NNTest { raft::allocate(min1, m, stream); } - void TearDown() override { - FusedL2NNTest::TearDown(); - CUDA_CHECK(cudaFree(min1)); - } + void TearDown() override { FusedL2NNTest::TearDown(); } protected: cub::KeyValuePair *min1; diff --git a/cpp/test/label/label.cu b/cpp/test/label/label.cu index 86a8927283..b28c754a5a 100644 --- a/cpp/test/label/label.cu +++ b/cpp/test/label/label.cu @@ -62,9 +62,8 @@ TEST_F(MakeMonotonicTest, Result) { ASSERT_TRUE(devArrMatch(actual, expected, m, raft::Compare(), stream)); + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(actual)); delete data_h; delete expected_h; @@ -100,9 +99,8 @@ TEST(labelTest, Classlabels) { EXPECT_TRUE(devArrMatchHost(y_relabeled_exp, y_relabeled_d, n_rows, raft::Compare(), stream)); + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); - CUDA_CHECK(cudaFree(y_d)); - CUDA_CHECK(cudaFree(y_relabeled_d)); } }; // namespace label }; // namespace raft diff --git a/cpp/test/linalg/add.cu b/cpp/test/linalg/add.cu index ab7de0f24b..301f069a33 100644 --- a/cpp/test/linalg/add.cu +++ b/cpp/test/linalg/add.cu @@ -43,11 +43,7 @@ class AddTest : public ::testing::TestWithParam> { } void TearDown() override { - CUDA_CHECK(cudaStreamSynchronize(stream)); - CUDA_CHECK(cudaFree(in1)); - CUDA_CHECK(cudaFree(in2)); - CUDA_CHECK(cudaFree(out_ref)); - CUDA_CHECK(cudaFree(out)); + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); } diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu index bfbaf9b5f9..45dbd9dcc4 100644 --- a/cpp/test/linalg/coalesced_reduction.cu +++ b/cpp/test/linalg/coalesced_reduction.cu @@ -57,7 +57,6 @@ class coalescedReductionTest raft::random::Rng r(params.seed); int rows = params.rows, cols = params.cols; int len = rows * cols; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); raft::allocate(data, len, stream); raft::allocate(dots_exp, rows, stream); @@ -70,18 +69,18 @@ class coalescedReductionTest // Add to result with inplace = true next coalescedReductionLaunch(dots_act, data, cols, rows, stream, true); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(dots_exp)); - CUDA_CHECK(cudaFree(dots_act)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: coalescedReductionInputs params; T *data, *dots_exp, *dots_act; + cudaStream_t stream; }; const std::vector> inputsf = { diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu index 07fbda7e1c..563f96c835 100644 --- a/cpp/test/linalg/divide.cu +++ b/cpp/test/linalg/divide.cu @@ -51,7 +51,6 @@ class DivideTest ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.len; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); raft::allocate(in, len, stream); @@ -60,18 +59,18 @@ class DivideTest r.uniform(in, len, T(-1.0), T(1.0), stream); naiveDivide(out_ref, in, params.scalar, len, stream); divideScalar(out, in, params.scalar, len, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(in)); - CUDA_CHECK(cudaFree(out_ref)); - CUDA_CHECK(cudaFree(out)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: UnaryOpInputs params; T *in, *out_ref, *out; + cudaStream_t stream; }; const std::vector> inputsf = { diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu index d53713f004..6e26757cf3 100644 --- a/cpp/test/linalg/eig.cu +++ b/cpp/test/linalg/eig.cu @@ -97,15 +97,7 @@ class EigTest : public ::testing::TestWithParam> { sweeps); } - void TearDown() override { - CUDA_CHECK(cudaFree(cov_matrix)); - CUDA_CHECK(cudaFree(eig_vectors)); - CUDA_CHECK(cudaFree(eig_vectors_jacobi)); - CUDA_CHECK(cudaFree(eig_vals)); - CUDA_CHECK(cudaFree(eig_vals_jacobi)); - CUDA_CHECK(cudaFree(eig_vectors_ref)); - CUDA_CHECK(cudaFree(eig_vals_ref)); - } + void TearDown() override { raft::deallocate_all(stream); } protected: EigInputs params; diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu index 26c8177269..bdd0a08ff6 100644 --- a/cpp/test/linalg/eig_sel.cu +++ b/cpp/test/linalg/eig_sel.cu @@ -72,15 +72,10 @@ class EigSelTest : public ::testing::TestWithParam> { eigSelDC(handle, cov_matrix, params.n_row, params.n_col, 3, eig_vectors, eig_vals, EigVecMemUsage::OVERWRITE_INPUT, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); } - void TearDown() override { - CUDA_CHECK(cudaFree(cov_matrix)); - CUDA_CHECK(cudaFree(eig_vectors)); - CUDA_CHECK(cudaFree(eig_vals)); - CUDA_CHECK(cudaFree(eig_vectors_ref)); - CUDA_CHECK(cudaFree(eig_vals_ref)); - } + void TearDown() override { raft::deallocate_all(stream); } protected: EigSelInputs params; diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu index 6087893006..5b13fb5362 100644 --- a/cpp/test/linalg/map.cu +++ b/cpp/test/linalg/map.cu @@ -51,7 +51,6 @@ void create_ref(OutType *out_ref, const InType *in1, const InType *in2, eltwiseAdd(tmp, in1, in2, len, stream); eltwiseAdd(out_ref, tmp, in3, len, stream); scalarAdd(out_ref, out_ref, (OutType)scalar, len, stream); - CUDA_CHECK(cudaFree(tmp)); } template @@ -63,7 +62,6 @@ class MapTest ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); IdxType len = params.len; raft::allocate(in1, len, stream); @@ -77,21 +75,19 @@ class MapTest create_ref(out_ref, in1, in2, in3, params.scalar, len, stream); mapLaunch(out, in1, in2, in3, params.scalar, len, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(in1)); - CUDA_CHECK(cudaFree(in2)); - CUDA_CHECK(cudaFree(in3)); - CUDA_CHECK(cudaFree(out_ref)); - CUDA_CHECK(cudaFree(out)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: MapInputs params; InType *in1, *in2, *in3; OutType *out_ref, *out; + cudaStream_t stream; }; const std::vector> inputsf_i32 = { diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu index 49b7deaf1f..edc6382706 100644 --- a/cpp/test/linalg/map_then_reduce.cu +++ b/cpp/test/linalg/map_then_reduce.cu @@ -75,26 +75,25 @@ class MapReduceTest : public ::testing::TestWithParam> { raft::random::Rng r(params.seed); auto len = params.len; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); raft::allocate(in, len, stream); raft::allocate(out_ref, len, stream); raft::allocate(out, len, stream); r.uniform(in, len, InType(-1.0), InType(1.0), stream); mapReduceLaunch(out_ref, out, in, len, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(in)); - CUDA_CHECK(cudaFree(out_ref)); - CUDA_CHECK(cudaFree(out)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: MapReduceInputs params; InType *in; OutType *out_ref, *out; + cudaStream_t stream; }; const std::vector> inputsf = { diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu index 933f806ff5..e017ee0918 100644 --- a/cpp/test/linalg/matrix_vector_op.cu +++ b/cpp/test/linalg/matrix_vector_op.cu @@ -66,7 +66,6 @@ class MatVecOpTest IdxType N = params.rows, D = params.cols; IdxType len = N * D; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); raft::allocate(in, len, stream); raft::allocate(out_ref, len, stream); @@ -86,20 +85,18 @@ class MatVecOpTest } matrixVectorOpLaunch(out, in, vec1, vec2, D, N, params.rowMajor, params.bcastAlongRows, params.useTwoVectors, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(vec1)); - CUDA_CHECK(cudaFree(vec2)); - CUDA_CHECK(cudaFree(out)); - CUDA_CHECK(cudaFree(out_ref)); - CUDA_CHECK(cudaFree(in)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: MatVecOpInputs params; T *in, *out, *out_ref, *vec1, *vec2; + cudaStream_t stream; }; const std::vector> inputsf_i32 = { diff --git a/cpp/test/linalg/multiply.cu b/cpp/test/linalg/multiply.cu index 33f2b1c104..d7bda7c27d 100644 --- a/cpp/test/linalg/multiply.cu +++ b/cpp/test/linalg/multiply.cu @@ -31,7 +31,6 @@ class MultiplyTest : public ::testing::TestWithParam> { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.len; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); raft::allocate(in, len, stream); @@ -40,18 +39,18 @@ class MultiplyTest : public ::testing::TestWithParam> { r.uniform(in, len, T(-1.0), T(1.0), stream); naiveScale(out_ref, in, params.scalar, len, stream); multiplyScalar(out, in, params.scalar, len, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(in)); - CUDA_CHECK(cudaFree(out_ref)); - CUDA_CHECK(cudaFree(out)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: UnaryOpInputs params; T *in, *out_ref, *out; + cudaStream_t stream; }; const std::vector> inputsf = { diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu index 29ea0a1915..5563064982 100644 --- a/cpp/test/linalg/norm.cu +++ b/cpp/test/linalg/norm.cu @@ -157,13 +157,11 @@ class ColNormTest : public ::testing::TestWithParam> { } else { colNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream); } - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(dots_exp)); - CUDA_CHECK(cudaFree(dots_act)); + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); } diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu index dba8f76a22..8a3cf5d885 100644 --- a/cpp/test/linalg/reduce.cu +++ b/cpp/test/linalg/reduce.cu @@ -77,9 +77,7 @@ class ReduceTest : public ::testing::TestWithParam> { } void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(dots_exp)); - CUDA_CHECK(cudaFree(dots_act)); + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); } diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/test/linalg/strided_reduction.cu index 3fe076f1d4..55d8cc0e92 100644 --- a/cpp/test/linalg/strided_reduction.cu +++ b/cpp/test/linalg/strided_reduction.cu @@ -60,9 +60,7 @@ class stridedReductionTest } void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(dots_exp)); - CUDA_CHECK(cudaFree(dots_act)); + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); } diff --git a/cpp/test/linalg/subtract.cu b/cpp/test/linalg/subtract.cu index 7318410330..27dea8503f 100644 --- a/cpp/test/linalg/subtract.cu +++ b/cpp/test/linalg/subtract.cu @@ -79,7 +79,6 @@ class SubtractTest : public ::testing::TestWithParam> { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.len; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); raft::allocate(in1, len, stream); raft::allocate(in2, len, stream); @@ -95,19 +94,18 @@ class SubtractTest : public ::testing::TestWithParam> { subtractScalar(out, out, T(1), len, stream); subtract(in1, in1, in2, len, stream); subtractScalar(in1, in1, T(1), len, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(in1)); - CUDA_CHECK(cudaFree(in2)); - CUDA_CHECK(cudaFree(out_ref)); - CUDA_CHECK(cudaFree(out)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: SubtractInputs params; T *in1, *in2, *out_ref, *out; + cudaStream_t stream; }; const std::vector> inputsf2 = { diff --git a/cpp/test/linalg/svd.cu b/cpp/test/linalg/svd.cu index adfbfd01d6..72a27790de 100644 --- a/cpp/test/linalg/svd.cu +++ b/cpp/test/linalg/svd.cu @@ -48,7 +48,7 @@ class SvdTest : public ::testing::TestWithParam> { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.len; - cudaStream_t stream = handle.get_stream(); + stream = handle.get_stream(); raft::allocate(data, len, stream); ASSERT(params.n_row == 3, "This test only supports nrows=3!"); @@ -87,22 +87,16 @@ class SvdTest : public ::testing::TestWithParam> { svdQR(handle, data, params.n_row, params.n_col, sing_vals_qr, left_eig_vectors_qr, right_eig_vectors_trans_qr, true, true, true, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); } - void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(left_eig_vectors_qr)); - CUDA_CHECK(cudaFree(right_eig_vectors_trans_qr)); - CUDA_CHECK(cudaFree(sing_vals_qr)); - CUDA_CHECK(cudaFree(left_eig_vectors_ref)); - CUDA_CHECK(cudaFree(right_eig_vectors_ref)); - CUDA_CHECK(cudaFree(sing_vals_ref)); - } + void TearDown() override { raft::deallocate_all(stream); } protected: SvdInputs params; T *data, *left_eig_vectors_qr, *right_eig_vectors_trans_qr, *sing_vals_qr, *left_eig_vectors_ref, *right_eig_vectors_ref, *sing_vals_ref; + cudaStream_t stream; }; const std::vector> inputsf2 = { diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu index 08be179c59..c574f54a05 100644 --- a/cpp/test/linalg/transpose.cu +++ b/cpp/test/linalg/transpose.cu @@ -63,11 +63,7 @@ class TransposeTest : public ::testing::TestWithParam> { transpose(data, params.n_row, stream); } - void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(data_trans)); - CUDA_CHECK(cudaFree(data_trans_ref)); - } + void TearDown() override { raft::deallocate_all(stream); } protected: TranposeInputs params; diff --git a/cpp/test/linalg/unary_op.cu b/cpp/test/linalg/unary_op.cu index 8ef810d794..042e8b9cbf 100644 --- a/cpp/test/linalg/unary_op.cu +++ b/cpp/test/linalg/unary_op.cu @@ -60,11 +60,8 @@ class UnaryOpTest } void TearDown() override { - CUDA_CHECK(cudaStreamSynchronize(stream)); + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); - CUDA_CHECK(cudaFree(in)); - CUDA_CHECK(cudaFree(out_ref)); - CUDA_CHECK(cudaFree(out)); } virtual void DoTest() { diff --git a/cpp/test/matrix/math.cu b/cpp/test/matrix/math.cu index 227aca643e..63381dec07 100644 --- a/cpp/test/matrix/math.cu +++ b/cpp/test/matrix/math.cu @@ -116,7 +116,7 @@ class MathTest : public ::testing::TestWithParam> { int len = params.len; raft::handle_t handle; - cudaStream_t stream; + stream = handle.get_stream(); CUDA_CHECK(cudaStreamCreate(&stream)); raft::allocate(in_power, len, stream); @@ -174,31 +174,16 @@ class MathTest : public ::testing::TestWithParam> { update_device(out_smallzero_ref, in_small_val_zero_ref_h.data(), 4, stream); setSmallValuesZero(out_smallzero, in_smallzero, 4, stream); setSmallValuesZero(in_smallzero, 4, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { - CUDA_CHECK(cudaFree(in_power)); - CUDA_CHECK(cudaFree(out_power_ref)); - CUDA_CHECK(cudaFree(in_sqrt)); - CUDA_CHECK(cudaFree(out_sqrt_ref)); - CUDA_CHECK(cudaFree(in_ratio)); - CUDA_CHECK(cudaFree(out_ratio_ref)); - CUDA_CHECK(cudaFree(in_sign_flip)); - CUDA_CHECK(cudaFree(out_sign_flip_ref)); - CUDA_CHECK(cudaFree(in_recip)); - CUDA_CHECK(cudaFree(in_recip_ref)); - CUDA_CHECK(cudaFree(out_recip)); - CUDA_CHECK(cudaFree(in_smallzero)); - CUDA_CHECK(cudaFree(out_smallzero)); - CUDA_CHECK(cudaFree(out_smallzero_ref)); - } + void TearDown() override { raft::deallocate_all(stream); } protected: MathInputs params; T *in_power, *out_power_ref, *in_sqrt, *out_sqrt_ref, *in_ratio, *out_ratio_ref, *in_sign_flip, *out_sign_flip_ref, *in_recip, *in_recip_ref, *out_recip, *in_smallzero, *out_smallzero, *out_smallzero_ref; + cudaStream_t stream; }; const std::vector> inputsf = { diff --git a/cpp/test/matrix/matrix.cu b/cpp/test/matrix/matrix.cu index 047415957c..cc88df0a73 100644 --- a/cpp/test/matrix/matrix.cu +++ b/cpp/test/matrix/matrix.cu @@ -44,7 +44,6 @@ class MatrixTest : public ::testing::TestWithParam> { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.n_row * params.n_col; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); raft::allocate(in1, len, stream); raft::allocate(in2, len, stream); @@ -58,18 +57,18 @@ class MatrixTest : public ::testing::TestWithParam> { T *outTrunc; raft::allocate(outTrunc, 6, stream); truncZeroOrigin(in1, params.n_row, outTrunc, 3, 2, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(in1)); - CUDA_CHECK(cudaFree(in2)); - // CUDA_CHECK(cudaFree(in1_revr)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: MatrixInputs params; T *in1, *in2, *in1_revr; + cudaStream_t stream; }; const std::vector> inputsf2 = {{0.000001f, 4, 4, 1234ULL}}; diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu index ee5312de22..c2ec7a340f 100644 --- a/cpp/test/random/rng.cu +++ b/cpp/test/random/rng.cu @@ -85,7 +85,6 @@ class RngTest : public ::testing::TestWithParam> { // 4 x sigma indicates the test shouldn't fail 99.9% of the time. num_sigma = 10; params = ::testing::TestWithParam>::GetParam(); - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); Rng r(params.seed, params.gtype); raft::allocate(data, params.len, stream); @@ -124,12 +123,12 @@ class RngTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamSynchronize(stream)); h_stats[0] /= params.len; h_stats[1] = (h_stats[1] / params.len) - (h_stats[0] * h_stats[0]); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(stats)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } void getExpectedMeanVar(T meanvar[2]) { @@ -182,6 +181,7 @@ class RngTest : public ::testing::TestWithParam> { T *data, *stats; T h_stats[2]; // mean, var int num_sigma; + cudaStream_t stream; }; // The measured mean and standard deviation for each tested distribution are, @@ -416,10 +416,8 @@ TEST(Rng, MeanError) { ASSERT_TRUE( (diff_expected_vs_measured_mean_error / d_std_of_mean_analytical < 0.5)); } + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(mean_result)); - CUDA_CHECK(cudaFree(std_result)); // std::cout << "mean_res:" << h_mean_result << "\n"; } @@ -515,7 +513,6 @@ class RngNormalTableTest params = ::testing::TestWithParam>::GetParam(); int len = params.rows * params.cols; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); Rng r(params.seed, params.gtype); raft::allocate(data, len, stream); @@ -532,13 +529,12 @@ class RngNormalTableTest CUDA_CHECK(cudaStreamSynchronize(stream)); h_stats[0] /= len; h_stats[1] = (h_stats[1] / len) - (h_stats[0] * h_stats[0]); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(stats)); - CUDA_CHECK(cudaFree(mu_vec)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } void getExpectedMeanVar(T meanvar[2]) { @@ -551,6 +547,7 @@ class RngNormalTableTest T *data, *stats, *mu_vec; T h_stats[2]; // mean, var int num_sigma; + cudaStream_t stream; }; typedef RngNormalTableTest RngNormalTableTestF; diff --git a/cpp/test/random/rng_int.cu b/cpp/test/random/rng_int.cu index cd948d50aa..a98619e5b4 100644 --- a/cpp/test/random/rng_int.cu +++ b/cpp/test/random/rng_int.cu @@ -70,7 +70,6 @@ class RngTest : public ::testing::TestWithParam> { params = ::testing::TestWithParam>::GetParam(); Rng r(params.seed, params.gtype); - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); raft::allocate(data, params.len, stream); raft::allocate(stats, 2, stream, true); @@ -87,12 +86,12 @@ class RngTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamSynchronize(stream)); h_stats[0] /= params.len; h_stats[1] = (h_stats[1] / params.len) - (h_stats[0] * h_stats[0]); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(stats)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } void getExpectedMeanVar(float meanvar[2]) { @@ -110,6 +109,7 @@ class RngTest : public ::testing::TestWithParam> { T *data; float *stats; float h_stats[2]; // mean, var + cudaStream_t stream; }; typedef RngTest RngTestU32; diff --git a/cpp/test/random/sample_without_replacement.cu b/cpp/test/random/sample_without_replacement.cu index 7883c8fe1b..cf60f46afe 100644 --- a/cpp/test/random/sample_without_replacement.cu +++ b/cpp/test/random/sample_without_replacement.cu @@ -67,12 +67,8 @@ class SWoRTest : public ::testing::TestWithParam> { } void TearDown() override { - CUDA_CHECK(cudaStreamSynchronize(stream)); + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); - CUDA_CHECK(cudaFree(in)); - CUDA_CHECK(cudaFree(wts)); - CUDA_CHECK(cudaFree(out)); - CUDA_CHECK(cudaFree(outIdx)); } protected: diff --git a/cpp/test/sparse/add.cu b/cpp/test/sparse/add.cu index 5b2e5bd5b4..8429a46941 100644 --- a/cpp/test/sparse/add.cu +++ b/cpp/test/sparse/add.cu @@ -110,18 +110,8 @@ class CSRAddTest } void TearDown() override { - CUDA_CHECK(cudaFree(ind_a)); - CUDA_CHECK(cudaFree(ind_b)); - CUDA_CHECK(cudaFree(ind_result)); - CUDA_CHECK(cudaFree(ind_ptr_a)); - CUDA_CHECK(cudaFree(ind_ptr_b)); - CUDA_CHECK(cudaFree(ind_ptr_verify)); - CUDA_CHECK(cudaFree(ind_ptr_result)); - CUDA_CHECK(cudaFree(values_a)); - CUDA_CHECK(cudaFree(values_b)); - CUDA_CHECK(cudaFree(values_verify)); - CUDA_CHECK(cudaFree(values_result)); - cudaStreamDestroy(stream); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: diff --git a/cpp/test/sparse/convert_coo.cu b/cpp/test/sparse/convert_coo.cu index 8af6d30d7d..4f9c00c7ab 100644 --- a/cpp/test/sparse/convert_coo.cu +++ b/cpp/test/sparse/convert_coo.cu @@ -62,9 +62,7 @@ class CSRtoCOOTest : public ::testing::TestWithParam> { } void TearDown() override { - CUDA_CHECK(cudaFree(ex_scan)); - CUDA_CHECK(cudaFree(verify)); - CUDA_CHECK(cudaFree(result)); + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); } diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu index 871d8475e8..465aad4e7f 100644 --- a/cpp/test/sparse/convert_csr.cu +++ b/cpp/test/sparse/convert_csr.cu @@ -132,11 +132,8 @@ class CSRAdjGraphTest } void TearDown() override { - CUDA_CHECK(cudaFree(row_ind)); - CUDA_CHECK(cudaFree(adj)); - CUDA_CHECK(cudaFree(verify)); - CUDA_CHECK(cudaFree(result)); - cudaStreamDestroy(stream); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: diff --git a/cpp/test/sparse/csr_row_slice.cu b/cpp/test/sparse/csr_row_slice.cu index 1cf88e7a77..00e6899cb2 100644 --- a/cpp/test/sparse/csr_row_slice.cu +++ b/cpp/test/sparse/csr_row_slice.cu @@ -109,16 +109,8 @@ class CSRRowSliceTest } void TearDown() override { - CUDA_CHECK(cudaStreamSynchronize(stream)); - CUDA_CHECK(cudaFree(indptr)); - CUDA_CHECK(cudaFree(indices)); - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(out_indptr)); - CUDA_CHECK(cudaFree(out_indices)); - CUDA_CHECK(cudaFree(out_data)); - CUDA_CHECK(cudaFree(out_indptr_ref)); - CUDA_CHECK(cudaFree(out_indices_ref)); - CUDA_CHECK(cudaFree(out_data_ref)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } void compare() { diff --git a/cpp/test/sparse/csr_to_dense.cu b/cpp/test/sparse/csr_to_dense.cu index 4a7c9b7b10..7f6b7dad07 100644 --- a/cpp/test/sparse/csr_to_dense.cu +++ b/cpp/test/sparse/csr_to_dense.cu @@ -88,12 +88,8 @@ class CSRToDenseTest } void TearDown() override { - CUDA_CHECK(cudaStreamSynchronize(stream)); - CUDA_CHECK(cudaFree(indptr)); - CUDA_CHECK(cudaFree(indices)); - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(out)); - CUDA_CHECK(cudaFree(out_ref)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } void compare() { diff --git a/cpp/test/sparse/csr_transpose.cu b/cpp/test/sparse/csr_transpose.cu index 352a656bf7..e50a9d94a9 100644 --- a/cpp/test/sparse/csr_transpose.cu +++ b/cpp/test/sparse/csr_transpose.cu @@ -106,16 +106,8 @@ class CSRTransposeTest } void TearDown() override { - CUDA_CHECK(cudaStreamSynchronize(stream)); - CUDA_CHECK(cudaFree(indptr)); - CUDA_CHECK(cudaFree(indices)); - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(out_indptr)); - CUDA_CHECK(cudaFree(out_indices)); - CUDA_CHECK(cudaFree(out_data)); - CUDA_CHECK(cudaFree(out_indptr_ref)); - CUDA_CHECK(cudaFree(out_indices_ref)); - CUDA_CHECK(cudaFree(out_data_ref)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } void compare() { diff --git a/cpp/test/sparse/degree.cu b/cpp/test/sparse/degree.cu index 68dc1b51ac..eede0dfc10 100644 --- a/cpp/test/sparse/degree.cu +++ b/cpp/test/sparse/degree.cu @@ -67,8 +67,7 @@ TEST_P(COODegree, Result) { ASSERT_TRUE(raft::devArrMatch(verify, results, 5, raft::Compare())); - CUDA_CHECK(cudaFree(in_rows)); - CUDA_CHECK(cudaFree(verify)); + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); } @@ -98,9 +97,7 @@ TEST_P(COODegreeNonzero, Result) { ASSERT_TRUE(raft::devArrMatch(verify, results, 5, raft::Compare())); - CUDA_CHECK(cudaFree(in_rows)); - CUDA_CHECK(cudaFree(verify)); - + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); } diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu index 6b18e773cc..563dcf6f15 100644 --- a/cpp/test/sparse/dist_coo_spmv.cu +++ b/cpp/test/sparse/dist_coo_spmv.cu @@ -208,14 +208,7 @@ class SparseDistanceCOOSPMVTest CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); } - void TearDown() override { - CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); - CUDA_CHECK(cudaFree(indptr)); - CUDA_CHECK(cudaFree(indices)); - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(out_dists)); - CUDA_CHECK(cudaFree(out_dists_ref)); - } + void TearDown() override { raft::deallocate_all(handle.get_stream()); } void compare() { ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu index 4ae95ae232..9115dbf0b5 100644 --- a/cpp/test/sparse/distance.cu +++ b/cpp/test/sparse/distance.cu @@ -88,14 +88,7 @@ class SparseDistanceTest CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); } - void TearDown() override { - CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); - CUDA_CHECK(cudaFree(indptr)); - CUDA_CHECK(cudaFree(indices)); - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(out_dists)); - CUDA_CHECK(cudaFree(out_dists_ref)); - } + void TearDown() override { raft::deallocate_all(handle.get_stream()); } void compare() { ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/knn.cu index 8b5c9edebe..22f97559b1 100644 --- a/cpp/test/sparse/knn.cu +++ b/cpp/test/sparse/knn.cu @@ -80,15 +80,7 @@ class SparseKNNTest CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); } - void TearDown() override { - CUDA_CHECK(cudaFree(indptr)); - CUDA_CHECK(cudaFree(indices)); - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(out_indices)); - CUDA_CHECK(cudaFree(out_dists)); - CUDA_CHECK(cudaFree(out_indices_ref)); - CUDA_CHECK(cudaFree(out_dists_ref)); - } + void TearDown() override { raft::deallocate_all(handle.get_stream()); } void compare() { ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, n_rows * k, diff --git a/cpp/test/sparse/knn_graph.cu b/cpp/test/sparse/knn_graph.cu index a89529ddb3..945943bd5a 100644 --- a/cpp/test/sparse/knn_graph.cu +++ b/cpp/test/sparse/knn_graph.cu @@ -91,7 +91,8 @@ class KNNGraphTest } void TearDown() override { - CUDA_CHECK(cudaFree(X)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); delete out; } diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu index 4d1a5a9ad6..d2222d12bf 100644 --- a/cpp/test/sparse/linkage.cu +++ b/cpp/test/sparse/linkage.cu @@ -154,30 +154,27 @@ template class LinkageTest : public ::testing::TestWithParam> { protected: void basicTest() { - raft::handle_t handle; + CUDA_CHECK(cudaStreamCreate(&stream)); params = ::testing::TestWithParam>::GetParam(); - rmm::device_uvector data(params.n_row * params.n_col, - handle.get_stream()); + rmm::device_uvector data(params.n_row * params.n_col, stream); // Allocate result labels and expected labels on device - raft::allocate(labels, params.n_row, handle.get_stream()); - raft::allocate(labels_ref, params.n_row, handle.get_stream()); + raft::allocate(labels, params.n_row, stream); + raft::allocate(labels_ref, params.n_row, stream); - raft::copy(data.data(), params.data.data(), data.size(), - handle.get_stream()); - raft::copy(labels_ref, params.expected_labels.data(), params.n_row, - handle.get_stream()); + raft::copy(data.data(), params.data.data(), data.size(), stream); + raft::copy(labels_ref, params.expected_labels.data(), params.n_row, stream); raft::hierarchy::linkage_output out_arrs; out_arrs.labels = labels; - rmm::device_uvector out_children(params.n_row * 2, - handle.get_stream()); + rmm::device_uvector out_children(params.n_row * 2, stream); out_arrs.children = out_children.data(); + raft::handle_t handle; raft::hierarchy::single_linkage< IdxT, T, raft::hierarchy::LinkageDistance::KNN_GRAPH>( handle, data.data(), params.n_row, params.n_col, @@ -186,22 +183,21 @@ class LinkageTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); - score = - compute_rand_index(labels, labels_ref, params.n_row, handle.get_stream()); + score = compute_rand_index(labels, labels_ref, params.n_row, stream); } void SetUp() override { basicTest(); } void TearDown() override { - CUDA_CHECK(cudaFree(labels)); - CUDA_CHECK(cudaFree(labels_ref)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: LinkageInputs params; IdxT *labels, *labels_ref; - double score; + cudaStream_t stream; }; const std::vector> linkage_inputsf2 = { diff --git a/cpp/test/sparse/norm.cu b/cpp/test/sparse/norm.cu index f9a30d5147..d69dd15c57 100644 --- a/cpp/test/sparse/norm.cu +++ b/cpp/test/sparse/norm.cu @@ -77,11 +77,8 @@ class CSRRowNormalizeTest } void TearDown() override { - CUDA_CHECK(cudaFree(ex_scan)); - CUDA_CHECK(cudaFree(in_vals)); - CUDA_CHECK(cudaFree(verify)); - CUDA_CHECK(cudaFree(result)); - cudaStreamDestroy(stream); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: diff --git a/cpp/test/sparse/row_op.cu b/cpp/test/sparse/row_op.cu index e584598f85..805a3d85da 100644 --- a/cpp/test/sparse/row_op.cu +++ b/cpp/test/sparse/row_op.cu @@ -75,10 +75,8 @@ class CSRRowOpTest } void TearDown() override { - CUDA_CHECK(cudaFree(ex_scan)); - CUDA_CHECK(cudaFree(verify)); - CUDA_CHECK(cudaFree(result)); - cudaStreamDestroy(stream); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: diff --git a/cpp/test/sparse/selection.cu b/cpp/test/sparse/selection.cu index 4434b65dde..256ecfdfb7 100644 --- a/cpp/test/sparse/selection.cu +++ b/cpp/test/sparse/selection.cu @@ -97,15 +97,7 @@ class SparseSelectionTest } void TearDown() override { - CUDA_CHECK(cudaStreamSynchronize(stream)); - - CUDA_CHECK(cudaFree(dists)); - CUDA_CHECK(cudaFree(inds)); - CUDA_CHECK(cudaFree(out_indices)); - CUDA_CHECK(cudaFree(out_dists)); - CUDA_CHECK(cudaFree(out_indices_ref)); - CUDA_CHECK(cudaFree(out_dists_ref)); - + raft::deallocate_all(stream); CUDA_CHECK(cudaStreamDestroy(stream)); } diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu index a56dc03be3..542f2d6fc8 100644 --- a/cpp/test/spatial/haversine.cu +++ b/cpp/test/spatial/haversine.cu @@ -29,7 +29,6 @@ template class HaversineKNNTest : public ::testing::Test { protected: void basicTest() { - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); // Allocate input @@ -74,17 +73,14 @@ class HaversineKNNTest : public ::testing::Test { raft::spatial::knn::detail::haversine_knn( d_pred_I, d_pred_D, d_train_inputs, d_train_inputs, n, n, k, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void SetUp() override { basicTest(); } void TearDown() override { - CUDA_CHECK(cudaFree(d_train_inputs)); - CUDA_CHECK(cudaFree(d_pred_I)); - CUDA_CHECK(cudaFree(d_pred_D)); - CUDA_CHECK(cudaFree(d_ref_I)); - CUDA_CHECK(cudaFree(d_ref_D)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: @@ -100,6 +96,8 @@ class HaversineKNNTest : public ::testing::Test { value_idx *d_ref_I; value_t *d_ref_D; + + cudaStream_t stream; }; typedef HaversineKNNTest HaversineKNNTestF; diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu index edf46b3fe0..8682dea7fc 100644 --- a/cpp/test/spatial/knn.cu +++ b/cpp/test/spatial/knn.cu @@ -128,10 +128,8 @@ class KNNTest : public ::testing::TestWithParam { } void TearDown() override { - CUDA_CHECK(cudaFree(search_data_)); - CUDA_CHECK(cudaFree(indices_)); - CUDA_CHECK(cudaFree(distances_)); - CUDA_CHECK(cudaFree(actual_labels_)); + cudaStream_t stream = handle_.get_stream(); + raft::deallocate_all(stream); } private: diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu index 3c79fc2cae..a3c88a92be 100644 --- a/cpp/test/stats/mean.cu +++ b/cpp/test/stats/mean.cu @@ -49,7 +49,6 @@ class MeanTest : public ::testing::TestWithParam> { int rows = params.rows, cols = params.cols; int len = rows * cols; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); raft::allocate(data, len, stream); @@ -66,13 +65,14 @@ class MeanTest : public ::testing::TestWithParam> { } void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(mean_act)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: MeanInputs params; T *data, *mean_act; + cudaStream_t stream; }; // Note: For 1024 samples, 256 experiments, a mean of 1.0 with stddev=1.0, the diff --git a/cpp/test/stats/mean_center.cu b/cpp/test/stats/mean_center.cu index 89118b877b..b827230b5d 100644 --- a/cpp/test/stats/mean_center.cu +++ b/cpp/test/stats/mean_center.cu @@ -47,7 +47,6 @@ class MeanCenterTest params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); auto rows = params.rows, cols = params.cols; @@ -65,19 +64,18 @@ class MeanCenterTest params.bcastAlongRows, stream); raft::linalg::naiveMatVec(out_ref, data, meanVec, cols, rows, params.rowMajor, params.bcastAlongRows, (T)-1.0); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(out)); - CUDA_CHECK(cudaFree(out_ref)); - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(meanVec)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: MeanCenterInputs params; T *data, *meanVec, *out, *out_ref; + cudaStream_t stream; }; const std::vector> inputsf_i32 = { diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu index 6dfa186825..fd374249d2 100644 --- a/cpp/test/stats/stddev.cu +++ b/cpp/test/stats/stddev.cu @@ -47,7 +47,6 @@ class StdDevTest : public ::testing::TestWithParam> { int rows = params.rows, cols = params.cols; int len = rows * cols; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); raft::allocate(data, len, stream); raft::allocate(mean_act, cols, stream); @@ -55,7 +54,7 @@ class StdDevTest : public ::testing::TestWithParam> { raft::allocate(vars_act, cols, stream); r.normal(data, len, params.mean, params.stddev, stream); stdVarSGtest(data, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void stdVarSGtest(T *data, cudaStream_t stream) { @@ -73,15 +72,14 @@ class StdDevTest : public ::testing::TestWithParam> { } void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(mean_act)); - CUDA_CHECK(cudaFree(stddev_act)); - CUDA_CHECK(cudaFree(vars_act)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: StdDevInputs params; T *data, *mean_act, *stddev_act, *vars_act; + cudaStream_t stream; }; const std::vector> inputsf = { diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu index bf7a615f84..58ebec7859 100644 --- a/cpp/test/stats/sum.cu +++ b/cpp/test/stats/sum.cu @@ -43,7 +43,6 @@ class SumTest : public ::testing::TestWithParam> { params = ::testing::TestWithParam>::GetParam(); int rows = params.rows, cols = params.cols; int len = rows * cols; - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); raft::allocate(data, len, stream); @@ -56,17 +55,18 @@ class SumTest : public ::testing::TestWithParam> { raft::allocate(sum_act, cols, stream); sum(sum_act, data, cols, rows, false, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } void TearDown() override { - CUDA_CHECK(cudaFree(data)); - CUDA_CHECK(cudaFree(sum_act)); + raft::deallocate_all(stream); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: SumInputs params; T *data, *sum_act; + cudaStream_t stream; }; const std::vector> inputsf = {{0.05f, 1024, 32, 1234ULL}, From 192882a5faf45680eb21f3e1ef1786a082958612 Mon Sep 17 00:00:00 2001 From: viclafargue Date: Tue, 3 Aug 2021 12:03:13 +0200 Subject: [PATCH 09/17] Update handle exec policy --- cpp/include/raft/handle.hpp | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp index 1b1923abb7..611868f55f 100644 --- a/cpp/include/raft/handle.hpp +++ b/cpp/include/raft/handle.hpp @@ -140,14 +140,10 @@ class handle_t { return cusparse_handle_; } - thrust_exec_policy_t get_thrust_policy() const { + rmm::exec_policy get_thrust_policy() const { std::lock_guard _(mutex_); if (!thrust_policy_initialized_) { - if (!thrust_policy_) { - thrust_policy_ = - (thrust_exec_policy_t*)malloc(sizeof(thrust_exec_policy_t)); - } - *thrust_policy_ = rmm::exec_policy(this->get_stream()); + thrust_policy_ = new rmm::exec_policy(get_stream()); thrust_policy_initialized_ = true; } return *thrust_policy_; @@ -240,7 +236,7 @@ class handle_t { mutable bool cusolver_sp_initialized_{false}; mutable cusparseHandle_t cusparse_handle_; mutable bool cusparse_initialized_{false}; - mutable thrust_exec_policy_t* thrust_policy_{nullptr}; + mutable rmm::exec_policy* thrust_policy_{nullptr}; mutable bool thrust_policy_initialized_{false}; cudaStream_t user_stream_{nullptr}; cudaEvent_t event_; From e13696e09b88d902e11c9b514691a10e6e491408 Mon Sep 17 00:00:00 2001 From: viclafargue Date: Wed, 4 Aug 2021 14:10:38 +0200 Subject: [PATCH 10/17] Small updates --- cpp/cmake/thirdparty/get_rmm.cmake | 17 ++++++++++++----- cpp/include/raft/handle.hpp | 3 --- cpp/include/raft/sparse/linalg/spectral.cuh | 4 +--- .../spatial/knn/detail/ann_quantized_faiss.cuh | 1 - 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/cpp/cmake/thirdparty/get_rmm.cmake b/cpp/cmake/thirdparty/get_rmm.cmake index 51f959a8d9..8a9f3ffe45 100644 --- a/cpp/cmake/thirdparty/get_rmm.cmake +++ b/cpp/cmake/thirdparty/get_rmm.cmake @@ -14,7 +14,7 @@ # limitations under the License. #============================================================================= -function(find_and_configure_rmm VERSION) +function(find_and_configure_rmm) if(TARGET rmm::rmm) return() @@ -26,13 +26,17 @@ function(find_and_configure_rmm VERSION) set(MAJOR_AND_MINOR "${VERSION}") endif() - rapids_cpm_find(rmm ${VERSION} + set(oneValueArgs VERSION FORK PINNED_TAG) + cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN} ) + + rapids_cpm_find(rmm ${PKG_VERSION} GLOBAL_TARGETS rmm::rmm BUILD_EXPORT_SET raft-exports INSTALL_EXPORT_SET raft-exports CPM_ARGS - GIT_REPOSITORY https://github.com/rapidsai/rmm.git - GIT_TAG branch-${MAJOR_AND_MINOR} + GIT_REPOSITORY https://github.com/${PKG_FORK}/rmm.git + GIT_TAG ${PKG_PINNED_TAG} GIT_SHALLOW TRUE OPTIONS "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" @@ -44,4 +48,7 @@ endfunction() set(RAFT_MIN_VERSION_rmm "${RAFT_VERSION_MAJOR}.${RAFT_VERSION_MINOR}.00") -find_and_configure_rmm(${RAFT_MIN_VERSION_rmm}) +find_and_configure_rmm(VERSION ${RAFT_MIN_VERSION_rmm} + FORK viclafargue + PINNED_TAG 84a1328bfb894ca23b8b5efce358473c358d47bf + ) diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp index aa0fd8af8d..bcec17e7d3 100644 --- a/cpp/include/raft/handle.hpp +++ b/cpp/include/raft/handle.hpp @@ -47,9 +47,6 @@ namespace raft { * necessary cuda kernels and/or libraries */ class handle_t { - using thrust_exec_policy_t = thrust::detail::execute_with_allocator< - rmm::mr::thrust_allocator, thrust::cuda_cub::execute_on_stream_base>; - private: static constexpr int kNumDefaultWorkerStreams = 0; diff --git a/cpp/include/raft/sparse/linalg/spectral.cuh b/cpp/include/raft/sparse/linalg/spectral.cuh index d15c2cdf23..ce0c4bbe6f 100644 --- a/cpp/include/raft/sparse/linalg/spectral.cuh +++ b/cpp/include/raft/sparse/linalg/spectral.cuh @@ -63,8 +63,6 @@ void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals, index_type maxiter = 4000; //default reset value (when set to 0); value_type tol = 0.01; index_type restart_iter = 15 + neigvs; //what cugraph is using - auto t_exe_p = handle.get_thrust_policy(); - using thrust_exe_policy_t = decltype(t_exe_p); raft::eigen_solver_config_t cfg{neigvs, maxiter, restart_iter, tol}; @@ -90,7 +88,7 @@ void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals, } }; - raft::spectral::partition(handle, t_exe_p, r_csr_m, eig_solver, + raft::spectral::partition(handle, r_csr_m, eig_solver, no_op_cluster_solver_t{}, labels.data(), eigVals.data(), eigVecs.data()); diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh index 43bdf12a38..77ad4afe96 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh @@ -43,7 +43,6 @@ #include -#include #include #include From afd1a1bfdc08b013b562cd6bda45dfa3bfeab945 Mon Sep 17 00:00:00 2001 From: viclafargue Date: Wed, 4 Aug 2021 15:25:42 +0200 Subject: [PATCH 11/17] Use of CUDA stream view --- cpp/include/raft/cudart_utils.h | 37 +++++++++++++++++++-------------- cpp/test/sparse/degree.cu | 12 +++++------ cpp/test/spatial/haversine.cu | 6 +++--- 3 files changed, 30 insertions(+), 25 deletions(-) diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h index 3c07f1974d..b46b5457a4 100644 --- a/cpp/include/raft/cudart_utils.h +++ b/cpp/include/raft/cudart_utils.h @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include @@ -203,9 +204,10 @@ class grid_1d_block_t { * @param stream cuda stream */ template -void copy(Type* dst, const Type* src, size_t len, cudaStream_t stream) { - CUDA_CHECK( - cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream)); +void copy(Type* dst, const Type* src, size_t len, + rmm::cuda_stream_view stream) { + CUDA_CHECK(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, + stream.value())); } /** @@ -217,22 +219,22 @@ void copy(Type* dst, const Type* src, size_t len, cudaStream_t stream) { /** performs a host to device copy */ template void update_device(Type* d_ptr, const Type* h_ptr, size_t len, - cudaStream_t stream) { - copy(d_ptr, h_ptr, len, stream); + rmm::cuda_stream_view stream) { + copy(d_ptr, h_ptr, len, stream.value()); } /** performs a device to host copy */ template void update_host(Type* h_ptr, const Type* d_ptr, size_t len, - cudaStream_t stream) { - copy(h_ptr, d_ptr, len, stream); + rmm::cuda_stream_view stream) { + copy(h_ptr, d_ptr, len, stream.value()); } template void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, - cudaStream_t stream) { + rmm::cuda_stream_view stream) { CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), - cudaMemcpyDeviceToDevice, stream)); + cudaMemcpyDeviceToDevice, stream.value())); } /** @} */ @@ -266,30 +268,33 @@ static std::mutex mutex_; static std::unordered_map allocations; template -void allocate(Type*& ptr, size_t len, cudaStream_t stream, +void allocate(Type*& ptr, size_t len, rmm::cuda_stream_view stream, bool setZero = false) { size_t size = len * sizeof(Type); - ptr = (Type*)rmm::mr::get_current_device_resource()->allocate(size, stream); - if (setZero) CUDA_CHECK(cudaMemset((void*)ptr, 0, size)); + ptr = (Type*)rmm::mr::get_current_device_resource()->allocate(size, + stream.value()); + if (setZero) CUDA_CHECK(cudaMemsetAsync((void*)ptr, 0, size, stream.value())); std::lock_guard _(mutex_); allocations[ptr] = size; } template -void deallocate(Type*& ptr, cudaStream_t stream) { +void deallocate(Type*& ptr, rmm::cuda_stream_view stream) { std::lock_guard _(mutex_); size_t size = allocations[ptr]; allocations.erase(ptr); - rmm::mr::get_current_device_resource()->deallocate((void*)ptr, size, stream); + rmm::mr::get_current_device_resource()->deallocate((void*)ptr, size, + stream.value()); } -inline void deallocate_all(cudaStream_t stream) { +inline void deallocate_all(rmm::cuda_stream_view stream) { std::lock_guard _(mutex_); for (auto& alloc : allocations) { void* ptr = alloc.first; size_t size = alloc.second; - rmm::mr::get_current_device_resource()->deallocate(ptr, size, stream); + rmm::mr::get_current_device_resource()->deallocate(ptr, size, + stream.value()); } allocations.clear(); } diff --git a/cpp/test/sparse/degree.cu b/cpp/test/sparse/degree.cu index eede0dfc10..f8a469af45 100644 --- a/cpp/test/sparse/degree.cu +++ b/cpp/test/sparse/degree.cu @@ -59,10 +59,10 @@ TEST_P(COODegree, Result) { raft::allocate(verify, 5, stream, true); raft::allocate(results, 5, stream, true); - raft::update_device(in_rows, *&in_rows_h, 5, 0); - raft::update_device(verify, *&verify_h, 5, 0); + raft::update_device(in_rows, *&in_rows_h, 5, stream); + raft::update_device(verify, *&verify_h, 5, stream); - linalg::coo_degree<32>(in_rows, 5, results, 0); + linalg::coo_degree<32>(in_rows, 5, results, stream); cudaDeviceSynchronize(); ASSERT_TRUE(raft::devArrMatch(verify, results, 5, raft::Compare())); @@ -88,9 +88,9 @@ TEST_P(COODegreeNonzero, Result) { raft::allocate(results, 5, stream, true); raft::allocate(in_vals, 5, stream, true); - raft::update_device(in_rows, *&in_rows_h, 5, 0); - raft::update_device(verify, *&verify_h, 5, 0); - raft::update_device(in_vals, *&in_vals_h, 5, 0); + raft::update_device(in_rows, *&in_rows_h, 5, stream); + raft::update_device(verify, *&verify_h, 5, stream); + raft::update_device(in_vals, *&in_vals_h, 5, stream); linalg::coo_degree_nz<32, float>(in_rows, in_vals, 5, results, stream); cudaDeviceSynchronize(); diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu index 542f2d6fc8..122d7f2d6a 100644 --- a/cpp/test/spatial/haversine.cu +++ b/cpp/test/spatial/haversine.cu @@ -49,7 +49,7 @@ class HaversineKNNTest : public ::testing::Test { 0.53154002, -1.47049808, 0.72891737, -1.54095137}; h_train_inputs.resize(n); - raft::update_device(d_train_inputs, h_train_inputs.data(), n * d, 0); + raft::update_device(d_train_inputs, h_train_inputs.data(), n * d, stream); std::vector h_res_D = { 0., 0.05041587, 0.18767063, 0.23048252, 0.35749438, 0.62925595, @@ -59,13 +59,13 @@ class HaversineKNNTest : public ::testing::Test { 0., 0.16461092, 0.20535265, 0.23048252, 0.2426416, 0.5170737, 0., 0.152463, 0.18767063, 0.20535265, 0.2345792, 0.44288665}; h_res_D.resize(n * n); - raft::update_device(d_ref_D, h_res_D.data(), n * n, 0); + raft::update_device(d_ref_D, h_res_D.data(), n * n, stream); std::vector h_res_I = {0, 2, 5, 4, 3, 1, 1, 3, 5, 4, 2, 0, 2, 0, 5, 4, 3, 1, 3, 4, 5, 2, 0, 1, 4, 3, 5, 0, 2, 1, 5, 2, 0, 4, 3, 1}; h_res_I.resize(n * n); - raft::update_device(d_ref_I, h_res_I.data(), n * n, 0); + raft::update_device(d_ref_I, h_res_I.data(), n * n, stream); std::vector input_vec = {d_train_inputs}; std::vector sizes_vec = {n}; From 598463f9fcedef93549dc64b1d47c4b6afd37c59 Mon Sep 17 00:00:00 2001 From: viclafargue Date: Tue, 10 Aug 2021 11:04:51 +0200 Subject: [PATCH 12/17] Apply requested changes --- cpp/include/raft/label/classlabels.cuh | 1 - cpp/include/raft/linalg/eig.cuh | 2 -- 2 files changed, 3 deletions(-) diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/classlabels.cuh index b7878dc276..e12620ee0f 100644 --- a/cpp/include/raft/label/classlabels.cuh +++ b/cpp/include/raft/label/classlabels.cuh @@ -66,7 +66,6 @@ void getUniquelabels(value_t *y, size_t n, value_t **y_unique, int *n_unique, cub::DeviceSelect::Unique(cub_storage.data(), bytes, y2.data(), y3.data(), d_num_selected.data(), n); *n_unique = d_num_selected.value(stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); // Copy unique classes to output *y_unique = diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh index 951e2b8e7a..7fdbd371af 100644 --- a/cpp/include/raft/linalg/eig.cuh +++ b/cpp/include/raft/linalg/eig.cuh @@ -65,7 +65,6 @@ void eigDC(const raft::handle_t &handle, const math_t *in, int n_rows, CUDA_CHECK(cudaGetLastError()); int dev_info = d_dev_info.value(stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); ASSERT(dev_info == 0, "eig.cuh: eigensolver couldn't converge to a solution. " "This usually occurs when some of the features do not vary enough."); @@ -128,7 +127,6 @@ void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, CUDA_CHECK(cudaGetLastError()); int dev_info = d_dev_info.value(stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); ASSERT(dev_info == 0, "eig.cuh: eigensolver couldn't converge to a solution. " "This usually occurs when some of the features do not vary enough."); From 819f698776ff9aa710cee1783a45990e7fc97f32 Mon Sep 17 00:00:00 2001 From: viclafargue Date: Tue, 10 Aug 2021 14:38:13 +0200 Subject: [PATCH 13/17] Completing adoption of rmm::exec_policy --- cpp/include/raft/sparse/op/filter.cuh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/include/raft/sparse/op/filter.cuh b/cpp/include/raft/sparse/op/filter.cuh index 5383f6fe7e..492058f85f 100644 --- a/cpp/include/raft/sparse/op/filter.cuh +++ b/cpp/include/raft/sparse/op/filter.cuh @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -93,14 +94,14 @@ void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz, thrust::device_ptr dev_cnnz = thrust::device_pointer_cast(cnnz); thrust::device_ptr dev_ex_scan = thrust::device_pointer_cast(ex_scan.data()); - thrust::exclusive_scan(thrust::cuda::par.on(stream), dev_cnnz, dev_cnnz + n, + thrust::exclusive_scan(rmm::exec_policy(stream), dev_cnnz, dev_cnnz + n, dev_ex_scan); CUDA_CHECK(cudaPeekAtLastError()); thrust::device_ptr dev_cur_cnnz = thrust::device_pointer_cast(cur_cnnz); thrust::device_ptr dev_cur_ex_scan = thrust::device_pointer_cast(cur_ex_scan.data()); - thrust::exclusive_scan(thrust::cuda::par.on(stream), dev_cur_cnnz, + thrust::exclusive_scan(rmm::exec_policy(stream), dev_cur_cnnz, dev_cur_cnnz + n, dev_cur_ex_scan); CUDA_CHECK(cudaPeekAtLastError()); @@ -140,7 +141,7 @@ void coo_remove_scalar(COO *in, COO *out, T scalar, cudaStream_t stream) { thrust::device_ptr d_row_count_nz = thrust::device_pointer_cast(row_count_nz.data()); - int out_nnz = thrust::reduce(thrust::cuda::par.on(stream), d_row_count_nz, + int out_nnz = thrust::reduce(rmm::exec_policy(stream), d_row_count_nz, d_row_count_nz + in->n_rows); out->allocate(out_nnz, in->n_rows, in->n_cols, false, stream); From f990cf91c3518a128115808af5a4105aa105e8b4 Mon Sep 17 00:00:00 2001 From: viclafargue Date: Tue, 10 Aug 2021 14:38:50 +0200 Subject: [PATCH 14/17] get_rmm.cmake beck to default --- cpp/cmake/thirdparty/get_rmm.cmake | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/cpp/cmake/thirdparty/get_rmm.cmake b/cpp/cmake/thirdparty/get_rmm.cmake index 8a9f3ffe45..9efee7cf29 100644 --- a/cpp/cmake/thirdparty/get_rmm.cmake +++ b/cpp/cmake/thirdparty/get_rmm.cmake @@ -14,7 +14,7 @@ # limitations under the License. #============================================================================= -function(find_and_configure_rmm) +function(find_and_configure_rmm VERSION) if(TARGET rmm::rmm) return() @@ -26,17 +26,13 @@ function(find_and_configure_rmm) set(MAJOR_AND_MINOR "${VERSION}") endif() - set(oneValueArgs VERSION FORK PINNED_TAG) - cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" - "${multiValueArgs}" ${ARGN} ) - - rapids_cpm_find(rmm ${PKG_VERSION} + rapids_cpm_find(rmm ${VERSION} GLOBAL_TARGETS rmm::rmm BUILD_EXPORT_SET raft-exports INSTALL_EXPORT_SET raft-exports CPM_ARGS - GIT_REPOSITORY https://github.com/${PKG_FORK}/rmm.git - GIT_TAG ${PKG_PINNED_TAG} + GIT_REPOSITORY https://github.com/rapidsai/rmm.git + GIT_TAG branch-${MAJOR_AND_MINOR} GIT_SHALLOW TRUE OPTIONS "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" @@ -48,7 +44,4 @@ endfunction() set(RAFT_MIN_VERSION_rmm "${RAFT_VERSION_MAJOR}.${RAFT_VERSION_MINOR}.00") -find_and_configure_rmm(VERSION ${RAFT_MIN_VERSION_rmm} - FORK viclafargue - PINNED_TAG 84a1328bfb894ca23b8b5efce358473c358d47bf - ) +find_and_configure_rmm(${RAFT_MIN_VERSION_rmm}) \ No newline at end of file From be9b5bbdd25b9d1b01efa7d575ba46fe16a19cda Mon Sep 17 00:00:00 2001 From: viclafargue Date: Mon, 16 Aug 2021 13:12:58 +0200 Subject: [PATCH 15/17] Answer requested changes --- cpp/include/raft/handle.hpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp index bcec17e7d3..b045c67c98 100644 --- a/cpp/include/raft/handle.hpp +++ b/cpp/include/raft/handle.hpp @@ -146,10 +146,6 @@ class handle_t { return *thrust_policy_; } - rmm::exec_policy get_thrust_policy(cudaStream_t stream) const { - return rmm::exec_policy(stream); - } - // legacy compatibility for cuML cudaStream_t get_internal_stream(int sid) const { return streams_.get_stream(sid).value(); From c9e0fa82af6d5b0316df4706e2c914534a5ad4d8 Mon Sep 17 00:00:00 2001 From: viclafargue Date: Wed, 18 Aug 2021 20:01:21 +0200 Subject: [PATCH 16/17] Replace stream.value() occurences --- cpp/include/raft/cudart_utils.h | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h index b46b5457a4..85ca310530 100644 --- a/cpp/include/raft/cudart_utils.h +++ b/cpp/include/raft/cudart_utils.h @@ -206,8 +206,8 @@ class grid_1d_block_t { template void copy(Type* dst, const Type* src, size_t len, rmm::cuda_stream_view stream) { - CUDA_CHECK(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, - stream.value())); + CUDA_CHECK( + cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream)); } /** @@ -220,21 +220,21 @@ void copy(Type* dst, const Type* src, size_t len, template void update_device(Type* d_ptr, const Type* h_ptr, size_t len, rmm::cuda_stream_view stream) { - copy(d_ptr, h_ptr, len, stream.value()); + copy(d_ptr, h_ptr, len, stream); } /** performs a device to host copy */ template void update_host(Type* h_ptr, const Type* d_ptr, size_t len, rmm::cuda_stream_view stream) { - copy(h_ptr, d_ptr, len, stream.value()); + copy(h_ptr, d_ptr, len, stream); } template void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, rmm::cuda_stream_view stream) { CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), - cudaMemcpyDeviceToDevice, stream.value())); + cudaMemcpyDeviceToDevice, stream)); } /** @} */ @@ -271,9 +271,8 @@ template void allocate(Type*& ptr, size_t len, rmm::cuda_stream_view stream, bool setZero = false) { size_t size = len * sizeof(Type); - ptr = (Type*)rmm::mr::get_current_device_resource()->allocate(size, - stream.value()); - if (setZero) CUDA_CHECK(cudaMemsetAsync((void*)ptr, 0, size, stream.value())); + ptr = (Type*)rmm::mr::get_current_device_resource()->allocate(size, stream); + if (setZero) CUDA_CHECK(cudaMemsetAsync((void*)ptr, 0, size, stream)); std::lock_guard _(mutex_); allocations[ptr] = size; @@ -284,8 +283,7 @@ void deallocate(Type*& ptr, rmm::cuda_stream_view stream) { std::lock_guard _(mutex_); size_t size = allocations[ptr]; allocations.erase(ptr); - rmm::mr::get_current_device_resource()->deallocate((void*)ptr, size, - stream.value()); + rmm::mr::get_current_device_resource()->deallocate((void*)ptr, size, stream); } inline void deallocate_all(rmm::cuda_stream_view stream) { @@ -293,8 +291,7 @@ inline void deallocate_all(rmm::cuda_stream_view stream) { for (auto& alloc : allocations) { void* ptr = alloc.first; size_t size = alloc.second; - rmm::mr::get_current_device_resource()->deallocate(ptr, size, - stream.value()); + rmm::mr::get_current_device_resource()->deallocate(ptr, size, stream); } allocations.clear(); } From c2a3fbd39ef4cd6ae70b7af9cbc3129798e3cf9e Mon Sep 17 00:00:00 2001 From: viclafargue Date: Thu, 19 Aug 2021 11:11:55 +0200 Subject: [PATCH 17/17] Update RAFT handle --- cpp/include/raft/handle.hpp | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp index b045c67c98..c925669530 100644 --- a/cpp/include/raft/handle.hpp +++ b/cpp/include/raft/handle.hpp @@ -64,6 +64,7 @@ class handle_t { }()), streams_(n_streams) { create_resources(); + thrust_policy_ = std::make_unique(user_stream_); } /** @@ -85,6 +86,7 @@ class handle_t { device_prop_initialized_ = true; create_resources(); set_stream(other.get_internal_stream(stream_id)); + thrust_policy_ = std::make_unique(user_stream_); } /** Destroys all held-up resources */ @@ -92,10 +94,7 @@ class handle_t { int get_device() const { return dev_id_; } - void set_stream(cudaStream_t stream) { - thrust_policy_initialized_ = false; - user_stream_ = stream; - } + void set_stream(cudaStream_t stream) { user_stream_ = stream; } cudaStream_t get_stream() const { return user_stream_; } rmm::cuda_stream_view get_stream_view() const { return rmm::cuda_stream_view(user_stream_); @@ -137,14 +136,7 @@ class handle_t { return cusparse_handle_; } - rmm::exec_policy get_thrust_policy() const { - std::lock_guard _(mutex_); - if (!thrust_policy_initialized_) { - thrust_policy_ = new rmm::exec_policy(get_stream()); - thrust_policy_initialized_ = true; - } - return *thrust_policy_; - } + rmm::exec_policy& get_thrust_policy() const { return *thrust_policy_; } // legacy compatibility for cuML cudaStream_t get_internal_stream(int sid) const { @@ -229,8 +221,7 @@ class handle_t { mutable bool cusolver_sp_initialized_{false}; mutable cusparseHandle_t cusparse_handle_; mutable bool cusparse_initialized_{false}; - mutable rmm::exec_policy* thrust_policy_{nullptr}; - mutable bool thrust_policy_initialized_{false}; + std::unique_ptr thrust_policy_{nullptr}; cudaStream_t user_stream_{nullptr}; cudaEvent_t event_; mutable cudaDeviceProp prop_; @@ -259,9 +250,6 @@ class handle_t { //CUBLAS_CHECK_NO_THROW(cublasDestroy(cublas_handle_)); CUBLAS_CHECK(cublasDestroy(cublas_handle_)); } - if (thrust_policy_initialized_) { - delete thrust_policy_; - } //CUDA_CHECK_NO_THROW(cudaEventDestroy(event_)); CUDA_CHECK(cudaEventDestroy(event_)); }