From 4ba7510565d12455e777cf8e370f9202a9e71621 Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Mon, 28 Aug 2023 15:19:40 -0700 Subject: [PATCH 01/11] Don't keep extra `pruned_graph` copy in optimize This just gets copied over to the output, and requires multiple copies of the output graph on host memory. --- .../raft/neighbors/detail/cagra/graph_core.cuh | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh index 0558d7ea39..f060bca36b 100644 --- a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh @@ -334,8 +334,6 @@ void optimize(raft::resources const& res, auto output_graph_ptr = new_graph.data_handle(); const IdxT graph_size = new_graph.extent(0); - auto pruned_graph = raft::make_host_matrix(graph_size, output_graph_degree); - { // // Prune kNN graph @@ -380,6 +378,7 @@ void optimize(raft::resources const& res, input_graph_ptr, graph_size * input_graph_degree, resource::get_cuda_stream(res)); + void (*kernel_prune)(const IdxT* const, const uint32_t, const uint32_t, @@ -447,7 +446,7 @@ void optimize(raft::resources const& res, if (max_detour < num_detour) { max_detour = num_detour; /* stats */ } for (uint64_t k = 0; k < input_graph_degree; k++) { if (detour_count.data_handle()[k + (input_graph_degree * i)] != num_detour) { continue; } - pruned_graph.data_handle()[pk + (output_graph_degree * i)] = + output_graph_ptr[pk + (output_graph_degree * i)] = input_graph_ptr[k + (input_graph_degree * i)]; pk += 1; if (pk >= output_graph_degree) break; @@ -497,7 +496,7 @@ void optimize(raft::resources const& res, for (uint64_t k = 0; k < output_graph_degree; k++) { #pragma omp parallel for for (uint64_t i = 0; i < graph_size; i++) { - dest_nodes.data_handle()[i] = pruned_graph.data_handle()[k + (output_graph_degree * i)]; + dest_nodes.data_handle()[i] = output_graph_ptr[k + (output_graph_degree * i)]; } resource::sync_stream(res); @@ -542,10 +541,6 @@ void optimize(raft::resources const& res, const uint64_t num_protected_edges = output_graph_degree / 2; RAFT_LOG_DEBUG("# num_protected_edges: %lu", num_protected_edges); - memcpy(output_graph_ptr, - pruned_graph.data_handle(), - sizeof(IdxT) * graph_size * output_graph_degree); - constexpr int _omp_chunk = 1024; #pragma omp parallel for schedule(dynamic, _omp_chunk) for (uint64_t j = 0; j < graph_size; j++) { @@ -578,7 +573,7 @@ void optimize(raft::resources const& res, #pragma omp parallel for reduction(+ : num_replaced_edges) for (uint64_t i = 0; i < graph_size; i++) { for (uint64_t k = 0; k < output_graph_degree; k++) { - const uint64_t j = pruned_graph.data_handle()[k + (output_graph_degree * i)]; + const uint64_t j = output_graph_ptr[k + (output_graph_degree * i)]; const uint64_t pos = pos_in_array(j, output_graph_ptr + (output_graph_degree * i), output_graph_degree); if (pos == output_graph_degree) { num_replaced_edges += 1; } From c5e499822389eab531901802b2d67e58c18461d9 Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Mon, 28 Aug 2023 15:27:22 -0700 Subject: [PATCH 02/11] in serialize, don't take copy of dataset if not strided --- .../detail/cagra/cagra_serialize.cuh | 35 ++++++++++++------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh index 2c9cbd2563..d0e1f8ea4f 100644 --- a/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh @@ -71,18 +71,29 @@ void serialize(raft::resources const& res, serialize_scalar(res, os, include_dataset); if (include_dataset) { auto dataset = index_.dataset(); - // Remove padding before saving the dataset - auto host_dataset = make_host_matrix(dataset.extent(0), dataset.extent(1)); - RAFT_CUDA_TRY(cudaMemcpy2DAsync(host_dataset.data_handle(), - sizeof(T) * host_dataset.extent(1), - dataset.data_handle(), - sizeof(T) * dataset.stride(0), - sizeof(T) * host_dataset.extent(1), - dataset.extent(0), - cudaMemcpyDefault, - resource::get_cuda_stream(res))); - resource::sync_stream(res); - serialize_mdspan(res, os, host_dataset.view()); + if (dataset.stride(0) == dataset.extent(0)) { + // Rather than take another copy of the dataset here, just write it out directly. + // Since the dataset is a strided layout, we can't pass directly to the serialize_mdspan + // - but since the stride is the same as the extent, we can convert to a row-major + // mdspan + serialize_mdspan( + res, + os, + make_device_matrix_view(dataset.data_handle(), dataset.extent(0), dataset.extent(1))); + } else { + // Remove padding before saving the dataset + auto host_dataset = make_host_matrix(dataset.extent(0), dataset.extent(1)); + RAFT_CUDA_TRY(cudaMemcpy2DAsync(host_dataset.data_handle(), + sizeof(T) * host_dataset.extent(1), + dataset.data_handle(), + sizeof(T) * dataset.stride(0), + sizeof(T) * host_dataset.extent(1), + dataset.extent(0), + cudaMemcpyDefault, + resource::get_cuda_stream(res))); + resource::sync_stream(res); + serialize_mdspan(res, os, host_dataset.view()); + } } } From 5d9c95927df0bbb69f231c543cdfb1b3d5bc765f Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Mon, 28 Aug 2023 15:59:29 -0700 Subject: [PATCH 03/11] simplify kern_prune --- .../neighbors/detail/cagra/graph_core.cuh | 35 +++++++------------ 1 file changed, 12 insertions(+), 23 deletions(-) diff --git a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh index f060bca36b..eeab3fb612 100644 --- a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh @@ -379,20 +379,8 @@ void optimize(raft::resources const& res, graph_size * input_graph_degree, resource::get_cuda_stream(res)); - void (*kernel_prune)(const IdxT* const, - const uint32_t, - const uint32_t, - const uint32_t, - const uint32_t, - const uint32_t, - uint8_t* const, - uint32_t* const, - uint64_t* const); - constexpr int MAX_DEGREE = 1024; - if (input_graph_degree <= MAX_DEGREE) { - kernel_prune = kern_prune; - } else { + if (input_graph_degree > MAX_DEGREE) { RAFT_FAIL( "The degree of input knn graph is too large (%u). " "It must be equal to or smaller than %d.", @@ -409,16 +397,17 @@ void optimize(raft::resources const& res, dev_stats.data_handle(), 0, sizeof(uint64_t) * 2, resource::get_cuda_stream(res))); for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) { - kernel_prune<<>>( - d_input_graph.data_handle(), - graph_size, - input_graph_degree, - output_graph_degree, - batch_size, - i_batch, - d_detour_count.data_handle(), - d_num_no_detour_edges.data_handle(), - dev_stats.data_handle()); + kern_prune + <<>>( + d_input_graph.data_handle(), + graph_size, + input_graph_degree, + output_graph_degree, + batch_size, + i_batch, + d_detour_count.data_handle(), + d_num_no_detour_edges.data_handle(), + dev_stats.data_handle()); resource::sync_stream(res); RAFT_LOG_DEBUG( "# Pruning kNN Graph on GPUs (%.1lf %%)\r", From 203667fd8411296018e83e7bbc13b386729c1248 Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Mon, 28 Aug 2023 17:12:14 -0700 Subject: [PATCH 04/11] avoid multiple detour_count matrices when using managed memory --- .../neighbors/detail/cagra/graph_core.cuh | 25 ++++++++++++++----- 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh index eeab3fb612..f38bc4c113 100644 --- a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh @@ -341,9 +341,9 @@ void optimize(raft::resources const& res, auto d_input_graph = raft::make_device_matrix(res, graph_size, input_graph_degree); - auto detour_count = raft::make_host_matrix(graph_size, input_graph_degree); auto d_detour_count = raft::make_device_matrix(res, graph_size, input_graph_degree); + RAFT_CUDA_TRY(cudaMemsetAsync(d_detour_count.data_handle(), 0xff, graph_size * input_graph_degree * sizeof(uint8_t), @@ -416,10 +416,23 @@ void optimize(raft::resources const& res, resource::sync_stream(res); RAFT_LOG_DEBUG("\n"); - raft::copy(detour_count.data_handle(), - d_detour_count.data_handle(), - graph_size * input_graph_degree, - resource::get_cuda_stream(res)); + // if we're using a managed memory resource, we don't need to make a separate copy of + // detour_count on the host - and can save allocating the extra host memory + uint8_t* host_detour_count_ptr = NULL; + std::optional> detour_count; + switch (spatial::knn::detail::utils::check_pointer_residency(d_detour_count.data_handle())) { + case spatial::knn::detail::utils::pointer_residency::host_and_device: + host_detour_count_ptr = d_detour_count.data_handle(); + break; + default: + detour_count.emplace( + raft::make_host_matrix(graph_size, input_graph_degree)); + raft::copy(detour_count->data_handle(), + d_detour_count.data_handle(), + graph_size * input_graph_degree, + resource::get_cuda_stream(res)); + host_detour_count_ptr = detour_count->data_handle(); + } raft::copy( host_stats.data_handle(), dev_stats.data_handle(), 2, resource::get_cuda_stream(res)); @@ -434,7 +447,7 @@ void optimize(raft::resources const& res, for (uint32_t num_detour = 0; num_detour < output_graph_degree; num_detour++) { if (max_detour < num_detour) { max_detour = num_detour; /* stats */ } for (uint64_t k = 0; k < input_graph_degree; k++) { - if (detour_count.data_handle()[k + (input_graph_degree * i)] != num_detour) { continue; } + if (host_detour_count_ptr[k + (input_graph_degree * i)] != num_detour) { continue; } output_graph_ptr[pk + (output_graph_degree * i)] = input_graph_ptr[k + (input_graph_degree * i)]; pk += 1; From f803deb8be6e5838ceb0cf45468d2080139f3c99 Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Mon, 28 Aug 2023 21:07:49 -0700 Subject: [PATCH 05/11] free intermediate graph before creating index --- cpp/include/raft/neighbors/cagra.cuh | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/cpp/include/raft/neighbors/cagra.cuh b/cpp/include/raft/neighbors/cagra.cuh index 6bb7beca55..903d0571dc 100644 --- a/cpp/include/raft/neighbors/cagra.cuh +++ b/cpp/include/raft/neighbors/cagra.cuh @@ -256,13 +256,17 @@ index build(raft::resources const& res, graph_degree = intermediate_degree; } - auto knn_graph = raft::make_host_matrix(dataset.extent(0), intermediate_degree); + std::optional> knn_graph( + raft::make_host_matrix(dataset.extent(0), intermediate_degree)); - build_knn_graph(res, dataset, knn_graph.view()); + build_knn_graph(res, dataset, knn_graph->view()); auto cagra_graph = raft::make_host_matrix(dataset.extent(0), graph_degree); - optimize(res, knn_graph.view(), cagra_graph.view()); + optimize(res, knn_graph->view(), cagra_graph.view()); + + // free intermediate graph before trying to create the index + knn_graph.reset(); // Construct an index from dataset and optimized knn graph. return index(res, params.metric, dataset, raft::make_const_mdspan(cagra_graph.view())); From c2eb1e37ca732d9b9ea068fe70ade2c896752f04 Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Tue, 29 Aug 2023 10:39:29 -0700 Subject: [PATCH 06/11] don't take device copy of intermediate graph unless necessary --- .../neighbors/detail/cagra/graph_core.cuh | 29 +++++++++++++------ 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh index f38bc4c113..947b8e66ca 100644 --- a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh @@ -338,9 +338,6 @@ void optimize(raft::resources const& res, // // Prune kNN graph // - auto d_input_graph = - raft::make_device_matrix(res, graph_size, input_graph_degree); - auto d_detour_count = raft::make_device_matrix(res, graph_size, input_graph_degree); @@ -374,10 +371,24 @@ void optimize(raft::resources const& res, const double time_prune_start = cur_time(); RAFT_LOG_DEBUG("# Pruning kNN Graph on GPUs\r"); - raft::copy(d_input_graph.data_handle(), - input_graph_ptr, - graph_size * input_graph_degree, - resource::get_cuda_stream(res)); + // only make a device copy of the the graph if it isn't already accessible on the + // device (which might already be because of uvm/hmm/ats etc) + std::optional> d_input_graph; + IdxT* d_input_graph_ptr = NULL; + switch (spatial::knn::detail::utils::check_pointer_residency(input_graph_ptr)) { + case spatial::knn::detail::utils::pointer_residency::host_and_device: + case spatial::knn::detail::utils::pointer_residency::device_only: + d_input_graph_ptr = input_graph_ptr; + break; + default: + d_input_graph.emplace( + raft::make_device_matrix(res, graph_size, input_graph_degree)); + raft::copy(d_input_graph->data_handle(), + input_graph_ptr, + graph_size * input_graph_degree, + resource::get_cuda_stream(res)); + d_input_graph_ptr = d_input_graph->data_handle(); + } constexpr int MAX_DEGREE = 1024; if (input_graph_degree > MAX_DEGREE) { @@ -399,7 +410,7 @@ void optimize(raft::resources const& res, for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) { kern_prune <<>>( - d_input_graph.data_handle(), + d_input_graph_ptr, graph_size, input_graph_degree, output_graph_degree, @@ -416,7 +427,7 @@ void optimize(raft::resources const& res, resource::sync_stream(res); RAFT_LOG_DEBUG("\n"); - // if we're using a managed memory resource, we don't need to make a separate copy of + // if we're using a managed memory (or HMM/ATS), we don't need to make a separate copy of // detour_count on the host - and can save allocating the extra host memory uint8_t* host_detour_count_ptr = NULL; std::optional> detour_count; From eb86b1ec2d7ef754f99f90e3f7a59ce9d5aecdc6 Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Tue, 29 Aug 2023 13:25:47 -0700 Subject: [PATCH 07/11] refactor --- cpp/include/raft/neighbors/cagra_types.hpp | 11 +++ .../neighbors/detail/cagra/graph_core.cuh | 43 ++--------- .../raft/neighbors/detail/cagra/utils.hpp | 77 +++++++++++++++++++ 3 files changed, 95 insertions(+), 36 deletions(-) diff --git a/cpp/include/raft/neighbors/cagra_types.hpp b/cpp/include/raft/neighbors/cagra_types.hpp index 02e3f5338e..7e2b6bc06e 100644 --- a/cpp/include/raft/neighbors/cagra_types.hpp +++ b/cpp/include/raft/neighbors/cagra_types.hpp @@ -287,6 +287,17 @@ struct index : ann::index { graph_view_ = knn_graph; } + /** + * Replace the graph with a new graph, taking ownership of the input device + * matrix. The index will manage the lifetime. + */ + void update_graph(raft::resources const& res, + raft::device_matrix&& knn_graph) + { + graph_ = std::move(knn_graph); + graph_view_ = graph_.view(); + } + /** * Replace the graph with a new graph. * diff --git a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh index 947b8e66ca..4b3546f8c8 100644 --- a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh @@ -371,24 +371,10 @@ void optimize(raft::resources const& res, const double time_prune_start = cur_time(); RAFT_LOG_DEBUG("# Pruning kNN Graph on GPUs\r"); - // only make a device copy of the the graph if it isn't already accessible on the - // device (which might already be because of uvm/hmm/ats etc) - std::optional> d_input_graph; - IdxT* d_input_graph_ptr = NULL; - switch (spatial::knn::detail::utils::check_pointer_residency(input_graph_ptr)) { - case spatial::knn::detail::utils::pointer_residency::host_and_device: - case spatial::knn::detail::utils::pointer_residency::device_only: - d_input_graph_ptr = input_graph_ptr; - break; - default: - d_input_graph.emplace( - raft::make_device_matrix(res, graph_size, input_graph_degree)); - raft::copy(d_input_graph->data_handle(), - input_graph_ptr, - graph_size * input_graph_degree, - resource::get_cuda_stream(res)); - d_input_graph_ptr = d_input_graph->data_handle(); - } + // Copy input_graph_ptr over to device if necessary + device_matrix_view_from_host d_input_graph( + res, + raft::make_host_matrix_view(input_graph_ptr, graph_size, input_graph_degree)); constexpr int MAX_DEGREE = 1024; if (input_graph_degree > MAX_DEGREE) { @@ -410,7 +396,7 @@ void optimize(raft::resources const& res, for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) { kern_prune <<>>( - d_input_graph_ptr, + d_input_graph.view().data_handle(), graph_size, input_graph_degree, output_graph_degree, @@ -427,23 +413,8 @@ void optimize(raft::resources const& res, resource::sync_stream(res); RAFT_LOG_DEBUG("\n"); - // if we're using a managed memory (or HMM/ATS), we don't need to make a separate copy of - // detour_count on the host - and can save allocating the extra host memory - uint8_t* host_detour_count_ptr = NULL; - std::optional> detour_count; - switch (spatial::knn::detail::utils::check_pointer_residency(d_detour_count.data_handle())) { - case spatial::knn::detail::utils::pointer_residency::host_and_device: - host_detour_count_ptr = d_detour_count.data_handle(); - break; - default: - detour_count.emplace( - raft::make_host_matrix(graph_size, input_graph_degree)); - raft::copy(detour_count->data_handle(), - d_detour_count.data_handle(), - graph_size * input_graph_degree, - resource::get_cuda_stream(res)); - host_detour_count_ptr = detour_count->data_handle(); - } + host_matrix_view_from_device detour_count(res, d_detour_count.view()); + uint8_t* host_detour_count_ptr = detour_count.view().data_handle(); raft::copy( host_stats.data_handle(), dev_stats.data_handle(), 2, resource::get_cuda_stream(res)); diff --git a/cpp/include/raft/neighbors/detail/cagra/utils.hpp b/cpp/include/raft/neighbors/detail/cagra/utils.hpp index 22c7a60647..72d57fa413 100644 --- a/cpp/include/raft/neighbors/detail/cagra/utils.hpp +++ b/cpp/include/raft/neighbors/detail/cagra/utils.hpp @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include namespace raft::neighbors::cagra::detail { @@ -150,4 +152,79 @@ struct gen_index_msb_1_mask { }; } // namespace utils +/** + * Utility to sync memory from a host_matrix_view to a device_matrix_view + * + * In certain situations (UVM/HMM/ATS) host memory might be directly accessible on the + * device, and no copy or extra allocations need to be performed. This class checks + * if the host_matrix_view is already accessible on the device, and only creates device + * memory and copies over if necessary + */ +template +struct device_matrix_view_from_host { + device_matrix_view_from_host(raft::resources const& res, host_matrix_view host_view) + : host_view_(host_view) + { + cudaPointerAttributes attr; + RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, host_view.data_handle())); + device_ptr = reinterpret_cast(attr.devicePointer); + if (device_ptr == NULL) { + // allocate memory and copy over + device_mem_.emplace( + raft::make_device_matrix(res, host_view.extent(0), host_view.extent(1))); + raft::copy(device_mem_->data_handle(), + host_view.data_handle(), + host_view.extent(0) * host_view.extent(1), + resource::get_cuda_stream(res)); + device_ptr = device_mem_->data_handle(); + } + } + + device_matrix_view view() + { + return make_device_matrix_view(device_ptr, host_view_.extent(0), host_view_.extent(1)); + } + + std::optional> device_mem_; + host_matrix_view host_view_; + T* device_ptr; +}; + +/** + * Utility to sync memory from a device_matrix_view to a host_matrix_view + * + * In certain situations (UVM/HMM/ATS) device memory might be directly accessible on the + * host, and no copy or extra allocations need to be performed. This class checks + * if the device_matrix_view is already accessible on the host, and only creates host + * memory and copies over if necessary + */ +template +struct host_matrix_view_from_device { + host_matrix_view_from_device(raft::resources const& res, device_matrix_view device_view) + : device_view_(device_view) + { + cudaPointerAttributes attr; + RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, device_view.data_handle())); + host_ptr = reinterpret_cast(attr.hostPointer); + if (host_ptr == NULL) { + // allocate memory and copy over + host_mem_.emplace( + raft::make_host_matrix(device_view.extent(0), device_view.extent(1))); + raft::copy(host_mem_->data_handle(), + device_view.data_handle(), + device_view.extent(0) * device_view.extent(1), + resource::get_cuda_stream(res)); + host_ptr = host_mem_->data_handle(); + } + } + + host_matrix_view view() + { + return make_host_matrix_view(host_ptr, device_view_.extent(0), device_view_.extent(1)); + } + + std::optional> host_mem_; + device_matrix_view device_view_; + T* host_ptr; +}; } // namespace raft::neighbors::cagra::detail From 191e7fcd87e4a01d9e76049cb33592d94f5d6c85 Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Tue, 29 Aug 2023 17:59:21 -0700 Subject: [PATCH 08/11] rev_graph --- .../neighbors/detail/cagra/graph_core.cuh | 18 ++++++------- .../raft/neighbors/detail/cagra/utils.hpp | 26 ++++++++++++++----- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh index 4b3546f8c8..18d451be60 100644 --- a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh @@ -396,7 +396,7 @@ void optimize(raft::resources const& res, for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) { kern_prune <<>>( - d_input_graph.view().data_handle(), + d_input_graph.data_handle(), graph_size, input_graph_degree, output_graph_degree, @@ -414,7 +414,6 @@ void optimize(raft::resources const& res, RAFT_LOG_DEBUG("\n"); host_matrix_view_from_device detour_count(res, d_detour_count.view()); - uint8_t* host_detour_count_ptr = detour_count.view().data_handle(); raft::copy( host_stats.data_handle(), dev_stats.data_handle(), 2, resource::get_cuda_stream(res)); @@ -429,7 +428,7 @@ void optimize(raft::resources const& res, for (uint32_t num_detour = 0; num_detour < output_graph_degree; num_detour++) { if (max_detour < num_detour) { max_detour = num_detour; /* stats */ } for (uint64_t k = 0; k < input_graph_degree; k++) { - if (host_detour_count_ptr[k + (input_graph_degree * i)] != num_detour) { continue; } + if (detour_count.data_handle()[k + (input_graph_degree * i)] != num_detour) { continue; } output_graph_ptr[pk + (output_graph_degree * i)] = input_graph_ptr[k + (input_graph_degree * i)]; pk += 1; @@ -461,8 +460,7 @@ void optimize(raft::resources const& res, // const double time_make_start = cur_time(); - auto d_rev_graph = - raft::make_device_matrix(res, graph_size, output_graph_degree); + device_matrix_view_from_host d_rev_graph(res, rev_graph.view()); RAFT_CUDA_TRY(cudaMemsetAsync(d_rev_graph.data_handle(), 0xff, graph_size * output_graph_degree * sizeof(IdxT), @@ -503,10 +501,12 @@ void optimize(raft::resources const& res, resource::sync_stream(res); RAFT_LOG_DEBUG("\n"); - raft::copy(rev_graph.data_handle(), - d_rev_graph.data_handle(), - graph_size * output_graph_degree, - resource::get_cuda_stream(res)); + if (d_rev_graph.allocated_memory()) { + raft::copy(rev_graph.data_handle(), + d_rev_graph.data_handle(), + graph_size * output_graph_degree, + resource::get_cuda_stream(res)); + } raft::copy(rev_graph_count.data_handle(), d_rev_graph_count.data_handle(), graph_size, diff --git a/cpp/include/raft/neighbors/detail/cagra/utils.hpp b/cpp/include/raft/neighbors/detail/cagra/utils.hpp index 72d57fa413..686fdc9bbe 100644 --- a/cpp/include/raft/neighbors/detail/cagra/utils.hpp +++ b/cpp/include/raft/neighbors/detail/cagra/utils.hpp @@ -156,12 +156,14 @@ struct gen_index_msb_1_mask { * Utility to sync memory from a host_matrix_view to a device_matrix_view * * In certain situations (UVM/HMM/ATS) host memory might be directly accessible on the - * device, and no copy or extra allocations need to be performed. This class checks + * device, and no extra allocations need to be performed. This class checks * if the host_matrix_view is already accessible on the device, and only creates device - * memory and copies over if necessary + * memory and copies over if necessary. In memory limited situations this is preferable + * to having both a host and device copy */ template -struct device_matrix_view_from_host { +class device_matrix_view_from_host { + public: device_matrix_view_from_host(raft::resources const& res, host_matrix_view host_view) : host_view_(host_view) { @@ -185,6 +187,11 @@ struct device_matrix_view_from_host { return make_device_matrix_view(device_ptr, host_view_.extent(0), host_view_.extent(1)); } + T* data_handle() { return device_ptr; } + + bool allocated_memory() const { return device_mem_.has_value(); } + + private: std::optional> device_mem_; host_matrix_view host_view_; T* device_ptr; @@ -194,12 +201,14 @@ struct device_matrix_view_from_host { * Utility to sync memory from a device_matrix_view to a host_matrix_view * * In certain situations (UVM/HMM/ATS) device memory might be directly accessible on the - * host, and no copy or extra allocations need to be performed. This class checks + * host, and no extra allocations need to be performed. This class checks * if the device_matrix_view is already accessible on the host, and only creates host - * memory and copies over if necessary + * memory and copies over if necessary. In memory limited situations this is preferable + * to having both a host and device copy */ template -struct host_matrix_view_from_device { +class host_matrix_view_from_device { + public: host_matrix_view_from_device(raft::resources const& res, device_matrix_view device_view) : device_view_(device_view) { @@ -223,6 +232,11 @@ struct host_matrix_view_from_device { return make_host_matrix_view(host_ptr, device_view_.extent(0), device_view_.extent(1)); } + T* data_handle() { return host_ptr; } + + bool allocated_memory() const { return host_mem_.has_value(); } + + private: std::optional> host_mem_; device_matrix_view device_view_; T* host_ptr; From f539655f770f7cd92fb3e9a064f1210662bde68a Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Wed, 30 Aug 2023 16:06:13 -0700 Subject: [PATCH 09/11] remove accidental checkin --- cpp/include/raft/neighbors/cagra_types.hpp | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/cpp/include/raft/neighbors/cagra_types.hpp b/cpp/include/raft/neighbors/cagra_types.hpp index 7e2b6bc06e..02e3f5338e 100644 --- a/cpp/include/raft/neighbors/cagra_types.hpp +++ b/cpp/include/raft/neighbors/cagra_types.hpp @@ -287,17 +287,6 @@ struct index : ann::index { graph_view_ = knn_graph; } - /** - * Replace the graph with a new graph, taking ownership of the input device - * matrix. The index will manage the lifetime. - */ - void update_graph(raft::resources const& res, - raft::device_matrix&& knn_graph) - { - graph_ = std::move(knn_graph); - graph_view_ = graph_.view(); - } - /** * Replace the graph with a new graph. * From 0e0c5f3b64803ab145b390277186989baffccaf9 Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Wed, 30 Aug 2023 16:12:52 -0700 Subject: [PATCH 10/11] revert serialization change was still making a copy from device->host inside serialize_mdspan, and with the include_dataset changes this branch won't even be called --- .../detail/cagra/cagra_serialize.cuh | 35 +++++++------------ 1 file changed, 12 insertions(+), 23 deletions(-) diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh index d0e1f8ea4f..2c9cbd2563 100644 --- a/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh @@ -71,29 +71,18 @@ void serialize(raft::resources const& res, serialize_scalar(res, os, include_dataset); if (include_dataset) { auto dataset = index_.dataset(); - if (dataset.stride(0) == dataset.extent(0)) { - // Rather than take another copy of the dataset here, just write it out directly. - // Since the dataset is a strided layout, we can't pass directly to the serialize_mdspan - // - but since the stride is the same as the extent, we can convert to a row-major - // mdspan - serialize_mdspan( - res, - os, - make_device_matrix_view(dataset.data_handle(), dataset.extent(0), dataset.extent(1))); - } else { - // Remove padding before saving the dataset - auto host_dataset = make_host_matrix(dataset.extent(0), dataset.extent(1)); - RAFT_CUDA_TRY(cudaMemcpy2DAsync(host_dataset.data_handle(), - sizeof(T) * host_dataset.extent(1), - dataset.data_handle(), - sizeof(T) * dataset.stride(0), - sizeof(T) * host_dataset.extent(1), - dataset.extent(0), - cudaMemcpyDefault, - resource::get_cuda_stream(res))); - resource::sync_stream(res); - serialize_mdspan(res, os, host_dataset.view()); - } + // Remove padding before saving the dataset + auto host_dataset = make_host_matrix(dataset.extent(0), dataset.extent(1)); + RAFT_CUDA_TRY(cudaMemcpy2DAsync(host_dataset.data_handle(), + sizeof(T) * host_dataset.extent(1), + dataset.data_handle(), + sizeof(T) * dataset.stride(0), + sizeof(T) * host_dataset.extent(1), + dataset.extent(0), + cudaMemcpyDefault, + resource::get_cuda_stream(res))); + resource::sync_stream(res); + serialize_mdspan(res, os, host_dataset.view()); } } From 6552c66c820431285c5ee01705dbfaaf7b4435ed Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Tue, 5 Sep 2023 10:28:40 -0700 Subject: [PATCH 11/11] Add TODO about using mdbuffer --- cpp/include/raft/neighbors/detail/cagra/utils.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/include/raft/neighbors/detail/cagra/utils.hpp b/cpp/include/raft/neighbors/detail/cagra/utils.hpp index 686fdc9bbe..22cbe6bbac 100644 --- a/cpp/include/raft/neighbors/detail/cagra/utils.hpp +++ b/cpp/include/raft/neighbors/detail/cagra/utils.hpp @@ -160,6 +160,8 @@ struct gen_index_msb_1_mask { * if the host_matrix_view is already accessible on the device, and only creates device * memory and copies over if necessary. In memory limited situations this is preferable * to having both a host and device copy + * TODO: once the mdbuffer changes here https://github.com/wphicks/raft/blob/fea-mdbuffer + * have been merged, we should remove this class and switch over to using mdbuffer for this */ template class device_matrix_view_from_host { @@ -205,6 +207,8 @@ class device_matrix_view_from_host { * if the device_matrix_view is already accessible on the host, and only creates host * memory and copies over if necessary. In memory limited situations this is preferable * to having both a host and device copy + * TODO: once the mdbuffer changes here https://github.com/wphicks/raft/blob/fea-mdbuffer + * have been merged, we should remove this class and switch over to using mdbuffer for this */ template class host_matrix_view_from_device {