From 4ba7510565d12455e777cf8e370f9202a9e71621 Mon Sep 17 00:00:00 2001
From: Ben Frederickson <ben@benfrederickson.com>
Date: Mon, 28 Aug 2023 15:19:40 -0700
Subject: [PATCH 01/11] Don't keep extra `pruned_graph` copy in optimize

This just gets copied over to the output, and requires multiple copies of the
output graph on host memory.
---
 .../raft/neighbors/detail/cagra/graph_core.cuh      | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)
diff --git a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
index 0558d7ea39..f060bca36b 100644
--- a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
@@ -334,8 +334,6 @@ void optimize(raft::resources const& res,
   auto output_graph_ptr              = new_graph.data_handle();
   const IdxT graph_size              = new_graph.extent(0);
 
-  auto pruned_graph = raft::make_host_matrix<IdxT, int64_t>(graph_size, output_graph_degree);
-
   {
     //
     // Prune kNN graph
@@ -380,6 +378,7 @@ void optimize(raft::resources const& res,
                input_graph_ptr,
                graph_size * input_graph_degree,
                resource::get_cuda_stream(res));
+
     void (*kernel_prune)(const IdxT* const,
                          const uint32_t,
                          const uint32_t,
@@ -447,7 +446,7 @@ void optimize(raft::resources const& res,
         if (max_detour < num_detour) { max_detour = num_detour; /* stats */ }
         for (uint64_t k = 0; k < input_graph_degree; k++) {
           if (detour_count.data_handle()[k + (input_graph_degree * i)] != num_detour) { continue; }
-          pruned_graph.data_handle()[pk + (output_graph_degree * i)] =
+          output_graph_ptr[pk + (output_graph_degree * i)] =
             input_graph_ptr[k + (input_graph_degree * i)];
           pk += 1;
           if (pk >= output_graph_degree) break;
@@ -497,7 +496,7 @@ void optimize(raft::resources const& res,
     for (uint64_t k = 0; k < output_graph_degree; k++) {
 #pragma omp parallel for
       for (uint64_t i = 0; i < graph_size; i++) {
-        dest_nodes.data_handle()[i] = pruned_graph.data_handle()[k + (output_graph_degree * i)];
+        dest_nodes.data_handle()[i] = output_graph_ptr[k + (output_graph_degree * i)];
       }
       resource::sync_stream(res);
 
@@ -542,10 +541,6 @@ void optimize(raft::resources const& res,
     const uint64_t num_protected_edges = output_graph_degree / 2;
     RAFT_LOG_DEBUG("# num_protected_edges: %lu", num_protected_edges);
 
-    memcpy(output_graph_ptr,
-           pruned_graph.data_handle(),
-           sizeof(IdxT) * graph_size * output_graph_degree);
-
     constexpr int _omp_chunk = 1024;
 #pragma omp parallel for schedule(dynamic, _omp_chunk)
     for (uint64_t j = 0; j < graph_size; j++) {
@@ -578,7 +573,7 @@ void optimize(raft::resources const& res,
 #pragma omp parallel for reduction(+ : num_replaced_edges)
     for (uint64_t i = 0; i < graph_size; i++) {
       for (uint64_t k = 0; k < output_graph_degree; k++) {
-        const uint64_t j = pruned_graph.data_handle()[k + (output_graph_degree * i)];
+        const uint64_t j = output_graph_ptr[k + (output_graph_degree * i)];
         const uint64_t pos =
           pos_in_array<IdxT>(j, output_graph_ptr + (output_graph_degree * i), output_graph_degree);
         if (pos == output_graph_degree) { num_replaced_edges += 1; }

From c5e499822389eab531901802b2d67e58c18461d9 Mon Sep 17 00:00:00 2001
From: Ben Frederickson <ben@benfrederickson.com>
Date: Mon, 28 Aug 2023 15:27:22 -0700
Subject: [PATCH 02/11] in serialize, don't take copy of dataset if not strided

---
 .../detail/cagra/cagra_serialize.cuh          | 35 ++++++++++++-------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh
index 2c9cbd2563..d0e1f8ea4f 100644
--- a/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh
@@ -71,18 +71,29 @@ void serialize(raft::resources const& res,
   serialize_scalar(res, os, include_dataset);
   if (include_dataset) {
     auto dataset = index_.dataset();
-    // Remove padding before saving the dataset
-    auto host_dataset = make_host_matrix<T, int64_t>(dataset.extent(0), dataset.extent(1));
-    RAFT_CUDA_TRY(cudaMemcpy2DAsync(host_dataset.data_handle(),
-                                    sizeof(T) * host_dataset.extent(1),
-                                    dataset.data_handle(),
-                                    sizeof(T) * dataset.stride(0),
-                                    sizeof(T) * host_dataset.extent(1),
-                                    dataset.extent(0),
-                                    cudaMemcpyDefault,
-                                    resource::get_cuda_stream(res)));
-    resource::sync_stream(res);
-    serialize_mdspan(res, os, host_dataset.view());
+    if (dataset.stride(0) == dataset.extent(0)) {
+      // Rather than take another copy of the dataset here, just write it out directly.
+      // Since the dataset is a strided layout, we can't pass directly to the serialize_mdspan
+      // - but since the stride is the same as the extent, we can convert to a row-major
+      // mdspan
+      serialize_mdspan(
+        res,
+        os,
+        make_device_matrix_view(dataset.data_handle(), dataset.extent(0), dataset.extent(1)));
+    } else {
+      // Remove padding before saving the dataset
+      auto host_dataset = make_host_matrix<T, int64_t>(dataset.extent(0), dataset.extent(1));
+      RAFT_CUDA_TRY(cudaMemcpy2DAsync(host_dataset.data_handle(),
+                                      sizeof(T) * host_dataset.extent(1),
+                                      dataset.data_handle(),
+                                      sizeof(T) * dataset.stride(0),
+                                      sizeof(T) * host_dataset.extent(1),
+                                      dataset.extent(0),
+                                      cudaMemcpyDefault,
+                                      resource::get_cuda_stream(res)));
+      resource::sync_stream(res);
+      serialize_mdspan(res, os, host_dataset.view());
+    }
   }
 }
 

From 5d9c95927df0bbb69f231c543cdfb1b3d5bc765f Mon Sep 17 00:00:00 2001
From: Ben Frederickson <ben@benfrederickson.com>
Date: Mon, 28 Aug 2023 15:59:29 -0700
Subject: [PATCH 03/11] simplify kern_prune

---
 .../neighbors/detail/cagra/graph_core.cuh     | 35 +++++++------------
 1 file changed, 12 insertions(+), 23 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
index f060bca36b..eeab3fb612 100644
--- a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
@@ -379,20 +379,8 @@ void optimize(raft::resources const& res,
                graph_size * input_graph_degree,
                resource::get_cuda_stream(res));
 
-    void (*kernel_prune)(const IdxT* const,
-                         const uint32_t,
-                         const uint32_t,
-                         const uint32_t,
-                         const uint32_t,
-                         const uint32_t,
-                         uint8_t* const,
-                         uint32_t* const,
-                         uint64_t* const);
-
     constexpr int MAX_DEGREE = 1024;
-    if (input_graph_degree <= MAX_DEGREE) {
-      kernel_prune = kern_prune<MAX_DEGREE, IdxT>;
-    } else {
+    if (input_graph_degree > MAX_DEGREE) {
       RAFT_FAIL(
         "The degree of input knn graph is too large (%u). "
         "It must be equal to or smaller than %d.",
@@ -409,16 +397,17 @@ void optimize(raft::resources const& res,
       dev_stats.data_handle(), 0, sizeof(uint64_t) * 2, resource::get_cuda_stream(res)));
 
     for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
-      kernel_prune<<<blocks_prune, threads_prune, 0, resource::get_cuda_stream(res)>>>(
-        d_input_graph.data_handle(),
-        graph_size,
-        input_graph_degree,
-        output_graph_degree,
-        batch_size,
-        i_batch,
-        d_detour_count.data_handle(),
-        d_num_no_detour_edges.data_handle(),
-        dev_stats.data_handle());
+      kern_prune<MAX_DEGREE, IdxT>
+        <<<blocks_prune, threads_prune, 0, resource::get_cuda_stream(res)>>>(
+          d_input_graph.data_handle(),
+          graph_size,
+          input_graph_degree,
+          output_graph_degree,
+          batch_size,
+          i_batch,
+          d_detour_count.data_handle(),
+          d_num_no_detour_edges.data_handle(),
+          dev_stats.data_handle());
       resource::sync_stream(res);
       RAFT_LOG_DEBUG(
         "# Pruning kNN Graph on GPUs (%.1lf %%)\r",

From 203667fd8411296018e83e7bbc13b386729c1248 Mon Sep 17 00:00:00 2001
From: Ben Frederickson <ben@benfrederickson.com>
Date: Mon, 28 Aug 2023 17:12:14 -0700
Subject: [PATCH 04/11] avoid multiple detour_count matrices when using managed
 memory

---
 .../neighbors/detail/cagra/graph_core.cuh     | 25 ++++++++++++++-----
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
index eeab3fb612..f38bc4c113 100644
--- a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
@@ -341,9 +341,9 @@ void optimize(raft::resources const& res,
     auto d_input_graph =
       raft::make_device_matrix<IdxT, int64_t>(res, graph_size, input_graph_degree);
 
-    auto detour_count = raft::make_host_matrix<uint8_t, int64_t>(graph_size, input_graph_degree);
     auto d_detour_count =
       raft::make_device_matrix<uint8_t, int64_t>(res, graph_size, input_graph_degree);
+
     RAFT_CUDA_TRY(cudaMemsetAsync(d_detour_count.data_handle(),
                                   0xff,
                                   graph_size * input_graph_degree * sizeof(uint8_t),
@@ -416,10 +416,23 @@ void optimize(raft::resources const& res,
     resource::sync_stream(res);
     RAFT_LOG_DEBUG("\n");
 
-    raft::copy(detour_count.data_handle(),
-               d_detour_count.data_handle(),
-               graph_size * input_graph_degree,
-               resource::get_cuda_stream(res));
+    // if we're using a managed memory resource, we don't need to make a separate copy of
+    // detour_count on the host - and can save allocating the extra host memory
+    uint8_t* host_detour_count_ptr = NULL;
+    std::optional<host_matrix<uint8_t, int64_t>> detour_count;
+    switch (spatial::knn::detail::utils::check_pointer_residency(d_detour_count.data_handle())) {
+      case spatial::knn::detail::utils::pointer_residency::host_and_device:
+        host_detour_count_ptr = d_detour_count.data_handle();
+        break;
+      default:
+        detour_count.emplace(
+          raft::make_host_matrix<uint8_t, int64_t>(graph_size, input_graph_degree));
+        raft::copy(detour_count->data_handle(),
+                   d_detour_count.data_handle(),
+                   graph_size * input_graph_degree,
+                   resource::get_cuda_stream(res));
+        host_detour_count_ptr = detour_count->data_handle();
+    }
 
     raft::copy(
       host_stats.data_handle(), dev_stats.data_handle(), 2, resource::get_cuda_stream(res));
@@ -434,7 +447,7 @@ void optimize(raft::resources const& res,
       for (uint32_t num_detour = 0; num_detour < output_graph_degree; num_detour++) {
         if (max_detour < num_detour) { max_detour = num_detour; /* stats */ }
         for (uint64_t k = 0; k < input_graph_degree; k++) {
-          if (detour_count.data_handle()[k + (input_graph_degree * i)] != num_detour) { continue; }
+          if (host_detour_count_ptr[k + (input_graph_degree * i)] != num_detour) { continue; }
           output_graph_ptr[pk + (output_graph_degree * i)] =
             input_graph_ptr[k + (input_graph_degree * i)];
           pk += 1;

From f803deb8be6e5838ceb0cf45468d2080139f3c99 Mon Sep 17 00:00:00 2001
From: Ben Frederickson <ben@benfrederickson.com>
Date: Mon, 28 Aug 2023 21:07:49 -0700
Subject: [PATCH 05/11] free intermediate graph before creating index

---
 cpp/include/raft/neighbors/cagra.cuh | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/cpp/include/raft/neighbors/cagra.cuh b/cpp/include/raft/neighbors/cagra.cuh
index 6bb7beca55..903d0571dc 100644
--- a/cpp/include/raft/neighbors/cagra.cuh
+++ b/cpp/include/raft/neighbors/cagra.cuh
@@ -256,13 +256,17 @@ index<T, IdxT> build(raft::resources const& res,
     graph_degree = intermediate_degree;
   }
 
-  auto knn_graph = raft::make_host_matrix<IdxT, int64_t>(dataset.extent(0), intermediate_degree);
+  std::optional<raft::host_matrix<IdxT, int64_t>> knn_graph(
+    raft::make_host_matrix<IdxT, int64_t>(dataset.extent(0), intermediate_degree));
 
-  build_knn_graph(res, dataset, knn_graph.view());
+  build_knn_graph(res, dataset, knn_graph->view());
 
   auto cagra_graph = raft::make_host_matrix<IdxT, int64_t>(dataset.extent(0), graph_degree);
 
-  optimize<IdxT>(res, knn_graph.view(), cagra_graph.view());
+  optimize<IdxT>(res, knn_graph->view(), cagra_graph.view());
+
+  // free intermediate graph before trying to create the index
+  knn_graph.reset();
 
   // Construct an index from dataset and optimized knn graph.
   return index<T, IdxT>(res, params.metric, dataset, raft::make_const_mdspan(cagra_graph.view()));

From c2eb1e37ca732d9b9ea068fe70ade2c896752f04 Mon Sep 17 00:00:00 2001
From: Ben Frederickson <ben@benfrederickson.com>
Date: Tue, 29 Aug 2023 10:39:29 -0700
Subject: [PATCH 06/11] don't take device copy of intermediate graph unless
 necessary

---
 .../neighbors/detail/cagra/graph_core.cuh     | 29 +++++++++++++------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
index f38bc4c113..947b8e66ca 100644
--- a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
@@ -338,9 +338,6 @@ void optimize(raft::resources const& res,
     //
     // Prune kNN graph
     //
-    auto d_input_graph =
-      raft::make_device_matrix<IdxT, int64_t>(res, graph_size, input_graph_degree);
-
     auto d_detour_count =
       raft::make_device_matrix<uint8_t, int64_t>(res, graph_size, input_graph_degree);
 
@@ -374,10 +371,24 @@ void optimize(raft::resources const& res,
     const double time_prune_start = cur_time();
     RAFT_LOG_DEBUG("# Pruning kNN Graph on GPUs\r");
 
-    raft::copy(d_input_graph.data_handle(),
-               input_graph_ptr,
-               graph_size * input_graph_degree,
-               resource::get_cuda_stream(res));
+    // only make a device copy of the the graph if it isn't already accessible on the
+    // device (which might already be because of uvm/hmm/ats etc)
+    std::optional<device_matrix<IdxT, int64_t>> d_input_graph;
+    IdxT* d_input_graph_ptr = NULL;
+    switch (spatial::knn::detail::utils::check_pointer_residency(input_graph_ptr)) {
+      case spatial::knn::detail::utils::pointer_residency::host_and_device:
+      case spatial::knn::detail::utils::pointer_residency::device_only:
+        d_input_graph_ptr = input_graph_ptr;
+        break;
+      default:
+        d_input_graph.emplace(
+          raft::make_device_matrix<IdxT, int64_t>(res, graph_size, input_graph_degree));
+        raft::copy(d_input_graph->data_handle(),
+                   input_graph_ptr,
+                   graph_size * input_graph_degree,
+                   resource::get_cuda_stream(res));
+        d_input_graph_ptr = d_input_graph->data_handle();
+    }
 
     constexpr int MAX_DEGREE = 1024;
     if (input_graph_degree > MAX_DEGREE) {
@@ -399,7 +410,7 @@ void optimize(raft::resources const& res,
     for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
       kern_prune<MAX_DEGREE, IdxT>
         <<<blocks_prune, threads_prune, 0, resource::get_cuda_stream(res)>>>(
-          d_input_graph.data_handle(),
+          d_input_graph_ptr,
           graph_size,
           input_graph_degree,
           output_graph_degree,
@@ -416,7 +427,7 @@ void optimize(raft::resources const& res,
     resource::sync_stream(res);
     RAFT_LOG_DEBUG("\n");
 
-    // if we're using a managed memory resource, we don't need to make a separate copy of
+    // if we're using a managed memory (or HMM/ATS), we don't need to make a separate copy of
     // detour_count on the host - and can save allocating the extra host memory
     uint8_t* host_detour_count_ptr = NULL;
     std::optional<host_matrix<uint8_t, int64_t>> detour_count;

From eb86b1ec2d7ef754f99f90e3f7a59ce9d5aecdc6 Mon Sep 17 00:00:00 2001
From: Ben Frederickson <ben@benfrederickson.com>
Date: Tue, 29 Aug 2023 13:25:47 -0700
Subject: [PATCH 07/11] refactor

---
 cpp/include/raft/neighbors/cagra_types.hpp    | 11 +++
 .../neighbors/detail/cagra/graph_core.cuh     | 43 ++---------
 .../raft/neighbors/detail/cagra/utils.hpp     | 77 +++++++++++++++++++
 3 files changed, 95 insertions(+), 36 deletions(-)

diff --git a/cpp/include/raft/neighbors/cagra_types.hpp b/cpp/include/raft/neighbors/cagra_types.hpp
index 02e3f5338e..7e2b6bc06e 100644
--- a/cpp/include/raft/neighbors/cagra_types.hpp
+++ b/cpp/include/raft/neighbors/cagra_types.hpp
@@ -287,6 +287,17 @@ struct index : ann::index {
     graph_view_ = knn_graph;
   }
 
+  /**
+   * Replace the graph with a new graph, taking ownership of the input device
+   * matrix. The index will manage the lifetime.
+   */
+  void update_graph(raft::resources const& res,
+                    raft::device_matrix<IdxT, int64_t, row_major>&& knn_graph)
+  {
+    graph_      = std::move(knn_graph);
+    graph_view_ = graph_.view();
+  }
+
   /**
    * Replace the graph with a new graph.
    *
diff --git a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
index 947b8e66ca..4b3546f8c8 100644
--- a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
@@ -371,24 +371,10 @@ void optimize(raft::resources const& res,
     const double time_prune_start = cur_time();
     RAFT_LOG_DEBUG("# Pruning kNN Graph on GPUs\r");
 
-    // only make a device copy of the the graph if it isn't already accessible on the
-    // device (which might already be because of uvm/hmm/ats etc)
-    std::optional<device_matrix<IdxT, int64_t>> d_input_graph;
-    IdxT* d_input_graph_ptr = NULL;
-    switch (spatial::knn::detail::utils::check_pointer_residency(input_graph_ptr)) {
-      case spatial::knn::detail::utils::pointer_residency::host_and_device:
-      case spatial::knn::detail::utils::pointer_residency::device_only:
-        d_input_graph_ptr = input_graph_ptr;
-        break;
-      default:
-        d_input_graph.emplace(
-          raft::make_device_matrix<IdxT, int64_t>(res, graph_size, input_graph_degree));
-        raft::copy(d_input_graph->data_handle(),
-                   input_graph_ptr,
-                   graph_size * input_graph_degree,
-                   resource::get_cuda_stream(res));
-        d_input_graph_ptr = d_input_graph->data_handle();
-    }
+    // Copy input_graph_ptr over to device if necessary
+    device_matrix_view_from_host d_input_graph(
+      res,
+      raft::make_host_matrix_view<IdxT, int64_t>(input_graph_ptr, graph_size, input_graph_degree));
 
     constexpr int MAX_DEGREE = 1024;
     if (input_graph_degree > MAX_DEGREE) {
@@ -410,7 +396,7 @@ void optimize(raft::resources const& res,
     for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
       kern_prune<MAX_DEGREE, IdxT>
         <<<blocks_prune, threads_prune, 0, resource::get_cuda_stream(res)>>>(
-          d_input_graph_ptr,
+          d_input_graph.view().data_handle(),
           graph_size,
           input_graph_degree,
           output_graph_degree,
@@ -427,23 +413,8 @@ void optimize(raft::resources const& res,
     resource::sync_stream(res);
     RAFT_LOG_DEBUG("\n");
 
-    // if we're using a managed memory (or HMM/ATS), we don't need to make a separate copy of
-    // detour_count on the host - and can save allocating the extra host memory
-    uint8_t* host_detour_count_ptr = NULL;
-    std::optional<host_matrix<uint8_t, int64_t>> detour_count;
-    switch (spatial::knn::detail::utils::check_pointer_residency(d_detour_count.data_handle())) {
-      case spatial::knn::detail::utils::pointer_residency::host_and_device:
-        host_detour_count_ptr = d_detour_count.data_handle();
-        break;
-      default:
-        detour_count.emplace(
-          raft::make_host_matrix<uint8_t, int64_t>(graph_size, input_graph_degree));
-        raft::copy(detour_count->data_handle(),
-                   d_detour_count.data_handle(),
-                   graph_size * input_graph_degree,
-                   resource::get_cuda_stream(res));
-        host_detour_count_ptr = detour_count->data_handle();
-    }
+    host_matrix_view_from_device<uint8_t, int64_t> detour_count(res, d_detour_count.view());
+    uint8_t* host_detour_count_ptr = detour_count.view().data_handle();
 
     raft::copy(
       host_stats.data_handle(), dev_stats.data_handle(), 2, resource::get_cuda_stream(res));
diff --git a/cpp/include/raft/neighbors/detail/cagra/utils.hpp b/cpp/include/raft/neighbors/detail/cagra/utils.hpp
index 22c7a60647..72d57fa413 100644
--- a/cpp/include/raft/neighbors/detail/cagra/utils.hpp
+++ b/cpp/include/raft/neighbors/detail/cagra/utils.hpp
@@ -20,6 +20,8 @@
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <raft/core/detail/macros.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/host_mdarray.hpp>
 #include <type_traits>
 
 namespace raft::neighbors::cagra::detail {
@@ -150,4 +152,79 @@ struct gen_index_msb_1_mask {
 };
 }  // namespace utils
 
+/**
+ * Utility to sync memory from a host_matrix_view to a device_matrix_view
+ *
+ * In certain situations (UVM/HMM/ATS) host memory might be directly accessible on the
+ * device, and no copy or extra allocations need to be performed. This class checks
+ * if the host_matrix_view is already accessible on the device, and only creates device
+ *  memory and copies over if necessary
+ */
+template <typename T, typename IdxT>
+struct device_matrix_view_from_host {
+  device_matrix_view_from_host(raft::resources const& res, host_matrix_view<T, IdxT> host_view)
+    : host_view_(host_view)
+  {
+    cudaPointerAttributes attr;
+    RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, host_view.data_handle()));
+    device_ptr = reinterpret_cast<T*>(attr.devicePointer);
+    if (device_ptr == NULL) {
+      // allocate memory and copy over
+      device_mem_.emplace(
+        raft::make_device_matrix<T, IdxT>(res, host_view.extent(0), host_view.extent(1)));
+      raft::copy(device_mem_->data_handle(),
+                 host_view.data_handle(),
+                 host_view.extent(0) * host_view.extent(1),
+                 resource::get_cuda_stream(res));
+      device_ptr = device_mem_->data_handle();
+    }
+  }
+
+  device_matrix_view<T, IdxT> view()
+  {
+    return make_device_matrix_view<T, IdxT>(device_ptr, host_view_.extent(0), host_view_.extent(1));
+  }
+
+  std::optional<device_matrix<T, IdxT>> device_mem_;
+  host_matrix_view<T, IdxT> host_view_;
+  T* device_ptr;
+};
+
+/**
+ * Utility to sync memory from a device_matrix_view to a host_matrix_view
+ *
+ * In certain situations (UVM/HMM/ATS) device memory might be directly accessible on the
+ * host, and no copy or extra allocations need to be performed. This class checks
+ * if the device_matrix_view is already accessible on the host, and only creates host
+ *  memory and copies over if necessary
+ */
+template <typename T, typename IdxT>
+struct host_matrix_view_from_device {
+  host_matrix_view_from_device(raft::resources const& res, device_matrix_view<T, IdxT> device_view)
+    : device_view_(device_view)
+  {
+    cudaPointerAttributes attr;
+    RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, device_view.data_handle()));
+    host_ptr = reinterpret_cast<T*>(attr.hostPointer);
+    if (host_ptr == NULL) {
+      // allocate memory and copy over
+      host_mem_.emplace(
+        raft::make_host_matrix<T, IdxT>(device_view.extent(0), device_view.extent(1)));
+      raft::copy(host_mem_->data_handle(),
+                 device_view.data_handle(),
+                 device_view.extent(0) * device_view.extent(1),
+                 resource::get_cuda_stream(res));
+      host_ptr = host_mem_->data_handle();
+    }
+  }
+
+  host_matrix_view<T, IdxT> view()
+  {
+    return make_host_matrix_view<T, IdxT>(host_ptr, device_view_.extent(0), device_view_.extent(1));
+  }
+
+  std::optional<host_matrix<T, IdxT>> host_mem_;
+  device_matrix_view<T, IdxT> device_view_;
+  T* host_ptr;
+};
 }  // namespace raft::neighbors::cagra::detail

From 191e7fcd87e4a01d9e76049cb33592d94f5d6c85 Mon Sep 17 00:00:00 2001
From: Ben Frederickson <ben@benfrederickson.com>
Date: Tue, 29 Aug 2023 17:59:21 -0700
Subject: [PATCH 08/11] rev_graph

---
 .../neighbors/detail/cagra/graph_core.cuh     | 18 ++++++-------
 .../raft/neighbors/detail/cagra/utils.hpp     | 26 ++++++++++++++-----
 2 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
index 4b3546f8c8..18d451be60 100644
--- a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
@@ -396,7 +396,7 @@ void optimize(raft::resources const& res,
     for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
       kern_prune<MAX_DEGREE, IdxT>
         <<<blocks_prune, threads_prune, 0, resource::get_cuda_stream(res)>>>(
-          d_input_graph.view().data_handle(),
+          d_input_graph.data_handle(),
           graph_size,
           input_graph_degree,
           output_graph_degree,
@@ -414,7 +414,6 @@ void optimize(raft::resources const& res,
     RAFT_LOG_DEBUG("\n");
 
     host_matrix_view_from_device<uint8_t, int64_t> detour_count(res, d_detour_count.view());
-    uint8_t* host_detour_count_ptr = detour_count.view().data_handle();
 
     raft::copy(
       host_stats.data_handle(), dev_stats.data_handle(), 2, resource::get_cuda_stream(res));
@@ -429,7 +428,7 @@ void optimize(raft::resources const& res,
       for (uint32_t num_detour = 0; num_detour < output_graph_degree; num_detour++) {
         if (max_detour < num_detour) { max_detour = num_detour; /* stats */ }
         for (uint64_t k = 0; k < input_graph_degree; k++) {
-          if (host_detour_count_ptr[k + (input_graph_degree * i)] != num_detour) { continue; }
+          if (detour_count.data_handle()[k + (input_graph_degree * i)] != num_detour) { continue; }
           output_graph_ptr[pk + (output_graph_degree * i)] =
             input_graph_ptr[k + (input_graph_degree * i)];
           pk += 1;
@@ -461,8 +460,7 @@ void optimize(raft::resources const& res,
     //
     const double time_make_start = cur_time();
 
-    auto d_rev_graph =
-      raft::make_device_matrix<IdxT, int64_t>(res, graph_size, output_graph_degree);
+    device_matrix_view_from_host<IdxT, int64_t> d_rev_graph(res, rev_graph.view());
     RAFT_CUDA_TRY(cudaMemsetAsync(d_rev_graph.data_handle(),
                                   0xff,
                                   graph_size * output_graph_degree * sizeof(IdxT),
@@ -503,10 +501,12 @@ void optimize(raft::resources const& res,
     resource::sync_stream(res);
     RAFT_LOG_DEBUG("\n");
 
-    raft::copy(rev_graph.data_handle(),
-               d_rev_graph.data_handle(),
-               graph_size * output_graph_degree,
-               resource::get_cuda_stream(res));
+    if (d_rev_graph.allocated_memory()) {
+      raft::copy(rev_graph.data_handle(),
+                 d_rev_graph.data_handle(),
+                 graph_size * output_graph_degree,
+                 resource::get_cuda_stream(res));
+    }
     raft::copy(rev_graph_count.data_handle(),
                d_rev_graph_count.data_handle(),
                graph_size,
diff --git a/cpp/include/raft/neighbors/detail/cagra/utils.hpp b/cpp/include/raft/neighbors/detail/cagra/utils.hpp
index 72d57fa413..686fdc9bbe 100644
--- a/cpp/include/raft/neighbors/detail/cagra/utils.hpp
+++ b/cpp/include/raft/neighbors/detail/cagra/utils.hpp
@@ -156,12 +156,14 @@ struct gen_index_msb_1_mask {
  * Utility to sync memory from a host_matrix_view to a device_matrix_view
  *
  * In certain situations (UVM/HMM/ATS) host memory might be directly accessible on the
- * device, and no copy or extra allocations need to be performed. This class checks
+ * device, and no extra allocations need to be performed. This class checks
  * if the host_matrix_view is already accessible on the device, and only creates device
- *  memory and copies over if necessary
+ * memory and copies over if necessary. In memory limited situations this is preferable
+ * to having both a host and device copy
  */
 template <typename T, typename IdxT>
-struct device_matrix_view_from_host {
+class device_matrix_view_from_host {
+ public:
   device_matrix_view_from_host(raft::resources const& res, host_matrix_view<T, IdxT> host_view)
     : host_view_(host_view)
   {
@@ -185,6 +187,11 @@ struct device_matrix_view_from_host {
     return make_device_matrix_view<T, IdxT>(device_ptr, host_view_.extent(0), host_view_.extent(1));
   }
 
+  T* data_handle() { return device_ptr; }
+
+  bool allocated_memory() const { return device_mem_.has_value(); }
+
+ private:
   std::optional<device_matrix<T, IdxT>> device_mem_;
   host_matrix_view<T, IdxT> host_view_;
   T* device_ptr;
@@ -194,12 +201,14 @@ struct device_matrix_view_from_host {
  * Utility to sync memory from a device_matrix_view to a host_matrix_view
  *
  * In certain situations (UVM/HMM/ATS) device memory might be directly accessible on the
- * host, and no copy or extra allocations need to be performed. This class checks
+ * host, and no extra allocations need to be performed. This class checks
  * if the device_matrix_view is already accessible on the host, and only creates host
- *  memory and copies over if necessary
+ * memory and copies over if necessary. In memory limited situations this is preferable
+ * to having both a host and device copy
  */
 template <typename T, typename IdxT>
-struct host_matrix_view_from_device {
+class host_matrix_view_from_device {
+ public:
   host_matrix_view_from_device(raft::resources const& res, device_matrix_view<T, IdxT> device_view)
     : device_view_(device_view)
   {
@@ -223,6 +232,11 @@ struct host_matrix_view_from_device {
     return make_host_matrix_view<T, IdxT>(host_ptr, device_view_.extent(0), device_view_.extent(1));
   }
 
+  T* data_handle() { return host_ptr; }
+
+  bool allocated_memory() const { return host_mem_.has_value(); }
+
+ private:
   std::optional<host_matrix<T, IdxT>> host_mem_;
   device_matrix_view<T, IdxT> device_view_;
   T* host_ptr;

From f539655f770f7cd92fb3e9a064f1210662bde68a Mon Sep 17 00:00:00 2001
From: Ben Frederickson <ben@benfrederickson.com>
Date: Wed, 30 Aug 2023 16:06:13 -0700
Subject: [PATCH 09/11] remove accidental checkin

---
 cpp/include/raft/neighbors/cagra_types.hpp | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/cpp/include/raft/neighbors/cagra_types.hpp b/cpp/include/raft/neighbors/cagra_types.hpp
index 7e2b6bc06e..02e3f5338e 100644
--- a/cpp/include/raft/neighbors/cagra_types.hpp
+++ b/cpp/include/raft/neighbors/cagra_types.hpp
@@ -287,17 +287,6 @@ struct index : ann::index {
     graph_view_ = knn_graph;
   }
 
-  /**
-   * Replace the graph with a new graph, taking ownership of the input device
-   * matrix. The index will manage the lifetime.
-   */
-  void update_graph(raft::resources const& res,
-                    raft::device_matrix<IdxT, int64_t, row_major>&& knn_graph)
-  {
-    graph_      = std::move(knn_graph);
-    graph_view_ = graph_.view();
-  }
-
   /**
    * Replace the graph with a new graph.
    *

From 0e0c5f3b64803ab145b390277186989baffccaf9 Mon Sep 17 00:00:00 2001
From: Ben Frederickson <ben@benfrederickson.com>
Date: Wed, 30 Aug 2023 16:12:52 -0700
Subject: [PATCH 10/11] revert serialization change

was still making a copy from device->host inside serialize_mdspan, and with the
include_dataset changes this branch won't even be called
---
 .../detail/cagra/cagra_serialize.cuh          | 35 +++++++------------
 1 file changed, 12 insertions(+), 23 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh
index d0e1f8ea4f..2c9cbd2563 100644
--- a/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh
@@ -71,29 +71,18 @@ void serialize(raft::resources const& res,
   serialize_scalar(res, os, include_dataset);
   if (include_dataset) {
     auto dataset = index_.dataset();
-    if (dataset.stride(0) == dataset.extent(0)) {
-      // Rather than take another copy of the dataset here, just write it out directly.
-      // Since the dataset is a strided layout, we can't pass directly to the serialize_mdspan
-      // - but since the stride is the same as the extent, we can convert to a row-major
-      // mdspan
-      serialize_mdspan(
-        res,
-        os,
-        make_device_matrix_view(dataset.data_handle(), dataset.extent(0), dataset.extent(1)));
-    } else {
-      // Remove padding before saving the dataset
-      auto host_dataset = make_host_matrix<T, int64_t>(dataset.extent(0), dataset.extent(1));
-      RAFT_CUDA_TRY(cudaMemcpy2DAsync(host_dataset.data_handle(),
-                                      sizeof(T) * host_dataset.extent(1),
-                                      dataset.data_handle(),
-                                      sizeof(T) * dataset.stride(0),
-                                      sizeof(T) * host_dataset.extent(1),
-                                      dataset.extent(0),
-                                      cudaMemcpyDefault,
-                                      resource::get_cuda_stream(res)));
-      resource::sync_stream(res);
-      serialize_mdspan(res, os, host_dataset.view());
-    }
+    // Remove padding before saving the dataset
+    auto host_dataset = make_host_matrix<T, int64_t>(dataset.extent(0), dataset.extent(1));
+    RAFT_CUDA_TRY(cudaMemcpy2DAsync(host_dataset.data_handle(),
+                                    sizeof(T) * host_dataset.extent(1),
+                                    dataset.data_handle(),
+                                    sizeof(T) * dataset.stride(0),
+                                    sizeof(T) * host_dataset.extent(1),
+                                    dataset.extent(0),
+                                    cudaMemcpyDefault,
+                                    resource::get_cuda_stream(res)));
+    resource::sync_stream(res);
+    serialize_mdspan(res, os, host_dataset.view());
   }
 }
 

From 6552c66c820431285c5ee01705dbfaaf7b4435ed Mon Sep 17 00:00:00 2001
From: Ben Frederickson <ben@benfrederickson.com>
Date: Tue, 5 Sep 2023 10:28:40 -0700
Subject: [PATCH 11/11] Add TODO about using mdbuffer

---
 cpp/include/raft/neighbors/detail/cagra/utils.hpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cpp/include/raft/neighbors/detail/cagra/utils.hpp b/cpp/include/raft/neighbors/detail/cagra/utils.hpp
index 686fdc9bbe..22cbe6bbac 100644
--- a/cpp/include/raft/neighbors/detail/cagra/utils.hpp
+++ b/cpp/include/raft/neighbors/detail/cagra/utils.hpp
@@ -160,6 +160,8 @@ struct gen_index_msb_1_mask {
  * if the host_matrix_view is already accessible on the device, and only creates device
  * memory and copies over if necessary. In memory limited situations this is preferable
  * to having both a host and device copy
+ * TODO: once the mdbuffer changes here https://github.com/wphicks/raft/blob/fea-mdbuffer
+ * have been merged, we should remove this class and switch over to using mdbuffer for this
  */
 template <typename T, typename IdxT>
 class device_matrix_view_from_host {
@@ -205,6 +207,8 @@ class device_matrix_view_from_host {
  * if the device_matrix_view is already accessible on the host, and only creates host
  * memory and copies over if necessary. In memory limited situations this is preferable
  * to having both a host and device copy
+ * TODO: once the mdbuffer changes here https://github.com/wphicks/raft/blob/fea-mdbuffer
+ * have been merged, we should remove this class and switch over to using mdbuffer for this
  */
 template <typename T, typename IdxT>
 class host_matrix_view_from_device {