From 9b0683d607320b1669906db6b4142135e8419ec6 Mon Sep 17 00:00:00 2001
From: Chuck Hastings <45364586+ChuckHastings@users.noreply.github.com>
Date: Fri, 9 Jun 2023 09:45:04 -0400
Subject: [PATCH] [HOTFIX] Fix random walks error identified in CUDA-12 testing
 (#3647)

CUDA-12 testing for release 23.06 detected a non-deterministic failure in the random walks test.

After some verification work, we determined there is a bug in the Random Walks implementation that can trigger out-of-bounds memory references.

This PR adds a test that triggers the OOB error (if run with `compute-sanitizer`) and a fix that resolves the issue.

Authors:
   - Chuck Hastings (https://github.com/ChuckHastings)

Approvers:
   - Seunghwa Kang (https://github.com/seunghwak)
   - Alex Barghi (https://github.com/alexbarghi-nv)
   - Rick Ratzel (https://github.com/rlratzel)
   - Naim (https://github.com/naimnv)
---
 cpp/src/sampling/random_walks_impl.cuh | 121 +++++++++++++++----------
 cpp/tests/c_api/sg_random_walks_test.c |  30 +++++-
 2 files changed, 98 insertions(+), 53 deletions(-)
diff --git a/cpp/src/sampling/random_walks_impl.cuh b/cpp/src/sampling/random_walks_impl.cuh
index 3a21143eb77..ec595105782 100644
--- a/cpp/src/sampling/random_walks_impl.cuh
+++ b/cpp/src/sampling/random_walks_impl.cuh
@@ -213,25 +213,22 @@ random_walk_impl(raft::handle_t const& handle,
     detail::scalar_fill(handle, result_weights->data(), result_weights->size(), weight_t{0});
 
   rmm::device_uvector<vertex_t> current_vertices(start_vertices.size(), handle.get_stream());
-  rmm::device_uvector<size_t> current_position(0, handle.get_stream());
+  rmm::device_uvector<size_t> current_position(start_vertices.size(), handle.get_stream());
   rmm::device_uvector<int> current_gpu(0, handle.get_stream());
   auto new_weights = edge_weight_view
                        ? std::make_optional<rmm::device_uvector<weight_t>>(0, handle.get_stream())
                        : std::nullopt;
 
+  raft::copy(
+    current_vertices.data(), start_vertices.data(), start_vertices.size(), handle.get_stream());
+  detail::sequence_fill(
+    handle.get_stream(), current_position.data(), current_position.size(), size_t{0});
+
   if constexpr (multi_gpu) {
-    current_position.resize(start_vertices.size(), handle.get_stream());
     current_gpu.resize(start_vertices.size(), handle.get_stream());
 
-    raft::copy(
-      current_vertices.data(), start_vertices.data(), start_vertices.size(), handle.get_stream());
     detail::scalar_fill(
       handle, current_gpu.data(), current_gpu.size(), handle.get_comms().get_rank());
-    detail::sequence_fill(
-      handle.get_stream(), current_position.data(), current_position.size(), size_t{0});
-  } else {
-    raft::copy(
-      current_vertices.begin(), start_vertices.begin(), start_vertices.size(), handle.get_stream());
   }
 
   thrust::for_each(
@@ -276,26 +273,20 @@ random_walk_impl(raft::handle_t const& handle,
     std::tie(current_vertices, new_weights) =
       random_selector.follow_random_edge(handle, graph_view, edge_weight_view, current_vertices);
 
+    // FIXME: remove_if has a 32-bit overflow issue
+    // (https://github.com/NVIDIA/thrust/issues/1302) Seems unlikely here (the goal of
+    // sampling is to extract small graphs) so not going to work around this for now.
+    CUGRAPH_EXPECTS(
+      current_vertices.size() < static_cast<size_t>(std::numeric_limits<int32_t>::max()),
+      "remove_if will fail, current_vertices.size() is too large");
+
     if constexpr (multi_gpu) {
-      //
-      //  Now I can iterate over the tuples (current_vertices, new_weights, current_gpu,
-      //  current_position) and skip over anything where current_vertices == invalid_vertex_id.
-      //  There should, for any vertex, be at most one gpu where the vertex has a new vertex
-      //  neighbor.
-      //
       if (result_weights) {
         auto input_iter = thrust::make_zip_iterator(current_vertices.begin(),
                                                     new_weights->begin(),
                                                     current_gpu.begin(),
                                                     current_position.begin());
 
-        CUGRAPH_EXPECTS(
-          current_vertices.size() < static_cast<size_t>(std::numeric_limits<int32_t>::max()),
-          "remove_if will fail, current_vertices.size() is too large");
-
-        // FIXME: remove_if has a 32-bit overflow issue
-        // (https://github.com/NVIDIA/thrust/issues/1302) Seems unlikely here (the goal of
-        // sampling is to extract small graphs) so not going to work around this for now.
         auto compacted_length = thrust::distance(
           input_iter,
           thrust::remove_if(handle.get_thrust_policy(),
@@ -346,15 +337,8 @@ random_walk_impl(raft::handle_t const& handle,
         auto input_iter = thrust::make_zip_iterator(
           current_vertices.begin(), current_gpu.begin(), current_position.begin());
 
-        CUGRAPH_EXPECTS(
-          current_vertices.size() < static_cast<size_t>(std::numeric_limits<int32_t>::max()),
-          "remove_if will fail, current_vertices.size() is too large");
-
         auto compacted_length = thrust::distance(
           input_iter,
-          // FIXME: remove_if has a 32-bit overflow issue
-          // (https://github.com/NVIDIA/thrust/issues/1302) Seems unlikely here (the goal of
-          // sampling is to extract small graphs) so not going to work around this for now.
           thrust::remove_if(handle.get_thrust_policy(),
                             input_iter,
                             input_iter + current_vertices.size(),
@@ -392,28 +376,65 @@ random_walk_impl(raft::handle_t const& handle,
       }
     } else {
       if (result_weights) {
-        thrust::for_each(handle.get_thrust_policy(),
-                         thrust::make_counting_iterator<size_t>(0),
-                         thrust::make_counting_iterator<size_t>(current_vertices.size()),
-                         [current_verts = current_vertices.data(),
-                          new_wgts      = new_weights->data(),
-                          result_verts  = result_vertices.data(),
-                          result_wgts   = result_weights->data(),
-                          level,
-                          max_length] __device__(size_t i) {
-                           result_verts[i * (max_length + 1) + level + 1] = current_verts[i];
-                           result_wgts[i * max_length + level]            = new_wgts[i];
-                         });
+        auto input_iter = thrust::make_zip_iterator(
+          current_vertices.begin(), new_weights->begin(), current_position.begin());
+
+        auto compacted_length = thrust::distance(
+          input_iter,
+          thrust::remove_if(handle.get_thrust_policy(),
+                            input_iter,
+                            input_iter + current_vertices.size(),
+                            current_vertices.begin(),
+                            [] __device__(auto dst) {
+                              return (dst == cugraph::invalid_vertex_id<vertex_t>::value);
+                            }));
+
+        current_vertices.resize(compacted_length, handle.get_stream());
+        new_weights->resize(compacted_length, handle.get_stream());
+        current_position.resize(compacted_length, handle.get_stream());
+
+        thrust::for_each(
+          handle.get_thrust_policy(),
+          thrust::make_zip_iterator(
+            current_vertices.begin(), new_weights->begin(), current_position.begin()),
+          thrust::make_zip_iterator(
+            current_vertices.end(), new_weights->end(), current_position.end()),
+          [result_verts = result_vertices.data(),
+           result_wgts  = result_weights->data(),
+           level,
+           max_length] __device__(auto tuple) {
+            vertex_t v                                       = thrust::get<0>(tuple);
+            weight_t w                                       = thrust::get<1>(tuple);
+            size_t pos                                       = thrust::get<2>(tuple);
+            result_verts[pos * (max_length + 1) + level + 1] = v;
+            result_wgts[pos * max_length + level]            = w;
+          });
       } else {
-        thrust::for_each(handle.get_thrust_policy(),
-                         thrust::make_counting_iterator<size_t>(0),
-                         thrust::make_counting_iterator<size_t>(current_vertices.size()),
-                         [current_verts = current_vertices.data(),
-                          result_verts  = result_vertices.data(),
-                          level,
-                          max_length] __device__(size_t i) {
-                           result_verts[i * (max_length + 1) + level + 1] = current_verts[i];
-                         });
+        auto input_iter =
+          thrust::make_zip_iterator(current_vertices.begin(), current_position.begin());
+
+        auto compacted_length = thrust::distance(
+          input_iter,
+          thrust::remove_if(handle.get_thrust_policy(),
+                            input_iter,
+                            input_iter + current_vertices.size(),
+                            current_vertices.begin(),
+                            [] __device__(auto dst) {
+                              return (dst == cugraph::invalid_vertex_id<vertex_t>::value);
+                            }));
+
+        current_vertices.resize(compacted_length, handle.get_stream());
+        current_position.resize(compacted_length, handle.get_stream());
+
+        thrust::for_each(
+          handle.get_thrust_policy(),
+          thrust::make_zip_iterator(current_vertices.begin(), current_position.begin()),
+          thrust::make_zip_iterator(current_vertices.end(), current_position.end()),
+          [result_verts = result_vertices.data(), level, max_length] __device__(auto tuple) {
+            vertex_t v                                       = thrust::get<0>(tuple);
+            size_t pos                                       = thrust::get<1>(tuple);
+            result_verts[pos * (max_length + 1) + level + 1] = v;
+          });
       }
     }
   }
diff --git a/cpp/tests/c_api/sg_random_walks_test.c b/cpp/tests/c_api/sg_random_walks_test.c
index 7b07967e3c9..99740d284df 100644
--- a/cpp/tests/c_api/sg_random_walks_test.c
+++ b/cpp/tests/c_api/sg_random_walks_test.c
@@ -95,10 +95,17 @@ int generic_uniform_random_walks_test(vertex_t* h_src,
   //  NOTE:  The C++ tester does a more thorough validation.  For our purposes
   //  here we will do a simpler validation, merely checking that all edges
   //  are actually part of the graph
-  weight_t M[num_vertices][num_vertices];
 
-  for (int i = 0; i < num_vertices; ++i)
-    for (int j = 0; j < num_vertices; ++j)
+  size_t unrenumbered_vertex_size = num_vertices;
+  for (size_t i = 0 ; i < num_edges ; ++i) {
+    if (h_src[i] > unrenumbered_vertex_size) unrenumbered_vertex_size = h_src[i];
+    if (h_dst[i] > unrenumbered_vertex_size) unrenumbered_vertex_size = h_dst[i];
+  }
+  ++unrenumbered_vertex_size;
+  weight_t M[unrenumbered_vertex_size][unrenumbered_vertex_size];
+
+  for (int i = 0; i < unrenumbered_vertex_size; ++i)
+    for (int j = 0; j < unrenumbered_vertex_size; ++j)
       M[i][j] = -1;
 
   for (int i = 0; i < num_edges; ++i)
@@ -408,11 +415,28 @@ int test_node2vec_random_walks()
     src, dst, wgt, num_vertices, num_edges, start, num_starts, 3, p, q, FALSE, FALSE);
 }
 
+int test_uniform_random_walks_oob()
+{
+  size_t num_edges    = 5;
+  size_t num_vertices = 6;
+  size_t num_starts   = 4;
+  size_t max_depth = 7;
+
+  vertex_t src[]   = {1, 2, 4, 7, 3};
+  vertex_t dst[]   = {5, 4, 1, 5, 2};
+  weight_t wgt[]   = {0.4, 0.5, 0.6, 0.7, 0.8};
+  vertex_t start[] = {2, 5, 3, 1};
+
+  return generic_uniform_random_walks_test(
+    src, dst, wgt, num_vertices, num_edges, start, num_starts, max_depth, TRUE, FALSE);
+}
+
 int main(int argc, char** argv)
 {
   int result = 0;
   result |= RUN_TEST(test_uniform_random_walks);
   result |= RUN_TEST(test_biased_random_walks);
   result |= RUN_TEST(test_node2vec_random_walks);
+  result |= RUN_TEST(test_uniform_random_walks_oob);
   return result;
 }