From 9b0683d607320b1669906db6b4142135e8419ec6 Mon Sep 17 00:00:00 2001 From: Chuck Hastings <45364586+ChuckHastings@users.noreply.github.com> Date: Fri, 9 Jun 2023 09:45:04 -0400 Subject: [PATCH] [HOTFIX] Fix random walks error identified in CUDA-12 testing (#3647) CUDA-12 testing for release 23.06 detected a non-deterministic failure in the random walks test. After some verification work, we determined there is a bug in the Random Walks implementation that can trigger out-of-bounds memory references. This PR adds a test that triggers the OOB error (if run with `compute-sanitizer`) and a fix that resolves the issue. Authors: - Chuck Hastings (https://github.com/ChuckHastings) Approvers: - Seunghwa Kang (https://github.com/seunghwak) - Alex Barghi (https://github.com/alexbarghi-nv) - Rick Ratzel (https://github.com/rlratzel) - Naim (https://github.com/naimnv) --- cpp/src/sampling/random_walks_impl.cuh | 121 +++++++++++++++---------- cpp/tests/c_api/sg_random_walks_test.c | 30 +++++- 2 files changed, 98 insertions(+), 53 deletions(-) diff --git a/cpp/src/sampling/random_walks_impl.cuh b/cpp/src/sampling/random_walks_impl.cuh index 3a21143eb77..ec595105782 100644 --- a/cpp/src/sampling/random_walks_impl.cuh +++ b/cpp/src/sampling/random_walks_impl.cuh @@ -213,25 +213,22 @@ random_walk_impl(raft::handle_t const& handle, detail::scalar_fill(handle, result_weights->data(), result_weights->size(), weight_t{0}); rmm::device_uvector current_vertices(start_vertices.size(), handle.get_stream()); - rmm::device_uvector current_position(0, handle.get_stream()); + rmm::device_uvector current_position(start_vertices.size(), handle.get_stream()); rmm::device_uvector current_gpu(0, handle.get_stream()); auto new_weights = edge_weight_view ? std::make_optional>(0, handle.get_stream()) : std::nullopt; + raft::copy( + current_vertices.data(), start_vertices.data(), start_vertices.size(), handle.get_stream()); + detail::sequence_fill( + handle.get_stream(), current_position.data(), current_position.size(), size_t{0}); + if constexpr (multi_gpu) { - current_position.resize(start_vertices.size(), handle.get_stream()); current_gpu.resize(start_vertices.size(), handle.get_stream()); - raft::copy( - current_vertices.data(), start_vertices.data(), start_vertices.size(), handle.get_stream()); detail::scalar_fill( handle, current_gpu.data(), current_gpu.size(), handle.get_comms().get_rank()); - detail::sequence_fill( - handle.get_stream(), current_position.data(), current_position.size(), size_t{0}); - } else { - raft::copy( - current_vertices.begin(), start_vertices.begin(), start_vertices.size(), handle.get_stream()); } thrust::for_each( @@ -276,26 +273,20 @@ random_walk_impl(raft::handle_t const& handle, std::tie(current_vertices, new_weights) = random_selector.follow_random_edge(handle, graph_view, edge_weight_view, current_vertices); + // FIXME: remove_if has a 32-bit overflow issue + // (https://github.com/NVIDIA/thrust/issues/1302) Seems unlikely here (the goal of + // sampling is to extract small graphs) so not going to work around this for now. + CUGRAPH_EXPECTS( + current_vertices.size() < static_cast(std::numeric_limits::max()), + "remove_if will fail, current_vertices.size() is too large"); + if constexpr (multi_gpu) { - // - // Now I can iterate over the tuples (current_vertices, new_weights, current_gpu, - // current_position) and skip over anything where current_vertices == invalid_vertex_id. - // There should, for any vertex, be at most one gpu where the vertex has a new vertex - // neighbor. - // if (result_weights) { auto input_iter = thrust::make_zip_iterator(current_vertices.begin(), new_weights->begin(), current_gpu.begin(), current_position.begin()); - CUGRAPH_EXPECTS( - current_vertices.size() < static_cast(std::numeric_limits::max()), - "remove_if will fail, current_vertices.size() is too large"); - - // FIXME: remove_if has a 32-bit overflow issue - // (https://github.com/NVIDIA/thrust/issues/1302) Seems unlikely here (the goal of - // sampling is to extract small graphs) so not going to work around this for now. auto compacted_length = thrust::distance( input_iter, thrust::remove_if(handle.get_thrust_policy(), @@ -346,15 +337,8 @@ random_walk_impl(raft::handle_t const& handle, auto input_iter = thrust::make_zip_iterator( current_vertices.begin(), current_gpu.begin(), current_position.begin()); - CUGRAPH_EXPECTS( - current_vertices.size() < static_cast(std::numeric_limits::max()), - "remove_if will fail, current_vertices.size() is too large"); - auto compacted_length = thrust::distance( input_iter, - // FIXME: remove_if has a 32-bit overflow issue - // (https://github.com/NVIDIA/thrust/issues/1302) Seems unlikely here (the goal of - // sampling is to extract small graphs) so not going to work around this for now. thrust::remove_if(handle.get_thrust_policy(), input_iter, input_iter + current_vertices.size(), @@ -392,28 +376,65 @@ random_walk_impl(raft::handle_t const& handle, } } else { if (result_weights) { - thrust::for_each(handle.get_thrust_policy(), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(current_vertices.size()), - [current_verts = current_vertices.data(), - new_wgts = new_weights->data(), - result_verts = result_vertices.data(), - result_wgts = result_weights->data(), - level, - max_length] __device__(size_t i) { - result_verts[i * (max_length + 1) + level + 1] = current_verts[i]; - result_wgts[i * max_length + level] = new_wgts[i]; - }); + auto input_iter = thrust::make_zip_iterator( + current_vertices.begin(), new_weights->begin(), current_position.begin()); + + auto compacted_length = thrust::distance( + input_iter, + thrust::remove_if(handle.get_thrust_policy(), + input_iter, + input_iter + current_vertices.size(), + current_vertices.begin(), + [] __device__(auto dst) { + return (dst == cugraph::invalid_vertex_id::value); + })); + + current_vertices.resize(compacted_length, handle.get_stream()); + new_weights->resize(compacted_length, handle.get_stream()); + current_position.resize(compacted_length, handle.get_stream()); + + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_zip_iterator( + current_vertices.begin(), new_weights->begin(), current_position.begin()), + thrust::make_zip_iterator( + current_vertices.end(), new_weights->end(), current_position.end()), + [result_verts = result_vertices.data(), + result_wgts = result_weights->data(), + level, + max_length] __device__(auto tuple) { + vertex_t v = thrust::get<0>(tuple); + weight_t w = thrust::get<1>(tuple); + size_t pos = thrust::get<2>(tuple); + result_verts[pos * (max_length + 1) + level + 1] = v; + result_wgts[pos * max_length + level] = w; + }); } else { - thrust::for_each(handle.get_thrust_policy(), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(current_vertices.size()), - [current_verts = current_vertices.data(), - result_verts = result_vertices.data(), - level, - max_length] __device__(size_t i) { - result_verts[i * (max_length + 1) + level + 1] = current_verts[i]; - }); + auto input_iter = + thrust::make_zip_iterator(current_vertices.begin(), current_position.begin()); + + auto compacted_length = thrust::distance( + input_iter, + thrust::remove_if(handle.get_thrust_policy(), + input_iter, + input_iter + current_vertices.size(), + current_vertices.begin(), + [] __device__(auto dst) { + return (dst == cugraph::invalid_vertex_id::value); + })); + + current_vertices.resize(compacted_length, handle.get_stream()); + current_position.resize(compacted_length, handle.get_stream()); + + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_zip_iterator(current_vertices.begin(), current_position.begin()), + thrust::make_zip_iterator(current_vertices.end(), current_position.end()), + [result_verts = result_vertices.data(), level, max_length] __device__(auto tuple) { + vertex_t v = thrust::get<0>(tuple); + size_t pos = thrust::get<1>(tuple); + result_verts[pos * (max_length + 1) + level + 1] = v; + }); } } } diff --git a/cpp/tests/c_api/sg_random_walks_test.c b/cpp/tests/c_api/sg_random_walks_test.c index 7b07967e3c9..99740d284df 100644 --- a/cpp/tests/c_api/sg_random_walks_test.c +++ b/cpp/tests/c_api/sg_random_walks_test.c @@ -95,10 +95,17 @@ int generic_uniform_random_walks_test(vertex_t* h_src, // NOTE: The C++ tester does a more thorough validation. For our purposes // here we will do a simpler validation, merely checking that all edges // are actually part of the graph - weight_t M[num_vertices][num_vertices]; - for (int i = 0; i < num_vertices; ++i) - for (int j = 0; j < num_vertices; ++j) + size_t unrenumbered_vertex_size = num_vertices; + for (size_t i = 0 ; i < num_edges ; ++i) { + if (h_src[i] > unrenumbered_vertex_size) unrenumbered_vertex_size = h_src[i]; + if (h_dst[i] > unrenumbered_vertex_size) unrenumbered_vertex_size = h_dst[i]; + } + ++unrenumbered_vertex_size; + weight_t M[unrenumbered_vertex_size][unrenumbered_vertex_size]; + + for (int i = 0; i < unrenumbered_vertex_size; ++i) + for (int j = 0; j < unrenumbered_vertex_size; ++j) M[i][j] = -1; for (int i = 0; i < num_edges; ++i) @@ -408,11 +415,28 @@ int test_node2vec_random_walks() src, dst, wgt, num_vertices, num_edges, start, num_starts, 3, p, q, FALSE, FALSE); } +int test_uniform_random_walks_oob() +{ + size_t num_edges = 5; + size_t num_vertices = 6; + size_t num_starts = 4; + size_t max_depth = 7; + + vertex_t src[] = {1, 2, 4, 7, 3}; + vertex_t dst[] = {5, 4, 1, 5, 2}; + weight_t wgt[] = {0.4, 0.5, 0.6, 0.7, 0.8}; + vertex_t start[] = {2, 5, 3, 1}; + + return generic_uniform_random_walks_test( + src, dst, wgt, num_vertices, num_edges, start, num_starts, max_depth, TRUE, FALSE); +} + int main(int argc, char** argv) { int result = 0; result |= RUN_TEST(test_uniform_random_walks); result |= RUN_TEST(test_biased_random_walks); result |= RUN_TEST(test_node2vec_random_walks); + result |= RUN_TEST(test_uniform_random_walks_oob); return result; }