From aaad98e9e81b84cd1e29c08349afcf8b5669d6e1 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 19 Oct 2021 13:36:15 -0400 Subject: [PATCH 1/7] add overflow check --- cpp/tests/utilities/test_graphs.hpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cpp/tests/utilities/test_graphs.hpp b/cpp/tests/utilities/test_graphs.hpp index 931c69c7357..dc836c0a6f4 100644 --- a/cpp/tests/utilities/test_graphs.hpp +++ b/cpp/tests/utilities/test_graphs.hpp @@ -134,6 +134,13 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { bool> construct_edgelist(raft::handle_t const& handle, bool test_weighted) const { + CUGRAPH_EXPECTS( + (size_t{1} << scale_) <= static_cast(std::numeric_limits::max()), + "Invalid template parameter: scale_ too large for vertex_t."); + CUGRAPH_EXPECTS(((size_t{1} << scale_) * edge_factor_) <= + static_cast(std::numeric_limits::max()), + "Invalid template parameter: (scale_, edge_factor_) too large for edge_t"); + std::vector partition_ids(1); size_t num_partitions; From 3ce426b36a345a892898dba43c6e13ac8b1d6bc8 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 19 Oct 2021 16:56:52 -0400 Subject: [PATCH 2/7] minor reordering of opreations to potentially lower peak memory usage --- cpp/src/structure/graph_impl.cuh | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh index 77fe5a3d7b6..3de8e26415a 100644 --- a/cpp/src/structure/graph_impl.cuh +++ b/cpp/src/structure/graph_impl.cuh @@ -846,6 +846,7 @@ graph_tdecompress_to_edgelist(handle, wrapped_renumber_map, true); + std::tie(edgelist_rows, edgelist_cols, edgelist_weights) = symmetrize_edgelist( handle, @@ -887,13 +888,7 @@ graph_tdecompress_to_edgelist(handle, renumber_map, true); - auto vertex_span = renumber ? std::move(renumber_map) - : std::make_optional>( - number_of_vertices, handle.get_stream()); - if (!renumber) { - thrust::sequence( - handle.get_thrust_policy(), (*vertex_span).begin(), (*vertex_span).end(), vertex_t{0}); - } + std::tie(edgelist_rows, edgelist_cols, edgelist_weights) = symmetrize_edgelist( handle, @@ -902,6 +897,14 @@ graph_t>( + number_of_vertices, handle.get_stream()); + if (!renumber) { + thrust::sequence( + handle.get_thrust_policy(), (*vertex_span).begin(), (*vertex_span).end(), vertex_t{0}); + } + auto [symmetrized_graph, new_renumber_map] = create_graph_from_edgelist( handle, From 61b626090b3c8e341c36c4e2548f2a78f30a288b Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 28 Oct 2021 08:54:06 -0700 Subject: [PATCH 3/7] update cudaSetDevice call to set GPU ID to rank % num_gpus_per_node --- cpp/tests/utilities/base_fixture.hpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cpp/tests/utilities/base_fixture.hpp b/cpp/tests/utilities/base_fixture.hpp index d58d71d9bd2..7b922b87111 100644 --- a/cpp/tests/utilities/base_fixture.hpp +++ b/cpp/tests/utilities/base_fixture.hpp @@ -193,11 +193,9 @@ inline auto parse_test_options(int argc, char** argv) int comm_size{}; \ MPI_TRY(MPI_Comm_rank(MPI_COMM_WORLD, &comm_rank)); \ MPI_TRY(MPI_Comm_size(MPI_COMM_WORLD, &comm_size)); \ - int num_gpus{}; \ - CUDA_TRY(cudaGetDeviceCount(&num_gpus)); \ - CUGRAPH_EXPECTS( \ - comm_size <= num_gpus, "# MPI ranks (%d) > # GPUs (%d).", comm_size, num_gpus); \ - CUDA_TRY(cudaSetDevice(comm_rank)); \ + int num_gpus_per_node{}; \ + CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node)); \ + CUDA_TRY(cudaSetDevice(comm_rank % num_gpus_per_node)); \ ::testing::InitGoogleTest(&argc, argv); \ auto const cmd_opts = parse_test_options(argc, argv); \ auto const rmm_mode = cmd_opts["rmm_mode"].as(); \ From 3d31cc306aa4ec0b230cd557e96f0ea3dc66407e Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 29 Oct 2021 11:07:48 -0400 Subject: [PATCH 4/7] cut memory requirement in renumbering --- cpp/src/structure/renumber_edgelist_impl.cuh | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh index f0c03a5901e..15a5ef48a2a 100644 --- a/cpp/src/structure/renumber_edgelist_impl.cuh +++ b/cpp/src/structure/renumber_edgelist_impl.cuh @@ -174,20 +174,30 @@ compute_renumber_map(raft::handle_t const& handle, edgelist_edge_counts.begin(), edgelist_edge_counts.end() - 1, minor_displs.begin() + 1); rmm::device_uvector minor_labels(minor_displs.back() + edgelist_edge_counts.back(), handle.get_stream()); - vertex_t num_local_unique_edge_minors{0}; + vertex_t minor_offset{0}; for (size_t i = 0; i < edgelist_minor_vertices.size(); ++i) { thrust::copy(handle.get_thrust_policy(), edgelist_minor_vertices[i], edgelist_minor_vertices[i] + edgelist_edge_counts[i], - minor_labels.begin() + minor_displs[i]); + minor_labels.begin() + minor_offset); + thrust::sort(handle.get_thrust_policy(), + minor_labels.begin() + minor_offset, + minor_labels.begin() + minor_offset + edgelist_edge_counts[i]); + minor_offset += + thrust::distance(minor_labels.begin() + minor_offset, + thrust::unique(handle.get_thrust_policy(), + minor_labels.begin() + minor_offset, + minor_labels.begin() + minor_offset + + edgelist_edge_counts[i])); } + minor_labels.resize(minor_offset, handle.get_stream()); thrust::sort(handle.get_thrust_policy(), minor_labels.begin(), minor_labels.end()); minor_labels.resize( thrust::distance( minor_labels.begin(), thrust::unique(handle.get_thrust_policy(), minor_labels.begin(), minor_labels.end())), handle.get_stream()); - num_local_unique_edge_minors += static_cast(minor_labels.size()); + auto num_local_unique_edge_minors = static_cast(minor_labels.size()); if (multi_gpu) { auto& comm = handle.get_comms(); auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); From 5871ea42a37ec6db1d0f73d74acf3754ddf66f8f Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 29 Oct 2021 15:19:56 -0400 Subject: [PATCH 5/7] update renumber_edgelist to take an optional vertex list as an R-value of std::optional> --- cpp/include/cugraph/graph_functions.hpp | 72 ++--- cpp/src/structure/coarsen_graph_impl.cuh | 6 +- .../create_graph_from_edgelist_impl.cuh | 27 +- cpp/src/structure/renumber_edgelist_impl.cuh | 292 +++++++++--------- cpp/src/structure/renumber_edgelist_mg.cu | 18 +- cpp/src/structure/renumber_edgelist_sg.cu | 39 ++- 6 files changed, 217 insertions(+), 237 deletions(-) diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp index 01b774962f1..5ddc244b183 100644 --- a/cpp/include/cugraph/graph_functions.hpp +++ b/cpp/include/cugraph/graph_functions.hpp @@ -59,27 +59,26 @@ struct renumber_meta_t * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. - * @param local_vertex_span If valid, part of the entire set of vertices in the graph to be - * renumbered. The first tuple element is the pointer to the array and the second tuple element is - * the size of the array. This parameter can be used to include isolated vertices. Applying the + * @param local_vertices If valid, part of the entire set of vertices in the graph to be renumbered. + * This parameter can be used to include isolated vertices. Applying the * compute_gpu_id_from_vertex_t to every vertex should return the local GPU ID for this function to * work (vertices should be pre-shuffled). - * @param edgelist_major_vertices Pointers (one pointer per local graph adjacency matrix partition - * assigned to this process) to edge source vertex IDs (if the graph adjacency matrix is stored as - * is) or edge destination vertex IDs (if the transposed graph adjacency matrix is stored). Vertex - * IDs are updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target - * process & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major, - * minor) pair should return the GPU ID of this process and applying the - * compute_partition_id_from_edge_t fuctor to every (major, minor) pair for a local matrix partition - * should return the partition ID of the corresponding matrix partition. - * @param edgelist_minor_vertices Pointers (one pointer per local graph adjacency matrix partition - * assigned to this process) to edge destination vertex IDs (if the graph adjacency matrix is stored - * as is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored). Vertex IDs - * are updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target - * process & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major, - * minor) pair should return the GPU ID of this process and applying the - * compute_partition_id_from_edge_t fuctor to every (major, minor) pair for a local matrix partition - * should return the partition ID of the corresponding matrix partition. + * @param edgelist_majors Pointers (one pointer per local graph adjacency matrix partition assigned + * to this process) to edge source vertex IDs (if the graph adjacency matrix is stored as is) or + * edge destination vertex IDs (if the transposed graph adjacency matrix is stored). Vertex IDs are + * updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target process + * & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major, minor) + * pair should return the GPU ID of this process and applying the compute_partition_id_from_edge_t + * fuctor to every (major, minor) pair for a local matrix partition should return the partition ID + * of the corresponding matrix partition. + * @param edgelist_minors Pointers (one pointer per local graph adjacency matrix partition assigned + * to this process) to edge destination vertex IDs (if the graph adjacency matrix is stored as is) + * or edge source vertex IDs (if the transposed graph adjacency matrix is stored). Vertex IDs are + * updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target process + * & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major, minor) + * pair should return the GPU ID of this process and applying the compute_partition_id_from_edge_t + * fuctor to every (major, minor) pair for a local matrix partition should return the partition ID + * of the corresponding matrix partition. * @param edgelist_edge_counts Edge counts (one count per local graph adjacency matrix partition * assigned to this process). * @param edgelist_intra_partition_segment_offsets If valid, store segment offsets within a local @@ -101,9 +100,9 @@ std::enable_if_t< std::tuple, renumber_meta_t>> renumber_edgelist( raft::handle_t const& handle, - std::optional> local_vertex_span, - std::vector const& edgelist_major_vertices /* [INOUT] */, - std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::optional>&& local_vertices, + std::vector const& edgelist_majors /* [INOUT] */, + std::vector const& edgelist_minors /* [INOUT] */, std::vector const& edgelist_edge_counts, std::optional>> const& edgelist_intra_partition_segment_offsets, bool do_expensive_check = false); @@ -117,17 +116,14 @@ renumber_edgelist( * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. - * @param vertex_span If valid, vertices in the graph to be renumbered. The first tuple element is - * the pointer to the array and the second tuple element is the size of the array. This parameter - * can be used to include isolated vertices. - * @param vertices The entire set of vertices in the graph to be renumbered. - * @param num_vertices Number of vertices. - * @param edgelist_major_vertices Edge source vertex IDs (if the graph adjacency matrix is stored as - * is) or edge destination vertex IDs (if the transposed graph adjacency matrix is stored). Vertex - * IDs are updated in-place ([INOUT] parameter). - * @param edgelist_minor_vertices Edge destination vertex IDs (if the graph adjacency matrix is - * stored as is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored). - * Vertex IDs are updated in-place ([INOUT] parameter). + * @param vertices If valid, vertices in the graph to be renumbered. This parameter can be used to + * include isolated vertices. + * @param edgelist_majors Edge source vertex IDs (if the graph adjacency matrix is stored as is) or + * edge destination vertex IDs (if the transposed graph adjacency matrix is stored). Vertex IDs are + * updated in-place ([INOUT] parameter). + * @param edgelist_minors Edge destination vertex IDs (if the graph adjacency matrix is stored as + * is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored). Vertex IDs + * are updated in-place ([INOUT] parameter). * @param num_edgelist_edges Number of edges in the edgelist. * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). * @return std::tuple, renumber_meta_t> @@ -141,9 +137,9 @@ std::enable_if_t< !multi_gpu, std::tuple, renumber_meta_t>> renumber_edgelist(raft::handle_t const& handle, - std::optional> vertex_span, - vertex_t* edgelist_major_vertices /* [INOUT] */, - vertex_t* edgelist_minor_vertices /* [INOUT] */, + std::optional>&& vertices, + vertex_t* edgelist_majors /* [INOUT] */, + vertex_t* edgelist_minors /* [INOUT] */, edge_t num_edgelist_edges, bool do_expensive_check = false); @@ -462,7 +458,7 @@ extract_induced_subgraphs( * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. - * @param vertex_span If valid, part of the entire set of vertices in the graph to be renumbered. + * @param vertices If valid, part of the entire set of vertices in the graph to be renumbered. * This parameter can be used to include isolated vertices. If multi-GPU, applying the * compute_gpu_id_from_vertex_t to every vertex should return the local GPU ID for this function to * work (vertices should be pre-shuffled). @@ -487,7 +483,7 @@ template , std::optional>> create_graph_from_edgelist(raft::handle_t const& handle, - std::optional>&& vertex_span, + std::optional>&& vertices, rmm::device_uvector&& edgelist_rows, rmm::device_uvector&& edgelist_cols, std::optional>&& edgelist_weights, diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh index 2fbdfe25f79..f49c7be626d 100644 --- a/cpp/src/structure/coarsen_graph_impl.cuh +++ b/cpp/src/structure/coarsen_graph_impl.cuh @@ -409,8 +409,7 @@ coarsen_graph( } std::tie(renumber_map_labels, meta) = renumber_edgelist( handle, - std::optional>{ - std::make_tuple(unique_labels.data(), static_cast(unique_labels.size()))}, + std::optional>{std::move(unique_labels)}, major_ptrs, minor_ptrs, counts, @@ -493,8 +492,7 @@ coarsen_graph( auto [renumber_map_labels, meta] = renumber_edgelist( handle, - std::optional>{ - std::make_tuple(unique_labels.data(), static_cast(unique_labels.size()))}, + std::optional>{std::move(unique_labels)}, coarsened_edgelist_major_vertices.data(), coarsened_edgelist_minor_vertices.data(), static_cast(coarsened_edgelist_major_vertices.size()), diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.cuh b/cpp/src/structure/create_graph_from_edgelist_impl.cuh index 13d3acd9cc4..4516b087180 100644 --- a/cpp/src/structure/create_graph_from_edgelist_impl.cuh +++ b/cpp/src/structure/create_graph_from_edgelist_impl.cuh @@ -226,7 +226,7 @@ std::enable_if_t< std::tuple, std::optional>>> create_graph_from_edgelist_impl(raft::handle_t const& handle, - std::optional>&& local_vertex_span, + std::optional>&& local_vertices, rmm::device_uvector&& edgelist_rows, rmm::device_uvector&& edgelist_cols, std::optional>&& edgelist_weights, @@ -247,7 +247,7 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle, if (do_expensive_check) { expensive_check_edgelist(handle, - local_vertex_span, + local_vertices, store_transposed ? edgelist_cols : edgelist_rows, store_transposed ? edgelist_rows : edgelist_cols); } @@ -296,10 +296,7 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle, } auto [renumber_map_labels, meta] = cugraph::renumber_edgelist( handle, - local_vertex_span - ? std::optional>{std::make_tuple( - (*local_vertex_span).data(), static_cast((*local_vertex_span).size()))} - : std::nullopt, + std::move(local_vertices), major_ptrs, minor_ptrs, edgelist_edge_counts, @@ -343,7 +340,7 @@ std::enable_if_t< std::tuple, std::optional>>> create_graph_from_edgelist_impl(raft::handle_t const& handle, - std::optional>&& vertex_span, + std::optional>&& vertices, rmm::device_uvector&& edgelist_rows, rmm::device_uvector&& edgelist_cols, std::optional>&& edgelist_weights, @@ -358,11 +355,13 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle, if (do_expensive_check) { expensive_check_edgelist(handle, - vertex_span, + vertices, store_transposed ? edgelist_cols : edgelist_rows, store_transposed ? edgelist_rows : edgelist_cols); } + auto input_vertex_list_size = vertices ? static_cast((*vertices).size()) : vertex_t{0}; + auto renumber_map_labels = renumber ? std::make_optional>(0, handle.get_stream()) : std::nullopt; @@ -370,9 +369,7 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle, if (renumber) { std::tie(*renumber_map_labels, meta) = cugraph::renumber_edgelist( handle, - vertex_span ? std::optional>{std::make_tuple( - (*vertex_span).data(), static_cast((*vertex_span).size()))} - : std::nullopt, + std::move(vertices), store_transposed ? edgelist_cols.data() : edgelist_rows.data(), store_transposed ? edgelist_rows.data() : edgelist_cols.data(), static_cast(edgelist_rows.size())); @@ -382,8 +379,8 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle, if (renumber) { num_vertices = static_cast((*renumber_map_labels).size()); } else { - if (vertex_span) { - num_vertices = static_cast((*vertex_span).size()); + if (vertices) { + num_vertices = input_vertex_list_size; } else { num_vertices = 1 + cugraph::detail::compute_maximum_vertex_id( handle.get_stream_view(), edgelist_rows, edgelist_cols); @@ -416,7 +413,7 @@ template , std::optional>> create_graph_from_edgelist(raft::handle_t const& handle, - std::optional>&& vertex_span, + std::optional>&& vertices, rmm::device_uvector&& edgelist_rows, rmm::device_uvector&& edgelist_cols, std::optional>&& edgelist_weights, @@ -426,7 +423,7 @@ create_graph_from_edgelist(raft::handle_t const& handle, { return create_graph_from_edgelist_impl( handle, - std::move(vertex_span), + std::move(vertices), std::move(edgelist_rows), std::move(edgelist_cols), std::move(edgelist_weights), diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh index 15a5ef48a2a..2c22630ea3f 100644 --- a/cpp/src/structure/renumber_edgelist_impl.cuh +++ b/cpp/src/structure/renumber_edgelist_impl.cuh @@ -49,9 +49,9 @@ namespace detail { template std::tuple, std::vector, vertex_t, vertex_t> compute_renumber_map(raft::handle_t const& handle, - std::optional> vertex_span, - std::vector const& edgelist_major_vertices, - std::vector const& edgelist_minor_vertices, + std::optional>&& local_vertices, + std::vector const& edgelist_majors, + std::vector const& edgelist_minors, std::vector const& edgelist_edge_counts) { // FIXME: compare this sort based approach with hash based approach in both speed and memory @@ -78,15 +78,15 @@ compute_renumber_map(raft::handle_t const& handle, rmm::device_uvector major_labels(0, handle.get_stream()); rmm::device_uvector major_counts(0, handle.get_stream()); vertex_t num_local_unique_edge_majors{0}; - for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) { + for (size_t i = 0; i < edgelist_majors.size(); ++i) { rmm::device_uvector tmp_major_labels(0, handle.get_stream()); rmm::device_uvector tmp_major_counts(0, handle.get_stream()); { rmm::device_uvector sorted_major_labels(edgelist_edge_counts[i], handle.get_stream()); thrust::copy(handle.get_thrust_policy(), - edgelist_major_vertices[i], - edgelist_major_vertices[i] + edgelist_edge_counts[i], + edgelist_majors[i], + edgelist_majors[i] + edgelist_edge_counts[i], sorted_major_labels.begin()); // FIXME: better refactor this sort-count_if-reduce_by_key routine for reuse thrust::sort( @@ -169,26 +169,25 @@ compute_renumber_map(raft::handle_t const& handle, // 2. acquire unique minor labels - std::vector minor_displs(edgelist_minor_vertices.size(), edge_t{0}); + std::vector minor_displs(edgelist_minors.size(), edge_t{0}); std::partial_sum( edgelist_edge_counts.begin(), edgelist_edge_counts.end() - 1, minor_displs.begin() + 1); rmm::device_uvector minor_labels(minor_displs.back() + edgelist_edge_counts.back(), handle.get_stream()); vertex_t minor_offset{0}; - for (size_t i = 0; i < edgelist_minor_vertices.size(); ++i) { + for (size_t i = 0; i < edgelist_minors.size(); ++i) { thrust::copy(handle.get_thrust_policy(), - edgelist_minor_vertices[i], - edgelist_minor_vertices[i] + edgelist_edge_counts[i], + edgelist_minors[i], + edgelist_minors[i] + edgelist_edge_counts[i], minor_labels.begin() + minor_offset); thrust::sort(handle.get_thrust_policy(), minor_labels.begin() + minor_offset, minor_labels.begin() + minor_offset + edgelist_edge_counts[i]); - minor_offset += - thrust::distance(minor_labels.begin() + minor_offset, - thrust::unique(handle.get_thrust_policy(), - minor_labels.begin() + minor_offset, - minor_labels.begin() + minor_offset + - edgelist_edge_counts[i])); + minor_offset += thrust::distance( + minor_labels.begin() + minor_offset, + thrust::unique(handle.get_thrust_policy(), + minor_labels.begin() + minor_offset, + minor_labels.begin() + minor_offset + edgelist_edge_counts[i])); } minor_labels.resize(minor_offset, handle.get_stream()); thrust::sort(handle.get_thrust_policy(), minor_labels.begin(), minor_labels.end()); @@ -287,27 +286,28 @@ compute_renumber_map(raft::handle_t const& handle, auto num_non_isolated_vertices = static_cast(labels.size()); - // 4. if vertex_span.has_value() == true, append isolated vertices + // 4. if local_vertices.has_value() == true, append isolated vertices - if (vertex_span) { + if (local_vertices) { rmm::device_uvector isolated_vertices(0, handle.get_stream()); - auto [vertices, num_vertices] = *vertex_span; - auto num_isolated_vertices = thrust::count_if( + auto num_isolated_vertices = thrust::count_if( handle.get_thrust_policy(), - vertices, - vertices + num_vertices, + (*local_vertices).begin(), + (*local_vertices).end(), [label_first = labels.begin(), label_last = labels.end()] __device__(auto v) { return !thrust::binary_search(thrust::seq, label_first, label_last, v); }); isolated_vertices.resize(num_isolated_vertices, handle.get_stream()); thrust::copy_if(handle.get_thrust_policy(), - vertices, - vertices + num_vertices, + (*local_vertices).begin(), + (*local_vertices).end(), isolated_vertices.begin(), [label_first = labels.begin(), label_last = labels.end()] __device__(auto v) { return !thrust::binary_search(thrust::seq, label_first, label_last, v); }); + (*local_vertices).resize(0, handle.get_stream()); + (*local_vertices).shrink_to_fit(handle.get_stream()); if (isolated_vertices.size() > 0) { labels.resize(labels.size() + isolated_vertices.size(), handle.get_stream()); @@ -389,29 +389,32 @@ compute_renumber_map(raft::handle_t const& handle, template void expensive_check_edgelist( raft::handle_t const& handle, - std::optional> vertex_span, - std::vector const& edgelist_major_vertices, - std::vector const& edgelist_minor_vertices, + std::optional> const& local_vertices, + std::vector const& edgelist_majors, + std::vector const& edgelist_minors, std::vector const& edgelist_edge_counts, std::optional>> const& edgelist_intra_partition_segment_offsets) { - rmm::device_uvector sorted_local_vertices(size_t{0}, handle.get_stream()); - if (vertex_span) { - auto [vertices, num_vertices] = *vertex_span; - sorted_local_vertices.resize(num_vertices, handle.get_stream()); - thrust::copy( - handle.get_thrust_policy(), vertices, vertices + num_vertices, sorted_local_vertices.begin()); + std::optional> sorted_local_vertices{std::nullopt}; + if (local_vertices) { + sorted_local_vertices = + rmm::device_uvector((*local_vertices).size(), handle.get_stream()); + thrust::copy(handle.get_thrust_policy(), + (*local_vertices).begin(), + (*local_vertices).end(), + (*sorted_local_vertices).begin()); thrust::sort( - handle.get_thrust_policy(), sorted_local_vertices.begin(), sorted_local_vertices.end()); - CUGRAPH_EXPECTS(static_cast(thrust::distance( - sorted_local_vertices.begin(), - thrust::unique(handle.get_thrust_policy(), - sorted_local_vertices.begin(), - sorted_local_vertices.end()))) == sorted_local_vertices.size(), - "Invalid input argument: local_vertices should not have duplicates."); + handle.get_thrust_policy(), (*sorted_local_vertices).begin(), (*sorted_local_vertices).end()); + CUGRAPH_EXPECTS( + static_cast(thrust::distance((*sorted_local_vertices).begin(), + thrust::unique(handle.get_thrust_policy(), + (*sorted_local_vertices).begin(), + (*sorted_local_vertices).end()))) == + (*sorted_local_vertices).size(), + "Invalid input argument: (local_)vertices should not have duplicates."); } - if (multi_gpu) { + if constexpr (multi_gpu) { auto& comm = handle.get_comms(); auto const comm_size = comm.get_size(); auto const comm_rank = comm.get_rank(); @@ -422,27 +425,28 @@ void expensive_check_edgelist( auto const col_comm_size = col_comm.get_size(); auto const col_comm_rank = col_comm.get_rank(); - CUGRAPH_EXPECTS((edgelist_major_vertices.size() == edgelist_minor_vertices.size()) && - (edgelist_major_vertices.size() == static_cast(col_comm_size)), - "Invalid input argument: both edgelist_major_vertices.size() & " - "edgelist_minor_vertices.size() should coincide with col_comm_size."); + CUGRAPH_EXPECTS((edgelist_majors.size() == edgelist_minors.size()) && + (edgelist_majors.size() == static_cast(col_comm_size)), + "Invalid input argument: both edgelist_majors.size() & " + "edgelist_minors.size() should coincide with col_comm_size."); - auto [local_vertices, num_local_vertices] = *vertex_span; - CUGRAPH_EXPECTS( - thrust::count_if( - handle.get_thrust_policy(), - local_vertices, - local_vertices + num_local_vertices, - [comm_rank, - key_func = - detail::compute_gpu_id_from_vertex_t{comm_size}] __device__(auto val) { - return key_func(val) != comm_rank; - }) == 0, - "Invalid input argument: local_vertices should be pre-shuffled."); - - for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) { - auto edge_first = thrust::make_zip_iterator( - thrust::make_tuple(edgelist_major_vertices[i], edgelist_minor_vertices[i])); + if (sorted_local_vertices) { + CUGRAPH_EXPECTS( + thrust::count_if( + handle.get_thrust_policy(), + (*sorted_local_vertices).begin(), + (*sorted_local_vertices).end(), + [comm_rank, + key_func = + detail::compute_gpu_id_from_vertex_t{comm_size}] __device__(auto val) { + return key_func(val) != comm_rank; + }) == 0, + "Invalid input argument: local_vertices should be pre-shuffled."); + } + + for (size_t i = 0; i < edgelist_majors.size(); ++i) { + auto edge_first = + thrust::make_zip_iterator(thrust::make_tuple(edgelist_majors[i], edgelist_minors[i])); CUGRAPH_EXPECTS( thrust::count_if( handle.get_thrust_policy(), @@ -463,10 +467,10 @@ void expensive_check_edgelist( (partition_id_key_func(thrust::get<0>(edge), thrust::get<1>(edge)) != row_comm_rank * col_comm_size + col_comm_rank + i * comm_size); }) == 0, - "Invalid input argument: edgelist_major_vertices & edgelist_minor_vertices should be " + "Invalid input argument: edgelist_majors & edgelist_minors should be " "pre-shuffled."); - if (vertex_span) { + if (sorted_local_vertices) { auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); @@ -483,22 +487,20 @@ void expensive_check_edgelist( comm.barrier(); // currently, this is ncclAllReduce #endif - rmm::device_uvector sorted_major_vertices(0, handle.get_stream()); + rmm::device_uvector sorted_majors(0, handle.get_stream()); { auto recvcounts = - host_scalar_allgather(col_comm, sorted_local_vertices.size(), handle.get_stream()); + host_scalar_allgather(col_comm, (*sorted_local_vertices).size(), handle.get_stream()); std::vector displacements(recvcounts.size(), size_t{0}); std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1); - sorted_major_vertices.resize(displacements.back() + recvcounts.back(), - handle.get_stream()); + sorted_majors.resize(displacements.back() + recvcounts.back(), handle.get_stream()); device_allgatherv(col_comm, - sorted_local_vertices.data(), - sorted_major_vertices.data(), + (*sorted_local_vertices).data(), + sorted_majors.data(), recvcounts, displacements, handle.get_stream()); - thrust::sort( - handle.get_thrust_policy(), sorted_major_vertices.begin(), sorted_major_vertices.end()); + thrust::sort(handle.get_thrust_policy(), sorted_majors.begin(), sorted_majors.end()); } // barrier is necessary here to avoid potential overlap (which can leads to deadlock) @@ -512,22 +514,20 @@ void expensive_check_edgelist( comm.barrier(); // currently, this is ncclAllReduce #endif - rmm::device_uvector sorted_minor_vertices(0, handle.get_stream()); + rmm::device_uvector sorted_minors(0, handle.get_stream()); { auto recvcounts = - host_scalar_allgather(row_comm, sorted_local_vertices.size(), handle.get_stream()); + host_scalar_allgather(row_comm, (*sorted_local_vertices).size(), handle.get_stream()); std::vector displacements(recvcounts.size(), size_t{0}); std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1); - sorted_minor_vertices.resize(displacements.back() + recvcounts.back(), - handle.get_stream()); + sorted_minors.resize(displacements.back() + recvcounts.back(), handle.get_stream()); device_allgatherv(row_comm, - sorted_local_vertices.data(), - sorted_minor_vertices.data(), + (*sorted_local_vertices).data(), + sorted_minors.data(), recvcounts, displacements, handle.get_stream()); - thrust::sort( - handle.get_thrust_policy(), sorted_minor_vertices.begin(), sorted_minor_vertices.end()); + thrust::sort(handle.get_thrust_policy(), sorted_minors.begin(), sorted_minors.end()); } // barrier is necessary here to avoid potential overlap (which can leads to deadlock) @@ -541,27 +541,23 @@ void expensive_check_edgelist( comm.barrier(); // currently, this is ncclAllReduce #endif - auto edge_first = thrust::make_zip_iterator( - thrust::make_tuple(edgelist_major_vertices[i], edgelist_minor_vertices[i])); + auto edge_first = + thrust::make_zip_iterator(thrust::make_tuple(edgelist_majors[i], edgelist_minors[i])); CUGRAPH_EXPECTS( thrust::count_if( handle.get_thrust_policy(), edge_first, edge_first + edgelist_edge_counts[i], - [num_major_vertices = static_cast(sorted_major_vertices.size()), - sorted_major_vertices = sorted_major_vertices.data(), - num_minor_vertices = static_cast(sorted_minor_vertices.size()), - sorted_minor_vertices = sorted_minor_vertices.data()] __device__(auto e) { - return !thrust::binary_search(thrust::seq, - sorted_major_vertices, - sorted_major_vertices + num_major_vertices, - thrust::get<0>(e)) || - !thrust::binary_search(thrust::seq, - sorted_minor_vertices, - sorted_minor_vertices + num_minor_vertices, - thrust::get<1>(e)); + [num_majors = static_cast(sorted_majors.size()), + sorted_majors = sorted_majors.data(), + num_minors = static_cast(sorted_minors.size()), + sorted_minors = sorted_minors.data()] __device__(auto e) { + return !thrust::binary_search( + thrust::seq, sorted_majors, sorted_majors + num_majors, thrust::get<0>(e)) || + !thrust::binary_search( + thrust::seq, sorted_minors, sorted_minors + num_minors, thrust::get<1>(e)); }) == 0, - "Invalid input argument: edgelist_major_vertices and/or edgelist_minor_vertices have " + "Invalid input argument: edgelist_majors and/or edgelist_minors have " "invalid vertex ID(s)."); } @@ -570,8 +566,8 @@ void expensive_check_edgelist( CUGRAPH_EXPECTS( thrust::count_if( handle.get_thrust_policy(), - edgelist_minor_vertices[i] + (*edgelist_intra_partition_segment_offsets)[i][j], - edgelist_minor_vertices[i] + (*edgelist_intra_partition_segment_offsets)[i][j + 1], + edgelist_minors[i] + (*edgelist_intra_partition_segment_offsets)[i][j], + edgelist_minors[i] + (*edgelist_intra_partition_segment_offsets)[i][j + 1], [row_comm_size, col_comm_rank, j, @@ -580,26 +576,26 @@ void expensive_check_edgelist( return gpu_id_key_func(minor) != col_comm_rank * row_comm_size + j; }) == 0, "Invalid input argument: if edgelist_intra_partition_segment_offsets.has_value() is " - "true, edgelist_major_vertices & edgelist_minor_vertices should be properly grouped " + "true, edgelist_majors & edgelist_minors should be properly grouped " "within each local partition."); } } } } else { - assert(edgelist_major_vertices.size() == 1); - assert(edgelist_minor_vertices.size() == 1); + assert(edgelist_majors.size() == 1); + assert(edgelist_minors.size() == 1); - if (vertex_span) { - auto edge_first = thrust::make_zip_iterator( - thrust::make_tuple(edgelist_major_vertices[0], edgelist_minor_vertices[0])); + if (sorted_local_vertices) { + auto edge_first = + thrust::make_zip_iterator(thrust::make_tuple(edgelist_majors[0], edgelist_minors[0])); CUGRAPH_EXPECTS( thrust::count_if( handle.get_thrust_policy(), edge_first, edge_first + edgelist_edge_counts[0], - [sorted_local_vertices = sorted_local_vertices.data(), + [sorted_local_vertices = (*sorted_local_vertices).data(), num_sorted_local_vertices = - static_cast(sorted_local_vertices.size())] __device__(auto e) { + static_cast((*sorted_local_vertices).size())] __device__(auto e) { return !thrust::binary_search(thrust::seq, sorted_local_vertices, sorted_local_vertices + num_sorted_local_vertices, @@ -609,7 +605,7 @@ void expensive_check_edgelist( sorted_local_vertices + num_sorted_local_vertices, thrust::get<1>(e)); }) == 0, - "Invalid input argument: edgelist_major_vertices and/or edgelist_minor_vertices have " + "Invalid input argument: edgelist_majors and/or edgelist_minors have " "invalid vertex ID(s)."); } @@ -628,9 +624,9 @@ std::enable_if_t< std::tuple, renumber_meta_t>> renumber_edgelist( raft::handle_t const& handle, - std::optional> local_vertex_span, - std::vector const& edgelist_major_vertices /* [INOUT] */, - std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::optional>&& local_vertices, + std::vector const& edgelist_majors /* [INOUT] */, + std::vector const& edgelist_minors /* [INOUT] */, std::vector const& edgelist_edge_counts, std::optional>> const& edgelist_intra_partition_segment_offsets, bool do_expensive_check) @@ -645,17 +641,17 @@ renumber_edgelist( auto const col_comm_size = col_comm.get_size(); auto const col_comm_rank = col_comm.get_rank(); - CUGRAPH_EXPECTS(edgelist_major_vertices.size() == static_cast(col_comm_size), - "Invalid input arguments: erroneous edgelist_major_vertices.size()."); - CUGRAPH_EXPECTS(edgelist_minor_vertices.size() == static_cast(col_comm_size), - "Invalid input arguments: erroneous edgelist_minor_vertices.size()."); + CUGRAPH_EXPECTS(edgelist_majors.size() == static_cast(col_comm_size), + "Invalid input arguments: erroneous edgelist_majors.size()."); + CUGRAPH_EXPECTS(edgelist_minors.size() == static_cast(col_comm_size), + "Invalid input arguments: erroneous edgelist_minors.size()."); CUGRAPH_EXPECTS(edgelist_edge_counts.size() == static_cast(col_comm_size), "Invalid input arguments: erroneous edgelist_edge_counts.size()."); if (edgelist_intra_partition_segment_offsets) { CUGRAPH_EXPECTS( (*edgelist_intra_partition_segment_offsets).size() == static_cast(col_comm_size), "Invalid input arguments: erroneous (*edgelist_intra_partition_segment_offsets).size()."); - for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) { + for (size_t i = 0; i < edgelist_majors.size(); ++i) { CUGRAPH_EXPECTS( (*edgelist_intra_partition_segment_offsets)[i].size() == static_cast(row_comm_size + 1), @@ -673,19 +669,19 @@ renumber_edgelist( } } - std::vector edgelist_const_major_vertices(edgelist_major_vertices.size()); - std::vector edgelist_const_minor_vertices(edgelist_const_major_vertices.size()); - for (size_t i = 0; i < edgelist_const_major_vertices.size(); ++i) { - edgelist_const_major_vertices[i] = edgelist_major_vertices[i]; - edgelist_const_minor_vertices[i] = edgelist_minor_vertices[i]; + std::vector edgelist_const_majors(edgelist_majors.size()); + std::vector edgelist_const_minors(edgelist_const_majors.size()); + for (size_t i = 0; i < edgelist_const_majors.size(); ++i) { + edgelist_const_majors[i] = edgelist_majors[i]; + edgelist_const_minors[i] = edgelist_minors[i]; } if (do_expensive_check) { detail::expensive_check_edgelist( handle, - local_vertex_span, - edgelist_const_major_vertices, - edgelist_const_minor_vertices, + local_vertices, + edgelist_const_majors, + edgelist_const_minors, edgelist_edge_counts, edgelist_intra_partition_segment_offsets); } @@ -697,9 +693,9 @@ renumber_edgelist( num_unique_edge_majors, num_unique_edge_minors] = detail::compute_renumber_map(handle, - local_vertex_span, - edgelist_const_major_vertices, - edgelist_const_minor_vertices, + std::move(local_vertices), + edgelist_const_majors, + edgelist_const_minors, edgelist_edge_counts); // 2. initialize partition_t object, number_of_vertices, and number_of_edges for the coarsened @@ -742,13 +738,13 @@ renumber_edgelist( { vertex_t max_matrix_partition_major_size{0}; - for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) { + for (size_t i = 0; i < edgelist_majors.size(); ++i) { max_matrix_partition_major_size = std::max(max_matrix_partition_major_size, partition.get_matrix_partition_major_size(i)); } rmm::device_uvector renumber_map_major_labels(max_matrix_partition_major_size, handle.get_stream()); - for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) { + for (size_t i = 0; i < edgelist_majors.size(); ++i) { device_bcast(col_comm, renumber_map_labels.data(), renumber_map_major_labels.data(), @@ -777,9 +773,8 @@ renumber_edgelist( renumber_map_major_labels.begin(), thrust::make_counting_iterator(partition.get_matrix_partition_major_first(i)))); renumber_map.insert(pair_first, pair_first + partition.get_matrix_partition_major_size(i)); - renumber_map.find(edgelist_major_vertices[i], - edgelist_major_vertices[i] + edgelist_edge_counts[i], - edgelist_major_vertices[i]); + renumber_map.find( + edgelist_majors[i], edgelist_majors[i] + edgelist_edge_counts[i], edgelist_majors[i]); } } @@ -830,11 +825,11 @@ renumber_edgelist( thrust::make_counting_iterator( partition.get_vertex_partition_first(col_comm_rank * row_comm_size + i)))); renumber_map.insert(pair_first, pair_first + segment_size); - for (size_t j = 0; j < edgelist_minor_vertices.size(); ++j) { + for (size_t j = 0; j < edgelist_minors.size(); ++j) { renumber_map.find( - edgelist_minor_vertices[j] + (*edgelist_intra_partition_segment_offsets)[j][i], - edgelist_minor_vertices[j] + (*edgelist_intra_partition_segment_offsets)[j][i + 1], - edgelist_minor_vertices[j] + (*edgelist_intra_partition_segment_offsets)[j][i]); + edgelist_minors[j] + (*edgelist_intra_partition_segment_offsets)[j][i], + edgelist_minors[j] + (*edgelist_intra_partition_segment_offsets)[j][i + 1], + edgelist_minors[j] + (*edgelist_intra_partition_segment_offsets)[j][i]); } } } else { @@ -870,10 +865,9 @@ renumber_edgelist( renumber_map_minor_labels.begin(), thrust::make_counting_iterator(partition.get_matrix_partition_minor_first()))); renumber_map.insert(pair_first, pair_first + renumber_map_minor_labels.size()); - for (size_t i = 0; i < edgelist_minor_vertices.size(); ++i) { - renumber_map.find(edgelist_minor_vertices[i], - edgelist_minor_vertices[i] + edgelist_edge_counts[i], - edgelist_minor_vertices[i]); + for (size_t i = 0; i < edgelist_minors.size(); ++i) { + renumber_map.find( + edgelist_minors[i], edgelist_minors[i] + edgelist_edge_counts[i], edgelist_minors[i]); } } // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between two @@ -902,18 +896,18 @@ std::enable_if_t< !multi_gpu, std::tuple, renumber_meta_t>> renumber_edgelist(raft::handle_t const& handle, - std::optional> vertex_span, - vertex_t* edgelist_major_vertices /* [INOUT] */, - vertex_t* edgelist_minor_vertices /* [INOUT] */, + std::optional>&& vertices, + vertex_t* edgelist_majors /* [INOUT] */, + vertex_t* edgelist_minors /* [INOUT] */, edge_t num_edgelist_edges, bool do_expensive_check) { if (do_expensive_check) { detail::expensive_check_edgelist( handle, - vertex_span, - std::vector{edgelist_major_vertices}, - std::vector{edgelist_minor_vertices}, + vertices, + std::vector{edgelist_majors}, + std::vector{edgelist_minors}, std::vector{num_edgelist_edges}, std::nullopt); } @@ -923,9 +917,9 @@ renumber_edgelist(raft::handle_t const& handle, std::tie(renumber_map_labels, segment_offsets, std::ignore, std::ignore) = detail::compute_renumber_map( handle, - vertex_span, - std::vector{edgelist_major_vertices}, - std::vector{edgelist_minor_vertices}, + std::move(vertices), + std::vector{edgelist_majors}, + std::vector{edgelist_minors}, std::vector{num_edgelist_edges}); double constexpr load_factor = 0.7; @@ -946,10 +940,8 @@ renumber_edgelist(raft::handle_t const& handle, auto pair_first = thrust::make_zip_iterator( thrust::make_tuple(renumber_map_labels.begin(), thrust::make_counting_iterator(vertex_t{0}))); renumber_map.insert(pair_first, pair_first + renumber_map_labels.size()); - renumber_map.find( - edgelist_major_vertices, edgelist_major_vertices + num_edgelist_edges, edgelist_major_vertices); - renumber_map.find( - edgelist_minor_vertices, edgelist_minor_vertices + num_edgelist_edges, edgelist_minor_vertices); + renumber_map.find(edgelist_majors, edgelist_majors + num_edgelist_edges, edgelist_majors); + renumber_map.find(edgelist_minors, edgelist_minors + num_edgelist_edges, edgelist_minors); return std::make_tuple(std::move(renumber_map_labels), renumber_meta_t{segment_offsets}); diff --git a/cpp/src/structure/renumber_edgelist_mg.cu b/cpp/src/structure/renumber_edgelist_mg.cu index 4e9f37e10bb..eecad91f5d6 100644 --- a/cpp/src/structure/renumber_edgelist_mg.cu +++ b/cpp/src/structure/renumber_edgelist_mg.cu @@ -22,9 +22,9 @@ namespace cugraph { template std::tuple, renumber_meta_t> renumber_edgelist( raft::handle_t const& handle, - std::optional> optional_local_vertex_span, - std::vector const& edgelist_major_vertices /* [INOUT] */, - std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::optional>&& local_vertices, + std::vector const& edgelist_majors /* [INOUT] */, + std::vector const& edgelist_minors /* [INOUT] */, std::vector const& edgelist_edge_counts, std::optional>> const& edgelist_intra_partition_segment_offsets, bool do_expensive_check); @@ -32,9 +32,9 @@ renumber_edgelist( template std::tuple, renumber_meta_t> renumber_edgelist( raft::handle_t const& handle, - std::optional> optional_local_vertex_span, - std::vector const& edgelist_major_vertices /* [INOUT] */, - std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::optional>&& local_vertices, + std::vector const& edgelist_majors /* [INOUT] */, + std::vector const& edgelist_minors /* [INOUT] */, std::vector const& edgelist_edge_counts, std::optional>> const& edgelist_intra_partition_segment_offsets, bool do_expensive_check); @@ -42,9 +42,9 @@ renumber_edgelist( template std::tuple, renumber_meta_t> renumber_edgelist( raft::handle_t const& handle, - std::optional> optional_local_vertex_span, - std::vector const& edgelist_major_vertices /* [INOUT] */, - std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::optional>&& local_vertices, + std::vector const& edgelist_majors /* [INOUT] */, + std::vector const& edgelist_minors /* [INOUT] */, std::vector const& edgelist_edge_counts, std::optional>> const& edgelist_intra_partition_segment_offsets, bool do_expensive_check); diff --git a/cpp/src/structure/renumber_edgelist_sg.cu b/cpp/src/structure/renumber_edgelist_sg.cu index 3bb25d74b2e..fd591eda052 100644 --- a/cpp/src/structure/renumber_edgelist_sg.cu +++ b/cpp/src/structure/renumber_edgelist_sg.cu @@ -20,30 +20,27 @@ namespace cugraph { // SG instantiation template std::tuple, renumber_meta_t> -renumber_edgelist( - raft::handle_t const& handle, - std::optional> optional_vertex_span, - int32_t* edgelist_major_vertices /* [INOUT] */, - int32_t* edgelist_minor_vertices /* [INOUT] */, - int32_t num_edgelist_edges, - bool do_expensive_check); +renumber_edgelist(raft::handle_t const& handle, + std::optional>&& vertices, + int32_t* edgelist_majors /* [INOUT] */, + int32_t* edgelist_minors /* [INOUT] */, + int32_t num_edgelist_edges, + bool do_expensive_check); template std::tuple, renumber_meta_t> -renumber_edgelist( - raft::handle_t const& handle, - std::optional> optional_vertex_span, - int32_t* edgelist_major_vertices /* [INOUT] */, - int32_t* edgelist_minor_vertices /* [INOUT] */, - int64_t num_edgelist_edges, - bool do_expensive_check); +renumber_edgelist(raft::handle_t const& handle, + std::optional>&& vertices, + int32_t* edgelist_majors /* [INOUT] */, + int32_t* edgelist_minors /* [INOUT] */, + int64_t num_edgelist_edges, + bool do_expensive_check); template std::tuple, renumber_meta_t> -renumber_edgelist( - raft::handle_t const& handle, - std::optional> optional_vertex_span, - int64_t* edgelist_major_vertices /* [INOUT] */, - int64_t* edgelist_minor_vertices /* [INOUT] */, - int64_t num_edgelist_edges, - bool do_expensive_check); +renumber_edgelist(raft::handle_t const& handle, + std::optional>&& vertices, + int64_t* edgelist_majors /* [INOUT] */, + int64_t* edgelist_minors /* [INOUT] */, + int64_t num_edgelist_edges, + bool do_expensive_check); } // namespace cugraph From da39f24cd509e4e60ad8972ff3dd38cd37cd636d Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 29 Oct 2021 15:27:26 -0400 Subject: [PATCH 6/7] clang-format --- cpp/tests/utilities/base_fixture.hpp | 56 ++++++++++++++-------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/cpp/tests/utilities/base_fixture.hpp b/cpp/tests/utilities/base_fixture.hpp index 7b922b87111..9e4ce610569 100644 --- a/cpp/tests/utilities/base_fixture.hpp +++ b/cpp/tests/utilities/base_fixture.hpp @@ -185,32 +185,32 @@ inline auto parse_test_options(int argc, char** argv) return RUN_ALL_TESTS(); \ } -#define CUGRAPH_MG_TEST_PROGRAM_MAIN() \ - int main(int argc, char** argv) \ - { \ - MPI_TRY(MPI_Init(&argc, &argv)); \ - int comm_rank{}; \ - int comm_size{}; \ - MPI_TRY(MPI_Comm_rank(MPI_COMM_WORLD, &comm_rank)); \ - MPI_TRY(MPI_Comm_size(MPI_COMM_WORLD, &comm_size)); \ - int num_gpus_per_node{}; \ - CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node)); \ - CUDA_TRY(cudaSetDevice(comm_rank % num_gpus_per_node)); \ - ::testing::InitGoogleTest(&argc, argv); \ - auto const cmd_opts = parse_test_options(argc, argv); \ - auto const rmm_mode = cmd_opts["rmm_mode"].as(); \ - auto resource = cugraph::test::create_memory_resource(rmm_mode); \ - rmm::mr::set_current_device_resource(resource.get()); \ - cugraph::test::g_perf = cmd_opts["perf"].as(); \ - cugraph::test::g_rmat_scale = \ - (cmd_opts.count("rmat_scale") > 0) \ - ? std::make_optional(cmd_opts["rmat_scale"].as()) \ - : std::nullopt; \ - cugraph::test::g_rmat_edge_factor = \ - (cmd_opts.count("rmat_edge_factor") > 0) \ - ? std::make_optional(cmd_opts["rmat_edge_factor"].as()) \ - : std::nullopt; \ - auto ret = RUN_ALL_TESTS(); \ - MPI_TRY(MPI_Finalize()); \ - return ret; \ +#define CUGRAPH_MG_TEST_PROGRAM_MAIN() \ + int main(int argc, char** argv) \ + { \ + MPI_TRY(MPI_Init(&argc, &argv)); \ + int comm_rank{}; \ + int comm_size{}; \ + MPI_TRY(MPI_Comm_rank(MPI_COMM_WORLD, &comm_rank)); \ + MPI_TRY(MPI_Comm_size(MPI_COMM_WORLD, &comm_size)); \ + int num_gpus_per_node{}; \ + CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node)); \ + CUDA_TRY(cudaSetDevice(comm_rank % num_gpus_per_node)); \ + ::testing::InitGoogleTest(&argc, argv); \ + auto const cmd_opts = parse_test_options(argc, argv); \ + auto const rmm_mode = cmd_opts["rmm_mode"].as(); \ + auto resource = cugraph::test::create_memory_resource(rmm_mode); \ + rmm::mr::set_current_device_resource(resource.get()); \ + cugraph::test::g_perf = cmd_opts["perf"].as(); \ + cugraph::test::g_rmat_scale = \ + (cmd_opts.count("rmat_scale") > 0) \ + ? std::make_optional(cmd_opts["rmat_scale"].as()) \ + : std::nullopt; \ + cugraph::test::g_rmat_edge_factor = \ + (cmd_opts.count("rmat_edge_factor") > 0) \ + ? std::make_optional(cmd_opts["rmat_edge_factor"].as()) \ + : std::nullopt; \ + auto ret = RUN_ALL_TESTS(); \ + MPI_TRY(MPI_Finalize()); \ + return ret; \ } From 292eda5183ef50c0d58ffdd32cc6893965dff960 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 29 Oct 2021 13:55:59 -0700 Subject: [PATCH 7/7] remove unncessary host barrier synchronization --- .../prims/copy_to_adj_matrix_row_col.cuh | 89 --------------- .../copy_v_transform_reduce_in_out_nbr.cuh | 45 -------- ...ransform_reduce_key_aggregated_out_nbr.cuh | 42 ------- .../update_frontier_v_push_if_out_nbr.cuh | 81 ------------- cpp/src/structure/coarsen_graph_impl.cuh | 21 ---- .../create_graph_from_edgelist_impl.cuh | 36 ------ cpp/src/structure/renumber_edgelist_impl.cuh | 107 ------------------ cpp/src/structure/renumber_utils_impl.cuh | 31 ----- 8 files changed, 452 deletions(-) diff --git a/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh b/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh index 7100e7c8663..508294c9e89 100644 --- a/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh +++ b/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include @@ -65,17 +64,6 @@ void copy_to_matrix_major(raft::handle_t const& handle, auto const col_comm_rank = col_comm.get_rank(); auto const col_comm_size = col_comm.get_size(); - // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between - // two different communicators (beginning of col_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK - // and MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif - if (matrix_major_value_output.key_first()) { auto key_offsets = GraphViewType::is_adj_matrix_transposed ? *(graph_view.get_local_sorted_unique_edge_col_offsets()) @@ -122,17 +110,6 @@ void copy_to_matrix_major(raft::handle_t const& handle, displacements, handle.get_stream()); } - - // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between - // two different communicators (end of col_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK - // and MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif } else { assert(!(matrix_major_value_output.key_first())); assert(graph_view.get_number_of_local_vertices() == GraphViewType::is_adj_matrix_transposed @@ -170,17 +147,6 @@ void copy_to_matrix_major(raft::handle_t const& handle, auto const col_comm_rank = col_comm.get_rank(); auto const col_comm_size = col_comm.get_size(); - // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between - // two different communicators (beginning of col_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK - // and MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif - auto rx_counts = host_scalar_allgather(col_comm, static_cast(thrust::distance(vertex_first, vertex_last)), @@ -260,17 +226,6 @@ void copy_to_matrix_major(raft::handle_t const& handle, matrix_major_value_output.value_data() + matrix_partition.get_major_value_start_offset()); } } - - // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between - // two different communicators (end of col_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK - // and MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif } else { assert(!(matrix_major_value_output.key_first())); assert(graph_view.get_number_of_local_vertices() == GraphViewType::is_adj_matrix_transposed @@ -305,17 +260,6 @@ void copy_to_matrix_minor(raft::handle_t const& handle, auto const col_comm_rank = col_comm.get_rank(); auto const col_comm_size = col_comm.get_size(); - // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between - // two different communicators (beginning of row_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK - // and MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif - if (matrix_minor_value_output.key_first()) { auto key_offsets = GraphViewType::is_adj_matrix_transposed ? *(graph_view.get_local_sorted_unique_edge_row_offsets()) @@ -362,17 +306,6 @@ void copy_to_matrix_minor(raft::handle_t const& handle, displacements, handle.get_stream()); } - - // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between - // two different communicators (end of row_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK - // and MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif } else { assert(!(matrix_minor_value_output.key_first())); assert(graph_view.get_number_of_local_vertices() == GraphViewType::is_adj_matrix_transposed @@ -410,17 +343,6 @@ void copy_to_matrix_minor(raft::handle_t const& handle, auto const col_comm_rank = col_comm.get_rank(); auto const col_comm_size = col_comm.get_size(); - // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between - // two different communicators (beginning of row_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK - // and MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif - auto rx_counts = host_scalar_allgather(row_comm, static_cast(thrust::distance(vertex_first, vertex_last)), @@ -498,17 +420,6 @@ void copy_to_matrix_minor(raft::handle_t const& handle, matrix_minor_value_output.value_data()); } } - - // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between - // two different communicators (end of row_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK - // and MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif } else { assert(!(matrix_minor_value_output.key_first())); assert(graph_view.get_number_of_local_vertices() == diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh index 9fb7e8bf2a1..0cf6633d23e 100644 --- a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh +++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh @@ -22,7 +22,6 @@ #include #include #include -#include #include #include @@ -603,17 +602,6 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, auto const col_comm_rank = col_comm.get_rank(); auto const col_comm_size = col_comm.get_size(); - // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between - // two different communicators (beginning of col_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK - // and MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif - device_reduce(col_comm, major_buffer_first, vertex_value_output_first, @@ -621,17 +609,6 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, raft::comms::op_t::SUM, i, handle.get_stream()); - - // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between - // two different communicators (end of col_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK - // and MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif } } @@ -645,17 +622,6 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, auto const col_comm_rank = col_comm.get_rank(); auto const col_comm_size = col_comm.get_size(); - // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between - // two different communicators (beginning of row_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK - // and MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif - for (int i = 0; i < row_comm_size; ++i) { auto offset = (graph_view.get_vertex_partition_first(col_comm_rank * row_comm_size + i) - graph_view.get_vertex_partition_first(col_comm_rank * row_comm_size)); @@ -668,17 +634,6 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, i, handle.get_stream()); } - - // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between - // two different communicators (end of row_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK - // and MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif } } diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh index aac89919892..9aca0361ccf 100644 --- a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh +++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include @@ -242,17 +241,6 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( auto const row_comm_rank = row_comm.get_rank(); auto const row_comm_size = row_comm.get_size(); - // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between - // two different communicators (beginning of row_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK - // and MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif - auto map_counts = host_scalar_allgather( row_comm, static_cast(thrust::distance(map_unique_key_first, map_unique_key_last)), @@ -320,21 +308,6 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( // 2. aggregate each vertex out-going edges based on keys and transform-reduce. - if (GraphViewType::is_multi_gpu) { - auto& comm = handle.get_comms(); - - // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between - // two different communicators (beginning of col_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK - // and MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif - } - rmm::device_uvector major_vertices(0, handle.get_stream()); auto e_op_result_buffer = allocate_dataframe_buffer(0, handle.get_stream()); for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { @@ -546,21 +519,6 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( } } - if (GraphViewType::is_multi_gpu) { - auto& comm = handle.get_comms(); - - // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between - // two different communicators (beginning of col_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK - // and MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif - } - auto execution_policy = handle.get_thrust_policy(); thrust::fill(execution_policy, vertex_value_output_first, diff --git a/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh b/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh index c84fcd19ce4..29a3c1cca1e 100644 --- a/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh +++ b/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include @@ -609,20 +608,6 @@ typename GraphViewType::edge_type compute_num_out_nbrs_from_frontier( edge_t ret{0}; - if (GraphViewType::is_multi_gpu) { - auto& comm = handle.get_comms(); - // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between - // two different communicators (beginning of col_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK - // and MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif - } - auto const& cur_frontier_bucket = frontier.get_bucket(cur_frontier_bucket_idx); vertex_t const* local_frontier_vertex_first{nullptr}; vertex_t const* local_frontier_vertex_last{nullptr}; @@ -719,20 +704,6 @@ typename GraphViewType::edge_type compute_num_out_nbrs_from_frontier( } } - if (GraphViewType::is_multi_gpu) { - auto& comm = handle.get_comms(); - // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between - // two different communicators (end of col_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK - // and MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif - } - return ret; } @@ -840,21 +811,6 @@ void update_frontier_v_push_if_out_nbr( // 1. fill the buffer - if (GraphViewType::is_multi_gpu) { - auto& comm = handle.get_comms(); - - // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between - // two different communicators (beginning of col_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK - // and MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif - } - auto key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); auto payload_buffer = detail::allocate_optional_payload_buffer(size_t{0}, handle.get_stream()); @@ -1097,21 +1053,6 @@ void update_frontier_v_push_if_out_nbr( } } - if (GraphViewType::is_multi_gpu) { - auto& comm = handle.get_comms(); - - // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between - // two different communicators (beginning of col_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK - // and MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif - } - // 2. reduce the buffer auto num_buffer_elements = detail::sort_and_reduce_buffer_elements( @@ -1128,17 +1069,6 @@ void update_frontier_v_push_if_out_nbr( auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); auto const col_comm_rank = col_comm.get_rank(); - // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between - // two different communicators (beginning of row_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK - // and MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif - std::vector h_vertex_lasts(row_comm_size); for (size_t i = 0; i < h_vertex_lasts.size(); ++i) { h_vertex_lasts[i] = graph_view.get_vertex_partition_last(col_comm_rank * row_comm_size + i); @@ -1189,17 +1119,6 @@ void update_frontier_v_push_if_out_nbr( detail::get_optional_payload_buffer_begin(payload_buffer), size_dataframe_buffer(key_buffer), reduce_op); - - // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between - // two different communicators (end of row_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK - // and MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif } // 3. update vertex properties and frontier diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh index f49c7be626d..44f87c71085 100644 --- a/cpp/src/structure/coarsen_graph_impl.cuh +++ b/cpp/src/structure/coarsen_graph_impl.cuh @@ -24,7 +24,6 @@ #include #include #include -#include #include #include @@ -238,16 +237,6 @@ coarsen_graph( for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { // 1-1. locally construct coarsened edge list - // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between - // two different communicators (beginning of col_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK - // and MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif rmm::device_uvector major_labels( store_transposed ? graph_view.get_number_of_local_adj_matrix_partition_cols(i) : graph_view.get_number_of_local_adj_matrix_partition_rows(i), @@ -258,16 +247,6 @@ coarsen_graph( major_labels.size(), static_cast(i), handle.get_stream()); - // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between - // two different communicators (end of col_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK - // and MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif auto [edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights] = decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist( diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.cuh b/cpp/src/structure/create_graph_from_edgelist_impl.cuh index 4516b087180..849705df4ac 100644 --- a/cpp/src/structure/create_graph_from_edgelist_impl.cuh +++ b/cpp/src/structure/create_graph_from_edgelist_impl.cuh @@ -22,7 +22,6 @@ #include #include #include -#include #include #include @@ -113,19 +112,6 @@ void expensive_check_edgelist(raft::handle_t const& handle, "Invalid input argument: edgelist_majors & edgelist_minors should be pre-shuffled."); if (vertices) { - // FIXME: this barrier is unnecessary if the above host_scalar_allreduce is a true host - // operation (as it serves as a barrier) barrier is necessary here to avoid potential - // overlap (which can leads to deadlock) between two different communicators (beginning of - // col_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with - // DASK and MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif - rmm::device_uvector sorted_majors(0, handle.get_stream()); { auto recvcounts = host_scalar_allgather(col_comm, (*vertices).size(), handle.get_stream()); @@ -141,17 +127,6 @@ void expensive_check_edgelist(raft::handle_t const& handle, thrust::sort(handle.get_thrust_policy(), sorted_majors.begin(), sorted_majors.end()); } - // barrier is necessary here to avoid potential overlap (which can leads to deadlock) - // between two different communicators (beginning of row_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with - // DASK and MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif - rmm::device_uvector sorted_minors(0, handle.get_stream()); { auto recvcounts = host_scalar_allgather(row_comm, (*vertices).size(), handle.get_stream()); @@ -167,17 +142,6 @@ void expensive_check_edgelist(raft::handle_t const& handle, thrust::sort(handle.get_thrust_policy(), sorted_minors.begin(), sorted_minors.end()); } - // barrier is necessary here to avoid potential overlap (which can leads to deadlock) - // between two different communicators (end of row_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with - // DASK and MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif - auto edge_first = thrust::make_zip_iterator( thrust::make_tuple(edgelist_majors.begin(), edgelist_minors.begin())); CUGRAPH_EXPECTS( diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh index 2c22630ea3f..8d4eb0abf10 100644 --- a/cpp/src/structure/renumber_edgelist_impl.cuh +++ b/cpp/src/structure/renumber_edgelist_impl.cuh @@ -20,7 +20,6 @@ #include #include #include -#include #include #include @@ -59,22 +58,6 @@ compute_renumber_map(raft::handle_t const& handle, // 1. acquire (unique major label, count) pairs - if (multi_gpu) { - auto& comm = handle.get_comms(); - - // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between - // two different communicators (beginning of col_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK - // and MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - ; - comm.barrier(); // currently, this is ncclAllReduce -#endif - } - rmm::device_uvector major_labels(0, handle.get_stream()); rmm::device_uvector major_counts(0, handle.get_stream()); vertex_t num_local_unique_edge_majors{0}; @@ -202,17 +185,6 @@ compute_renumber_map(raft::handle_t const& handle, auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); auto const row_comm_size = row_comm.get_size(); - // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between - // two different communicators (beginning of row_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK - // and MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif - if (row_comm_size > 1) { rmm::device_uvector rx_minor_labels(0, handle.get_stream()); std::tie(rx_minor_labels, std::ignore) = groupby_gpuid_and_shuffle_values( @@ -230,18 +202,6 @@ compute_renumber_map(raft::handle_t const& handle, handle.get_stream()); minor_labels = std::move(rx_minor_labels); } - - // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between - // two different communicators (end of row_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK - // and MPI barrier with MPI) - // - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif } minor_labels.shrink_to_fit(handle.get_stream_view()); @@ -474,19 +434,6 @@ void expensive_check_edgelist( auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - // FIXME: this barrier is unnecessary if the above host_scalar_allreduce is a true host - // operation (as it serves as a barrier) barrier is necessary here to avoid potential - // overlap (which can leads to deadlock) between two different communicators (beginning of - // col_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with - // DASK and MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif - rmm::device_uvector sorted_majors(0, handle.get_stream()); { auto recvcounts = @@ -503,17 +450,6 @@ void expensive_check_edgelist( thrust::sort(handle.get_thrust_policy(), sorted_majors.begin(), sorted_majors.end()); } - // barrier is necessary here to avoid potential overlap (which can leads to deadlock) - // between two different communicators (beginning of row_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with - // DASK and MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif - rmm::device_uvector sorted_minors(0, handle.get_stream()); { auto recvcounts = @@ -530,17 +466,6 @@ void expensive_check_edgelist( thrust::sort(handle.get_thrust_policy(), sorted_minors.begin(), sorted_minors.end()); } - // barrier is necessary here to avoid potential overlap (which can leads to deadlock) - // between two different communicators (end of row_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with - // DASK and MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif - auto edge_first = thrust::make_zip_iterator(thrust::make_tuple(edgelist_majors[i], edgelist_minors[i])); CUGRAPH_EXPECTS( @@ -724,18 +649,6 @@ renumber_edgelist( // FIXME: compare this hash based approach with a binary search based approach in both memory // footprint and execution time - // FIXME: this barrier is unnecessary if the above host_scalar_allgather is a true host operation - // (as it serves as a barrier) barrier is necessary here to avoid potential overlap (which can - // leads to deadlock) between two different communicators (beginning of col_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK and - // MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif - { vertex_t max_matrix_partition_major_size{0}; for (size_t i = 0; i < edgelist_majors.size(); ++i) { @@ -778,16 +691,6 @@ renumber_edgelist( } } - // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between two - // different communicators (beginning of row_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK and - // MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif if ((partition.get_matrix_partition_minor_size() >= number_of_edges / comm_size) && edgelist_intra_partition_segment_offsets) { // memory footprint dominated by the O(V/sqrt(P)) // part than the O(E/P) part @@ -870,16 +773,6 @@ renumber_edgelist( edgelist_minors[i], edgelist_minors[i] + edgelist_edge_counts[i], edgelist_minors[i]); } } - // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between two - // different communicators (end of row_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK and - // MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif return std::make_tuple( std::move(renumber_map_labels), diff --git a/cpp/src/structure/renumber_utils_impl.cuh b/cpp/src/structure/renumber_utils_impl.cuh index b719fe6bc3c..871d71975df 100644 --- a/cpp/src/structure/renumber_utils_impl.cuh +++ b/cpp/src/structure/renumber_utils_impl.cuh @@ -20,7 +20,6 @@ #include #include #include -#include #include #include @@ -163,16 +162,6 @@ void unrenumber_local_int_edges( // FIXME: compare this hash based approach with a binary search based approach in both memory // footprint and execution time - // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between two - // different communicators (beginning of col_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK and - // MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif { vertex_t max_matrix_partition_major_size{0}; for (size_t i = 0; i < edgelist_majors.size(); ++i) { @@ -224,16 +213,6 @@ void unrenumber_local_int_edges( } } - // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between two - // different communicators (beginning of row_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK and - // MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif vertex_t matrix_partition_minor_size{0}; for (int i = 0; i < row_comm_size; ++i) { auto vertex_partition_rank = col_comm_rank * row_comm_size + i; @@ -340,16 +319,6 @@ void unrenumber_local_int_edges( edgelist_minors[i], edgelist_minors[i] + edgelist_edge_counts[i], edgelist_minors[i]); } } - // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between two - // different communicators (end of row_comm) -#if 1 - // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK and - // MPI barrier with MPI) - host_barrier(comm, handle.get_stream_view()); -#else - handle.get_stream_view().synchronize(); - comm.barrier(); // currently, this is ncclAllReduce -#endif } } // namespace detail