From 656638d17e43db99fb9caeee0997d73b4aa8a09e Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <45857425+seunghwak@users.noreply.github.com>
Date: Wed, 20 Jul 2022 17:09:31 -0700
Subject: [PATCH] Enable concurrent broadcasts in
 update_edge_partition_minor_property() (#2413)

Dependent on https://github.com/rapidsai/raft/pull/742

Place multiple broadcast operations inside ncclGroupStart and ncclGroupEnd wrappers to improve performance.

Authors:
  - Seunghwa Kang (https://github.com/seunghwak)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: https://github.com/rapidsai/cugraph/pull/2413
---
 cpp/include/cugraph/utilities/device_comm.cuh |  4 ++
 ...update_edge_partition_src_dst_property.cuh | 40 ++-----------------
 cpp/src/structure/graph_impl.cuh              | 36 ++++++++---------
 3 files changed, 23 insertions(+), 57 deletions(-)

diff --git a/cpp/include/cugraph/utilities/device_comm.cuh b/cpp/include/cugraph/utilities/device_comm.cuh
index 6d342aad06e..27e3512ccf5 100644
--- a/cpp/include/cugraph/utilities/device_comm.cuh
+++ b/cpp/include/cugraph/utilities/device_comm.cuh
@@ -1091,4 +1091,8 @@ device_gatherv(raft::comms::comms_t const& comm,
     .run(comm, input_first, output_first, sendcount, recvcounts, displacements, root, stream_view);
 }
 
+inline void device_group_start(raft::comms::comms_t const& comm) { comm.group_start(); }
+
+inline void device_group_end(raft::comms::comms_t const& comm) { comm.group_end(); }
+
 }  // namespace cugraph
diff --git a/cpp/src/prims/update_edge_partition_src_dst_property.cuh b/cpp/src/prims/update_edge_partition_src_dst_property.cuh
index 8cee40d3438..ffa1baf37fa 100644
--- a/cpp/src/prims/update_edge_partition_src_dst_property.cuh
+++ b/cpp/src/prims/update_edge_partition_src_dst_property.cuh
@@ -294,8 +294,6 @@ void update_edge_partition_minor_property(
         key_offsets = *(graph_view.local_sorted_unique_edge_dst_vertex_partition_offsets());
       }
 
-#if 0  // FIXME: should not directly call ncclGroupStart(), ncclGroupEnd(), enable after adding a
-       // raft::comms mechanism to group broadcast operations
       // memory footprint vs parallelism trade-off
       // memory requirement per loop is
       // (V/comm_size) * sizeof(value_t)
@@ -327,10 +325,7 @@ void update_edge_partition_minor_property(
       }
 
       for (size_t round = 0; round < num_rounds; ++round) {
-        // FIXME: better use a higher-level interface than directly invoking NCCL functions.
-        if (ncclGroupStart() != ncclSuccess) {
-          CUGRAPH_FAIL("ncclGroupStart failure.");
-        }
+        device_group_start(row_comm);
         for (size_t i = 0; i < num_concurrent_bcasts; ++i) {
           auto j = num_rounds * i + round;
           if (j < static_cast<size_t>(row_comm_size)) {
@@ -343,10 +338,8 @@ void update_edge_partition_minor_property(
                          handle.get_stream());
           }
         }
-        // FIXME: better use a higher-level interface than directly invoking NCCL functions.
-        if (ncclGroupEnd() != ncclSuccess) {
-          CUGRAPH_FAIL("ncclGroupEnd failure.");
-        }
+        device_group_end(row_comm);
+
         for (size_t i = 0; i < num_concurrent_bcasts; ++i) {
           auto j = num_rounds * i + round;
           if (j < static_cast<size_t>(row_comm_size)) {
@@ -363,33 +356,6 @@ void update_edge_partition_minor_property(
           }
         }
       }
-#else
-      vertex_t max_rx_size{0};
-      for (int i = 0; i < row_comm_size; ++i) {
-        max_rx_size = std::max(
-          max_rx_size, graph_view.vertex_partition_range_size(col_comm_rank * row_comm_size + i));
-      }
-      auto rx_value_buffer = allocate_dataframe_buffer<value_t>(max_rx_size, handle.get_stream());
-      auto rx_value_first  = get_dataframe_buffer_begin(rx_value_buffer);
-      for (int i = 0; i < row_comm_size; ++i) {
-        device_bcast(row_comm,
-                     vertex_property_input_first,
-                     rx_value_first,
-                     graph_view.vertex_partition_range_size(col_comm_rank * row_comm_size + i),
-                     i,
-                     handle.get_stream());
-
-        auto v_offset_first = thrust::make_transform_iterator(
-          (*(edge_partition_minor_property_output.keys())).begin() + key_offsets[i],
-          [v_first = graph_view.vertex_partition_range_first(
-             col_comm_rank * row_comm_size + i)] __device__(auto v) { return v - v_first; });
-        thrust::gather(handle.get_thrust_policy(),
-                       v_offset_first,
-                       v_offset_first + (key_offsets[i + 1] - key_offsets[i]),
-                       rx_value_first,
-                       edge_partition_minor_property_output.value_first() + key_offsets[i]);
-      }
-#endif
     } else {
       std::vector<size_t> rx_counts(row_comm_size, size_t{0});
       std::vector<size_t> displacements(row_comm_size, size_t{0});
diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh
index 12e5a5dc234..1811ae196b7 100644
--- a/cpp/src/structure/graph_impl.cuh
+++ b/cpp/src/structure/graph_impl.cuh
@@ -999,20 +999,18 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
   for (size_t i = 0; i < edgelists.size(); ++i) {
     auto [major_range_first, major_range_last] = partition_.local_edge_partition_major_range(i);
     auto [minor_range_first, minor_range_last] = partition_.local_edge_partition_minor_range();
-    auto major_hypersparse_first =
+    auto [offsets, indices, weights, dcs_nzd_vertices] = compress_edgelist<store_transposed>(
+      edgelists[i],
+      major_range_first,
       use_dcs ? std::optional<vertex_t>{major_range_first +
                                         (*edge_partition_segment_offsets_)
                                           [(*(meta.segment_offsets)).size() * i +
                                            detail::num_sparse_segments_per_vertex_partition]}
-              : std::nullopt;
-    auto [offsets, indices, weights, dcs_nzd_vertices] =
-      compress_edgelist<store_transposed>(edgelists[i],
-                                          major_range_first,
-                                          major_hypersparse_first,
-                                          major_range_last,
-                                          minor_range_first,
-                                          minor_range_last,
-                                          handle.get_stream());
+              : std::nullopt,
+      major_range_last,
+      minor_range_first,
+      minor_range_last,
+      handle.get_stream());
 
     edge_partition_offsets_.push_back(std::move(offsets));
     edge_partition_indices_.push_back(std::move(indices));
@@ -1149,20 +1147,18 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
   for (size_t i = 0; i < edgelists.size(); ++i) {
     auto [major_range_first, major_range_last] = partition_.local_edge_partition_major_range(i);
     auto [minor_range_first, minor_range_last] = partition_.local_edge_partition_minor_range();
-    auto major_hypersparse_first =
+    auto [offsets, indices, weights, dcs_nzd_vertices] = compress_edgelist<store_transposed>(
+      edgelists[i],
+      major_range_first,
       use_dcs ? std::optional<vertex_t>{major_range_first +
                                         (*edge_partition_segment_offsets_)
                                           [(*(meta.segment_offsets)).size() * i +
                                            detail::num_sparse_segments_per_vertex_partition]}
-              : std::nullopt;
-    auto [offsets, indices, weights, dcs_nzd_vertices] =
-      compress_edgelist<store_transposed>(edgelists[i],
-                                          major_range_first,
-                                          major_hypersparse_first,
-                                          major_range_last,
-                                          minor_range_first,
-                                          minor_range_last,
-                                          handle.get_stream());
+              : std::nullopt,
+      major_range_last,
+      minor_range_first,
+      minor_range_last,
+      handle.get_stream());
     edgelist_src_partitions[i].resize(0, handle.get_stream());
     edgelist_src_partitions[i].shrink_to_fit(handle.get_stream());
     edgelist_dst_partitions[i].resize(0, handle.get_stream());