From aaad98e9e81b84cd1e29c08349afcf8b5669d6e1 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 19 Oct 2021 13:36:15 -0400
Subject: [PATCH 1/7] add overflow check

---
 cpp/tests/utilities/test_graphs.hpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/cpp/tests/utilities/test_graphs.hpp b/cpp/tests/utilities/test_graphs.hpp
index 931c69c7357..dc836c0a6f4 100644
--- a/cpp/tests/utilities/test_graphs.hpp
+++ b/cpp/tests/utilities/test_graphs.hpp
@@ -134,6 +134,13 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase {
              bool>
   construct_edgelist(raft::handle_t const& handle, bool test_weighted) const
   {
+    CUGRAPH_EXPECTS(
+      (size_t{1} << scale_) <= static_cast<size_t>(std::numeric_limits<vertex_t>::max()),
+      "Invalid template parameter: scale_ too large for vertex_t.");
+    CUGRAPH_EXPECTS(((size_t{1} << scale_) * edge_factor_) <=
+                      static_cast<size_t>(std::numeric_limits<edge_t>::max()),
+                    "Invalid template parameter: (scale_, edge_factor_) too large for edge_t");
+
     std::vector<size_t> partition_ids(1);
     size_t num_partitions;
 

From 3ce426b36a345a892898dba43c6e13ac8b1d6bc8 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 19 Oct 2021 16:56:52 -0400
Subject: [PATCH 2/7] minor reordering of opreations to potentially lower peak
 memory usage

---
 cpp/src/structure/graph_impl.cuh | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh
index 77fe5a3d7b6..3de8e26415a 100644
--- a/cpp/src/structure/graph_impl.cuh
+++ b/cpp/src/structure/graph_impl.cuh
@@ -846,6 +846,7 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
 
   auto [edgelist_rows, edgelist_cols, edgelist_weights] =
     this->decompress_to_edgelist(handle, wrapped_renumber_map, true);
+
   std::tie(edgelist_rows, edgelist_cols, edgelist_weights) =
     symmetrize_edgelist<vertex_t, weight_t, store_transposed, multi_gpu>(
       handle,
@@ -887,13 +888,7 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
 
   auto [edgelist_rows, edgelist_cols, edgelist_weights] =
     this->decompress_to_edgelist(handle, renumber_map, true);
-  auto vertex_span = renumber ? std::move(renumber_map)
-                              : std::make_optional<rmm::device_uvector<vertex_t>>(
-                                  number_of_vertices, handle.get_stream());
-  if (!renumber) {
-    thrust::sequence(
-      handle.get_thrust_policy(), (*vertex_span).begin(), (*vertex_span).end(), vertex_t{0});
-  }
+
   std::tie(edgelist_rows, edgelist_cols, edgelist_weights) =
     symmetrize_edgelist<vertex_t, weight_t, store_transposed, multi_gpu>(
       handle,
@@ -902,6 +897,14 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
       std::move(edgelist_weights),
       reciprocal);
 
+  auto vertex_span = renumber ? std::move(renumber_map)
+                              : std::make_optional<rmm::device_uvector<vertex_t>>(
+                                  number_of_vertices, handle.get_stream());
+  if (!renumber) {
+    thrust::sequence(
+      handle.get_thrust_policy(), (*vertex_span).begin(), (*vertex_span).end(), vertex_t{0});
+  }
+
   auto [symmetrized_graph, new_renumber_map] =
     create_graph_from_edgelist<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
       handle,

From 61b626090b3c8e341c36c4e2548f2a78f30a288b Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@prm-dgx-22.nvidia.com>
Date: Thu, 28 Oct 2021 08:54:06 -0700
Subject: [PATCH 3/7] update cudaSetDevice call to set GPU ID to rank %
 num_gpus_per_node

---
 cpp/tests/utilities/base_fixture.hpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/cpp/tests/utilities/base_fixture.hpp b/cpp/tests/utilities/base_fixture.hpp
index d58d71d9bd2..7b922b87111 100644
--- a/cpp/tests/utilities/base_fixture.hpp
+++ b/cpp/tests/utilities/base_fixture.hpp
@@ -193,11 +193,9 @@ inline auto parse_test_options(int argc, char** argv)
     int comm_size{};                                                                  \
     MPI_TRY(MPI_Comm_rank(MPI_COMM_WORLD, &comm_rank));                               \
     MPI_TRY(MPI_Comm_size(MPI_COMM_WORLD, &comm_size));                               \
-    int num_gpus{};                                                                   \
-    CUDA_TRY(cudaGetDeviceCount(&num_gpus));                                          \
-    CUGRAPH_EXPECTS(                                                                  \
-      comm_size <= num_gpus, "# MPI ranks (%d) > # GPUs (%d).", comm_size, num_gpus); \
-    CUDA_TRY(cudaSetDevice(comm_rank));                                               \
+    int num_gpus_per_node{};                                                          \
+    CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node));                                 \
+    CUDA_TRY(cudaSetDevice(comm_rank % num_gpus_per_node));                           \
     ::testing::InitGoogleTest(&argc, argv);                                           \
     auto const cmd_opts = parse_test_options(argc, argv);                             \
     auto const rmm_mode = cmd_opts["rmm_mode"].as<std::string>();                     \

From 3d31cc306aa4ec0b230cd557e96f0ea3dc66407e Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Fri, 29 Oct 2021 11:07:48 -0400
Subject: [PATCH 4/7] cut memory requirement in renumbering

---
 cpp/src/structure/renumber_edgelist_impl.cuh | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh
index f0c03a5901e..15a5ef48a2a 100644
--- a/cpp/src/structure/renumber_edgelist_impl.cuh
+++ b/cpp/src/structure/renumber_edgelist_impl.cuh
@@ -174,20 +174,30 @@ compute_renumber_map(raft::handle_t const& handle,
     edgelist_edge_counts.begin(), edgelist_edge_counts.end() - 1, minor_displs.begin() + 1);
   rmm::device_uvector<vertex_t> minor_labels(minor_displs.back() + edgelist_edge_counts.back(),
                                              handle.get_stream());
-  vertex_t num_local_unique_edge_minors{0};
+  vertex_t minor_offset{0};
   for (size_t i = 0; i < edgelist_minor_vertices.size(); ++i) {
     thrust::copy(handle.get_thrust_policy(),
                  edgelist_minor_vertices[i],
                  edgelist_minor_vertices[i] + edgelist_edge_counts[i],
-                 minor_labels.begin() + minor_displs[i]);
+                 minor_labels.begin() + minor_offset);
+    thrust::sort(handle.get_thrust_policy(),
+                 minor_labels.begin() + minor_offset,
+                 minor_labels.begin() + minor_offset + edgelist_edge_counts[i]);
+    minor_offset +=
+      thrust::distance(minor_labels.begin() + minor_offset,
+                       thrust::unique(handle.get_thrust_policy(),
+                                      minor_labels.begin() + minor_offset,
+                                      minor_labels.begin() + minor_offset +
+                                        edgelist_edge_counts[i]));
   }
+  minor_labels.resize(minor_offset, handle.get_stream());
   thrust::sort(handle.get_thrust_policy(), minor_labels.begin(), minor_labels.end());
   minor_labels.resize(
     thrust::distance(
       minor_labels.begin(),
       thrust::unique(handle.get_thrust_policy(), minor_labels.begin(), minor_labels.end())),
     handle.get_stream());
-  num_local_unique_edge_minors += static_cast<vertex_t>(minor_labels.size());
+  auto num_local_unique_edge_minors = static_cast<vertex_t>(minor_labels.size());
   if (multi_gpu) {
     auto& comm               = handle.get_comms();
     auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());

From 5871ea42a37ec6db1d0f73d74acf3754ddf66f8f Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Fri, 29 Oct 2021 15:19:56 -0400
Subject: [PATCH 5/7] update renumber_edgelist to take an optional vertex list
 as an R-value of std::optional<rmm::device_uvector<vertex_t>>

---
 cpp/include/cugraph/graph_functions.hpp       |  72 ++---
 cpp/src/structure/coarsen_graph_impl.cuh      |   6 +-
 .../create_graph_from_edgelist_impl.cuh       |  27 +-
 cpp/src/structure/renumber_edgelist_impl.cuh  | 292 +++++++++---------
 cpp/src/structure/renumber_edgelist_mg.cu     |  18 +-
 cpp/src/structure/renumber_edgelist_sg.cu     |  39 ++-
 6 files changed, 217 insertions(+), 237 deletions(-)

diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp
index 01b774962f1..5ddc244b183 100644
--- a/cpp/include/cugraph/graph_functions.hpp
+++ b/cpp/include/cugraph/graph_functions.hpp
@@ -59,27 +59,26 @@ struct renumber_meta_t<vertex_t, edge_t, multi_gpu, std::enable_if_t<!multi_gpu>
  * or multi-GPU (true).
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
- * @param local_vertex_span If valid, part of the entire set of vertices in the graph to be
- * renumbered. The first tuple element is the pointer to the array and the second tuple element is
- * the size of the array. This parameter can be used to include isolated vertices. Applying the
+ * @param local_vertices If valid, part of the entire set of vertices in the graph to be renumbered.
+ * This parameter can be used to include isolated vertices. Applying the
  * compute_gpu_id_from_vertex_t to every vertex should return the local GPU ID for this function to
  * work (vertices should be pre-shuffled).
- * @param edgelist_major_vertices Pointers (one pointer per local graph adjacency matrix partition
- * assigned to this process) to edge source vertex IDs (if the graph adjacency matrix is stored as
- * is) or edge destination vertex IDs (if the transposed graph adjacency matrix is stored). Vertex
- * IDs are updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target
- * process & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major,
- * minor) pair should return the GPU ID of this process and applying the
- * compute_partition_id_from_edge_t fuctor to every (major, minor) pair for a local matrix partition
- * should return the partition ID of the corresponding matrix partition.
- * @param edgelist_minor_vertices Pointers (one pointer per local graph adjacency matrix partition
- * assigned to this process) to edge destination vertex IDs (if the graph adjacency matrix is stored
- * as is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored). Vertex IDs
- * are updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target
- * process & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major,
- * minor) pair should return the GPU ID of this process and applying the
- * compute_partition_id_from_edge_t fuctor to every (major, minor) pair for a local matrix partition
- * should return the partition ID of the corresponding matrix partition.
+ * @param edgelist_majors Pointers (one pointer per local graph adjacency matrix partition assigned
+ * to this process) to edge source vertex IDs (if the graph adjacency matrix is stored as is) or
+ * edge destination vertex IDs (if the transposed graph adjacency matrix is stored). Vertex IDs are
+ * updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target process
+ * & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major, minor)
+ * pair should return the GPU ID of this process and applying the compute_partition_id_from_edge_t
+ * fuctor to every (major, minor) pair for a local matrix partition should return the partition ID
+ * of the corresponding matrix partition.
+ * @param edgelist_minors Pointers (one pointer per local graph adjacency matrix partition assigned
+ * to this process) to edge destination vertex IDs (if the graph adjacency matrix is stored as is)
+ * or edge source vertex IDs (if the transposed graph adjacency matrix is stored). Vertex IDs are
+ * updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target process
+ * & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major, minor)
+ * pair should return the GPU ID of this process and applying the compute_partition_id_from_edge_t
+ * fuctor to every (major, minor) pair for a local matrix partition should return the partition ID
+ * of the corresponding matrix partition.
  * @param edgelist_edge_counts Edge counts (one count per local graph adjacency matrix partition
  * assigned to this process).
  * @param edgelist_intra_partition_segment_offsets If valid, store segment offsets within a local
@@ -101,9 +100,9 @@ std::enable_if_t<
   std::tuple<rmm::device_uvector<vertex_t>, renumber_meta_t<vertex_t, edge_t, multi_gpu>>>
 renumber_edgelist(
   raft::handle_t const& handle,
-  std::optional<std::tuple<vertex_t const*, vertex_t>> local_vertex_span,
-  std::vector<vertex_t*> const& edgelist_major_vertices /* [INOUT] */,
-  std::vector<vertex_t*> const& edgelist_minor_vertices /* [INOUT] */,
+  std::optional<rmm::device_uvector<vertex_t>>&& local_vertices,
+  std::vector<vertex_t*> const& edgelist_majors /* [INOUT] */,
+  std::vector<vertex_t*> const& edgelist_minors /* [INOUT] */,
   std::vector<edge_t> const& edgelist_edge_counts,
   std::optional<std::vector<std::vector<edge_t>>> const& edgelist_intra_partition_segment_offsets,
   bool do_expensive_check = false);
@@ -117,17 +116,14 @@ renumber_edgelist(
  * or multi-GPU (true).
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
- * @param vertex_span If valid, vertices in the graph to be renumbered. The first tuple element is
- * the pointer to the array and the second tuple element is the size of the array. This parameter
- * can be used to include isolated vertices.
- * @param vertices The entire set of vertices in the graph to be renumbered.
- * @param num_vertices Number of vertices.
- * @param edgelist_major_vertices Edge source vertex IDs (if the graph adjacency matrix is stored as
- * is) or edge destination vertex IDs (if the transposed graph adjacency matrix is stored). Vertex
- * IDs are updated in-place ([INOUT] parameter).
- * @param edgelist_minor_vertices Edge destination vertex IDs (if the graph adjacency matrix is
- * stored as is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored).
- * Vertex IDs are updated in-place ([INOUT] parameter).
+ * @param vertices If valid, vertices in the graph to be renumbered. This parameter can be used to
+ * include isolated vertices.
+ * @param edgelist_majors Edge source vertex IDs (if the graph adjacency matrix is stored as is) or
+ * edge destination vertex IDs (if the transposed graph adjacency matrix is stored). Vertex IDs are
+ * updated in-place ([INOUT] parameter).
+ * @param edgelist_minors Edge destination vertex IDs (if the graph adjacency matrix is stored as
+ * is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored). Vertex IDs
+ * are updated in-place ([INOUT] parameter).
  * @param num_edgelist_edges Number of edges in the edgelist.
  * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
  * @return std::tuple<rmm::device_uvector<vertex_t>, renumber_meta_t<vertex_t, edge_t, multi_gpu>>
@@ -141,9 +137,9 @@ std::enable_if_t<
   !multi_gpu,
   std::tuple<rmm::device_uvector<vertex_t>, renumber_meta_t<vertex_t, edge_t, multi_gpu>>>
 renumber_edgelist(raft::handle_t const& handle,
-                  std::optional<std::tuple<vertex_t const*, vertex_t>> vertex_span,
-                  vertex_t* edgelist_major_vertices /* [INOUT] */,
-                  vertex_t* edgelist_minor_vertices /* [INOUT] */,
+                  std::optional<rmm::device_uvector<vertex_t>>&& vertices,
+                  vertex_t* edgelist_majors /* [INOUT] */,
+                  vertex_t* edgelist_minors /* [INOUT] */,
                   edge_t num_edgelist_edges,
                   bool do_expensive_check = false);
 
@@ -462,7 +458,7 @@ extract_induced_subgraphs(
  * or multi-GPU (true).
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
- * @param vertex_span  If valid, part of the entire set of vertices in the graph to be renumbered.
+ * @param vertices  If valid, part of the entire set of vertices in the graph to be renumbered.
  * This parameter can be used to include isolated vertices. If multi-GPU, applying the
  * compute_gpu_id_from_vertex_t to every vertex should return the local GPU ID for this function to
  * work (vertices should be pre-shuffled).
@@ -487,7 +483,7 @@ template <typename vertex_t,
 std::tuple<cugraph::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
            std::optional<rmm::device_uvector<vertex_t>>>
 create_graph_from_edgelist(raft::handle_t const& handle,
-                           std::optional<rmm::device_uvector<vertex_t>>&& vertex_span,
+                           std::optional<rmm::device_uvector<vertex_t>>&& vertices,
                            rmm::device_uvector<vertex_t>&& edgelist_rows,
                            rmm::device_uvector<vertex_t>&& edgelist_cols,
                            std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh
index 2fbdfe25f79..f49c7be626d 100644
--- a/cpp/src/structure/coarsen_graph_impl.cuh
+++ b/cpp/src/structure/coarsen_graph_impl.cuh
@@ -409,8 +409,7 @@ coarsen_graph(
     }
     std::tie(renumber_map_labels, meta) = renumber_edgelist<vertex_t, edge_t, multi_gpu>(
       handle,
-      std::optional<std::tuple<vertex_t const*, vertex_t>>{
-        std::make_tuple(unique_labels.data(), static_cast<vertex_t>(unique_labels.size()))},
+      std::optional<rmm::device_uvector<vertex_t>>{std::move(unique_labels)},
       major_ptrs,
       minor_ptrs,
       counts,
@@ -493,8 +492,7 @@ coarsen_graph(
 
   auto [renumber_map_labels, meta] = renumber_edgelist<vertex_t, edge_t, multi_gpu>(
     handle,
-    std::optional<std::tuple<vertex_t const*, vertex_t>>{
-      std::make_tuple(unique_labels.data(), static_cast<vertex_t>(unique_labels.size()))},
+    std::optional<rmm::device_uvector<vertex_t>>{std::move(unique_labels)},
     coarsened_edgelist_major_vertices.data(),
     coarsened_edgelist_minor_vertices.data(),
     static_cast<edge_t>(coarsened_edgelist_major_vertices.size()),
diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.cuh b/cpp/src/structure/create_graph_from_edgelist_impl.cuh
index 13d3acd9cc4..4516b087180 100644
--- a/cpp/src/structure/create_graph_from_edgelist_impl.cuh
+++ b/cpp/src/structure/create_graph_from_edgelist_impl.cuh
@@ -226,7 +226,7 @@ std::enable_if_t<
   std::tuple<cugraph::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
              std::optional<rmm::device_uvector<vertex_t>>>>
 create_graph_from_edgelist_impl(raft::handle_t const& handle,
-                                std::optional<rmm::device_uvector<vertex_t>>&& local_vertex_span,
+                                std::optional<rmm::device_uvector<vertex_t>>&& local_vertices,
                                 rmm::device_uvector<vertex_t>&& edgelist_rows,
                                 rmm::device_uvector<vertex_t>&& edgelist_cols,
                                 std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
@@ -247,7 +247,7 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle,
 
   if (do_expensive_check) {
     expensive_check_edgelist<vertex_t, multi_gpu>(handle,
-                                                  local_vertex_span,
+                                                  local_vertices,
                                                   store_transposed ? edgelist_cols : edgelist_rows,
                                                   store_transposed ? edgelist_rows : edgelist_cols);
   }
@@ -296,10 +296,7 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle,
   }
   auto [renumber_map_labels, meta] = cugraph::renumber_edgelist<vertex_t, edge_t, multi_gpu>(
     handle,
-    local_vertex_span
-      ? std::optional<std::tuple<vertex_t const*, vertex_t>>{std::make_tuple(
-          (*local_vertex_span).data(), static_cast<vertex_t>((*local_vertex_span).size()))}
-      : std::nullopt,
+    std::move(local_vertices),
     major_ptrs,
     minor_ptrs,
     edgelist_edge_counts,
@@ -343,7 +340,7 @@ std::enable_if_t<
   std::tuple<cugraph::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
              std::optional<rmm::device_uvector<vertex_t>>>>
 create_graph_from_edgelist_impl(raft::handle_t const& handle,
-                                std::optional<rmm::device_uvector<vertex_t>>&& vertex_span,
+                                std::optional<rmm::device_uvector<vertex_t>>&& vertices,
                                 rmm::device_uvector<vertex_t>&& edgelist_rows,
                                 rmm::device_uvector<vertex_t>&& edgelist_cols,
                                 std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
@@ -358,11 +355,13 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle,
 
   if (do_expensive_check) {
     expensive_check_edgelist<vertex_t, multi_gpu>(handle,
-                                                  vertex_span,
+                                                  vertices,
                                                   store_transposed ? edgelist_cols : edgelist_rows,
                                                   store_transposed ? edgelist_rows : edgelist_cols);
   }
 
+  auto input_vertex_list_size = vertices ? static_cast<vertex_t>((*vertices).size()) : vertex_t{0};
+
   auto renumber_map_labels =
     renumber ? std::make_optional<rmm::device_uvector<vertex_t>>(0, handle.get_stream())
              : std::nullopt;
@@ -370,9 +369,7 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle,
   if (renumber) {
     std::tie(*renumber_map_labels, meta) = cugraph::renumber_edgelist<vertex_t, edge_t, multi_gpu>(
       handle,
-      vertex_span ? std::optional<std::tuple<vertex_t const*, vertex_t>>{std::make_tuple(
-                      (*vertex_span).data(), static_cast<vertex_t>((*vertex_span).size()))}
-                  : std::nullopt,
+      std::move(vertices),
       store_transposed ? edgelist_cols.data() : edgelist_rows.data(),
       store_transposed ? edgelist_rows.data() : edgelist_cols.data(),
       static_cast<edge_t>(edgelist_rows.size()));
@@ -382,8 +379,8 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle,
   if (renumber) {
     num_vertices = static_cast<vertex_t>((*renumber_map_labels).size());
   } else {
-    if (vertex_span) {
-      num_vertices = static_cast<vertex_t>((*vertex_span).size());
+    if (vertices) {
+      num_vertices = input_vertex_list_size;
     } else {
       num_vertices = 1 + cugraph::detail::compute_maximum_vertex_id(
                            handle.get_stream_view(), edgelist_rows, edgelist_cols);
@@ -416,7 +413,7 @@ template <typename vertex_t,
 std::tuple<cugraph::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
            std::optional<rmm::device_uvector<vertex_t>>>
 create_graph_from_edgelist(raft::handle_t const& handle,
-                           std::optional<rmm::device_uvector<vertex_t>>&& vertex_span,
+                           std::optional<rmm::device_uvector<vertex_t>>&& vertices,
                            rmm::device_uvector<vertex_t>&& edgelist_rows,
                            rmm::device_uvector<vertex_t>&& edgelist_cols,
                            std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
@@ -426,7 +423,7 @@ create_graph_from_edgelist(raft::handle_t const& handle,
 {
   return create_graph_from_edgelist_impl<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
     handle,
-    std::move(vertex_span),
+    std::move(vertices),
     std::move(edgelist_rows),
     std::move(edgelist_cols),
     std::move(edgelist_weights),
diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh
index 15a5ef48a2a..2c22630ea3f 100644
--- a/cpp/src/structure/renumber_edgelist_impl.cuh
+++ b/cpp/src/structure/renumber_edgelist_impl.cuh
@@ -49,9 +49,9 @@ namespace detail {
 template <typename vertex_t, typename edge_t, bool multi_gpu>
 std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>, vertex_t, vertex_t>
 compute_renumber_map(raft::handle_t const& handle,
-                     std::optional<std::tuple<vertex_t const*, vertex_t>> vertex_span,
-                     std::vector<vertex_t const*> const& edgelist_major_vertices,
-                     std::vector<vertex_t const*> const& edgelist_minor_vertices,
+                     std::optional<rmm::device_uvector<vertex_t>>&& local_vertices,
+                     std::vector<vertex_t const*> const& edgelist_majors,
+                     std::vector<vertex_t const*> const& edgelist_minors,
                      std::vector<edge_t> const& edgelist_edge_counts)
 {
   // FIXME: compare this sort based approach with hash based approach in both speed and memory
@@ -78,15 +78,15 @@ compute_renumber_map(raft::handle_t const& handle,
   rmm::device_uvector<vertex_t> major_labels(0, handle.get_stream());
   rmm::device_uvector<edge_t> major_counts(0, handle.get_stream());
   vertex_t num_local_unique_edge_majors{0};
-  for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) {
+  for (size_t i = 0; i < edgelist_majors.size(); ++i) {
     rmm::device_uvector<vertex_t> tmp_major_labels(0, handle.get_stream());
     rmm::device_uvector<edge_t> tmp_major_counts(0, handle.get_stream());
     {
       rmm::device_uvector<vertex_t> sorted_major_labels(edgelist_edge_counts[i],
                                                         handle.get_stream());
       thrust::copy(handle.get_thrust_policy(),
-                   edgelist_major_vertices[i],
-                   edgelist_major_vertices[i] + edgelist_edge_counts[i],
+                   edgelist_majors[i],
+                   edgelist_majors[i] + edgelist_edge_counts[i],
                    sorted_major_labels.begin());
       // FIXME: better refactor this sort-count_if-reduce_by_key routine for reuse
       thrust::sort(
@@ -169,26 +169,25 @@ compute_renumber_map(raft::handle_t const& handle,
 
   // 2. acquire unique minor labels
 
-  std::vector<edge_t> minor_displs(edgelist_minor_vertices.size(), edge_t{0});
+  std::vector<edge_t> minor_displs(edgelist_minors.size(), edge_t{0});
   std::partial_sum(
     edgelist_edge_counts.begin(), edgelist_edge_counts.end() - 1, minor_displs.begin() + 1);
   rmm::device_uvector<vertex_t> minor_labels(minor_displs.back() + edgelist_edge_counts.back(),
                                              handle.get_stream());
   vertex_t minor_offset{0};
-  for (size_t i = 0; i < edgelist_minor_vertices.size(); ++i) {
+  for (size_t i = 0; i < edgelist_minors.size(); ++i) {
     thrust::copy(handle.get_thrust_policy(),
-                 edgelist_minor_vertices[i],
-                 edgelist_minor_vertices[i] + edgelist_edge_counts[i],
+                 edgelist_minors[i],
+                 edgelist_minors[i] + edgelist_edge_counts[i],
                  minor_labels.begin() + minor_offset);
     thrust::sort(handle.get_thrust_policy(),
                  minor_labels.begin() + minor_offset,
                  minor_labels.begin() + minor_offset + edgelist_edge_counts[i]);
-    minor_offset +=
-      thrust::distance(minor_labels.begin() + minor_offset,
-                       thrust::unique(handle.get_thrust_policy(),
-                                      minor_labels.begin() + minor_offset,
-                                      minor_labels.begin() + minor_offset +
-                                        edgelist_edge_counts[i]));
+    minor_offset += thrust::distance(
+      minor_labels.begin() + minor_offset,
+      thrust::unique(handle.get_thrust_policy(),
+                     minor_labels.begin() + minor_offset,
+                     minor_labels.begin() + minor_offset + edgelist_edge_counts[i]));
   }
   minor_labels.resize(minor_offset, handle.get_stream());
   thrust::sort(handle.get_thrust_policy(), minor_labels.begin(), minor_labels.end());
@@ -287,27 +286,28 @@ compute_renumber_map(raft::handle_t const& handle,
 
   auto num_non_isolated_vertices = static_cast<vertex_t>(labels.size());
 
-  // 4. if vertex_span.has_value() == true, append isolated vertices
+  // 4. if local_vertices.has_value() == true, append isolated vertices
 
-  if (vertex_span) {
+  if (local_vertices) {
     rmm::device_uvector<vertex_t> isolated_vertices(0, handle.get_stream());
 
-    auto [vertices, num_vertices] = *vertex_span;
-    auto num_isolated_vertices    = thrust::count_if(
+    auto num_isolated_vertices = thrust::count_if(
       handle.get_thrust_policy(),
-      vertices,
-      vertices + num_vertices,
+      (*local_vertices).begin(),
+      (*local_vertices).end(),
       [label_first = labels.begin(), label_last = labels.end()] __device__(auto v) {
         return !thrust::binary_search(thrust::seq, label_first, label_last, v);
       });
     isolated_vertices.resize(num_isolated_vertices, handle.get_stream());
     thrust::copy_if(handle.get_thrust_policy(),
-                    vertices,
-                    vertices + num_vertices,
+                    (*local_vertices).begin(),
+                    (*local_vertices).end(),
                     isolated_vertices.begin(),
                     [label_first = labels.begin(), label_last = labels.end()] __device__(auto v) {
                       return !thrust::binary_search(thrust::seq, label_first, label_last, v);
                     });
+    (*local_vertices).resize(0, handle.get_stream());
+    (*local_vertices).shrink_to_fit(handle.get_stream());
 
     if (isolated_vertices.size() > 0) {
       labels.resize(labels.size() + isolated_vertices.size(), handle.get_stream());
@@ -389,29 +389,32 @@ compute_renumber_map(raft::handle_t const& handle,
 template <typename vertex_t, typename edge_t, bool multi_gpu>
 void expensive_check_edgelist(
   raft::handle_t const& handle,
-  std::optional<std::tuple<vertex_t const*, vertex_t>> vertex_span,
-  std::vector<vertex_t const*> const& edgelist_major_vertices,
-  std::vector<vertex_t const*> const& edgelist_minor_vertices,
+  std::optional<rmm::device_uvector<vertex_t>> const& local_vertices,
+  std::vector<vertex_t const*> const& edgelist_majors,
+  std::vector<vertex_t const*> const& edgelist_minors,
   std::vector<edge_t> const& edgelist_edge_counts,
   std::optional<std::vector<std::vector<edge_t>>> const& edgelist_intra_partition_segment_offsets)
 {
-  rmm::device_uvector<vertex_t> sorted_local_vertices(size_t{0}, handle.get_stream());
-  if (vertex_span) {
-    auto [vertices, num_vertices] = *vertex_span;
-    sorted_local_vertices.resize(num_vertices, handle.get_stream());
-    thrust::copy(
-      handle.get_thrust_policy(), vertices, vertices + num_vertices, sorted_local_vertices.begin());
+  std::optional<rmm::device_uvector<vertex_t>> sorted_local_vertices{std::nullopt};
+  if (local_vertices) {
+    sorted_local_vertices =
+      rmm::device_uvector<vertex_t>((*local_vertices).size(), handle.get_stream());
+    thrust::copy(handle.get_thrust_policy(),
+                 (*local_vertices).begin(),
+                 (*local_vertices).end(),
+                 (*sorted_local_vertices).begin());
     thrust::sort(
-      handle.get_thrust_policy(), sorted_local_vertices.begin(), sorted_local_vertices.end());
-    CUGRAPH_EXPECTS(static_cast<size_t>(thrust::distance(
-                      sorted_local_vertices.begin(),
-                      thrust::unique(handle.get_thrust_policy(),
-                                     sorted_local_vertices.begin(),
-                                     sorted_local_vertices.end()))) == sorted_local_vertices.size(),
-                    "Invalid input argument: local_vertices should not have duplicates.");
+      handle.get_thrust_policy(), (*sorted_local_vertices).begin(), (*sorted_local_vertices).end());
+    CUGRAPH_EXPECTS(
+      static_cast<size_t>(thrust::distance((*sorted_local_vertices).begin(),
+                                           thrust::unique(handle.get_thrust_policy(),
+                                                          (*sorted_local_vertices).begin(),
+                                                          (*sorted_local_vertices).end()))) ==
+        (*sorted_local_vertices).size(),
+      "Invalid input argument: (local_)vertices should not have duplicates.");
   }
 
-  if (multi_gpu) {
+  if constexpr (multi_gpu) {
     auto& comm               = handle.get_comms();
     auto const comm_size     = comm.get_size();
     auto const comm_rank     = comm.get_rank();
@@ -422,27 +425,28 @@ void expensive_check_edgelist(
     auto const col_comm_size = col_comm.get_size();
     auto const col_comm_rank = col_comm.get_rank();
 
-    CUGRAPH_EXPECTS((edgelist_major_vertices.size() == edgelist_minor_vertices.size()) &&
-                      (edgelist_major_vertices.size() == static_cast<size_t>(col_comm_size)),
-                    "Invalid input argument: both edgelist_major_vertices.size() & "
-                    "edgelist_minor_vertices.size() should coincide with col_comm_size.");
+    CUGRAPH_EXPECTS((edgelist_majors.size() == edgelist_minors.size()) &&
+                      (edgelist_majors.size() == static_cast<size_t>(col_comm_size)),
+                    "Invalid input argument: both edgelist_majors.size() & "
+                    "edgelist_minors.size() should coincide with col_comm_size.");
 
-    auto [local_vertices, num_local_vertices] = *vertex_span;
-    CUGRAPH_EXPECTS(
-      thrust::count_if(
-        handle.get_thrust_policy(),
-        local_vertices,
-        local_vertices + num_local_vertices,
-        [comm_rank,
-         key_func =
-           detail::compute_gpu_id_from_vertex_t<vertex_t>{comm_size}] __device__(auto val) {
-          return key_func(val) != comm_rank;
-        }) == 0,
-      "Invalid input argument: local_vertices should be pre-shuffled.");
-
-    for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) {
-      auto edge_first = thrust::make_zip_iterator(
-        thrust::make_tuple(edgelist_major_vertices[i], edgelist_minor_vertices[i]));
+    if (sorted_local_vertices) {
+      CUGRAPH_EXPECTS(
+        thrust::count_if(
+          handle.get_thrust_policy(),
+          (*sorted_local_vertices).begin(),
+          (*sorted_local_vertices).end(),
+          [comm_rank,
+           key_func =
+             detail::compute_gpu_id_from_vertex_t<vertex_t>{comm_size}] __device__(auto val) {
+            return key_func(val) != comm_rank;
+          }) == 0,
+        "Invalid input argument: local_vertices should be pre-shuffled.");
+    }
+
+    for (size_t i = 0; i < edgelist_majors.size(); ++i) {
+      auto edge_first =
+        thrust::make_zip_iterator(thrust::make_tuple(edgelist_majors[i], edgelist_minors[i]));
       CUGRAPH_EXPECTS(
         thrust::count_if(
           handle.get_thrust_policy(),
@@ -463,10 +467,10 @@ void expensive_check_edgelist(
                    (partition_id_key_func(thrust::get<0>(edge), thrust::get<1>(edge)) !=
                     row_comm_rank * col_comm_size + col_comm_rank + i * comm_size);
           }) == 0,
-        "Invalid input argument: edgelist_major_vertices & edgelist_minor_vertices should be "
+        "Invalid input argument: edgelist_majors & edgelist_minors should be "
         "pre-shuffled.");
 
-      if (vertex_span) {
+      if (sorted_local_vertices) {
         auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
         auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
 
@@ -483,22 +487,20 @@ void expensive_check_edgelist(
         comm.barrier();  // currently, this is ncclAllReduce
 #endif
 
-        rmm::device_uvector<vertex_t> sorted_major_vertices(0, handle.get_stream());
+        rmm::device_uvector<vertex_t> sorted_majors(0, handle.get_stream());
         {
           auto recvcounts =
-            host_scalar_allgather(col_comm, sorted_local_vertices.size(), handle.get_stream());
+            host_scalar_allgather(col_comm, (*sorted_local_vertices).size(), handle.get_stream());
           std::vector<size_t> displacements(recvcounts.size(), size_t{0});
           std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1);
-          sorted_major_vertices.resize(displacements.back() + recvcounts.back(),
-                                       handle.get_stream());
+          sorted_majors.resize(displacements.back() + recvcounts.back(), handle.get_stream());
           device_allgatherv(col_comm,
-                            sorted_local_vertices.data(),
-                            sorted_major_vertices.data(),
+                            (*sorted_local_vertices).data(),
+                            sorted_majors.data(),
                             recvcounts,
                             displacements,
                             handle.get_stream());
-          thrust::sort(
-            handle.get_thrust_policy(), sorted_major_vertices.begin(), sorted_major_vertices.end());
+          thrust::sort(handle.get_thrust_policy(), sorted_majors.begin(), sorted_majors.end());
         }
 
         // barrier is necessary here to avoid potential overlap (which can leads to deadlock)
@@ -512,22 +514,20 @@ void expensive_check_edgelist(
         comm.barrier();  // currently, this is ncclAllReduce
 #endif
 
-        rmm::device_uvector<vertex_t> sorted_minor_vertices(0, handle.get_stream());
+        rmm::device_uvector<vertex_t> sorted_minors(0, handle.get_stream());
         {
           auto recvcounts =
-            host_scalar_allgather(row_comm, sorted_local_vertices.size(), handle.get_stream());
+            host_scalar_allgather(row_comm, (*sorted_local_vertices).size(), handle.get_stream());
           std::vector<size_t> displacements(recvcounts.size(), size_t{0});
           std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1);
-          sorted_minor_vertices.resize(displacements.back() + recvcounts.back(),
-                                       handle.get_stream());
+          sorted_minors.resize(displacements.back() + recvcounts.back(), handle.get_stream());
           device_allgatherv(row_comm,
-                            sorted_local_vertices.data(),
-                            sorted_minor_vertices.data(),
+                            (*sorted_local_vertices).data(),
+                            sorted_minors.data(),
                             recvcounts,
                             displacements,
                             handle.get_stream());
-          thrust::sort(
-            handle.get_thrust_policy(), sorted_minor_vertices.begin(), sorted_minor_vertices.end());
+          thrust::sort(handle.get_thrust_policy(), sorted_minors.begin(), sorted_minors.end());
         }
 
         // barrier is necessary here to avoid potential overlap (which can leads to deadlock)
@@ -541,27 +541,23 @@ void expensive_check_edgelist(
         comm.barrier();  // currently, this is ncclAllReduce
 #endif
 
-        auto edge_first = thrust::make_zip_iterator(
-          thrust::make_tuple(edgelist_major_vertices[i], edgelist_minor_vertices[i]));
+        auto edge_first =
+          thrust::make_zip_iterator(thrust::make_tuple(edgelist_majors[i], edgelist_minors[i]));
         CUGRAPH_EXPECTS(
           thrust::count_if(
             handle.get_thrust_policy(),
             edge_first,
             edge_first + edgelist_edge_counts[i],
-            [num_major_vertices    = static_cast<vertex_t>(sorted_major_vertices.size()),
-             sorted_major_vertices = sorted_major_vertices.data(),
-             num_minor_vertices    = static_cast<vertex_t>(sorted_minor_vertices.size()),
-             sorted_minor_vertices = sorted_minor_vertices.data()] __device__(auto e) {
-              return !thrust::binary_search(thrust::seq,
-                                            sorted_major_vertices,
-                                            sorted_major_vertices + num_major_vertices,
-                                            thrust::get<0>(e)) ||
-                     !thrust::binary_search(thrust::seq,
-                                            sorted_minor_vertices,
-                                            sorted_minor_vertices + num_minor_vertices,
-                                            thrust::get<1>(e));
+            [num_majors    = static_cast<vertex_t>(sorted_majors.size()),
+             sorted_majors = sorted_majors.data(),
+             num_minors    = static_cast<vertex_t>(sorted_minors.size()),
+             sorted_minors = sorted_minors.data()] __device__(auto e) {
+              return !thrust::binary_search(
+                       thrust::seq, sorted_majors, sorted_majors + num_majors, thrust::get<0>(e)) ||
+                     !thrust::binary_search(
+                       thrust::seq, sorted_minors, sorted_minors + num_minors, thrust::get<1>(e));
             }) == 0,
-          "Invalid input argument: edgelist_major_vertices and/or edgelist_minor_vertices have "
+          "Invalid input argument: edgelist_majors and/or edgelist_minors have "
           "invalid vertex ID(s).");
       }
 
@@ -570,8 +566,8 @@ void expensive_check_edgelist(
           CUGRAPH_EXPECTS(
             thrust::count_if(
               handle.get_thrust_policy(),
-              edgelist_minor_vertices[i] + (*edgelist_intra_partition_segment_offsets)[i][j],
-              edgelist_minor_vertices[i] + (*edgelist_intra_partition_segment_offsets)[i][j + 1],
+              edgelist_minors[i] + (*edgelist_intra_partition_segment_offsets)[i][j],
+              edgelist_minors[i] + (*edgelist_intra_partition_segment_offsets)[i][j + 1],
               [row_comm_size,
                col_comm_rank,
                j,
@@ -580,26 +576,26 @@ void expensive_check_edgelist(
                 return gpu_id_key_func(minor) != col_comm_rank * row_comm_size + j;
               }) == 0,
             "Invalid input argument: if edgelist_intra_partition_segment_offsets.has_value() is "
-            "true, edgelist_major_vertices & edgelist_minor_vertices should be properly grouped "
+            "true, edgelist_majors & edgelist_minors should be properly grouped "
             "within each local partition.");
         }
       }
     }
   } else {
-    assert(edgelist_major_vertices.size() == 1);
-    assert(edgelist_minor_vertices.size() == 1);
+    assert(edgelist_majors.size() == 1);
+    assert(edgelist_minors.size() == 1);
 
-    if (vertex_span) {
-      auto edge_first = thrust::make_zip_iterator(
-        thrust::make_tuple(edgelist_major_vertices[0], edgelist_minor_vertices[0]));
+    if (sorted_local_vertices) {
+      auto edge_first =
+        thrust::make_zip_iterator(thrust::make_tuple(edgelist_majors[0], edgelist_minors[0]));
       CUGRAPH_EXPECTS(
         thrust::count_if(
           handle.get_thrust_policy(),
           edge_first,
           edge_first + edgelist_edge_counts[0],
-          [sorted_local_vertices = sorted_local_vertices.data(),
+          [sorted_local_vertices = (*sorted_local_vertices).data(),
            num_sorted_local_vertices =
-             static_cast<vertex_t>(sorted_local_vertices.size())] __device__(auto e) {
+             static_cast<vertex_t>((*sorted_local_vertices).size())] __device__(auto e) {
             return !thrust::binary_search(thrust::seq,
                                           sorted_local_vertices,
                                           sorted_local_vertices + num_sorted_local_vertices,
@@ -609,7 +605,7 @@ void expensive_check_edgelist(
                                           sorted_local_vertices + num_sorted_local_vertices,
                                           thrust::get<1>(e));
           }) == 0,
-        "Invalid input argument: edgelist_major_vertices and/or edgelist_minor_vertices have "
+        "Invalid input argument: edgelist_majors and/or edgelist_minors have "
         "invalid vertex ID(s).");
     }
 
@@ -628,9 +624,9 @@ std::enable_if_t<
   std::tuple<rmm::device_uvector<vertex_t>, renumber_meta_t<vertex_t, edge_t, multi_gpu>>>
 renumber_edgelist(
   raft::handle_t const& handle,
-  std::optional<std::tuple<vertex_t const*, vertex_t>> local_vertex_span,
-  std::vector<vertex_t*> const& edgelist_major_vertices /* [INOUT] */,
-  std::vector<vertex_t*> const& edgelist_minor_vertices /* [INOUT] */,
+  std::optional<rmm::device_uvector<vertex_t>>&& local_vertices,
+  std::vector<vertex_t*> const& edgelist_majors /* [INOUT] */,
+  std::vector<vertex_t*> const& edgelist_minors /* [INOUT] */,
   std::vector<edge_t> const& edgelist_edge_counts,
   std::optional<std::vector<std::vector<edge_t>>> const& edgelist_intra_partition_segment_offsets,
   bool do_expensive_check)
@@ -645,17 +641,17 @@ renumber_edgelist(
   auto const col_comm_size = col_comm.get_size();
   auto const col_comm_rank = col_comm.get_rank();
 
-  CUGRAPH_EXPECTS(edgelist_major_vertices.size() == static_cast<size_t>(col_comm_size),
-                  "Invalid input arguments: erroneous edgelist_major_vertices.size().");
-  CUGRAPH_EXPECTS(edgelist_minor_vertices.size() == static_cast<size_t>(col_comm_size),
-                  "Invalid input arguments: erroneous edgelist_minor_vertices.size().");
+  CUGRAPH_EXPECTS(edgelist_majors.size() == static_cast<size_t>(col_comm_size),
+                  "Invalid input arguments: erroneous edgelist_majors.size().");
+  CUGRAPH_EXPECTS(edgelist_minors.size() == static_cast<size_t>(col_comm_size),
+                  "Invalid input arguments: erroneous edgelist_minors.size().");
   CUGRAPH_EXPECTS(edgelist_edge_counts.size() == static_cast<size_t>(col_comm_size),
                   "Invalid input arguments: erroneous edgelist_edge_counts.size().");
   if (edgelist_intra_partition_segment_offsets) {
     CUGRAPH_EXPECTS(
       (*edgelist_intra_partition_segment_offsets).size() == static_cast<size_t>(col_comm_size),
       "Invalid input arguments: erroneous (*edgelist_intra_partition_segment_offsets).size().");
-    for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) {
+    for (size_t i = 0; i < edgelist_majors.size(); ++i) {
       CUGRAPH_EXPECTS(
         (*edgelist_intra_partition_segment_offsets)[i].size() ==
           static_cast<size_t>(row_comm_size + 1),
@@ -673,19 +669,19 @@ renumber_edgelist(
     }
   }
 
-  std::vector<vertex_t const*> edgelist_const_major_vertices(edgelist_major_vertices.size());
-  std::vector<vertex_t const*> edgelist_const_minor_vertices(edgelist_const_major_vertices.size());
-  for (size_t i = 0; i < edgelist_const_major_vertices.size(); ++i) {
-    edgelist_const_major_vertices[i] = edgelist_major_vertices[i];
-    edgelist_const_minor_vertices[i] = edgelist_minor_vertices[i];
+  std::vector<vertex_t const*> edgelist_const_majors(edgelist_majors.size());
+  std::vector<vertex_t const*> edgelist_const_minors(edgelist_const_majors.size());
+  for (size_t i = 0; i < edgelist_const_majors.size(); ++i) {
+    edgelist_const_majors[i] = edgelist_majors[i];
+    edgelist_const_minors[i] = edgelist_minors[i];
   }
 
   if (do_expensive_check) {
     detail::expensive_check_edgelist<vertex_t, edge_t, multi_gpu>(
       handle,
-      local_vertex_span,
-      edgelist_const_major_vertices,
-      edgelist_const_minor_vertices,
+      local_vertices,
+      edgelist_const_majors,
+      edgelist_const_minors,
       edgelist_edge_counts,
       edgelist_intra_partition_segment_offsets);
   }
@@ -697,9 +693,9 @@ renumber_edgelist(
         num_unique_edge_majors,
         num_unique_edge_minors] =
     detail::compute_renumber_map<vertex_t, edge_t, multi_gpu>(handle,
-                                                              local_vertex_span,
-                                                              edgelist_const_major_vertices,
-                                                              edgelist_const_minor_vertices,
+                                                              std::move(local_vertices),
+                                                              edgelist_const_majors,
+                                                              edgelist_const_minors,
                                                               edgelist_edge_counts);
 
   // 2. initialize partition_t object, number_of_vertices, and number_of_edges for the coarsened
@@ -742,13 +738,13 @@ renumber_edgelist(
 
   {
     vertex_t max_matrix_partition_major_size{0};
-    for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) {
+    for (size_t i = 0; i < edgelist_majors.size(); ++i) {
       max_matrix_partition_major_size =
         std::max(max_matrix_partition_major_size, partition.get_matrix_partition_major_size(i));
     }
     rmm::device_uvector<vertex_t> renumber_map_major_labels(max_matrix_partition_major_size,
                                                             handle.get_stream());
-    for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) {
+    for (size_t i = 0; i < edgelist_majors.size(); ++i) {
       device_bcast(col_comm,
                    renumber_map_labels.data(),
                    renumber_map_major_labels.data(),
@@ -777,9 +773,8 @@ renumber_edgelist(
         renumber_map_major_labels.begin(),
         thrust::make_counting_iterator(partition.get_matrix_partition_major_first(i))));
       renumber_map.insert(pair_first, pair_first + partition.get_matrix_partition_major_size(i));
-      renumber_map.find(edgelist_major_vertices[i],
-                        edgelist_major_vertices[i] + edgelist_edge_counts[i],
-                        edgelist_major_vertices[i]);
+      renumber_map.find(
+        edgelist_majors[i], edgelist_majors[i] + edgelist_edge_counts[i], edgelist_majors[i]);
     }
   }
 
@@ -830,11 +825,11 @@ renumber_edgelist(
         thrust::make_counting_iterator(
           partition.get_vertex_partition_first(col_comm_rank * row_comm_size + i))));
       renumber_map.insert(pair_first, pair_first + segment_size);
-      for (size_t j = 0; j < edgelist_minor_vertices.size(); ++j) {
+      for (size_t j = 0; j < edgelist_minors.size(); ++j) {
         renumber_map.find(
-          edgelist_minor_vertices[j] + (*edgelist_intra_partition_segment_offsets)[j][i],
-          edgelist_minor_vertices[j] + (*edgelist_intra_partition_segment_offsets)[j][i + 1],
-          edgelist_minor_vertices[j] + (*edgelist_intra_partition_segment_offsets)[j][i]);
+          edgelist_minors[j] + (*edgelist_intra_partition_segment_offsets)[j][i],
+          edgelist_minors[j] + (*edgelist_intra_partition_segment_offsets)[j][i + 1],
+          edgelist_minors[j] + (*edgelist_intra_partition_segment_offsets)[j][i]);
       }
     }
   } else {
@@ -870,10 +865,9 @@ renumber_edgelist(
       renumber_map_minor_labels.begin(),
       thrust::make_counting_iterator(partition.get_matrix_partition_minor_first())));
     renumber_map.insert(pair_first, pair_first + renumber_map_minor_labels.size());
-    for (size_t i = 0; i < edgelist_minor_vertices.size(); ++i) {
-      renumber_map.find(edgelist_minor_vertices[i],
-                        edgelist_minor_vertices[i] + edgelist_edge_counts[i],
-                        edgelist_minor_vertices[i]);
+    for (size_t i = 0; i < edgelist_minors.size(); ++i) {
+      renumber_map.find(
+        edgelist_minors[i], edgelist_minors[i] + edgelist_edge_counts[i], edgelist_minors[i]);
     }
   }
   // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between two
@@ -902,18 +896,18 @@ std::enable_if_t<
   !multi_gpu,
   std::tuple<rmm::device_uvector<vertex_t>, renumber_meta_t<vertex_t, edge_t, multi_gpu>>>
 renumber_edgelist(raft::handle_t const& handle,
-                  std::optional<std::tuple<vertex_t const*, vertex_t>> vertex_span,
-                  vertex_t* edgelist_major_vertices /* [INOUT] */,
-                  vertex_t* edgelist_minor_vertices /* [INOUT] */,
+                  std::optional<rmm::device_uvector<vertex_t>>&& vertices,
+                  vertex_t* edgelist_majors /* [INOUT] */,
+                  vertex_t* edgelist_minors /* [INOUT] */,
                   edge_t num_edgelist_edges,
                   bool do_expensive_check)
 {
   if (do_expensive_check) {
     detail::expensive_check_edgelist<vertex_t, edge_t, multi_gpu>(
       handle,
-      vertex_span,
-      std::vector<vertex_t const*>{edgelist_major_vertices},
-      std::vector<vertex_t const*>{edgelist_minor_vertices},
+      vertices,
+      std::vector<vertex_t const*>{edgelist_majors},
+      std::vector<vertex_t const*>{edgelist_minors},
       std::vector<edge_t>{num_edgelist_edges},
       std::nullopt);
   }
@@ -923,9 +917,9 @@ renumber_edgelist(raft::handle_t const& handle,
   std::tie(renumber_map_labels, segment_offsets, std::ignore, std::ignore) =
     detail::compute_renumber_map<vertex_t, edge_t, multi_gpu>(
       handle,
-      vertex_span,
-      std::vector<vertex_t const*>{edgelist_major_vertices},
-      std::vector<vertex_t const*>{edgelist_minor_vertices},
+      std::move(vertices),
+      std::vector<vertex_t const*>{edgelist_majors},
+      std::vector<vertex_t const*>{edgelist_minors},
       std::vector<edge_t>{num_edgelist_edges});
 
   double constexpr load_factor = 0.7;
@@ -946,10 +940,8 @@ renumber_edgelist(raft::handle_t const& handle,
   auto pair_first = thrust::make_zip_iterator(
     thrust::make_tuple(renumber_map_labels.begin(), thrust::make_counting_iterator(vertex_t{0})));
   renumber_map.insert(pair_first, pair_first + renumber_map_labels.size());
-  renumber_map.find(
-    edgelist_major_vertices, edgelist_major_vertices + num_edgelist_edges, edgelist_major_vertices);
-  renumber_map.find(
-    edgelist_minor_vertices, edgelist_minor_vertices + num_edgelist_edges, edgelist_minor_vertices);
+  renumber_map.find(edgelist_majors, edgelist_majors + num_edgelist_edges, edgelist_majors);
+  renumber_map.find(edgelist_minors, edgelist_minors + num_edgelist_edges, edgelist_minors);
 
   return std::make_tuple(std::move(renumber_map_labels),
                          renumber_meta_t<vertex_t, edge_t, multi_gpu>{segment_offsets});
diff --git a/cpp/src/structure/renumber_edgelist_mg.cu b/cpp/src/structure/renumber_edgelist_mg.cu
index 4e9f37e10bb..eecad91f5d6 100644
--- a/cpp/src/structure/renumber_edgelist_mg.cu
+++ b/cpp/src/structure/renumber_edgelist_mg.cu
@@ -22,9 +22,9 @@ namespace cugraph {
 template std::tuple<rmm::device_uvector<int32_t>, renumber_meta_t<int32_t, int32_t, true>>
 renumber_edgelist<int32_t, int32_t, true>(
   raft::handle_t const& handle,
-  std::optional<std::tuple<int32_t const*, int32_t>> optional_local_vertex_span,
-  std::vector<int32_t*> const& edgelist_major_vertices /* [INOUT] */,
-  std::vector<int32_t*> const& edgelist_minor_vertices /* [INOUT] */,
+  std::optional<rmm::device_uvector<int32_t>>&& local_vertices,
+  std::vector<int32_t*> const& edgelist_majors /* [INOUT] */,
+  std::vector<int32_t*> const& edgelist_minors /* [INOUT] */,
   std::vector<int32_t> const& edgelist_edge_counts,
   std::optional<std::vector<std::vector<int32_t>>> const& edgelist_intra_partition_segment_offsets,
   bool do_expensive_check);
@@ -32,9 +32,9 @@ renumber_edgelist<int32_t, int32_t, true>(
 template std::tuple<rmm::device_uvector<int32_t>, renumber_meta_t<int32_t, int64_t, true>>
 renumber_edgelist<int32_t, int64_t, true>(
   raft::handle_t const& handle,
-  std::optional<std::tuple<int32_t const*, int32_t>> optional_local_vertex_span,
-  std::vector<int32_t*> const& edgelist_major_vertices /* [INOUT] */,
-  std::vector<int32_t*> const& edgelist_minor_vertices /* [INOUT] */,
+  std::optional<rmm::device_uvector<int32_t>>&& local_vertices,
+  std::vector<int32_t*> const& edgelist_majors /* [INOUT] */,
+  std::vector<int32_t*> const& edgelist_minors /* [INOUT] */,
   std::vector<int64_t> const& edgelist_edge_counts,
   std::optional<std::vector<std::vector<int64_t>>> const& edgelist_intra_partition_segment_offsets,
   bool do_expensive_check);
@@ -42,9 +42,9 @@ renumber_edgelist<int32_t, int64_t, true>(
 template std::tuple<rmm::device_uvector<int64_t>, renumber_meta_t<int64_t, int64_t, true>>
 renumber_edgelist<int64_t, int64_t, true>(
   raft::handle_t const& handle,
-  std::optional<std::tuple<int64_t const*, int64_t>> optional_local_vertex_span,
-  std::vector<int64_t*> const& edgelist_major_vertices /* [INOUT] */,
-  std::vector<int64_t*> const& edgelist_minor_vertices /* [INOUT] */,
+  std::optional<rmm::device_uvector<int64_t>>&& local_vertices,
+  std::vector<int64_t*> const& edgelist_majors /* [INOUT] */,
+  std::vector<int64_t*> const& edgelist_minors /* [INOUT] */,
   std::vector<int64_t> const& edgelist_edge_counts,
   std::optional<std::vector<std::vector<int64_t>>> const& edgelist_intra_partition_segment_offsets,
   bool do_expensive_check);
diff --git a/cpp/src/structure/renumber_edgelist_sg.cu b/cpp/src/structure/renumber_edgelist_sg.cu
index 3bb25d74b2e..fd591eda052 100644
--- a/cpp/src/structure/renumber_edgelist_sg.cu
+++ b/cpp/src/structure/renumber_edgelist_sg.cu
@@ -20,30 +20,27 @@ namespace cugraph {
 // SG instantiation
 
 template std::tuple<rmm::device_uvector<int32_t>, renumber_meta_t<int32_t, int32_t, false>>
-renumber_edgelist<int32_t, int32_t, false>(
-  raft::handle_t const& handle,
-  std::optional<std::tuple<int32_t const*, int32_t>> optional_vertex_span,
-  int32_t* edgelist_major_vertices /* [INOUT] */,
-  int32_t* edgelist_minor_vertices /* [INOUT] */,
-  int32_t num_edgelist_edges,
-  bool do_expensive_check);
+renumber_edgelist<int32_t, int32_t, false>(raft::handle_t const& handle,
+                                           std::optional<rmm::device_uvector<int32_t>>&& vertices,
+                                           int32_t* edgelist_majors /* [INOUT] */,
+                                           int32_t* edgelist_minors /* [INOUT] */,
+                                           int32_t num_edgelist_edges,
+                                           bool do_expensive_check);
 
 template std::tuple<rmm::device_uvector<int32_t>, renumber_meta_t<int32_t, int64_t, false>>
-renumber_edgelist<int32_t, int64_t, false>(
-  raft::handle_t const& handle,
-  std::optional<std::tuple<int32_t const*, int32_t>> optional_vertex_span,
-  int32_t* edgelist_major_vertices /* [INOUT] */,
-  int32_t* edgelist_minor_vertices /* [INOUT] */,
-  int64_t num_edgelist_edges,
-  bool do_expensive_check);
+renumber_edgelist<int32_t, int64_t, false>(raft::handle_t const& handle,
+                                           std::optional<rmm::device_uvector<int32_t>>&& vertices,
+                                           int32_t* edgelist_majors /* [INOUT] */,
+                                           int32_t* edgelist_minors /* [INOUT] */,
+                                           int64_t num_edgelist_edges,
+                                           bool do_expensive_check);
 
 template std::tuple<rmm::device_uvector<int64_t>, renumber_meta_t<int64_t, int64_t, false>>
-renumber_edgelist<int64_t, int64_t, false>(
-  raft::handle_t const& handle,
-  std::optional<std::tuple<int64_t const*, int64_t>> optional_vertex_span,
-  int64_t* edgelist_major_vertices /* [INOUT] */,
-  int64_t* edgelist_minor_vertices /* [INOUT] */,
-  int64_t num_edgelist_edges,
-  bool do_expensive_check);
+renumber_edgelist<int64_t, int64_t, false>(raft::handle_t const& handle,
+                                           std::optional<rmm::device_uvector<int64_t>>&& vertices,
+                                           int64_t* edgelist_majors /* [INOUT] */,
+                                           int64_t* edgelist_minors /* [INOUT] */,
+                                           int64_t num_edgelist_edges,
+                                           bool do_expensive_check);
 
 }  // namespace cugraph

From da39f24cd509e4e60ad8972ff3dd38cd37cd636d Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Fri, 29 Oct 2021 15:27:26 -0400
Subject: [PATCH 6/7] clang-format

---
 cpp/tests/utilities/base_fixture.hpp | 56 ++++++++++++++--------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/cpp/tests/utilities/base_fixture.hpp b/cpp/tests/utilities/base_fixture.hpp
index 7b922b87111..9e4ce610569 100644
--- a/cpp/tests/utilities/base_fixture.hpp
+++ b/cpp/tests/utilities/base_fixture.hpp
@@ -185,32 +185,32 @@ inline auto parse_test_options(int argc, char** argv)
     return RUN_ALL_TESTS();                                                     \
   }
 
-#define CUGRAPH_MG_TEST_PROGRAM_MAIN()                                                \
-  int main(int argc, char** argv)                                                     \
-  {                                                                                   \
-    MPI_TRY(MPI_Init(&argc, &argv));                                                  \
-    int comm_rank{};                                                                  \
-    int comm_size{};                                                                  \
-    MPI_TRY(MPI_Comm_rank(MPI_COMM_WORLD, &comm_rank));                               \
-    MPI_TRY(MPI_Comm_size(MPI_COMM_WORLD, &comm_size));                               \
-    int num_gpus_per_node{};                                                          \
-    CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node));                                 \
-    CUDA_TRY(cudaSetDevice(comm_rank % num_gpus_per_node));                           \
-    ::testing::InitGoogleTest(&argc, argv);                                           \
-    auto const cmd_opts = parse_test_options(argc, argv);                             \
-    auto const rmm_mode = cmd_opts["rmm_mode"].as<std::string>();                     \
-    auto resource       = cugraph::test::create_memory_resource(rmm_mode);            \
-    rmm::mr::set_current_device_resource(resource.get());                             \
-    cugraph::test::g_perf = cmd_opts["perf"].as<bool>();                              \
-    cugraph::test::g_rmat_scale =                                                     \
-      (cmd_opts.count("rmat_scale") > 0)                                              \
-        ? std::make_optional<size_t>(cmd_opts["rmat_scale"].as<size_t>())             \
-        : std::nullopt;                                                               \
-    cugraph::test::g_rmat_edge_factor =                                               \
-      (cmd_opts.count("rmat_edge_factor") > 0)                                        \
-        ? std::make_optional<size_t>(cmd_opts["rmat_edge_factor"].as<size_t>())       \
-        : std::nullopt;                                                               \
-    auto ret = RUN_ALL_TESTS();                                                       \
-    MPI_TRY(MPI_Finalize());                                                          \
-    return ret;                                                                       \
+#define CUGRAPH_MG_TEST_PROGRAM_MAIN()                                          \
+  int main(int argc, char** argv)                                               \
+  {                                                                             \
+    MPI_TRY(MPI_Init(&argc, &argv));                                            \
+    int comm_rank{};                                                            \
+    int comm_size{};                                                            \
+    MPI_TRY(MPI_Comm_rank(MPI_COMM_WORLD, &comm_rank));                         \
+    MPI_TRY(MPI_Comm_size(MPI_COMM_WORLD, &comm_size));                         \
+    int num_gpus_per_node{};                                                    \
+    CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node));                           \
+    CUDA_TRY(cudaSetDevice(comm_rank % num_gpus_per_node));                     \
+    ::testing::InitGoogleTest(&argc, argv);                                     \
+    auto const cmd_opts = parse_test_options(argc, argv);                       \
+    auto const rmm_mode = cmd_opts["rmm_mode"].as<std::string>();               \
+    auto resource       = cugraph::test::create_memory_resource(rmm_mode);      \
+    rmm::mr::set_current_device_resource(resource.get());                       \
+    cugraph::test::g_perf = cmd_opts["perf"].as<bool>();                        \
+    cugraph::test::g_rmat_scale =                                               \
+      (cmd_opts.count("rmat_scale") > 0)                                        \
+        ? std::make_optional<size_t>(cmd_opts["rmat_scale"].as<size_t>())       \
+        : std::nullopt;                                                         \
+    cugraph::test::g_rmat_edge_factor =                                         \
+      (cmd_opts.count("rmat_edge_factor") > 0)                                  \
+        ? std::make_optional<size_t>(cmd_opts["rmat_edge_factor"].as<size_t>()) \
+        : std::nullopt;                                                         \
+    auto ret = RUN_ALL_TESTS();                                                 \
+    MPI_TRY(MPI_Finalize());                                                    \
+    return ret;                                                                 \
   }

From 292eda5183ef50c0d58ffdd32cc6893965dff960 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Fri, 29 Oct 2021 13:55:59 -0700
Subject: [PATCH 7/7] remove unncessary host barrier synchronization

---
 .../prims/copy_to_adj_matrix_row_col.cuh      |  89 ---------------
 .../copy_v_transform_reduce_in_out_nbr.cuh    |  45 --------
 ...ransform_reduce_key_aggregated_out_nbr.cuh |  42 -------
 .../update_frontier_v_push_if_out_nbr.cuh     |  81 -------------
 cpp/src/structure/coarsen_graph_impl.cuh      |  21 ----
 .../create_graph_from_edgelist_impl.cuh       |  36 ------
 cpp/src/structure/renumber_edgelist_impl.cuh  | 107 ------------------
 cpp/src/structure/renumber_utils_impl.cuh     |  31 -----
 8 files changed, 452 deletions(-)

diff --git a/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh b/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh
index 7100e7c8663..508294c9e89 100644
--- a/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh
+++ b/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh
@@ -22,7 +22,6 @@
 #include <cugraph/utilities/dataframe_buffer.cuh>
 #include <cugraph/utilities/device_comm.cuh>
 #include <cugraph/utilities/error.hpp>
-#include <cugraph/utilities/host_barrier.hpp>
 #include <cugraph/utilities/host_scalar_comm.cuh>
 #include <cugraph/utilities/thrust_tuple_utils.cuh>
 #include <cugraph/vertex_partition_device_view.cuh>
@@ -65,17 +64,6 @@ void copy_to_matrix_major(raft::handle_t const& handle,
     auto const col_comm_rank = col_comm.get_rank();
     auto const col_comm_size = col_comm.get_size();
 
-    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
-    // two different communicators (beginning of col_comm)
-#if 1
-    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
-    // and MPI barrier with MPI)
-    host_barrier(comm, handle.get_stream_view());
-#else
-    handle.get_stream_view().synchronize();
-    comm.barrier();  // currently, this is ncclAllReduce
-#endif
-
     if (matrix_major_value_output.key_first()) {
       auto key_offsets = GraphViewType::is_adj_matrix_transposed
                            ? *(graph_view.get_local_sorted_unique_edge_col_offsets())
@@ -122,17 +110,6 @@ void copy_to_matrix_major(raft::handle_t const& handle,
                         displacements,
                         handle.get_stream());
     }
-
-    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
-    // two different communicators (end of col_comm)
-#if 1
-    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
-    // and MPI barrier with MPI)
-    host_barrier(comm, handle.get_stream_view());
-#else
-    handle.get_stream_view().synchronize();
-    comm.barrier();  // currently, this is ncclAllReduce
-#endif
   } else {
     assert(!(matrix_major_value_output.key_first()));
     assert(graph_view.get_number_of_local_vertices() == GraphViewType::is_adj_matrix_transposed
@@ -170,17 +147,6 @@ void copy_to_matrix_major(raft::handle_t const& handle,
     auto const col_comm_rank = col_comm.get_rank();
     auto const col_comm_size = col_comm.get_size();
 
-    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
-    // two different communicators (beginning of col_comm)
-#if 1
-    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
-    // and MPI barrier with MPI)
-    host_barrier(comm, handle.get_stream_view());
-#else
-    handle.get_stream_view().synchronize();
-    comm.barrier();  // currently, this is ncclAllReduce
-#endif
-
     auto rx_counts =
       host_scalar_allgather(col_comm,
                             static_cast<size_t>(thrust::distance(vertex_first, vertex_last)),
@@ -260,17 +226,6 @@ void copy_to_matrix_major(raft::handle_t const& handle,
           matrix_major_value_output.value_data() + matrix_partition.get_major_value_start_offset());
       }
     }
-
-    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
-    // two different communicators (end of col_comm)
-#if 1
-    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
-    // and MPI barrier with MPI)
-    host_barrier(comm, handle.get_stream_view());
-#else
-    handle.get_stream_view().synchronize();
-    comm.barrier();  // currently, this is ncclAllReduce
-#endif
   } else {
     assert(!(matrix_major_value_output.key_first()));
     assert(graph_view.get_number_of_local_vertices() == GraphViewType::is_adj_matrix_transposed
@@ -305,17 +260,6 @@ void copy_to_matrix_minor(raft::handle_t const& handle,
     auto const col_comm_rank = col_comm.get_rank();
     auto const col_comm_size = col_comm.get_size();
 
-    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
-    // two different communicators (beginning of row_comm)
-#if 1
-    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
-    // and MPI barrier with MPI)
-    host_barrier(comm, handle.get_stream_view());
-#else
-    handle.get_stream_view().synchronize();
-    comm.barrier();  // currently, this is ncclAllReduce
-#endif
-
     if (matrix_minor_value_output.key_first()) {
       auto key_offsets = GraphViewType::is_adj_matrix_transposed
                            ? *(graph_view.get_local_sorted_unique_edge_row_offsets())
@@ -362,17 +306,6 @@ void copy_to_matrix_minor(raft::handle_t const& handle,
                         displacements,
                         handle.get_stream());
     }
-
-    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
-    // two different communicators (end of row_comm)
-#if 1
-    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
-    // and MPI barrier with MPI)
-    host_barrier(comm, handle.get_stream_view());
-#else
-    handle.get_stream_view().synchronize();
-    comm.barrier();  // currently, this is ncclAllReduce
-#endif
   } else {
     assert(!(matrix_minor_value_output.key_first()));
     assert(graph_view.get_number_of_local_vertices() == GraphViewType::is_adj_matrix_transposed
@@ -410,17 +343,6 @@ void copy_to_matrix_minor(raft::handle_t const& handle,
     auto const col_comm_rank = col_comm.get_rank();
     auto const col_comm_size = col_comm.get_size();
 
-    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
-    // two different communicators (beginning of row_comm)
-#if 1
-    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
-    // and MPI barrier with MPI)
-    host_barrier(comm, handle.get_stream_view());
-#else
-    handle.get_stream_view().synchronize();
-    comm.barrier();  // currently, this is ncclAllReduce
-#endif
-
     auto rx_counts =
       host_scalar_allgather(row_comm,
                             static_cast<size_t>(thrust::distance(vertex_first, vertex_last)),
@@ -498,17 +420,6 @@ void copy_to_matrix_minor(raft::handle_t const& handle,
                         matrix_minor_value_output.value_data());
       }
     }
-
-    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
-    // two different communicators (end of row_comm)
-#if 1
-    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
-    // and MPI barrier with MPI)
-    host_barrier(comm, handle.get_stream_view());
-#else
-    handle.get_stream_view().synchronize();
-    comm.barrier();  // currently, this is ncclAllReduce
-#endif
   } else {
     assert(!(matrix_minor_value_output.key_first()));
     assert(graph_view.get_number_of_local_vertices() ==
diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh
index 9fb7e8bf2a1..0cf6633d23e 100644
--- a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh
+++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh
@@ -22,7 +22,6 @@
 #include <cugraph/utilities/dataframe_buffer.cuh>
 #include <cugraph/utilities/device_comm.cuh>
 #include <cugraph/utilities/error.hpp>
-#include <cugraph/utilities/host_barrier.hpp>
 
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
@@ -603,17 +602,6 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
       auto const col_comm_rank = col_comm.get_rank();
       auto const col_comm_size = col_comm.get_size();
 
-      // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
-      // two different communicators (beginning of col_comm)
-#if 1
-      // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
-      // and MPI barrier with MPI)
-      host_barrier(comm, handle.get_stream_view());
-#else
-      handle.get_stream_view().synchronize();
-      comm.barrier();  // currently, this is ncclAllReduce
-#endif
-
       device_reduce(col_comm,
                     major_buffer_first,
                     vertex_value_output_first,
@@ -621,17 +609,6 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
                     raft::comms::op_t::SUM,
                     i,
                     handle.get_stream());
-
-      // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
-      // two different communicators (end of col_comm)
-#if 1
-      // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
-      // and MPI barrier with MPI)
-      host_barrier(comm, handle.get_stream_view());
-#else
-      handle.get_stream_view().synchronize();
-      comm.barrier();  // currently, this is ncclAllReduce
-#endif
     }
   }
 
@@ -645,17 +622,6 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
     auto const col_comm_rank = col_comm.get_rank();
     auto const col_comm_size = col_comm.get_size();
 
-    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
-    // two different communicators (beginning of row_comm)
-#if 1
-    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
-    // and MPI barrier with MPI)
-    host_barrier(comm, handle.get_stream_view());
-#else
-    handle.get_stream_view().synchronize();
-    comm.barrier();  // currently, this is ncclAllReduce
-#endif
-
     for (int i = 0; i < row_comm_size; ++i) {
       auto offset = (graph_view.get_vertex_partition_first(col_comm_rank * row_comm_size + i) -
                      graph_view.get_vertex_partition_first(col_comm_rank * row_comm_size));
@@ -668,17 +634,6 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
                     i,
                     handle.get_stream());
     }
-
-    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
-    // two different communicators (end of row_comm)
-#if 1
-    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
-    // and MPI barrier with MPI)
-    host_barrier(comm, handle.get_stream_view());
-#else
-    handle.get_stream_view().synchronize();
-    comm.barrier();  // currently, this is ncclAllReduce
-#endif
   }
 }
 
diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
index aac89919892..9aca0361ccf 100644
--- a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
+++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
@@ -22,7 +22,6 @@
 #include <cugraph/utilities/collect_comm.cuh>
 #include <cugraph/utilities/dataframe_buffer.cuh>
 #include <cugraph/utilities/error.hpp>
-#include <cugraph/utilities/host_barrier.hpp>
 #include <cugraph/utilities/host_scalar_comm.cuh>
 #include <cugraph/utilities/shuffle_comm.cuh>
 #include <cugraph/vertex_partition_device_view.cuh>
@@ -242,17 +241,6 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
     auto const row_comm_rank = row_comm.get_rank();
     auto const row_comm_size = row_comm.get_size();
 
-    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
-    // two different communicators (beginning of row_comm)
-#if 1
-    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
-    // and MPI barrier with MPI)
-    host_barrier(comm, handle.get_stream_view());
-#else
-    handle.get_stream_view().synchronize();
-    comm.barrier();  // currently, this is ncclAllReduce
-#endif
-
     auto map_counts = host_scalar_allgather(
       row_comm,
       static_cast<size_t>(thrust::distance(map_unique_key_first, map_unique_key_last)),
@@ -320,21 +308,6 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
 
   // 2. aggregate each vertex out-going edges based on keys and transform-reduce.
 
-  if (GraphViewType::is_multi_gpu) {
-    auto& comm = handle.get_comms();
-
-    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
-    // two different communicators (beginning of col_comm)
-#if 1
-    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
-    // and MPI barrier with MPI)
-    host_barrier(comm, handle.get_stream_view());
-#else
-    handle.get_stream_view().synchronize();
-    comm.barrier();  // currently, this is ncclAllReduce
-#endif
-  }
-
   rmm::device_uvector<vertex_t> major_vertices(0, handle.get_stream());
   auto e_op_result_buffer = allocate_dataframe_buffer<T>(0, handle.get_stream());
   for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
@@ -546,21 +519,6 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
     }
   }
 
-  if (GraphViewType::is_multi_gpu) {
-    auto& comm = handle.get_comms();
-
-    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
-    // two different communicators (beginning of col_comm)
-#if 1
-    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
-    // and MPI barrier with MPI)
-    host_barrier(comm, handle.get_stream_view());
-#else
-    handle.get_stream_view().synchronize();
-    comm.barrier();  // currently, this is ncclAllReduce
-#endif
-  }
-
   auto execution_policy = handle.get_thrust_policy();
   thrust::fill(execution_policy,
                vertex_value_output_first,
diff --git a/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh b/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh
index c84fcd19ce4..29a3c1cca1e 100644
--- a/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh
+++ b/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh
@@ -23,7 +23,6 @@
 #include <cugraph/utilities/dataframe_buffer.cuh>
 #include <cugraph/utilities/device_comm.cuh>
 #include <cugraph/utilities/error.hpp>
-#include <cugraph/utilities/host_barrier.hpp>
 #include <cugraph/utilities/host_scalar_comm.cuh>
 #include <cugraph/utilities/shuffle_comm.cuh>
 #include <cugraph/utilities/thrust_tuple_utils.cuh>
@@ -609,20 +608,6 @@ typename GraphViewType::edge_type compute_num_out_nbrs_from_frontier(
 
   edge_t ret{0};
 
-  if (GraphViewType::is_multi_gpu) {
-    auto& comm = handle.get_comms();
-    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
-    // two different communicators (beginning of col_comm)
-#if 1
-    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
-    // and MPI barrier with MPI)
-    host_barrier(comm, handle.get_stream_view());
-#else
-    handle.get_stream_view().synchronize();
-    comm.barrier();  // currently, this is ncclAllReduce
-#endif
-  }
-
   auto const& cur_frontier_bucket = frontier.get_bucket(cur_frontier_bucket_idx);
   vertex_t const* local_frontier_vertex_first{nullptr};
   vertex_t const* local_frontier_vertex_last{nullptr};
@@ -719,20 +704,6 @@ typename GraphViewType::edge_type compute_num_out_nbrs_from_frontier(
     }
   }
 
-  if (GraphViewType::is_multi_gpu) {
-    auto& comm = handle.get_comms();
-    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
-    // two different communicators (end of col_comm)
-#if 1
-    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
-    // and MPI barrier with MPI)
-    host_barrier(comm, handle.get_stream_view());
-#else
-    handle.get_stream_view().synchronize();
-    comm.barrier();  // currently, this is ncclAllReduce
-#endif
-  }
-
   return ret;
 }
 
@@ -840,21 +811,6 @@ void update_frontier_v_push_if_out_nbr(
 
   // 1. fill the buffer
 
-  if (GraphViewType::is_multi_gpu) {
-    auto& comm = handle.get_comms();
-
-    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
-    // two different communicators (beginning of col_comm)
-#if 1
-    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
-    // and MPI barrier with MPI)
-    host_barrier(comm, handle.get_stream_view());
-#else
-    handle.get_stream_view().synchronize();
-    comm.barrier();  // currently, this is ncclAllReduce
-#endif
-  }
-
   auto key_buffer = allocate_dataframe_buffer<key_t>(size_t{0}, handle.get_stream());
   auto payload_buffer =
     detail::allocate_optional_payload_buffer<payload_t>(size_t{0}, handle.get_stream());
@@ -1097,21 +1053,6 @@ void update_frontier_v_push_if_out_nbr(
     }
   }
 
-  if (GraphViewType::is_multi_gpu) {
-    auto& comm = handle.get_comms();
-
-    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
-    // two different communicators (beginning of col_comm)
-#if 1
-    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
-    // and MPI barrier with MPI)
-    host_barrier(comm, handle.get_stream_view());
-#else
-    handle.get_stream_view().synchronize();
-    comm.barrier();  // currently, this is ncclAllReduce
-#endif
-  }
-
   // 2. reduce the buffer
 
   auto num_buffer_elements = detail::sort_and_reduce_buffer_elements(
@@ -1128,17 +1069,6 @@ void update_frontier_v_push_if_out_nbr(
     auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
     auto const col_comm_rank = col_comm.get_rank();
 
-    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
-    // two different communicators (beginning of row_comm)
-#if 1
-    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
-    // and MPI barrier with MPI)
-    host_barrier(comm, handle.get_stream_view());
-#else
-    handle.get_stream_view().synchronize();
-    comm.barrier();  // currently, this is ncclAllReduce
-#endif
-
     std::vector<vertex_t> h_vertex_lasts(row_comm_size);
     for (size_t i = 0; i < h_vertex_lasts.size(); ++i) {
       h_vertex_lasts[i] = graph_view.get_vertex_partition_last(col_comm_rank * row_comm_size + i);
@@ -1189,17 +1119,6 @@ void update_frontier_v_push_if_out_nbr(
       detail::get_optional_payload_buffer_begin<payload_t>(payload_buffer),
       size_dataframe_buffer(key_buffer),
       reduce_op);
-
-    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
-    // two different communicators (end of row_comm)
-#if 1
-    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
-    // and MPI barrier with MPI)
-    host_barrier(comm, handle.get_stream_view());
-#else
-    handle.get_stream_view().synchronize();
-    comm.barrier();  // currently, this is ncclAllReduce
-#endif
   }
 
   // 3. update vertex properties and frontier
diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh
index f49c7be626d..44f87c71085 100644
--- a/cpp/src/structure/coarsen_graph_impl.cuh
+++ b/cpp/src/structure/coarsen_graph_impl.cuh
@@ -24,7 +24,6 @@
 #include <cugraph/prims/copy_to_adj_matrix_row_col.cuh>
 #include <cugraph/prims/row_col_properties.cuh>
 #include <cugraph/utilities/error.hpp>
-#include <cugraph/utilities/host_barrier.hpp>
 
 #include <raft/handle.hpp>
 #include <rmm/device_uvector.hpp>
@@ -238,16 +237,6 @@ coarsen_graph(
   for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
     // 1-1. locally construct coarsened edge list
 
-    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
-    // two different communicators (beginning of col_comm)
-#if 1
-    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
-    // and MPI barrier with MPI)
-    host_barrier(comm, handle.get_stream_view());
-#else
-    handle.get_stream_view().synchronize();
-    comm.barrier();  // currently, this is ncclAllReduce
-#endif
     rmm::device_uvector<vertex_t> major_labels(
       store_transposed ? graph_view.get_number_of_local_adj_matrix_partition_cols(i)
                        : graph_view.get_number_of_local_adj_matrix_partition_rows(i),
@@ -258,16 +247,6 @@ coarsen_graph(
                  major_labels.size(),
                  static_cast<int>(i),
                  handle.get_stream());
-    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
-    // two different communicators (end of col_comm)
-#if 1
-    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
-    // and MPI barrier with MPI)
-    host_barrier(comm, handle.get_stream_view());
-#else
-    handle.get_stream_view().synchronize();
-    comm.barrier();  // currently, this is ncclAllReduce
-#endif
 
     auto [edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights] =
       decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist(
diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.cuh b/cpp/src/structure/create_graph_from_edgelist_impl.cuh
index 4516b087180..849705df4ac 100644
--- a/cpp/src/structure/create_graph_from_edgelist_impl.cuh
+++ b/cpp/src/structure/create_graph_from_edgelist_impl.cuh
@@ -22,7 +22,6 @@
 #include <cugraph/partition_manager.hpp>
 #include <cugraph/utilities/device_comm.cuh>
 #include <cugraph/utilities/error.hpp>
-#include <cugraph/utilities/host_barrier.hpp>
 #include <cugraph/utilities/host_scalar_comm.cuh>
 
 #include <raft/handle.hpp>
@@ -113,19 +112,6 @@ void expensive_check_edgelist(raft::handle_t const& handle,
       "Invalid input argument: edgelist_majors & edgelist_minors should be pre-shuffled.");
 
     if (vertices) {
-      // FIXME: this barrier is unnecessary if the above host_scalar_allreduce is a true host
-      // operation (as it serves as a barrier) barrier is necessary here to avoid potential
-      // overlap (which can leads to deadlock) between two different communicators (beginning of
-      // col_comm)
-#if 1
-      // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with
-      // DASK and MPI barrier with MPI)
-      host_barrier(comm, handle.get_stream_view());
-#else
-      handle.get_stream_view().synchronize();
-      comm.barrier();  // currently, this is ncclAllReduce
-#endif
-
       rmm::device_uvector<vertex_t> sorted_majors(0, handle.get_stream());
       {
         auto recvcounts = host_scalar_allgather(col_comm, (*vertices).size(), handle.get_stream());
@@ -141,17 +127,6 @@ void expensive_check_edgelist(raft::handle_t const& handle,
         thrust::sort(handle.get_thrust_policy(), sorted_majors.begin(), sorted_majors.end());
       }
 
-      // barrier is necessary here to avoid potential overlap (which can leads to deadlock)
-      // between two different communicators (beginning of row_comm)
-#if 1
-      // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with
-      // DASK and MPI barrier with MPI)
-      host_barrier(comm, handle.get_stream_view());
-#else
-      handle.get_stream_view().synchronize();
-      comm.barrier();  // currently, this is ncclAllReduce
-#endif
-
       rmm::device_uvector<vertex_t> sorted_minors(0, handle.get_stream());
       {
         auto recvcounts = host_scalar_allgather(row_comm, (*vertices).size(), handle.get_stream());
@@ -167,17 +142,6 @@ void expensive_check_edgelist(raft::handle_t const& handle,
         thrust::sort(handle.get_thrust_policy(), sorted_minors.begin(), sorted_minors.end());
       }
 
-      // barrier is necessary here to avoid potential overlap (which can leads to deadlock)
-      // between two different communicators (end of row_comm)
-#if 1
-      // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with
-      // DASK and MPI barrier with MPI)
-      host_barrier(comm, handle.get_stream_view());
-#else
-      handle.get_stream_view().synchronize();
-      comm.barrier();  // currently, this is ncclAllReduce
-#endif
-
       auto edge_first = thrust::make_zip_iterator(
         thrust::make_tuple(edgelist_majors.begin(), edgelist_minors.begin()));
       CUGRAPH_EXPECTS(
diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh
index 2c22630ea3f..8d4eb0abf10 100644
--- a/cpp/src/structure/renumber_edgelist_impl.cuh
+++ b/cpp/src/structure/renumber_edgelist_impl.cuh
@@ -20,7 +20,6 @@
 #include <cugraph/graph_view.hpp>
 #include <cugraph/utilities/device_comm.cuh>
 #include <cugraph/utilities/error.hpp>
-#include <cugraph/utilities/host_barrier.hpp>
 #include <cugraph/utilities/host_scalar_comm.cuh>
 #include <cugraph/utilities/shuffle_comm.cuh>
 
@@ -59,22 +58,6 @@ compute_renumber_map(raft::handle_t const& handle,
 
   // 1. acquire (unique major label, count) pairs
 
-  if (multi_gpu) {
-    auto& comm = handle.get_comms();
-
-    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
-    // two different communicators (beginning of col_comm)
-#if 1
-    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
-    // and MPI barrier with MPI)
-    host_barrier(comm, handle.get_stream_view());
-#else
-    handle.get_stream_view().synchronize();
-    ;
-    comm.barrier();  // currently, this is ncclAllReduce
-#endif
-  }
-
   rmm::device_uvector<vertex_t> major_labels(0, handle.get_stream());
   rmm::device_uvector<edge_t> major_counts(0, handle.get_stream());
   vertex_t num_local_unique_edge_majors{0};
@@ -202,17 +185,6 @@ compute_renumber_map(raft::handle_t const& handle,
     auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
     auto const row_comm_size = row_comm.get_size();
 
-    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
-    // two different communicators (beginning of row_comm)
-#if 1
-    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
-    // and MPI barrier with MPI)
-    host_barrier(comm, handle.get_stream_view());
-#else
-    handle.get_stream_view().synchronize();
-    comm.barrier();  // currently, this is ncclAllReduce
-#endif
-
     if (row_comm_size > 1) {
       rmm::device_uvector<vertex_t> rx_minor_labels(0, handle.get_stream());
       std::tie(rx_minor_labels, std::ignore) = groupby_gpuid_and_shuffle_values(
@@ -230,18 +202,6 @@ compute_renumber_map(raft::handle_t const& handle,
                              handle.get_stream());
       minor_labels = std::move(rx_minor_labels);
     }
-
-    // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
-    // two different communicators (end of row_comm)
-#if 1
-    // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK
-    // and MPI barrier with MPI)
-    //
-    host_barrier(comm, handle.get_stream_view());
-#else
-    handle.get_stream_view().synchronize();
-    comm.barrier();  // currently, this is ncclAllReduce
-#endif
   }
   minor_labels.shrink_to_fit(handle.get_stream_view());
 
@@ -474,19 +434,6 @@ void expensive_check_edgelist(
         auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
         auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
 
-        // FIXME: this barrier is unnecessary if the above host_scalar_allreduce is a true host
-        // operation (as it serves as a barrier) barrier is necessary here to avoid potential
-        // overlap (which can leads to deadlock) between two different communicators (beginning of
-        // col_comm)
-#if 1
-        // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with
-        // DASK and MPI barrier with MPI)
-        host_barrier(comm, handle.get_stream_view());
-#else
-        handle.get_stream_view().synchronize();
-        comm.barrier();  // currently, this is ncclAllReduce
-#endif
-
         rmm::device_uvector<vertex_t> sorted_majors(0, handle.get_stream());
         {
           auto recvcounts =
@@ -503,17 +450,6 @@ void expensive_check_edgelist(
           thrust::sort(handle.get_thrust_policy(), sorted_majors.begin(), sorted_majors.end());
         }
 
-        // barrier is necessary here to avoid potential overlap (which can leads to deadlock)
-        // between two different communicators (beginning of row_comm)
-#if 1
-        // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with
-        // DASK and MPI barrier with MPI)
-        host_barrier(comm, handle.get_stream_view());
-#else
-        handle.get_stream_view().synchronize();
-        comm.barrier();  // currently, this is ncclAllReduce
-#endif
-
         rmm::device_uvector<vertex_t> sorted_minors(0, handle.get_stream());
         {
           auto recvcounts =
@@ -530,17 +466,6 @@ void expensive_check_edgelist(
           thrust::sort(handle.get_thrust_policy(), sorted_minors.begin(), sorted_minors.end());
         }
 
-        // barrier is necessary here to avoid potential overlap (which can leads to deadlock)
-        // between two different communicators (end of row_comm)
-#if 1
-        // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with
-        // DASK and MPI barrier with MPI)
-        host_barrier(comm, handle.get_stream_view());
-#else
-        handle.get_stream_view().synchronize();
-        comm.barrier();  // currently, this is ncclAllReduce
-#endif
-
         auto edge_first =
           thrust::make_zip_iterator(thrust::make_tuple(edgelist_majors[i], edgelist_minors[i]));
         CUGRAPH_EXPECTS(
@@ -724,18 +649,6 @@ renumber_edgelist(
   // FIXME: compare this hash based approach with a binary search based approach in both memory
   // footprint and execution time
 
-  // FIXME: this barrier is unnecessary if the above host_scalar_allgather is a true host operation
-  // (as it serves as a barrier) barrier is necessary here to avoid potential overlap (which can
-  // leads to deadlock) between two different communicators (beginning of col_comm)
-#if 1
-  // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK and
-  // MPI barrier with MPI)
-  host_barrier(comm, handle.get_stream_view());
-#else
-  handle.get_stream_view().synchronize();
-  comm.barrier();  // currently, this is ncclAllReduce
-#endif
-
   {
     vertex_t max_matrix_partition_major_size{0};
     for (size_t i = 0; i < edgelist_majors.size(); ++i) {
@@ -778,16 +691,6 @@ renumber_edgelist(
     }
   }
 
-  // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between two
-  // different communicators (beginning of row_comm)
-#if 1
-  // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK and
-  // MPI barrier with MPI)
-  host_barrier(comm, handle.get_stream_view());
-#else
-  handle.get_stream_view().synchronize();
-  comm.barrier();  // currently, this is ncclAllReduce
-#endif
   if ((partition.get_matrix_partition_minor_size() >= number_of_edges / comm_size) &&
       edgelist_intra_partition_segment_offsets) {  // memory footprint dominated by the O(V/sqrt(P))
                                                    // part than the O(E/P) part
@@ -870,16 +773,6 @@ renumber_edgelist(
         edgelist_minors[i], edgelist_minors[i] + edgelist_edge_counts[i], edgelist_minors[i]);
     }
   }
-  // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between two
-  // different communicators (end of row_comm)
-#if 1
-  // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK and
-  // MPI barrier with MPI)
-  host_barrier(comm, handle.get_stream_view());
-#else
-  handle.get_stream_view().synchronize();
-  comm.barrier();  // currently, this is ncclAllReduce
-#endif
 
   return std::make_tuple(
     std::move(renumber_map_labels),
diff --git a/cpp/src/structure/renumber_utils_impl.cuh b/cpp/src/structure/renumber_utils_impl.cuh
index b719fe6bc3c..871d71975df 100644
--- a/cpp/src/structure/renumber_utils_impl.cuh
+++ b/cpp/src/structure/renumber_utils_impl.cuh
@@ -20,7 +20,6 @@
 #include <cugraph/graph_functions.hpp>
 #include <cugraph/utilities/collect_comm.cuh>
 #include <cugraph/utilities/error.hpp>
-#include <cugraph/utilities/host_barrier.hpp>
 #include <cugraph/utilities/host_scalar_comm.cuh>
 
 #include <cuco/static_map.cuh>
@@ -163,16 +162,6 @@ void unrenumber_local_int_edges(
   // FIXME: compare this hash based approach with a binary search based approach in both memory
   // footprint and execution time
 
-  // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between two
-  // different communicators (beginning of col_comm)
-#if 1
-  // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK and
-  // MPI barrier with MPI)
-  host_barrier(comm, handle.get_stream_view());
-#else
-  handle.get_stream_view().synchronize();
-  comm.barrier();  // currently, this is ncclAllReduce
-#endif
   {
     vertex_t max_matrix_partition_major_size{0};
     for (size_t i = 0; i < edgelist_majors.size(); ++i) {
@@ -224,16 +213,6 @@ void unrenumber_local_int_edges(
     }
   }
 
-  // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between two
-  // different communicators (beginning of row_comm)
-#if 1
-  // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK and
-  // MPI barrier with MPI)
-  host_barrier(comm, handle.get_stream_view());
-#else
-  handle.get_stream_view().synchronize();
-  comm.barrier();  // currently, this is ncclAllReduce
-#endif
   vertex_t matrix_partition_minor_size{0};
   for (int i = 0; i < row_comm_size; ++i) {
     auto vertex_partition_rank = col_comm_rank * row_comm_size + i;
@@ -340,16 +319,6 @@ void unrenumber_local_int_edges(
         edgelist_minors[i], edgelist_minors[i] + edgelist_edge_counts[i], edgelist_minors[i]);
     }
   }
-  // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between two
-  // different communicators (end of row_comm)
-#if 1
-  // FIXME: temporary hack till UCC is integrated into RAFT (so we can use UCC barrier with DASK and
-  // MPI barrier with MPI)
-  host_barrier(comm, handle.get_stream_view());
-#else
-  handle.get_stream_view().synchronize();
-  comm.barrier();  // currently, this is ncclAllReduce
-#endif
 }
 
 }  // namespace detail