diff --git a/cpp/include/cugraph/edge_partition_edge_property_device_view.cuh b/cpp/include/cugraph/edge_partition_edge_property_device_view.cuh
index e5b64b1e02f..c7521831fb8 100644
--- a/cpp/include/cugraph/edge_partition_edge_property_device_view.cuh
+++ b/cpp/include/cugraph/edge_partition_edge_property_device_view.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -127,7 +127,7 @@ class edge_partition_edge_property_device_view_t {
     value_t>
   atomic_add(edge_t offset, value_t val) const
   {
-    cugraph::atomic_add(value_first_ + offset, val);
+    return cugraph::atomic_add(value_first_ + offset, val);
   }
 
   template <typename Iter = ValueIterator>
@@ -154,7 +154,7 @@ class edge_partition_edge_property_device_view_t {
     value_t>
   elementwise_atomic_min(edge_t offset, value_t val) const
   {
-    cugraph::elementwise_atomic_min(value_first_ + offset, val);
+    return cugraph::elementwise_atomic_min(value_first_ + offset, val);
   }
 
   template <typename Iter = ValueIterator, typename T = value_t>
@@ -164,7 +164,7 @@ class edge_partition_edge_property_device_view_t {
     value_t>
   elementwise_atomic_max(edge_t offset, value_t val) const
   {
-    cugraph::elementwise_atomic_max(value_first_ + offset, val);
+    return cugraph::elementwise_atomic_max(value_first_ + offset, val);
   }
 
  private:
diff --git a/cpp/include/cugraph/edge_partition_endpoint_property_device_view.cuh b/cpp/include/cugraph/edge_partition_endpoint_property_device_view.cuh
index 7578c646175..824662a957e 100644
--- a/cpp/include/cugraph/edge_partition_endpoint_property_device_view.cuh
+++ b/cpp/include/cugraph/edge_partition_endpoint_property_device_view.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -133,7 +133,7 @@ class edge_partition_endpoint_property_device_view_t {
   atomic_add(vertex_t offset, value_t val) const
   {
     auto val_offset = value_offset(offset);
-    cugraph::atomic_add(value_first_ + val_offset, val);
+    return cugraph::atomic_add(value_first_ + val_offset, val);
   }
 
   template <typename Iter = ValueIterator>
@@ -162,7 +162,7 @@ class edge_partition_endpoint_property_device_view_t {
   elementwise_atomic_min(vertex_t offset, value_t val) const
   {
     auto val_offset = value_offset(offset);
-    cugraph::elementwise_atomic_min(value_first_ + val_offset, val);
+    return cugraph::elementwise_atomic_min(value_first_ + val_offset, val);
   }
 
   template <typename Iter = ValueIterator, typename T = value_t>
@@ -173,7 +173,7 @@ class edge_partition_endpoint_property_device_view_t {
   elementwise_atomic_max(vertex_t offset, value_t val) const
   {
     auto val_offset = value_offset(offset);
-    cugraph::elementwise_atomic_max(value_first_ + val_offset, val);
+    return cugraph::elementwise_atomic_max(value_first_ + val_offset, val);
   }
 
  private:
diff --git a/cpp/include/cugraph/utilities/atomic_ops.cuh b/cpp/include/cugraph/utilities/atomic_ops.cuh
index 6af9841d71f..2b6102a6a17 100644
--- a/cpp/include/cugraph/utilities/atomic_ops.cuh
+++ b/cpp/include/cugraph/utilities/atomic_ops.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include <cugraph/utilities/packed_bool_utils.hpp>
 #include <cugraph/utilities/thrust_tuple_utils.hpp>
 
 #include <raft/util/device_atomics.cuh>
@@ -112,7 +113,7 @@ __device__
                    T>
   atomic_and(Iterator iter, T value)
 {
-  detail::thrust_tuple_atomic_and(
+  return detail::thrust_tuple_atomic_and(
     iter, value, std::make_index_sequence<thrust::tuple_size<T>::value>{});
 }
 
@@ -140,7 +141,7 @@ __device__
                    T>
   atomic_or(Iterator iter, T value)
 {
-  detail::thrust_tuple_atomic_or(
+  return detail::thrust_tuple_atomic_or(
     iter, value, std::make_index_sequence<thrust::tuple_size<T>::value>{});
 }
 
@@ -155,22 +156,22 @@ template <typename Iterator, typename T>
 __device__
   std::enable_if_t<std::is_arithmetic_v<T> &&
                      std::is_same_v<typename thrust::iterator_traits<Iterator>::value_type, T>,
-                   void>
+                   T>
   atomic_add(Iterator iter, T value)
 {
-  atomicAdd(&(thrust::raw_reference_cast(*iter)), value);
+  return atomicAdd(&(thrust::raw_reference_cast(*iter)), value);
 }
 
 template <typename Iterator, typename T>
 __device__
   std::enable_if_t<is_thrust_tuple<typename thrust::iterator_traits<Iterator>::value_type>::value &&
                      is_thrust_tuple<T>::value,
-                   void>
+                   T>
   atomic_add(Iterator iter, T value)
 {
   static_assert(thrust::tuple_size<typename thrust::iterator_traits<Iterator>::value_type>::value ==
                 thrust::tuple_size<T>::value);
-  detail::thrust_tuple_atomic_add(
+  return detail::thrust_tuple_atomic_add(
     iter, value, std::make_index_sequence<thrust::tuple_size<T>::value>{});
 }
 
@@ -191,7 +192,7 @@ __device__
                    T>
   elementwise_atomic_cas(Iterator iter, T compare, T value)
 {
-  detail::thrust_tuple_elementwise_atomic_cas(
+  return detail::thrust_tuple_elementwise_atomic_cas(
     iter, compare, value, std::make_index_sequence<thrust::tuple_size<T>::value>{});
 }
 
@@ -206,22 +207,22 @@ template <typename Iterator, typename T>
 __device__
   std::enable_if_t<std::is_same<typename thrust::iterator_traits<Iterator>::value_type, T>::value &&
                      std::is_arithmetic<T>::value,
-                   void>
+                   T>
   elementwise_atomic_min(Iterator iter, T const& value)
 {
-  atomicMin(&(thrust::raw_reference_cast(*iter)), value);
+  return atomicMin(&(thrust::raw_reference_cast(*iter)), value);
 }
 
 template <typename Iterator, typename T>
 __device__
   std::enable_if_t<is_thrust_tuple<typename thrust::iterator_traits<Iterator>::value_type>::value &&
                      is_thrust_tuple<T>::value,
-                   void>
+                   T>
   elementwise_atomic_min(Iterator iter, T const& value)
 {
   static_assert(thrust::tuple_size<typename thrust::iterator_traits<Iterator>::value_type>::value ==
                 thrust::tuple_size<T>::value);
-  detail::thrust_tuple_elementwise_atomic_min(
+  return detail::thrust_tuple_elementwise_atomic_min(
     iter, value, std::make_index_sequence<thrust::tuple_size<T>::value>{});
 }
 
@@ -236,23 +237,35 @@ template <typename Iterator, typename T>
 __device__
   std::enable_if_t<std::is_same<typename thrust::iterator_traits<Iterator>::value_type, T>::value &&
                      std::is_arithmetic<T>::value,
-                   void>
+                   T>
   elementwise_atomic_max(Iterator iter, T const& value)
 {
-  atomicMax(&(thrust::raw_reference_cast(*iter)), value);
+  return atomicMax(&(thrust::raw_reference_cast(*iter)), value);
 }
 
 template <typename Iterator, typename T>
 __device__
   std::enable_if_t<is_thrust_tuple<typename thrust::iterator_traits<Iterator>::value_type>::value &&
                      is_thrust_tuple<T>::value,
-                   void>
+                   T>
   elementwise_atomic_max(Iterator iter, T const& value)
 {
   static_assert(thrust::tuple_size<typename thrust::iterator_traits<Iterator>::value_type>::value ==
                 thrust::tuple_size<T>::value);
-  detail::thrust_tuple_elementwise_atomic_max(
+  return detail::thrust_tuple_elementwise_atomic_max(
     iter, value, std::make_index_sequence<thrust::tuple_size<T>::value>{});
 }
 
+template <typename Iterator, typename T>
+__device__ void packed_bool_atomic_set(Iterator iter, T offset, bool val)
+{
+  auto packed_output_offset = packed_bool_offset(offset);
+  auto packed_output_mask   = packed_bool_mask(offset);
+  if (val) {
+    atomicOr(iter + packed_output_offset, packed_output_mask);
+  } else {
+    atomicAnd(iter + packed_output_offset, ~packed_output_mask);
+  }
+}
+
 }  // namespace cugraph
diff --git a/cpp/include/cugraph/utilities/dataframe_buffer.hpp b/cpp/include/cugraph/utilities/dataframe_buffer.hpp
index ab4c4eff6b5..a20613c65ef 100644
--- a/cpp/include/cugraph/utilities/dataframe_buffer.hpp
+++ b/cpp/include/cugraph/utilities/dataframe_buffer.hpp
@@ -40,30 +40,30 @@ auto allocate_dataframe_buffer_tuple_impl(std::index_sequence<Is...>,
     buffer_size, stream_view)...);
 }
 
-template <typename TupleType, std::size_t... I>
-auto get_dataframe_buffer_begin_tuple_impl(std::index_sequence<I...>, TupleType& buffer)
+template <typename TupleType, std::size_t... Is>
+auto get_dataframe_buffer_begin_tuple_impl(std::index_sequence<Is...>, TupleType& buffer)
 {
-  return thrust::make_zip_iterator(thrust::make_tuple((std::get<I>(buffer).begin())...));
+  return thrust::make_zip_iterator(thrust::make_tuple((std::get<Is>(buffer).begin())...));
 }
 
-template <typename TupleType, std::size_t... I>
-auto get_dataframe_buffer_end_tuple_impl(std::index_sequence<I...>, TupleType& buffer)
+template <typename TupleType, std::size_t... Is>
+auto get_dataframe_buffer_end_tuple_impl(std::index_sequence<Is...>, TupleType& buffer)
 {
-  return thrust::make_zip_iterator(thrust::make_tuple((std::get<I>(buffer).end())...));
+  return thrust::make_zip_iterator(thrust::make_tuple((std::get<Is>(buffer).end())...));
 }
 
-template <typename TupleType, size_t... I>
-auto get_dataframe_buffer_cbegin_tuple_impl(std::index_sequence<I...>, TupleType& buffer)
+template <typename TupleType, size_t... Is>
+auto get_dataframe_buffer_cbegin_tuple_impl(std::index_sequence<Is...>, TupleType& buffer)
 {
   // thrust::make_tuple instead of std::make_tuple as this is fed to thrust::make_zip_iterator.
-  return thrust::make_zip_iterator(thrust::make_tuple((std::get<I>(buffer).cbegin())...));
+  return thrust::make_zip_iterator(thrust::make_tuple((std::get<Is>(buffer).cbegin())...));
 }
 
-template <typename TupleType, std::size_t... I>
-auto get_dataframe_buffer_cend_tuple_impl(std::index_sequence<I...>, TupleType& buffer)
+template <typename TupleType, std::size_t... Is>
+auto get_dataframe_buffer_cend_tuple_impl(std::index_sequence<Is...>, TupleType& buffer)
 {
   // thrust::make_tuple instead of std::make_tuple as this is fed to thrust::make_zip_iterator.
-  return thrust::make_zip_iterator(thrust::make_tuple((std::get<I>(buffer).cend())...));
+  return thrust::make_zip_iterator(thrust::make_tuple((std::get<Is>(buffer).cend())...));
 }
 
 }  // namespace detail
diff --git a/cpp/src/c_api/graph_helper_impl.cuh b/cpp/src/c_api/graph_helper_impl.cuh
index a0614c5e08e..6377fde8fee 100644
--- a/cpp/src/c_api/graph_helper_impl.cuh
+++ b/cpp/src/c_api/graph_helper_impl.cuh
@@ -37,7 +37,7 @@ edge_property_t<GraphViewType, T> create_constant_edge_property(raft::handle_t c
 {
   edge_property_t<GraphViewType, T> edge_property(handle, graph_view);
 
-  cugraph::fill_edge_property(handle, graph_view, constant_value, edge_property);
+  cugraph::fill_edge_property(handle, graph_view, edge_property.mutable_view(), constant_value);
 
   return edge_property;
 }
diff --git a/cpp/src/centrality/betweenness_centrality_impl.cuh b/cpp/src/centrality/betweenness_centrality_impl.cuh
index 7b68d88abea..8ae49ed207c 100644
--- a/cpp/src/centrality/betweenness_centrality_impl.cuh
+++ b/cpp/src/centrality/betweenness_centrality_impl.cuh
@@ -23,7 +23,7 @@
 #include "prims/per_v_transform_reduce_incoming_outgoing_e.cuh"
 #include "prims/transform_e.cuh"
 #include "prims/transform_reduce_v.cuh"
-#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh"
+#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh"
 #include "prims/update_edge_src_dst_property.cuh"
 #include "prims/update_v_frontier.cuh"
 #include "prims/vertex_frontier.cuh"
@@ -130,8 +130,8 @@ std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<edge_t>> brandes_b
   edge_t hop{0};
 
   while (true) {
-    update_edge_src_property(handle, graph_view, sigmas.begin(), src_sigmas);
-    update_edge_dst_property(handle, graph_view, distances.begin(), dst_distances);
+    update_edge_src_property(handle, graph_view, sigmas.begin(), src_sigmas.mutable_view());
+    update_edge_dst_property(handle, graph_view, distances.begin(), dst_distances.mutable_view());
 
     auto [new_frontier, new_sigma] =
       transform_reduce_v_frontier_outgoing_e_by_dst(handle,
@@ -228,12 +228,12 @@ void accumulate_vertex_results(
     handle,
     graph_view,
     thrust::make_zip_iterator(distances.begin(), sigmas.begin(), deltas.begin()),
-    src_properties);
+    src_properties.mutable_view());
   update_edge_dst_property(
     handle,
     graph_view,
     thrust::make_zip_iterator(distances.begin(), sigmas.begin(), deltas.begin()),
-    dst_properties);
+    dst_properties.mutable_view());
 
   // FIXME: To do this efficiently, I need a version of
   //   per_v_transform_reduce_outgoing_e that takes a vertex list
@@ -272,12 +272,12 @@ void accumulate_vertex_results(
       handle,
       graph_view,
       thrust::make_zip_iterator(distances.begin(), sigmas.begin(), deltas.begin()),
-      src_properties);
+      src_properties.mutable_view());
     update_edge_dst_property(
       handle,
       graph_view,
       thrust::make_zip_iterator(distances.begin(), sigmas.begin(), deltas.begin()),
-      dst_properties);
+      dst_properties.mutable_view());
 
     thrust::transform(handle.get_thrust_policy(),
                       centralities.begin(),
@@ -323,12 +323,12 @@ void accumulate_edge_results(
     handle,
     graph_view,
     thrust::make_zip_iterator(distances.begin(), sigmas.begin(), deltas.begin()),
-    src_properties);
+    src_properties.mutable_view());
   update_edge_dst_property(
     handle,
     graph_view,
     thrust::make_zip_iterator(distances.begin(), sigmas.begin(), deltas.begin()),
-    dst_properties);
+    dst_properties.mutable_view());
 
   //
   //   For now this will do a O(E) pass over all edges over the diameter
@@ -417,12 +417,12 @@ void accumulate_edge_results(
       handle,
       graph_view,
       thrust::make_zip_iterator(distances.begin(), sigmas.begin(), deltas.begin()),
-      src_properties);
+      src_properties.mutable_view());
     update_edge_dst_property(
       handle,
       graph_view,
       thrust::make_zip_iterator(distances.begin(), sigmas.begin(), deltas.begin()),
-      dst_properties);
+      dst_properties.mutable_view());
   }
 }
 
@@ -594,9 +594,11 @@ edge_betweenness_centrality(
   if (graph_view.has_edge_mask()) {
     auto unmasked_graph_view = graph_view;
     unmasked_graph_view.clear_edge_mask();
-    fill_edge_property(handle, unmasked_graph_view, weight_t{0}, centralities, do_expensive_check);
+    fill_edge_property(
+      handle, unmasked_graph_view, centralities.mutable_view(), weight_t{0}, do_expensive_check);
   } else {
-    fill_edge_property(handle, graph_view, weight_t{0}, centralities, do_expensive_check);
+    fill_edge_property(
+      handle, graph_view, centralities.mutable_view(), weight_t{0}, do_expensive_check);
   }
 
   size_t num_sources = thrust::distance(vertices_begin, vertices_end);
diff --git a/cpp/src/centrality/eigenvector_centrality_impl.cuh b/cpp/src/centrality/eigenvector_centrality_impl.cuh
index 09436b68189..922f5b44f83 100644
--- a/cpp/src/centrality/eigenvector_centrality_impl.cuh
+++ b/cpp/src/centrality/eigenvector_centrality_impl.cuh
@@ -98,7 +98,7 @@ rmm::device_uvector<weight_t> eigenvector_centrality(
                  old_centralities.data());
 
     update_edge_src_property(
-      handle, pull_graph_view, old_centralities.begin(), edge_src_centralities);
+      handle, pull_graph_view, old_centralities.begin(), edge_src_centralities.mutable_view());
 
     if (edge_weight_view) {
       per_v_transform_reduce_incoming_e(
diff --git a/cpp/src/centrality/katz_centrality_impl.cuh b/cpp/src/centrality/katz_centrality_impl.cuh
index 132d7aa5039..953e0ee381c 100644
--- a/cpp/src/centrality/katz_centrality_impl.cuh
+++ b/cpp/src/centrality/katz_centrality_impl.cuh
@@ -107,7 +107,7 @@ void katz_centrality(
     std::swap(new_katz_centralities, old_katz_centralities);
 
     update_edge_src_property(
-      handle, pull_graph_view, old_katz_centralities, edge_src_katz_centralities);
+      handle, pull_graph_view, old_katz_centralities, edge_src_katz_centralities.mutable_view());
 
     if (edge_weight_view) {
       per_v_transform_reduce_incoming_e(
diff --git a/cpp/src/community/approx_weighted_matching_impl.cuh b/cpp/src/community/approx_weighted_matching_impl.cuh
index e693beee489..a0ccfa52ffc 100644
--- a/cpp/src/community/approx_weighted_matching_impl.cuh
+++ b/cpp/src/community/approx_weighted_matching_impl.cuh
@@ -50,9 +50,11 @@ std::tuple<rmm::device_uvector<vertex_t>, weight_t> approximate_weighted_matchin
   if (current_graph_view.has_edge_mask()) { current_graph_view.clear_edge_mask(); }
 
   cugraph::edge_property_t<graph_view_t, bool> edge_masks_even(handle, current_graph_view);
-  cugraph::fill_edge_property(handle, current_graph_view, bool{false}, edge_masks_even);
+  cugraph::fill_edge_property(
+    handle, current_graph_view, edge_masks_even.mutable_view(), bool{false});
   cugraph::edge_property_t<graph_view_t, bool> edge_masks_odd(handle, current_graph_view);
-  cugraph::fill_edge_property(handle, current_graph_view, bool{false}, edge_masks_odd);
+  cugraph::fill_edge_property(
+    handle, current_graph_view, edge_masks_odd.mutable_view(), bool{false});
 
   if (graph_view.has_edge_mask()) {
     current_graph_view.attach_edge_mask(*(graph_view.edge_mask_view()));
@@ -99,7 +101,8 @@ std::tuple<rmm::device_uvector<vertex_t>, weight_t> approximate_weighted_matchin
   if constexpr (graph_view_t::is_multi_gpu) {
     src_key_cache = edge_src_property_t<graph_view_t, vertex_t>(handle, current_graph_view);
 
-    update_edge_src_property(handle, current_graph_view, local_vertices.begin(), src_key_cache);
+    update_edge_src_property(
+      handle, current_graph_view, local_vertices.begin(), src_key_cache.mutable_view());
 
     src_match_flags = cugraph::edge_src_property_t<graph_view_t, bool>(handle, current_graph_view);
     dst_match_flags = cugraph::edge_dst_property_t<graph_view_t, bool>(handle, current_graph_view);
@@ -296,9 +299,9 @@ std::tuple<rmm::device_uvector<vertex_t>, weight_t> approximate_weighted_matchin
 
     if constexpr (graph_view_t::is_multi_gpu) {
       cugraph::update_edge_src_property(
-        handle, current_graph_view, is_vertex_matched.begin(), src_match_flags);
+        handle, current_graph_view, is_vertex_matched.begin(), src_match_flags.mutable_view());
       cugraph::update_edge_dst_property(
-        handle, current_graph_view, is_vertex_matched.begin(), dst_match_flags);
+        handle, current_graph_view, is_vertex_matched.begin(), dst_match_flags.mutable_view());
     }
 
     if (loop_counter % 2 == 0) {
@@ -330,7 +333,8 @@ std::tuple<rmm::device_uvector<vertex_t>, weight_t> approximate_weighted_matchin
       }
 
       if (current_graph_view.has_edge_mask()) current_graph_view.clear_edge_mask();
-      cugraph::fill_edge_property(handle, current_graph_view, bool{false}, edge_masks_even);
+      cugraph::fill_edge_property(
+        handle, current_graph_view, edge_masks_even.mutable_view(), bool{false});
       current_graph_view.attach_edge_mask(edge_masks_odd.view());
     } else {
       if constexpr (graph_view_t::is_multi_gpu) {
@@ -361,7 +365,8 @@ std::tuple<rmm::device_uvector<vertex_t>, weight_t> approximate_weighted_matchin
       }
 
       if (current_graph_view.has_edge_mask()) current_graph_view.clear_edge_mask();
-      cugraph::fill_edge_property(handle, current_graph_view, bool{false}, edge_masks_odd);
+      cugraph::fill_edge_property(
+        handle, current_graph_view, edge_masks_odd.mutable_view(), bool{false});
       current_graph_view.attach_edge_mask(edge_masks_even.view());
     }
 
diff --git a/cpp/src/community/detail/common_methods.cuh b/cpp/src/community/detail/common_methods.cuh
index dcad4e92b95..e17abdb3703 100644
--- a/cpp/src/community/detail/common_methods.cuh
+++ b/cpp/src/community/detail/common_methods.cuh
@@ -299,7 +299,7 @@ rmm::device_uvector<vertex_t> update_clustering_by_delta_modularity(
       edge_src_property_t<graph_view_t<vertex_t, edge_t, false, multi_gpu>, weight_t>(handle,
                                                                                       graph_view);
     update_edge_src_property(
-      handle, graph_view, vertex_cluster_weights_v.begin(), src_cluster_weights);
+      handle, graph_view, vertex_cluster_weights_v.begin(), src_cluster_weights.mutable_view());
     vertex_cluster_weights_v.resize(0, handle.get_stream());
     vertex_cluster_weights_v.shrink_to_fit(handle.get_stream());
   } else {
@@ -367,7 +367,7 @@ rmm::device_uvector<vertex_t> update_clustering_by_delta_modularity(
                              graph_view,
                              thrust::make_zip_iterator(thrust::make_tuple(
                                old_cluster_sum_v.begin(), cluster_subtract_v.begin())),
-                             src_old_cluster_sum_subtract_pairs);
+                             src_old_cluster_sum_subtract_pairs.mutable_view());
     old_cluster_sum_v.resize(0, handle.get_stream());
     old_cluster_sum_v.shrink_to_fit(handle.get_stream());
     cluster_subtract_v.resize(0, handle.get_stream());
diff --git a/cpp/src/community/detail/maximal_independent_moves.cuh b/cpp/src/community/detail/maximal_independent_moves.cuh
index 82d20a04203..85892f711ba 100644
--- a/cpp/src/community/detail/maximal_independent_moves.cuh
+++ b/cpp/src/community/detail/maximal_independent_moves.cuh
@@ -163,8 +163,10 @@ rmm::device_uvector<vertex_t> maximal_independent_moves(
     if constexpr (multi_gpu) {
       src_rank_cache = edge_src_property_t<GraphViewType, vertex_t>(handle, graph_view);
       dst_rank_cache = edge_dst_property_t<GraphViewType, vertex_t>(handle, graph_view);
-      update_edge_src_property(handle, graph_view, temporary_ranks.begin(), src_rank_cache);
-      update_edge_dst_property(handle, graph_view, temporary_ranks.begin(), dst_rank_cache);
+      update_edge_src_property(
+        handle, graph_view, temporary_ranks.begin(), src_rank_cache.mutable_view());
+      update_edge_dst_property(
+        handle, graph_view, temporary_ranks.begin(), dst_rank_cache.mutable_view());
     }
 
     //
diff --git a/cpp/src/community/detail/refine_impl.cuh b/cpp/src/community/detail/refine_impl.cuh
index ef34ad90584..99fc1cd6fae 100644
--- a/cpp/src/community/detail/refine_impl.cuh
+++ b/cpp/src/community/detail/refine_impl.cuh
@@ -262,12 +262,16 @@ refine_clustering(
     // Update cluster weight, weighted degree and cut for edge sources
     src_louvain_cluster_weight_cache =
       edge_src_property_t<GraphViewType, weight_t>(handle, graph_view);
-    update_edge_src_property(
-      handle, graph_view, vertex_louvain_cluster_weights.begin(), src_louvain_cluster_weight_cache);
+    update_edge_src_property(handle,
+                             graph_view,
+                             vertex_louvain_cluster_weights.begin(),
+                             src_louvain_cluster_weight_cache.mutable_view());
 
     src_cut_to_louvain_cache = edge_src_property_t<GraphViewType, weight_t>(handle, graph_view);
-    update_edge_src_property(
-      handle, graph_view, weighted_cut_of_vertices_to_louvain.begin(), src_cut_to_louvain_cache);
+    update_edge_src_property(handle,
+                             graph_view,
+                             weighted_cut_of_vertices_to_louvain.begin(),
+                             src_cut_to_louvain_cache.mutable_view());
 
     vertex_louvain_cluster_weights.resize(0, handle.get_stream());
     vertex_louvain_cluster_weights.shrink_to_fit(handle.get_stream());
@@ -329,15 +333,15 @@ refine_clustering(
         edge_src_property_t<GraphViewType, uint8_t>(handle, graph_view);
 
       update_edge_src_property(
-        handle, graph_view, leiden_assignment.begin(), src_leiden_assignment_cache);
+        handle, graph_view, leiden_assignment.begin(), src_leiden_assignment_cache.mutable_view());
 
       update_edge_dst_property(
-        handle, graph_view, leiden_assignment.begin(), dst_leiden_assignment_cache);
+        handle, graph_view, leiden_assignment.begin(), dst_leiden_assignment_cache.mutable_view());
 
       update_edge_src_property(handle,
                                graph_view,
                                singleton_and_connected_flags.begin(),
-                               src_singleton_and_connected_flag_cache);
+                               src_singleton_and_connected_flag_cache.mutable_view());
     }
 
     auto src_input_property_values =
diff --git a/cpp/src/community/ecg_impl.cuh b/cpp/src/community/ecg_impl.cuh
index 2698f3cad4d..d01b13f0b35 100644
--- a/cpp/src/community/ecg_impl.cuh
+++ b/cpp/src/community/ecg_impl.cuh
@@ -64,7 +64,8 @@ std::tuple<rmm::device_uvector<vertex_t>, size_t, weight_t> ecg(
   edge_dst_property_t<graph_view_t, vertex_t> dst_cluster_assignments(handle, graph_view);
   edge_property_t<graph_view_t, weight_t> modified_edge_weights(handle, graph_view);
 
-  cugraph::fill_edge_property(handle, graph_view, weight_t{0}, modified_edge_weights);
+  cugraph::fill_edge_property(
+    handle, graph_view, modified_edge_weights.mutable_view(), weight_t{0});
 
   weight_t modularity = -1.0;
   rmm::device_uvector<vertex_t> cluster_assignments(graph_view.local_vertex_partition_range_size(),
@@ -82,9 +83,9 @@ std::tuple<rmm::device_uvector<vertex_t>, size_t, weight_t> ecg(
       resolution);
 
     cugraph::update_edge_src_property(
-      handle, graph_view, cluster_assignments.begin(), src_cluster_assignments);
+      handle, graph_view, cluster_assignments.begin(), src_cluster_assignments.mutable_view());
     cugraph::update_edge_dst_property(
-      handle, graph_view, cluster_assignments.begin(), dst_cluster_assignments);
+      handle, graph_view, cluster_assignments.begin(), dst_cluster_assignments.mutable_view());
 
     cugraph::transform_e(
       handle,
@@ -128,9 +129,9 @@ std::tuple<rmm::device_uvector<vertex_t>, size_t, weight_t> ecg(
 
   if constexpr (multi_gpu) {
     cugraph::update_edge_src_property(
-      handle, graph_view, cluster_assignments.begin(), src_cluster_assignments);
+      handle, graph_view, cluster_assignments.begin(), src_cluster_assignments.mutable_view());
     cugraph::update_edge_dst_property(
-      handle, graph_view, cluster_assignments.begin(), dst_cluster_assignments);
+      handle, graph_view, cluster_assignments.begin(), dst_cluster_assignments.mutable_view());
   }
 
   auto [cluster_keys, cluster_weights] = cugraph::detail::compute_cluster_keys_and_values(
diff --git a/cpp/src/community/k_truss_impl.cuh b/cpp/src/community/k_truss_impl.cuh
index f830e6a7700..f2d4c04dffd 100644
--- a/cpp/src/community/k_truss_impl.cuh
+++ b/cpp/src/community/k_truss_impl.cuh
@@ -595,8 +595,10 @@ k_truss(raft::handle_t const& handle,
                                                                                cur_graph_view);
     edge_dst_property_t<decltype(cur_graph_view), edge_t> edge_dst_out_degrees(handle,
                                                                                cur_graph_view);
-    update_edge_src_property(handle, cur_graph_view, out_degrees.begin(), edge_src_out_degrees);
-    update_edge_dst_property(handle, cur_graph_view, out_degrees.begin(), edge_dst_out_degrees);
+    update_edge_src_property(
+      handle, cur_graph_view, out_degrees.begin(), edge_src_out_degrees.mutable_view());
+    update_edge_dst_property(
+      handle, cur_graph_view, out_degrees.begin(), edge_dst_out_degrees.mutable_view());
 
     rmm::device_uvector<vertex_t> srcs(0, handle.get_stream());
     rmm::device_uvector<vertex_t> dsts(0, handle.get_stream());
@@ -695,7 +697,7 @@ k_truss(raft::handle_t const& handle,
                         (*num_triangles).begin());
 
     cugraph::edge_property_t<decltype(cur_graph_view), bool> edge_mask(handle, cur_graph_view);
-    cugraph::fill_edge_property(handle, cur_graph_view, true, edge_mask);
+    cugraph::fill_edge_property(handle, cur_graph_view, edge_mask.mutable_view(), true);
     cur_graph_view.attach_edge_mask(edge_mask.view());
 
     while (true) {
diff --git a/cpp/src/community/leiden_impl.cuh b/cpp/src/community/leiden_impl.cuh
index 166eb334301..da790a5dd66 100644
--- a/cpp/src/community/leiden_impl.cuh
+++ b/cpp/src/community/leiden_impl.cuh
@@ -236,8 +236,10 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> leiden(
     if constexpr (graph_view_t::is_multi_gpu) {
       src_vertex_weights_cache =
         edge_src_property_t<graph_view_t, weight_t>(handle, current_graph_view);
-      update_edge_src_property(
-        handle, current_graph_view, vertex_weights.begin(), src_vertex_weights_cache);
+      update_edge_src_property(handle,
+                               current_graph_view,
+                               vertex_weights.begin(),
+                               src_vertex_weights_cache.mutable_view());
     }
 
 #ifdef TIMING
@@ -265,13 +267,13 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> leiden(
       update_edge_src_property(handle,
                                current_graph_view,
                                louvain_assignment_for_vertices.begin(),
-                               src_louvain_assignment_cache);
+                               src_louvain_assignment_cache.mutable_view());
       dst_louvain_assignment_cache =
         edge_dst_property_t<graph_view_t, vertex_t>(handle, current_graph_view);
       update_edge_dst_property(handle,
                                current_graph_view,
                                louvain_assignment_for_vertices.begin(),
-                               dst_louvain_assignment_cache);
+                               dst_louvain_assignment_cache.mutable_view());
     }
 
     weight_t new_Q = detail::compute_modularity(handle,
@@ -328,11 +330,11 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> leiden(
         update_edge_src_property(handle,
                                  current_graph_view,
                                  louvain_assignment_for_vertices.begin(),
-                                 src_louvain_assignment_cache);
+                                 src_louvain_assignment_cache.mutable_view());
         update_edge_dst_property(handle,
                                  current_graph_view,
                                  louvain_assignment_for_vertices.begin(),
-                                 dst_louvain_assignment_cache);
+                                 dst_louvain_assignment_cache.mutable_view());
       }
 
       std::tie(cluster_keys, cluster_weights) =
@@ -403,11 +405,11 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> leiden(
         update_edge_src_property(handle,
                                  current_graph_view,
                                  louvain_assignment_for_vertices.begin(),
-                                 src_louvain_assignment_cache);
+                                 src_louvain_assignment_cache.mutable_view());
         update_edge_dst_property(handle,
                                  current_graph_view,
                                  louvain_assignment_for_vertices.begin(),
-                                 dst_louvain_assignment_cache);
+                                 dst_louvain_assignment_cache.mutable_view());
       }
 
       std::tie(refined_leiden_partition, leiden_to_louvain_map) =
diff --git a/cpp/src/community/louvain_impl.cuh b/cpp/src/community/louvain_impl.cuh
index a4b4b4a7bcd..ef90e8fd555 100644
--- a/cpp/src/community/louvain_impl.cuh
+++ b/cpp/src/community/louvain_impl.cuh
@@ -138,8 +138,10 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> louvain(
 
       src_vertex_weights_cache =
         edge_src_property_t<graph_view_t, weight_t>(handle, current_graph_view);
-      update_edge_src_property(
-        handle, current_graph_view, vertex_weights_v.begin(), src_vertex_weights_cache);
+      update_edge_src_property(handle,
+                               current_graph_view,
+                               vertex_weights_v.begin(),
+                               src_vertex_weights_cache.mutable_view());
       vertex_weights_v.resize(0, handle.get_stream());
       vertex_weights_v.shrink_to_fit(handle.get_stream());
     }
@@ -167,10 +169,10 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> louvain(
     if constexpr (multi_gpu) {
       src_clusters_cache = edge_src_property_t<graph_view_t, vertex_t>(handle, current_graph_view);
       update_edge_src_property(
-        handle, current_graph_view, next_clusters_v.begin(), src_clusters_cache);
+        handle, current_graph_view, next_clusters_v.begin(), src_clusters_cache.mutable_view());
       dst_clusters_cache = edge_dst_property_t<graph_view_t, vertex_t>(handle, current_graph_view);
       update_edge_dst_property(
-        handle, current_graph_view, next_clusters_v.begin(), dst_clusters_cache);
+        handle, current_graph_view, next_clusters_v.begin(), dst_clusters_cache.mutable_view());
     }
 
     weight_t new_Q = detail::compute_modularity(handle,
@@ -208,9 +210,9 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> louvain(
 
       if constexpr (graph_view_t::is_multi_gpu) {
         update_edge_src_property(
-          handle, current_graph_view, next_clusters_v.begin(), src_clusters_cache);
+          handle, current_graph_view, next_clusters_v.begin(), src_clusters_cache.mutable_view());
         update_edge_dst_property(
-          handle, current_graph_view, next_clusters_v.begin(), dst_clusters_cache);
+          handle, current_graph_view, next_clusters_v.begin(), dst_clusters_cache.mutable_view());
       }
 
       std::tie(cluster_keys_v, cluster_weights_v) = detail::compute_cluster_keys_and_values(
diff --git a/cpp/src/community/triangle_count_impl.cuh b/cpp/src/community/triangle_count_impl.cuh
index 7bccfafa043..0b453cfe262 100644
--- a/cpp/src/community/triangle_count_impl.cuh
+++ b/cpp/src/community/triangle_count_impl.cuh
@@ -194,7 +194,8 @@ void triangle_count(raft::handle_t const& handle,
   if (vertices) {
     cugraph::edge_property_t<decltype(cur_graph_view), bool> within_two_hop_edge_mask(
       handle, cur_graph_view);
-    cugraph::fill_edge_property(handle, unmasked_cur_graph_view, false, within_two_hop_edge_mask);
+    cugraph::fill_edge_property(
+      handle, unmasked_cur_graph_view, within_two_hop_edge_mask.mutable_view(), false);
 
     rmm::device_uvector<vertex_t> unique_vertices((*vertices).size(), handle.get_stream());
     thrust::copy(
@@ -322,10 +323,14 @@ void triangle_count(raft::handle_t const& handle,
       handle, cur_graph_view);
     edge_dst_property_t<decltype(cur_graph_view), bool> edge_dst_within_two_hop_flags(
       handle, cur_graph_view);
-    update_edge_src_property(
-      handle, cur_graph_view, within_two_hop_flags.begin(), edge_src_within_two_hop_flags);
-    update_edge_dst_property(
-      handle, cur_graph_view, within_two_hop_flags.begin(), edge_dst_within_two_hop_flags);
+    update_edge_src_property(handle,
+                             cur_graph_view,
+                             within_two_hop_flags.begin(),
+                             edge_src_within_two_hop_flags.mutable_view());
+    update_edge_dst_property(handle,
+                             cur_graph_view,
+                             within_two_hop_flags.begin(),
+                             edge_dst_within_two_hop_flags.mutable_view());
 
     transform_e(
       handle,
@@ -348,7 +353,8 @@ void triangle_count(raft::handle_t const& handle,
   {
     cugraph::edge_property_t<decltype(cur_graph_view), bool> self_loop_edge_mask(handle,
                                                                                  cur_graph_view);
-    cugraph::fill_edge_property(handle, unmasked_cur_graph_view, false, self_loop_edge_mask);
+    cugraph::fill_edge_property(
+      handle, unmasked_cur_graph_view, self_loop_edge_mask.mutable_view(), false);
 
     transform_e(
       handle,
@@ -369,7 +375,8 @@ void triangle_count(raft::handle_t const& handle,
   {
     cugraph::edge_property_t<decltype(cur_graph_view), bool> in_two_core_edge_mask(handle,
                                                                                    cur_graph_view);
-    cugraph::fill_edge_property(handle, unmasked_cur_graph_view, false, in_two_core_edge_mask);
+    cugraph::fill_edge_property(
+      handle, unmasked_cur_graph_view, in_two_core_edge_mask.mutable_view(), false);
 
     rmm::device_uvector<edge_t> core_numbers(cur_graph_view.number_of_vertices(),
                                              handle.get_stream());
@@ -388,9 +395,9 @@ void triangle_count(raft::handle_t const& handle,
                  in_two_core_first + core_numbers.size(),
                  in_two_core_flags.begin());
     update_edge_src_property(
-      handle, cur_graph_view, in_two_core_flags.begin(), edge_src_in_two_cores);
+      handle, cur_graph_view, in_two_core_flags.begin(), edge_src_in_two_cores.mutable_view());
     update_edge_dst_property(
-      handle, cur_graph_view, in_two_core_flags.begin(), edge_dst_in_two_cores);
+      handle, cur_graph_view, in_two_core_flags.begin(), edge_dst_in_two_cores.mutable_view());
 
     transform_e(
       handle,
@@ -420,8 +427,10 @@ void triangle_count(raft::handle_t const& handle,
                                                                                cur_graph_view);
     edge_dst_property_t<decltype(cur_graph_view), edge_t> edge_dst_out_degrees(handle,
                                                                                cur_graph_view);
-    update_edge_src_property(handle, cur_graph_view, out_degrees.begin(), edge_src_out_degrees);
-    update_edge_dst_property(handle, cur_graph_view, out_degrees.begin(), edge_dst_out_degrees);
+    update_edge_src_property(
+      handle, cur_graph_view, out_degrees.begin(), edge_src_out_degrees.mutable_view());
+    update_edge_dst_property(
+      handle, cur_graph_view, out_degrees.begin(), edge_dst_out_degrees.mutable_view());
     auto [srcs, dsts] = extract_transform_e(handle,
                                             cur_graph_view,
                                             edge_src_out_degrees.view(),
diff --git a/cpp/src/components/mis_impl.cuh b/cpp/src/components/mis_impl.cuh
index 550edf9807a..b593c639946 100644
--- a/cpp/src/components/mis_impl.cuh
+++ b/cpp/src/components/mis_impl.cuh
@@ -16,7 +16,6 @@
  */
 #pragma once
 
-#include "prims/fill_edge_property.cuh"
 #include "prims/fill_edge_src_dst_property.cuh"
 #include "prims/per_v_transform_reduce_incoming_outgoing_e.cuh"
 #include "prims/update_edge_src_dst_property.cuh"
@@ -174,8 +173,10 @@ rmm::device_uvector<vertex_t> maximal_independent_set(
     if constexpr (multi_gpu) {
       src_rank_cache = edge_src_property_t<GraphViewType, vertex_t>(handle, graph_view);
       dst_rank_cache = edge_dst_property_t<GraphViewType, vertex_t>(handle, graph_view);
-      update_edge_src_property(handle, graph_view, temporary_ranks.begin(), src_rank_cache);
-      update_edge_dst_property(handle, graph_view, temporary_ranks.begin(), dst_rank_cache);
+      update_edge_src_property(
+        handle, graph_view, temporary_ranks.begin(), src_rank_cache.mutable_view());
+      update_edge_dst_property(
+        handle, graph_view, temporary_ranks.begin(), dst_rank_cache.mutable_view());
     }
 
     //
diff --git a/cpp/src/components/vertex_coloring_impl.cuh b/cpp/src/components/vertex_coloring_impl.cuh
index bffaab34990..fa7fb1f6099 100644
--- a/cpp/src/components/vertex_coloring_impl.cuh
+++ b/cpp/src/components/vertex_coloring_impl.cuh
@@ -39,10 +39,12 @@ rmm::device_uvector<vertex_t> vertex_coloring(
 
   // edge mask
   cugraph::edge_property_t<graph_view_t, bool> edge_masks_even(handle, current_graph_view);
-  cugraph::fill_edge_property(handle, current_graph_view, bool{false}, edge_masks_even);
+  cugraph::fill_edge_property(
+    handle, current_graph_view, edge_masks_even.mutable_view(), bool{false});
 
   cugraph::edge_property_t<graph_view_t, bool> edge_masks_odd(handle, current_graph_view);
-  cugraph::fill_edge_property(handle, current_graph_view, bool{false}, edge_masks_odd);
+  cugraph::fill_edge_property(
+    handle, current_graph_view, edge_masks_odd.mutable_view(), bool{false});
 
   cugraph::transform_e(
     handle,
@@ -99,10 +101,10 @@ rmm::device_uvector<vertex_t> vertex_coloring(
         cugraph::edge_dst_property_t<graph_view_t, flag_t>(handle, current_graph_view);
 
       cugraph::update_edge_src_property(
-        handle, current_graph_view, is_vertex_in_mis.begin(), src_mis_flags);
+        handle, current_graph_view, is_vertex_in_mis.begin(), src_mis_flags.mutable_view());
 
       cugraph::update_edge_dst_property(
-        handle, current_graph_view, is_vertex_in_mis.begin(), dst_mis_flags);
+        handle, current_graph_view, is_vertex_in_mis.begin(), dst_mis_flags.mutable_view());
     }
 
     if (color_id % 2 == 0) {
@@ -123,7 +125,8 @@ rmm::device_uvector<vertex_t> vertex_coloring(
         edge_masks_odd.mutable_view());
 
       if (current_graph_view.has_edge_mask()) current_graph_view.clear_edge_mask();
-      cugraph::fill_edge_property(handle, current_graph_view, bool{false}, edge_masks_even);
+      cugraph::fill_edge_property(
+        handle, current_graph_view, edge_masks_even.mutable_view(), bool{false});
       current_graph_view.attach_edge_mask(edge_masks_odd.view());
     } else {
       cugraph::transform_e(
@@ -143,7 +146,8 @@ rmm::device_uvector<vertex_t> vertex_coloring(
         edge_masks_even.mutable_view());
 
       if (current_graph_view.has_edge_mask()) current_graph_view.clear_edge_mask();
-      cugraph::fill_edge_property(handle, current_graph_view, bool{false}, edge_masks_odd);
+      cugraph::fill_edge_property(
+        handle, current_graph_view, edge_masks_odd.mutable_view(), bool{false});
       current_graph_view.attach_edge_mask(edge_masks_even.view());
     }
 
diff --git a/cpp/src/components/weakly_connected_components_impl.cuh b/cpp/src/components/weakly_connected_components_impl.cuh
index f63f28210d8..681523dad90 100644
--- a/cpp/src/components/weakly_connected_components_impl.cuh
+++ b/cpp/src/components/weakly_connected_components_impl.cuh
@@ -16,7 +16,7 @@
 #pragma once
 
 #include "prims/fill_edge_src_dst_property.cuh"
-#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh"
+#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh"
 #include "prims/update_edge_src_dst_property.cuh"
 #include "prims/update_v_frontier.cuh"
 #include "prims/vertex_frontier.cuh"
@@ -488,8 +488,10 @@ void weakly_connected_components_impl(raft::handle_t const& handle,
         ? edge_dst_property_t<GraphViewType, vertex_t>(handle, level_graph_view)
         : edge_dst_property_t<GraphViewType, vertex_t>(handle);
     if constexpr (GraphViewType::is_multi_gpu) {
-      fill_edge_dst_property(
-        handle, level_graph_view, invalid_component_id<vertex_t>::value, edge_dst_components);
+      fill_edge_dst_property(handle,
+                             level_graph_view,
+                             edge_dst_components.mutable_view(),
+                             invalid_component_id<vertex_t>::value);
     }
 
     // 2.4 iterate till every vertex gets visited
@@ -534,7 +536,7 @@ void weakly_connected_components_impl(raft::handle_t const& handle,
           thrust::get<0>(vertex_frontier.bucket(bucket_idx_cur).begin().get_iterator_tuple()),
           thrust::get<0>(vertex_frontier.bucket(bucket_idx_cur).end().get_iterator_tuple()),
           level_components,
-          edge_dst_components);
+          edge_dst_components.mutable_view());
       }
 
       auto max_pushes = GraphViewType::is_multi_gpu
diff --git a/cpp/src/cores/core_number_impl.cuh b/cpp/src/cores/core_number_impl.cuh
index 358b0ac2e00..d807ccac5a5 100644
--- a/cpp/src/cores/core_number_impl.cuh
+++ b/cpp/src/cores/core_number_impl.cuh
@@ -16,7 +16,7 @@
 #pragma once
 
 #include "prims/reduce_v.cuh"
-#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh"
+#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh"
 #include "prims/update_edge_src_dst_property.cuh"
 #include "prims/update_v_frontier.cuh"
 #include "prims/vertex_frontier.cuh"
@@ -177,7 +177,7 @@ void core_number(raft::handle_t const& handle,
 
   edge_dst_property_t<graph_view_t<vertex_t, edge_t, false, multi_gpu>, edge_t> dst_core_numbers(
     handle, graph_view);
-  update_edge_dst_property(handle, graph_view, core_numbers, dst_core_numbers);
+  update_edge_dst_property(handle, graph_view, core_numbers, dst_core_numbers.mutable_view());
 
   auto k = std::max(k_first, size_t{2});  // degree 0|1 vertices belong to 0|1-core
   if (graph_view.is_symmetric() && (degree_type == k_core_degree_type_t::INOUT) &&
@@ -267,7 +267,7 @@ void core_number(raft::handle_t const& handle,
                                  vertex_frontier.bucket(bucket_idx_next).begin(),
                                  vertex_frontier.bucket(bucket_idx_next).end(),
                                  core_numbers,
-                                 dst_core_numbers);
+                                 dst_core_numbers.mutable_view());
 
         vertex_frontier.bucket(bucket_idx_next)
           .resize(static_cast<size_t>(thrust::distance(
diff --git a/cpp/src/link_analysis/hits_impl.cuh b/cpp/src/link_analysis/hits_impl.cuh
index 6cf27b8c00b..d03d80a8a75 100644
--- a/cpp/src/link_analysis/hits_impl.cuh
+++ b/cpp/src/link_analysis/hits_impl.cuh
@@ -105,9 +105,10 @@ std::tuple<result_t, size_t> hits(raft::handle_t const& handle,
 
   // Initialize hubs from user input if provided
   if (has_initial_hubs_guess) {
-    update_edge_src_property(handle, graph_view, prev_hubs, prev_src_hubs);
+    update_edge_src_property(handle, graph_view, prev_hubs, prev_src_hubs.mutable_view());
   } else {
-    fill_edge_src_property(handle, graph_view, result_t{1.0} / num_vertices, prev_src_hubs);
+    fill_edge_src_property(
+      handle, graph_view, prev_src_hubs.mutable_view(), result_t{1.0} / num_vertices);
     thrust::fill(handle.get_thrust_policy(),
                  prev_hubs,
                  prev_hubs + graph_view.local_vertex_partition_range_size(),
@@ -128,7 +129,7 @@ std::tuple<result_t, size_t> hits(raft::handle_t const& handle,
       reduce_op::plus<result_t>{},
       authorities);
 
-    update_edge_dst_property(handle, graph_view, authorities, curr_dst_auth);
+    update_edge_dst_property(handle, graph_view, authorities, curr_dst_auth.mutable_view());
 
     // Update current source hubs property
     per_v_transform_reduce_outgoing_e(
@@ -166,7 +167,7 @@ std::tuple<result_t, size_t> hits(raft::handle_t const& handle,
       [] __device__(auto, auto val) { return std::abs(thrust::get<0>(val) - thrust::get<1>(val)); },
       result_t{0});
 
-    update_edge_src_property(handle, graph_view, curr_hubs, prev_src_hubs);
+    update_edge_src_property(handle, graph_view, curr_hubs, prev_src_hubs.mutable_view());
 
     // Swap pointers for the next iteration
     // After this swap call, prev_hubs has the latest value of hubs
diff --git a/cpp/src/link_analysis/pagerank_impl.cuh b/cpp/src/link_analysis/pagerank_impl.cuh
index 9a7e0319de5..7976eac0e89 100644
--- a/cpp/src/link_analysis/pagerank_impl.cuh
+++ b/cpp/src/link_analysis/pagerank_impl.cuh
@@ -261,7 +261,8 @@ centrality_algorithm_metadata_t pagerank(
         return pagerank / divisor;
       });
 
-    update_edge_src_property(handle, pull_graph_view, pageranks.data(), edge_src_pageranks);
+    update_edge_src_property(
+      handle, pull_graph_view, pageranks.data(), edge_src_pageranks.mutable_view());
 
     auto unvarying_part = aggregate_personalization_vector_size == 0
                             ? (dangling_sum * alpha + static_cast<result_t>(1.0 - alpha)) /
diff --git a/cpp/src/link_prediction/similarity_impl.cuh b/cpp/src/link_prediction/similarity_impl.cuh
index 6a8882dcfab..487f31e5e03 100644
--- a/cpp/src/link_prediction/similarity_impl.cuh
+++ b/cpp/src/link_prediction/similarity_impl.cuh
@@ -299,7 +299,7 @@ all_pairs_similarity(raft::handle_t const& handle,
     // FIXME: If vertices is specified, this could be done on a subset of the vertices
     //
     edge_dst_property_t<GraphViewType, edge_t> edge_dst_degrees(handle, graph_view);
-    update_edge_dst_property(handle, graph_view, degrees.begin(), edge_dst_degrees);
+    update_edge_dst_property(handle, graph_view, degrees.begin(), edge_dst_degrees.mutable_view());
 
     per_v_transform_reduce_incoming_e(
       handle,
diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh
index 04448c9e51d..177c79ace87 100644
--- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh
+++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh
@@ -118,6 +118,7 @@ __device__ void warp_push_buffer_elements(
 }
 
 template <bool hypersparse,
+          bool max_one_e_per_frontier_key,
           typename GraphViewType,
           typename KeyIterator,
           typename EdgePartitionSrcValueInputWrapper,
@@ -165,12 +166,18 @@ __global__ static void extract_transform_v_frontier_e_hypersparse_or_low_degree(
 
   cuda::atomic_ref<size_t, cuda::thread_scope_device> buffer_idx(*buffer_idx_ptr);
 
-  __shared__ edge_t
-    warp_local_degree_inclusive_sums[extract_transform_v_frontier_e_kernel_block_size];
-  __shared__ edge_t warp_key_local_edge_offsets[extract_transform_v_frontier_e_kernel_block_size];
+  int32_t constexpr shared_array_size = max_one_e_per_frontier_key
+                                          ? int32_t{1} /* dummy */
+                                          : extract_transform_v_frontier_e_kernel_block_size;
+  __shared__ std::conditional_t<max_one_e_per_frontier_key, std::byte /* dummy */, edge_t>
+    warp_local_degree_inclusive_sums[shared_array_size];
+  __shared__ std::conditional_t<max_one_e_per_frontier_key, std::byte /* dummy */, edge_t>
+    warp_key_local_edge_offsets[shared_array_size];
 
   using WarpScan = cub::WarpScan<edge_t, raft::warp_size()>;
-  __shared__ typename WarpScan::TempStorage temp_storage;
+  __shared__ std::
+    conditional_t<max_one_e_per_frontier_key, std::byte /* dummy */, typename WarpScan::TempStorage>
+      temp_storage;
 
   auto indices = edge_partition.indices();
 
@@ -179,104 +186,128 @@ __global__ static void extract_transform_v_frontier_e_hypersparse_or_low_degree(
     ((static_cast<size_t>(num_keys) + (raft::warp_size() - 1)) / raft::warp_size()) *
     raft::warp_size();
   while (idx < rounded_up_num_keys) {
-    auto min_key_idx = static_cast<vertex_t>(idx - (idx % raft::warp_size()));  // inclusive
-    auto max_key_idx =
-      static_cast<vertex_t>(std::min(static_cast<size_t>(min_key_idx) + raft::warp_size(),
-                                     static_cast<size_t>(num_keys)));  // exclusive
-
-    // update warp_local_degree_inclusive_sums & warp_key_local_edge_offsets
+    auto call_e_op = call_e_op_with_key_t<GraphViewType,
+                                          key_t,
+                                          EdgePartitionSrcValueInputWrapper,
+                                          EdgePartitionDstValueInputWrapper,
+                                          EdgePartitionEdgeValueInputWrapper,
+                                          EdgeOp>{edge_partition,
+                                                  edge_partition_src_value_input,
+                                                  edge_partition_dst_value_input,
+                                                  edge_partition_e_value_input,
+                                                  e_op};
 
+    edge_t edge_offset{0};
     edge_t local_degree{0};
-    if (lane_id < static_cast<int32_t>(max_key_idx - min_key_idx)) {
+    if (idx < num_keys) {
       auto key   = *(key_first + idx);
       auto major = thrust_tuple_get_or_identity<key_t, 0>(key);
       if constexpr (hypersparse) {
         auto major_hypersparse_idx = edge_partition.major_hypersparse_idx_from_major_nocheck(major);
         if (major_hypersparse_idx) {
-          auto major_idx                           = major_start_offset + *major_hypersparse_idx;
-          local_degree                             = edge_partition.local_degree(major_idx);
-          warp_key_local_edge_offsets[threadIdx.x] = edge_partition.local_offset(major_idx);
-        } else {
-          local_degree                             = edge_t{0};
-          warp_key_local_edge_offsets[threadIdx.x] = edge_t{0};  // dummy
+          auto major_idx = major_start_offset + *major_hypersparse_idx;
+          edge_offset    = edge_partition.local_offset(major_idx);
+          local_degree   = edge_partition.local_degree(major_idx);
         }
       } else {
         auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+        edge_offset       = edge_partition.local_offset(major_offset);
         local_degree      = edge_partition.local_degree(major_offset);
-        warp_key_local_edge_offsets[threadIdx.x] = edge_partition.local_offset(major_offset);
       }
     }
-    WarpScan(temp_storage)
-      .InclusiveSum(local_degree, warp_local_degree_inclusive_sums[threadIdx.x]);
-    __syncwarp();
-
-    // all the threads in a warp collectively process local edges for the keys in [key_first +
-    // min_key_idx, key_first + max_key_idx)
-
-    auto num_edges_this_warp = warp_local_degree_inclusive_sums[warp_id * raft::warp_size() +
-                                                                (max_key_idx - min_key_idx) - 1];
-    auto rounded_up_num_edges_this_warp =
-      ((static_cast<size_t>(num_edges_this_warp) + (raft::warp_size() - 1)) / raft::warp_size()) *
-      raft::warp_size();
-
-    auto call_e_op = call_e_op_with_key_t<GraphViewType,
-                                          key_t,
-                                          EdgePartitionSrcValueInputWrapper,
-                                          EdgePartitionDstValueInputWrapper,
-                                          EdgePartitionEdgeValueInputWrapper,
-                                          EdgeOp>{edge_partition,
-                                                  edge_partition_src_value_input,
-                                                  edge_partition_dst_value_input,
-                                                  edge_partition_e_value_input,
-                                                  e_op};
 
-    auto this_warp_inclusive_sum_first =
-      warp_local_degree_inclusive_sums + warp_id * raft::warp_size();
-    auto this_warp_inclusive_sum_last = this_warp_inclusive_sum_first + (max_key_idx - min_key_idx);
+    if constexpr (max_one_e_per_frontier_key) {
+      // each thread processes one frontier key, exits if any edge returns a valid output
 
-    if (edge_partition_e_mask) {
-      for (size_t i = lane_id; i < rounded_up_num_edges_this_warp; i += raft::warp_size()) {
-        e_op_result_t e_op_result{thrust::nullopt};
+      e_op_result_t e_op_result{thrust::nullopt};
+      auto key = *(key_first + idx);
 
-        if (i < static_cast<size_t>(num_edges_this_warp)) {
-          auto key_idx_this_warp = static_cast<vertex_t>(thrust::distance(
-            this_warp_inclusive_sum_first,
-            thrust::upper_bound(
-              thrust::seq, this_warp_inclusive_sum_first, this_warp_inclusive_sum_last, i)));
-          auto local_edge_offset =
-            warp_key_local_edge_offsets[warp_id * raft::warp_size() + key_idx_this_warp] +
-            static_cast<edge_t>(i - ((key_idx_this_warp == 0) ? edge_t{0}
-                                                              : *(this_warp_inclusive_sum_first +
-                                                                  (key_idx_this_warp - 1))));
-          if ((*edge_partition_e_mask).get(local_edge_offset)) {
-            auto key    = *(key_first + (min_key_idx + key_idx_this_warp));
-            e_op_result = call_e_op(key, local_edge_offset);
+      if (edge_partition_e_mask) {
+        for (edge_t i = 0; i < local_degree; ++i) {
+          if ((*edge_partition_e_mask).get(edge_offset + i)) {
+            e_op_result = call_e_op(key, edge_offset + i);
+            if (e_op_result) { break; }
           }
         }
-
-        warp_push_buffer_elements(
-          buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result);
+      } else {
+        for (edge_t i = 0; i < local_degree; ++i) {
+          e_op_result = call_e_op(key, edge_offset + i);
+          if (e_op_result) { break; }
+        }
       }
+      warp_push_buffer_elements(
+        buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result);
     } else {
-      for (size_t i = lane_id; i < rounded_up_num_edges_this_warp; i += raft::warp_size()) {
-        e_op_result_t e_op_result{thrust::nullopt};
+      auto min_key_idx = static_cast<vertex_t>(idx - (idx % raft::warp_size()));  // inclusive
+      auto max_key_idx =
+        static_cast<vertex_t>(std::min(static_cast<size_t>(min_key_idx) + raft::warp_size(),
+                                       static_cast<size_t>(num_keys)));  // exclusive
+
+      // update warp_local_degree_inclusive_sums & warp_key_local_edge_offsets
+
+      warp_key_local_edge_offsets[threadIdx.x] = edge_offset;
+      WarpScan(temp_storage)
+        .InclusiveSum(local_degree, warp_local_degree_inclusive_sums[threadIdx.x]);
+      __syncwarp();
+
+      // all the threads in a warp collectively process local edges for the keys in [key_first +
+      // min_key_idx, key_first + max_key_idx)
+
+      auto num_edges_this_warp = warp_local_degree_inclusive_sums[warp_id * raft::warp_size() +
+                                                                  (max_key_idx - min_key_idx) - 1];
+      auto rounded_up_num_edges_this_warp =
+        ((static_cast<size_t>(num_edges_this_warp) + (raft::warp_size() - 1)) / raft::warp_size()) *
+        raft::warp_size();
+
+      auto this_warp_inclusive_sum_first =
+        warp_local_degree_inclusive_sums + warp_id * raft::warp_size();
+      auto this_warp_inclusive_sum_last =
+        this_warp_inclusive_sum_first + (max_key_idx - min_key_idx);
+
+      if (edge_partition_e_mask) {
+        for (size_t i = lane_id; i < rounded_up_num_edges_this_warp; i += raft::warp_size()) {
+          e_op_result_t e_op_result{thrust::nullopt};
+
+          if (i < static_cast<size_t>(num_edges_this_warp)) {
+            auto key_idx_this_warp = static_cast<vertex_t>(thrust::distance(
+              this_warp_inclusive_sum_first,
+              thrust::upper_bound(
+                thrust::seq, this_warp_inclusive_sum_first, this_warp_inclusive_sum_last, i)));
+            auto local_edge_offset =
+              warp_key_local_edge_offsets[warp_id * raft::warp_size() + key_idx_this_warp] +
+              static_cast<edge_t>(i - ((key_idx_this_warp == 0) ? edge_t{0}
+                                                                : *(this_warp_inclusive_sum_first +
+                                                                    (key_idx_this_warp - 1))));
+            if ((*edge_partition_e_mask).get(local_edge_offset)) {
+              auto key    = *(key_first + (min_key_idx + key_idx_this_warp));
+              e_op_result = call_e_op(key, local_edge_offset);
+            }
+          }
 
-        if (i < static_cast<size_t>(num_edges_this_warp)) {
-          auto key_idx_this_warp = static_cast<vertex_t>(thrust::distance(
-            this_warp_inclusive_sum_first,
-            thrust::upper_bound(
-              thrust::seq, this_warp_inclusive_sum_first, this_warp_inclusive_sum_last, i)));
-          auto local_edge_offset =
-            warp_key_local_edge_offsets[warp_id * raft::warp_size() + key_idx_this_warp] +
-            static_cast<edge_t>(i - ((key_idx_this_warp == 0) ? edge_t{0}
-                                                              : *(this_warp_inclusive_sum_first +
-                                                                  (key_idx_this_warp - 1))));
-          auto key    = *(key_first + (min_key_idx + key_idx_this_warp));
-          e_op_result = call_e_op(key, local_edge_offset);
+          warp_push_buffer_elements(
+            buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result);
         }
+      } else {
+        for (size_t i = lane_id; i < rounded_up_num_edges_this_warp; i += raft::warp_size()) {
+          e_op_result_t e_op_result{thrust::nullopt};
+
+          if (i < static_cast<size_t>(num_edges_this_warp)) {
+            auto key_idx_this_warp = static_cast<vertex_t>(thrust::distance(
+              this_warp_inclusive_sum_first,
+              thrust::upper_bound(
+                thrust::seq, this_warp_inclusive_sum_first, this_warp_inclusive_sum_last, i)));
+            auto local_edge_offset =
+              warp_key_local_edge_offsets[warp_id * raft::warp_size() + key_idx_this_warp] +
+              static_cast<edge_t>(i - ((key_idx_this_warp == 0) ? edge_t{0}
+                                                                : *(this_warp_inclusive_sum_first +
+                                                                    (key_idx_this_warp - 1))));
+            auto key    = *(key_first + (min_key_idx + key_idx_this_warp));
+            e_op_result = call_e_op(key, local_edge_offset);
+          }
 
-        warp_push_buffer_elements(
-          buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result);
+          warp_push_buffer_elements(
+            buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result);
+        }
       }
     }
 
@@ -284,7 +315,8 @@ __global__ static void extract_transform_v_frontier_e_hypersparse_or_low_degree(
   }
 }
 
-template <typename GraphViewType,
+template <bool max_one_e_per_frontier_key,
+          typename GraphViewType,
           typename KeyIterator,
           typename EdgePartitionSrcValueInputWrapper,
           typename EdgePartitionDstValueInputWrapper,
@@ -327,6 +359,14 @@ __global__ static void extract_transform_v_frontier_e_mid_degree(
 
   cuda::atomic_ref<size_t, cuda::thread_scope_device> buffer_idx(*buffer_idx_ptr);
 
+  using WarpReduce = cub::WarpReduce<int32_t>;
+  __shared__ std::conditional_t<max_one_e_per_frontier_key,
+                                typename WarpReduce::TempStorage,
+                                std::byte /* dummy */>
+    temp_storage[max_one_e_per_frontier_key
+                   ? (extract_transform_v_frontier_e_kernel_block_size / raft::warp_size())
+                   : int32_t{1} /* dummy */];
+
   while (idx < static_cast<size_t>(thrust::distance(key_first, key_last))) {
     auto key          = *(key_first + idx);
     auto major        = thrust_tuple_get_or_identity<key_t, 0>(key);
@@ -363,16 +403,42 @@ __global__ static void extract_transform_v_frontier_e_mid_degree(
           e_op_result = call_e_op(i);
         }
 
-        warp_push_buffer_elements(
-          buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result);
+        if constexpr (max_one_e_per_frontier_key) {
+          auto first_valid_lane_id =
+            WarpReduce(temp_storage[threadIdx.x / raft::warp_size()])
+              .Reduce(e_op_result ? lane_id : raft::warp_size(), cub::Min());
+          first_valid_lane_id = __shfl_sync(raft::warp_full_mask(), first_valid_lane_id, int{0});
+          if (lane_id == first_valid_lane_id) {
+            auto push_idx = buffer_idx.fetch_add(1, cuda::std::memory_order_relaxed);
+            push_buffer_element(
+              buffer_key_output_first, buffer_value_output_first, push_idx, e_op_result);
+          }
+          if (first_valid_lane_id != raft::warp_size()) { break; }
+        } else {
+          warp_push_buffer_elements(
+            buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result);
+        }
       }
     } else {
       for (size_t i = lane_id; i < rounded_up_local_out_degree; i += raft::warp_size()) {
         e_op_result_t e_op_result{thrust::nullopt};
         if (i < static_cast<size_t>(local_out_degree)) { e_op_result = call_e_op(i); }
 
-        warp_push_buffer_elements(
-          buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result);
+        if constexpr (max_one_e_per_frontier_key) {
+          auto first_valid_lane_id =
+            WarpReduce(temp_storage[threadIdx.x / raft::warp_size()])
+              .Reduce(e_op_result ? lane_id : raft::warp_size(), cub::Min());
+          first_valid_lane_id = __shfl_sync(raft::warp_full_mask(), first_valid_lane_id, int{0});
+          if (lane_id == first_valid_lane_id) {
+            auto push_buffer_idx = buffer_idx.fetch_add(1, cuda::std::memory_order_relaxed);
+            push_buffer_element(
+              buffer_key_output_first, buffer_value_output_first, push_buffer_idx, e_op_result);
+          }
+          if (first_valid_lane_id != raft::warp_size()) { break; }
+        } else {
+          warp_push_buffer_elements(
+            buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result);
+        }
       }
     }
 
@@ -380,7 +446,8 @@ __global__ static void extract_transform_v_frontier_e_mid_degree(
   }
 }
 
-template <typename GraphViewType,
+template <bool max_one_e_per_frontier_key,
+          typename GraphViewType,
           typename KeyIterator,
           typename EdgePartitionSrcValueInputWrapper,
           typename EdgePartitionDstValueInputWrapper,
@@ -421,6 +488,13 @@ __global__ static void extract_transform_v_frontier_e_high_degree(
 
   cuda::atomic_ref<size_t, cuda::thread_scope_device> buffer_idx(*buffer_idx_ptr);
 
+  using BlockReduce = cub::BlockReduce<int32_t, extract_transform_v_frontier_e_kernel_block_size>;
+  __shared__ std::conditional_t<max_one_e_per_frontier_key,
+                                typename BlockReduce::TempStorage,
+                                std::byte /* dummy */>
+    temp_storage;
+  __shared__ int32_t output_thread_id;
+
   while (idx < static_cast<size_t>(thrust::distance(key_first, key_last))) {
     auto key          = *(key_first + idx);
     auto major        = thrust_tuple_get_or_identity<key_t, 0>(key);
@@ -458,16 +532,46 @@ __global__ static void extract_transform_v_frontier_e_high_degree(
           e_op_result = call_e_op(i);
         }
 
-        warp_push_buffer_elements(
-          buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result);
+        if constexpr (max_one_e_per_frontier_key) {
+          auto first_valid_thread_id =
+            BlockReduce(temp_storage)
+              .Reduce(e_op_result ? threadIdx.x : extract_transform_v_frontier_e_kernel_block_size,
+                      cub::Min());
+          if (threadIdx.x == 0) { output_thread_id = first_valid_thread_id; }
+          __syncthreads();
+          if (threadIdx.x == output_thread_id) {
+            auto push_buffer_idx = buffer_idx.fetch_add(1, cuda::std::memory_order_relaxed);
+            push_buffer_element(
+              buffer_key_output_first, buffer_value_output_first, push_buffer_idx, e_op_result);
+          }
+          if (output_thread_id != extract_transform_v_frontier_e_kernel_block_size) { break; }
+        } else {
+          warp_push_buffer_elements(
+            buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result);
+        }
       }
     } else {
       for (size_t i = threadIdx.x; i < rounded_up_local_out_degree; i += blockDim.x) {
         e_op_result_t e_op_result{thrust::nullopt};
         if (i < static_cast<size_t>(local_out_degree)) { e_op_result = call_e_op(i); }
 
-        warp_push_buffer_elements(
-          buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result);
+        if constexpr (max_one_e_per_frontier_key) {
+          auto first_valid_thread_id =
+            BlockReduce(temp_storage)
+              .Reduce(e_op_result ? threadIdx.x : extract_transform_v_frontier_e_kernel_block_size,
+                      cub::Min());
+          if (threadIdx.x == 0) { output_thread_id = first_valid_thread_id; }
+          __syncthreads();
+          if (threadIdx.x == output_thread_id) {
+            auto push_buffer_idx = buffer_idx.fetch_add(1, cuda::std::memory_order_relaxed);
+            push_buffer_element(
+              buffer_key_output_first, buffer_value_output_first, push_buffer_idx, e_op_result);
+          }
+          if (output_thread_id != extract_transform_v_frontier_e_kernel_block_size) { break; }
+        } else {
+          warp_push_buffer_elements(
+            buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result);
+        }
       }
     }
 
@@ -477,6 +581,10 @@ __global__ static void extract_transform_v_frontier_e_high_degree(
 
 template <bool incoming,  // iterate over incoming edges (incoming == true) or outgoing edges
                           // (incoming == false)
+          bool max_one_e_per_frontier_key,  // extract maximum one edge per key in the input
+                                            // frontier (if multiple e_op calls return valid output
+                                            // values, all but one will be discarded (the remaining
+                                            // one will be arbitrarily selected)
           typename OutputKeyT,
           typename OutputValueT,
           typename GraphViewType,
@@ -647,8 +755,11 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
         edge_partition_frontier_key_last);
 
     auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i);
-    auto max_pushes      = edge_partition.compute_number_of_edges(
-      edge_partition_frontier_major_first, edge_partition_frontier_major_last, handle.get_stream());
+    auto max_pushes      = max_one_e_per_frontier_key ? edge_partition_frontier_size
+                                                      : edge_partition.compute_number_of_edges(
+                                                     edge_partition_frontier_major_first,
+                                                     edge_partition_frontier_major_last,
+                                                     handle.get_stream());
 
     auto new_buffer_size = buffer_idx.value(handle.get_stream()) + max_pushes;
     resize_optional_dataframe_buffer<output_key_t>(
@@ -699,7 +810,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
         raft::grid_1d_block_t update_grid(h_offsets[0],
                                           extract_transform_v_frontier_e_kernel_block_size,
                                           handle.get_device_properties().maxGridSize[0]);
-        extract_transform_v_frontier_e_high_degree<GraphViewType>
+        extract_transform_v_frontier_e_high_degree<max_one_e_per_frontier_key, GraphViewType>
           <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
             edge_partition,
             edge_partition_frontier_key_first,
@@ -717,7 +828,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
         raft::grid_1d_warp_t update_grid(h_offsets[1] - h_offsets[0],
                                          extract_transform_v_frontier_e_kernel_block_size,
                                          handle.get_device_properties().maxGridSize[0]);
-        extract_transform_v_frontier_e_mid_degree<GraphViewType>
+        extract_transform_v_frontier_e_mid_degree<max_one_e_per_frontier_key, GraphViewType>
           <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
             edge_partition,
             edge_partition_frontier_key_first + h_offsets[0],
@@ -735,7 +846,9 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
         raft::grid_1d_thread_t update_grid(h_offsets[2] - h_offsets[1],
                                            extract_transform_v_frontier_e_kernel_block_size,
                                            handle.get_device_properties().maxGridSize[0]);
-        extract_transform_v_frontier_e_hypersparse_or_low_degree<false, GraphViewType>
+        extract_transform_v_frontier_e_hypersparse_or_low_degree<false,
+                                                                 max_one_e_per_frontier_key,
+                                                                 GraphViewType>
           <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
             edge_partition,
             edge_partition_frontier_key_first + h_offsets[1],
@@ -753,7 +866,9 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
         raft::grid_1d_thread_t update_grid(h_offsets[3] - h_offsets[2],
                                            extract_transform_v_frontier_e_kernel_block_size,
                                            handle.get_device_properties().maxGridSize[0]);
-        extract_transform_v_frontier_e_hypersparse_or_low_degree<true, GraphViewType>
+        extract_transform_v_frontier_e_hypersparse_or_low_degree<true,
+                                                                 max_one_e_per_frontier_key,
+                                                                 GraphViewType>
           <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
             edge_partition,
             edge_partition_frontier_key_first + h_offsets[2],
@@ -773,7 +888,9 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
                                            extract_transform_v_frontier_e_kernel_block_size,
                                            handle.get_device_properties().maxGridSize[0]);
 
-        extract_transform_v_frontier_e_hypersparse_or_low_degree<false, GraphViewType>
+        extract_transform_v_frontier_e_hypersparse_or_low_degree<false,
+                                                                 max_one_e_per_frontier_key,
+                                                                 GraphViewType>
           <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
             edge_partition,
             edge_partition_frontier_key_first,
diff --git a/cpp/src/prims/edge_bucket.cuh b/cpp/src/prims/edge_bucket.cuh
index 9fbf47c3288..bca294e5628 100644
--- a/cpp/src/prims/edge_bucket.cuh
+++ b/cpp/src/prims/edge_bucket.cuh
@@ -92,6 +92,29 @@ class edge_bucket_t {
   {
   }
 
+  template <typename tag_type = tag_t, std::enable_if_t<std::is_same_v<tag_type, void>>* = nullptr>
+  edge_bucket_t(raft::handle_t const& handle,
+                rmm::device_uvector<vertex_t>&& srcs,
+                rmm::device_uvector<vertex_t>&& dsts)
+    : handle_ptr_(&handle),
+      majors_(std::move(src_major ? srcs : dsts)),
+      minors_(std::move(src_major ? dsts : srcs)),
+      tags_(std::byte{0})
+  {
+  }
+
+  template <typename tag_type = tag_t, std::enable_if_t<!std::is_same_v<tag_type, void>>* = nullptr>
+  edge_bucket_t(raft::handle_t const& handle,
+                rmm::device_uvector<vertex_t>&& srcs,
+                rmm::device_uvector<vertex_t>&& dsts,
+                rmm::device_uvector<tag_t>&& tags)
+    : handle_ptr_(&handle),
+      majors_(std::move(src_major ? srcs : dsts)),
+      minors_(std::move(src_major ? dsts : srcs)),
+      tags_(std::move(tags))
+  {
+  }
+
   /**
    * @ brief insert an edge to the bucket
    *
diff --git a/cpp/src/prims/extract_transform_e.cuh b/cpp/src/prims/extract_transform_e.cuh
index 5741c98d90e..d51e03628e1 100644
--- a/cpp/src/prims/extract_transform_e.cuh
+++ b/cpp/src/prims/extract_transform_e.cuh
@@ -116,8 +116,8 @@ extract_transform_e(raft::handle_t const& handle,
                   thrust::make_counting_iterator(graph_view.local_vertex_partition_range_last()));
 
   auto value_buffer = allocate_dataframe_buffer<payload_t>(size_t{0}, handle.get_stream());
-  std::tie(std::ignore, value_buffer) =
-    detail::extract_transform_v_frontier_e<GraphViewType::is_storage_transposed, void, payload_t>(
+  std::tie(std::ignore, value_buffer) = detail::
+    extract_transform_v_frontier_e<GraphViewType::is_storage_transposed, false, void, payload_t>(
       handle,
       graph_view,
       frontier,
diff --git a/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh b/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh
index b9c63481420..7ad033b93c2 100644
--- a/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh
+++ b/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh
@@ -100,14 +100,14 @@ extract_transform_v_frontier_outgoing_e(raft::handle_t const& handle,
 
   auto value_buffer = allocate_dataframe_buffer<payload_t>(size_t{0}, handle.get_stream());
   std::tie(std::ignore, value_buffer) =
-    detail::extract_transform_v_frontier_e<false, void, payload_t>(handle,
-                                                                   graph_view,
-                                                                   frontier,
-                                                                   edge_src_value_input,
-                                                                   edge_dst_value_input,
-                                                                   edge_value_input,
-                                                                   e_op,
-                                                                   do_expensive_check);
+    detail::extract_transform_v_frontier_e<false, false, void, payload_t>(handle,
+                                                                          graph_view,
+                                                                          frontier,
+                                                                          edge_src_value_input,
+                                                                          edge_dst_value_input,
+                                                                          edge_value_input,
+                                                                          e_op,
+                                                                          do_expensive_check);
 
   return value_buffer;
 }
diff --git a/cpp/src/prims/fill_edge_property.cuh b/cpp/src/prims/fill_edge_property.cuh
index 161ec623287..54d0c454ec2 100644
--- a/cpp/src/prims/fill_edge_property.cuh
+++ b/cpp/src/prims/fill_edge_property.cuh
@@ -33,11 +33,11 @@ namespace cugraph {
 
 namespace detail {
 
-template <typename GraphViewType, typename T, typename EdgePropertyOutputWrapper>
+template <typename GraphViewType, typename EdgePropertyOutputWrapper, typename T>
 void fill_edge_property(raft::handle_t const& handle,
                         GraphViewType const& graph_view,
-                        T input,
-                        EdgePropertyOutputWrapper edge_property_output)
+                        EdgePropertyOutputWrapper edge_property_output,
+                        T input)
 {
   static_assert(std::is_same_v<T, typename EdgePropertyOutputWrapper::value_type>);
 
@@ -123,27 +123,28 @@ void fill_edge_property(raft::handle_t const& handle,
  * @brief Fill graph edge property values to the input value.
  *
  * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam EdgeValueOutputWrapper Type of the wrapper for output edge property values.
  * @tparam T Type of the edge property values.
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Non-owning graph object.
+ * @param edge_property_output edge_property_view_t class object to store edge property values (for
+ * the edges assigned to this process in multi-GPU).
  * @param input Edge property values will be set to @p input.
- * @param edge_property_output edge_property_t class object to store edge property values (for the
- * edges assigned to this process in multi-GPU).
  * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
  */
-template <typename GraphViewType, typename T>
+template <typename GraphViewType, typename EdgeValueOutputWrapper, typename T>
 void fill_edge_property(raft::handle_t const& handle,
                         GraphViewType const& graph_view,
+                        EdgeValueOutputWrapper edge_property_output,
                         T input,
-                        edge_property_t<GraphViewType, T>& edge_property_output,
                         bool do_expensive_check = false)
 {
   if (do_expensive_check) {
     // currently, nothing to do
   }
 
-  detail::fill_edge_property(handle, graph_view, input, edge_property_output.mutable_view());
+  detail::fill_edge_property(handle, graph_view, edge_property_output, input);
 }
 
 }  // namespace cugraph
diff --git a/cpp/src/prims/fill_edge_src_dst_property.cuh b/cpp/src/prims/fill_edge_src_dst_property.cuh
index 152de3f8d4a..58dbf7e74a0 100644
--- a/cpp/src/prims/fill_edge_src_dst_property.cuh
+++ b/cpp/src/prims/fill_edge_src_dst_property.cuh
@@ -15,29 +15,86 @@
  */
 #pragma once
 
+#include <cugraph/edge_partition_device_view.cuh>
+#include <cugraph/edge_partition_endpoint_property_device_view.cuh>
 #include <cugraph/edge_src_dst_property.hpp>
 #include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/atomic_ops.cuh>
 #include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/host_scalar_comm.hpp>
+#include <cugraph/utilities/packed_bool_utils.hpp>
 
 #include <raft/core/handle.hpp>
 
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/count.h>
 #include <thrust/fill.h>
+#include <thrust/iterator/constant_iterator.h>
 
 #include <cstddef>
+#include <utility>
 
 namespace cugraph {
 
 namespace detail {
 
-template <typename GraphViewType, typename T, typename EdgeMajorPropertyOutputWrapper>
+template <typename Iterator, typename T>
+__device__ std::enable_if_t<std::is_same_v<T, bool>, void> fill_thrust_tuple_element(Iterator iter,
+                                                                                     size_t offset,
+                                                                                     T value)
+{
+  packed_bool_atomic_set(iter, offset, value);
+}
+
+template <typename Iterator, typename T>
+__device__ std::enable_if_t<!std::is_same_v<T, bool>, void> fill_thrust_tuple_element(Iterator iter,
+                                                                                      size_t offset,
+                                                                                      T value)
+{
+  *(iter + offset) = value;
+}
+
+template <typename Iterator, typename T, std::size_t... Is>
+__device__ void fill_thrust_tuple(Iterator iter, size_t offset, T value, std::index_sequence<Is...>)
+{
+  ((fill_thrust_tuple_element(
+     thrust::get<Is>(iter.get_iterator_tuple()), offset, thrust::get<Is>(value))),
+   ...);
+}
+
+template <typename Iterator, typename T>
+__device__ void fill_scalar_or_thrust_tuple(Iterator iter, size_t offset, T value)
+{
+  if constexpr (std::is_arithmetic_v<T>) {
+    if constexpr (cugraph::is_packed_bool<Iterator, T>) {
+      packed_bool_atomic_set(iter, offset, value);
+    } else {
+      *(iter + offset) = value;
+    }
+  } else {
+    if constexpr (cugraph::has_packed_bool_element<Iterator, T>) {
+      fill_thrust_tuple(
+        iter, offset, value, std::make_index_sequence<thrust::tuple_size<T>::value>());
+    } else {
+      *(iter + offset) = value;
+    }
+  }
+}
+
+template <typename GraphViewType, typename EdgeMajorPropertyOutputWrapper, typename T>
 void fill_edge_major_property(raft::handle_t const& handle,
                               GraphViewType const& graph_view,
-                              T input,
-                              EdgeMajorPropertyOutputWrapper edge_major_property_output)
+                              EdgeMajorPropertyOutputWrapper edge_major_property_output,
+                              T input)
 {
+  constexpr bool contains_packed_bool_element =
+    cugraph::has_packed_bool_element<typename EdgeMajorPropertyOutputWrapper::value_iterator,
+                                     typename EdgeMajorPropertyOutputWrapper::value_type>();
   static_assert(std::is_same_v<T, typename EdgeMajorPropertyOutputWrapper::value_type>);
+  static_assert(!contains_packed_bool_element ||
+                  std::is_arithmetic_v<typename EdgeMajorPropertyOutputWrapper::value_type>,
+                "unimplemented for thrust::tuple types with a packed bool element.");
 
   auto keys         = edge_major_property_output.keys();
   auto value_firsts = edge_major_property_output.value_firsts();
@@ -54,10 +111,7 @@ void fill_edge_major_property(raft::handle_t const& handle,
           static_cast<size_t>(graph_view.local_edge_partition_src_range_size(i));
       }
     }
-    if constexpr (cugraph::has_packed_bool_element<
-                    std::remove_reference_t<decltype(value_firsts[i])>,
-                    T>()) {
-      static_assert(std::is_arithmetic_v<T>, "unimplemented for thrust::tuple types.");
+    if constexpr (contains_packed_bool_element) {
       auto packed_input = input ? packed_bool_full_mask() : packed_bool_empty_mask();
       thrust::fill_n(handle.get_thrust_policy(),
                      value_firsts[i],
@@ -69,12 +123,139 @@ void fill_edge_major_property(raft::handle_t const& handle,
   }
 }
 
-template <typename GraphViewType, typename T, typename EdgeMinorPropertyOutputWrapper>
+template <typename GraphViewType,
+          typename VertexIterator,
+          typename EdgeMajorPropertyOutputWrapper,
+          typename T>
+void fill_edge_major_property(raft::handle_t const& handle,
+                              GraphViewType const& graph_view,
+                              VertexIterator vertex_first,
+                              VertexIterator vertex_last,
+                              EdgeMajorPropertyOutputWrapper edge_major_property_output,
+                              T input)
+{
+  constexpr bool contains_packed_bool_element =
+    cugraph::has_packed_bool_element<typename EdgeMajorPropertyOutputWrapper::value_iterator,
+                                     typename EdgeMajorPropertyOutputWrapper::value_type>();
+  static_assert(std::is_same_v<T, typename EdgeMajorPropertyOutputWrapper::value_type>);
+  static_assert(!contains_packed_bool_element ||
+                  std::is_arithmetic_v<typename EdgeMajorPropertyOutputWrapper::value_type>,
+                "unimplemented for thrust::tuple types with a packed bool element.");
+
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+
+  auto edge_partition_value_firsts = edge_major_property_output.value_firsts();
+  if constexpr (GraphViewType::is_multi_gpu) {
+    auto& comm                 = handle.get_comms();
+    auto const comm_rank       = comm.get_rank();
+    auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+    auto const minor_comm_rank = minor_comm.get_rank();
+    auto const minor_comm_size = minor_comm.get_size();
+
+    auto rx_counts =
+      host_scalar_allgather(minor_comm,
+                            static_cast<size_t>(thrust::distance(vertex_first, vertex_last)),
+                            handle.get_stream());
+    auto max_rx_size =
+      std::reduce(rx_counts.begin(), rx_counts.end(), size_t{0}, [](auto lhs, auto rhs) {
+        return std::max(lhs, rhs);
+      });
+    rmm::device_uvector<vertex_t> rx_vertices(max_rx_size, handle.get_stream());
+
+    auto edge_partition_keys = edge_major_property_output.keys();
+    for (int i = 0; i < minor_comm_size; ++i) {
+      auto edge_partition =
+        edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
+          graph_view.local_edge_partition_view(i));
+
+      device_bcast(
+        minor_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream());
+
+      if (edge_partition_keys) {
+        thrust::for_each(
+          handle.get_thrust_policy(),
+          thrust::make_counting_iterator(size_t{0}),
+          thrust::make_counting_iterator(rx_counts[i]),
+          [rx_vertex_first = rx_vertices.begin(),
+           input,
+           edge_partition_key_first   = ((*edge_partition_keys)[i]).begin(),
+           edge_partition_key_last    = ((*edge_partition_keys)[i]).end(),
+           edge_partition_value_first = edge_partition_value_firsts[i]] __device__(size_t i) {
+            auto major = *(rx_vertex_first + i);
+            auto it    = thrust::lower_bound(
+              thrust::seq, edge_partition_key_first, edge_partition_key_last, major);
+            if ((it != edge_partition_key_last) && (*it == major)) {
+              auto edge_partition_offset = thrust::distance(edge_partition_key_first, it);
+              if constexpr (contains_packed_bool_element) {
+                packe_bool_atomic_set(edge_partition_value_first, edge_partition_offset, input);
+              } else {
+                *(edge_partition_value_first + edge_partition_offset) = input;
+              }
+            }
+          });
+      } else {
+        if constexpr (contains_packed_bool_element) {
+          thrust::for_each(
+            handle.get_thrust_policy(),
+            thrust::make_counting_iterator(vertex_t{0}),
+            thrust::make_counting_iterator(static_cast<vertex_t>(rx_counts[i])),
+            [edge_partition,
+             rx_vertex_first = rx_vertices.begin(),
+             input,
+             output_value_first = edge_partition_value_firsts[i]] __device__(auto i) {
+              auto rx_vertex    = *(rx_vertex_first + i);
+              auto major_offset = edge_partition.major_offset_from_major_nocheck(rx_vertex);
+              packed_bool_atomic_set(output_value_first, major_offset, input);
+            });
+        } else {
+          auto map_first = thrust::make_transform_iterator(
+            rx_vertices.begin(),
+            cuda::proclaim_return_type<vertex_t>([edge_partition] __device__(auto v) {
+              return edge_partition.major_offset_from_major_nocheck(v);
+            }));
+          auto val_first = thrust::make_constant_iterator(input);
+          // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
+          // directly scatters from the internal buffer)
+          thrust::scatter(handle.get_thrust_policy(),
+                          val_first,
+                          val_first + rx_counts[i],
+                          map_first,
+                          edge_partition_value_firsts[i]);
+        }
+      }
+    }
+  } else {
+    assert(graph_view.local_vertex_partition_range_size() == GraphViewType::is_storage_transposed
+             ? graph_view.local_edge_partition_dst_range_size()
+             : graph_view.local_edge_partition_src_range_size());
+    assert(edge_partition_value_firsts.size() == size_t{1});
+    if constexpr (contains_packed_bool_element) {
+      thrust::for_each(handle.get_thrust_policy(),
+                       vertex_first,
+                       vertex_last,
+                       [input, output_value_first = edge_partition_value_firsts[0]] __device__(
+                         auto v) { packed_bool_atomic_set(output_value_first, v, input); });
+    } else {
+      auto val_first = thrust::make_constant_iterator(input);
+      thrust::scatter(handle.get_thrust_policy(),
+                      val_first,
+                      val_first + thrust::distance(vertex_first, vertex_last),
+                      vertex_first,
+                      edge_partition_value_firsts[0]);
+    }
+  }
+}
+
+template <typename GraphViewType, typename EdgeMinorPropertyOutputWrapper, typename T>
 void fill_edge_minor_property(raft::handle_t const& handle,
                               GraphViewType const& graph_view,
-                              T input,
-                              EdgeMinorPropertyOutputWrapper edge_minor_property_output)
+                              EdgeMinorPropertyOutputWrapper edge_minor_property_output,
+                              T input)
 {
+  constexpr bool contains_packed_bool_element =
+    cugraph::has_packed_bool_element<typename EdgeMinorPropertyOutputWrapper::value_iterator,
+                                     typename EdgeMinorPropertyOutputWrapper::value_type>();
   static_assert(std::is_same_v<T, typename EdgeMinorPropertyOutputWrapper::value_type>);
 
   auto keys = edge_minor_property_output.keys();
@@ -89,7 +270,7 @@ void fill_edge_minor_property(raft::handle_t const& handle,
     }
   }
   auto value_first = edge_minor_property_output.value_first();
-  if constexpr (cugraph::has_packed_bool_element<decltype(value_first), T>()) {
+  if constexpr (contains_packed_bool_element) {
     static_assert(std::is_arithmetic_v<T>, "unimplemented for thrust::tuple types.");
     auto packed_input = input ? packed_bool_full_mask() : packed_bool_empty_mask();
     thrust::fill_n(
@@ -99,6 +280,136 @@ void fill_edge_minor_property(raft::handle_t const& handle,
   }
 }
 
+template <typename GraphViewType,
+          typename VertexIterator,
+          typename EdgeMinorPropertyOutputWrapper,
+          typename T>
+void fill_edge_minor_property(raft::handle_t const& handle,
+                              GraphViewType const& graph_view,
+                              VertexIterator vertex_first,
+                              VertexIterator vertex_last,
+                              EdgeMinorPropertyOutputWrapper edge_minor_property_output,
+                              T input)
+{
+  constexpr bool contains_packed_bool_element =
+    cugraph::has_packed_bool_element<typename EdgeMinorPropertyOutputWrapper::value_iterator,
+                                     typename EdgeMinorPropertyOutputWrapper::value_type>();
+  static_assert(std::is_same_v<T, typename EdgeMinorPropertyOutputWrapper::value_type>);
+
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+
+  auto edge_partition_value_first = edge_minor_property_output.value_first();
+  if constexpr (GraphViewType::is_multi_gpu) {
+    auto& comm                 = handle.get_comms();
+    auto const comm_rank       = comm.get_rank();
+    auto& major_comm           = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
+    auto const major_comm_rank = major_comm.get_rank();
+    auto const major_comm_size = major_comm.get_size();
+
+    auto rx_counts =
+      host_scalar_allgather(major_comm,
+                            static_cast<size_t>(thrust::distance(vertex_first, vertex_last)),
+                            handle.get_stream());
+    auto max_rx_size =
+      std::reduce(rx_counts.begin(), rx_counts.end(), size_t{0}, [](auto lhs, auto rhs) {
+        return std::max(lhs, rhs);
+      });
+    rmm::device_uvector<vertex_t> rx_vertices(max_rx_size, handle.get_stream());
+
+    std::optional<raft::host_span<vertex_t const>> key_offsets{};
+    if constexpr (GraphViewType::is_storage_transposed) {
+      key_offsets = graph_view.local_sorted_unique_edge_src_vertex_partition_offsets();
+    } else {
+      key_offsets = graph_view.local_sorted_unique_edge_dst_vertex_partition_offsets();
+    }
+
+    auto edge_partition =
+      edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
+        graph_view.local_edge_partition_view(size_t{0}));
+    auto edge_partition_keys = edge_minor_property_output.keys();
+    for (int i = 0; i < major_comm_size; ++i) {
+      // FIXME: these broadcast operations can be placed between ncclGroupStart() and
+      // ncclGroupEnd()
+      device_bcast(
+        major_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream());
+
+      if (edge_partition_keys) {
+        thrust::for_each(
+          handle.get_thrust_policy(),
+          thrust::make_counting_iterator(size_t{0}),
+          thrust::make_counting_iterator(rx_counts[i]),
+          [rx_vertex_first = rx_vertices.begin(),
+           input,
+           subrange_key_first         = (*edge_partition_keys).begin() + (*key_offsets)[i],
+           subrange_key_last          = (*edge_partition_keys).begin() + (*key_offsets)[i + 1],
+           edge_partition_value_first = edge_partition_value_first,
+           subrange_start_offset      = (*key_offsets)[i]] __device__(auto i) {
+            auto minor = *(rx_vertex_first + i);
+            auto it =
+              thrust::lower_bound(thrust::seq, subrange_key_first, subrange_key_last, minor);
+            if ((it != subrange_key_last) && (*it == minor)) {
+              auto subrange_offset = thrust::distance(subrange_key_first, it);
+              if constexpr (contains_packed_bool_element) {
+                fill_scalar_or_thrust_tuple(
+                  edge_partition_value_first, subrange_start_offset + subrange_offset, input);
+              } else {
+                *(edge_partition_value_first + subrange_start_offset + subrange_offset) = input;
+              }
+            }
+          });
+      } else {
+        if constexpr (contains_packed_bool_element) {
+          thrust::for_each(handle.get_thrust_policy(),
+                           thrust::make_counting_iterator(vertex_t{0}),
+                           thrust::make_counting_iterator(static_cast<vertex_t>(rx_counts[i])),
+                           [edge_partition,
+                            rx_vertex_first = rx_vertices.begin(),
+                            input,
+                            output_value_first = edge_partition_value_first] __device__(auto i) {
+                             auto rx_vertex = *(rx_vertex_first + i);
+                             auto minor_offset =
+                               edge_partition.minor_offset_from_minor_nocheck(rx_vertex);
+                             fill_scalar_or_thrust_tuple(output_value_first, minor_offset, input);
+                           });
+        } else {
+          auto map_first = thrust::make_transform_iterator(
+            rx_vertices.begin(),
+            cuda::proclaim_return_type<vertex_t>([edge_partition] __device__(auto v) {
+              return edge_partition.minor_offset_from_minor_nocheck(v);
+            }));
+          auto val_first = thrust::make_constant_iterator(input);
+          // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
+          // directly scatters from the internal buffer)
+          thrust::scatter(handle.get_thrust_policy(),
+                          val_first,
+                          val_first + rx_counts[i],
+                          map_first,
+                          edge_partition_value_first);
+        }
+      }
+    }
+  } else {
+    assert(graph_view.local_vertex_partition_range_size() ==
+           graph_view.local_edge_partition_src_range_size());
+    if constexpr (contains_packed_bool_element) {
+      thrust::for_each(handle.get_thrust_policy(),
+                       vertex_first,
+                       vertex_last,
+                       [input, output_value_first = edge_partition_value_first] __device__(auto v) {
+                         fill_scalar_or_thrust_tuple(output_value_first, v, input);
+                       });
+    } else {
+      auto val_first = thrust::make_constant_iterator(input);
+      thrust::scatter(handle.get_thrust_policy(),
+                      val_first,
+                      val_first + thrust::distance(vertex_first, vertex_last),
+                      vertex_first,
+                      edge_partition_value_first);
+    }
+  }
+}
+
 }  // namespace detail
 
 /**
@@ -108,32 +419,96 @@ void fill_edge_minor_property(raft::handle_t const& handle,
  * to this process in multi-GPU).
  *
  * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam EdgeSrcValueOutputWrapper Type of the wrapper for output edge source property values.
  * @tparam T Type of the edge source property values.
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Non-owning graph object.
+ * @param edge_src_property_output edge_src_property_view_t class object to store source property
+ * values (for the edge source assigned to this process in multi-GPU).
  * @param input Edge source property values will be set to @p input.
- * @param edge_src_property_output edge_src_property_t class object to store source property values
- * (for the edge source assigned to this process in multi-GPU).
  * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
  */
-template <typename GraphViewType, typename T>
+template <typename GraphViewType, typename EdgeSrcValueOutputWrapper, typename T>
 void fill_edge_src_property(raft::handle_t const& handle,
                             GraphViewType const& graph_view,
+                            EdgeSrcValueOutputWrapper edge_src_property_output,
                             T input,
-                            edge_src_property_t<GraphViewType, T>& edge_src_property_output,
                             bool do_expensive_check = false)
 {
+  static_assert(std::is_same_v<T, typename EdgeSrcValueOutputWrapper::value_type>);
   if (do_expensive_check) {
     // currently, nothing to do
   }
 
+  if constexpr (GraphViewType::is_storage_transposed) {
+    detail::fill_edge_minor_property(handle, graph_view, edge_src_property_output, input);
+  } else {
+    detail::fill_edge_major_property(handle, graph_view, edge_src_property_output, input);
+  }
+}
+
+/**
+ * @brief Fill graph edge source property values to the input value.
+ *
+ * This version fills only a subset of graph edge source property values. [@p vertex_first,
+ * @p vertex_last) specifies the vertices to be filled.
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam VertexIterator Type of the iterator for vertex identifiers.
+ * @tparam EdgeSrcValueOutputWrapper Type of the wrapper for output edge source property values.
+ * @tparam T Type of the edge source property values.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param vertex_first Iterator pointing to the first (inclusive) vertex with a value to be filled.
+ * v in [vertex_first, vertex_last) should be distinct (and should belong to the vertex partition
+ * assigned to this process in multi-GPU), otherwise undefined behavior.
+ * @param vertex_last Iterator pointing to the last (exclusive) vertex with a value to be filled.
+ * @param edge_src_property_output edge_src_property_view_t class object to store source property
+ * values (for the edge source assigned to this process in multi-GPU).
+ * @param input Edge source property values will be set to @p input.
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ */
+template <typename GraphViewType,
+          typename VertexIterator,
+          typename EdgeSrcValueOutputWrapper,
+          typename T>
+void fill_edge_src_property(raft::handle_t const& handle,
+                            GraphViewType const& graph_view,
+                            VertexIterator vertex_first,
+                            VertexIterator vertex_last,
+                            EdgeSrcValueOutputWrapper edge_src_property_output,
+                            T input,
+                            bool do_expensive_check = false)
+{
+  static_assert(std::is_same_v<T, typename EdgeSrcValueOutputWrapper::value_type>);
+  if (do_expensive_check) {
+    auto num_invalids = thrust::count_if(
+      handle.get_thrust_policy(),
+      vertex_first,
+      vertex_last,
+      [local_vertex_partition_range_first = graph_view.local_vertex_partition_range_first(),
+       local_vertex_partition_range_last =
+         graph_view.local_vertex_partition_range_last()] __device__(auto v) {
+        return (v < local_vertex_partition_range_first) || (v >= local_vertex_partition_range_last);
+      });
+    if constexpr (GraphViewType::is_multi_gpu) {
+      auto& comm = handle.get_comms();
+      num_invalids =
+        host_scalar_allreduce(comm, num_invalids, raft::comms::op_t::SUM, handle.get_stream());
+    }
+    CUGRAPH_EXPECTS(
+      num_invalids == 0,
+      "Invalid input argument: invalid or non-local vertices in [vertex_first, vertex_last).");
+  }
+
   if constexpr (GraphViewType::is_storage_transposed) {
     detail::fill_edge_minor_property(
-      handle, graph_view, input, edge_src_property_output.mutable_view());
+      handle, graph_view, vertex_first, vertex_last, edge_src_property_output, input);
   } else {
     detail::fill_edge_major_property(
-      handle, graph_view, input, edge_src_property_output.mutable_view());
+      handle, graph_view, vertex_first, vertex_last, edge_src_property_output, input);
   }
 }
 
@@ -144,32 +519,98 @@ void fill_edge_src_property(raft::handle_t const& handle,
  * (assigned to this process in multi-GPU).
  *
  * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam EdgeDstValueOutputWrapper Type of the wrapper for output edge destination property
+ * values.
  * @tparam T Type of the edge destination property values.
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Non-owning graph object.
+ * @param edge_dst_property_output edge_dst_property_view_t class object to store destination
+ * property values (for the edge destinations assigned to this process in multi-GPU).
  * @param input Edge destination property values will be set to @p input.
- * @param edge_dst_property_output edge_dst_property_t class object to store destination property
- * values (for the edge destinations assigned to this process in multi-GPU).
  * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
  */
-template <typename GraphViewType, typename T>
+template <typename GraphViewType, typename EdgeDstValueOutputWrapper, typename T>
 void fill_edge_dst_property(raft::handle_t const& handle,
                             GraphViewType const& graph_view,
+                            EdgeDstValueOutputWrapper edge_dst_property_output,
                             T input,
-                            edge_dst_property_t<GraphViewType, T>& edge_dst_property_output,
                             bool do_expensive_check = false)
 {
+  static_assert(std::is_same_v<T, typename EdgeDstValueOutputWrapper::value_type>);
   if (do_expensive_check) {
     // currently, nothing to do
   }
 
+  if constexpr (GraphViewType::is_storage_transposed) {
+    detail::fill_edge_major_property(handle, graph_view, edge_dst_property_output, input);
+  } else {
+    detail::fill_edge_minor_property(handle, graph_view, edge_dst_property_output, input);
+  }
+}
+
+/**
+ * @brief Fill graph edge destination property values to the input value.
+ *
+ * This version fills only a subset of graph edge destination property values. [@p vertex_first,
+ * @p vertex_last) specifies the vertices to be filled.
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam VertexIterator Type of the iterator for vertex identifiers.
+ * @tparam EdgeDstValueOutputWrapper Type of the wrapper for output edge destination property
+ * values.
+ * @tparam T Type of the edge destination property values.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param vertex_first Iterator pointing to the first (inclusive) vertex with a value to be filled.
+ * v in [vertex_first, vertex_last) should be distinct (and should belong to the vertex partition
+ * assigned to this process in multi-GPU), otherwise undefined behavior.
+ * @param vertex_last Iterator pointing to the last (exclusive) vertex with a value to be filled.
+ * @param edge_dst_property_output edge_dst_property_view_t class object to store destination
+ * property values (for the edge destinations assigned to this process in multi-GPU).
+ * @param input Edge destination property values will be set to @p input.
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ */
+template <typename GraphViewType,
+          typename VertexIterator,
+          typename EdgeDstValueOutputWrapper,
+          typename T>
+void fill_edge_dst_property(raft::handle_t const& handle,
+                            GraphViewType const& graph_view,
+                            VertexIterator vertex_first,
+                            VertexIterator vertex_last,
+                            EdgeDstValueOutputWrapper edge_dst_property_output,
+                            T input,
+                            bool do_expensive_check = false)
+{
+  static_assert(std::is_same_v<T, typename EdgeDstValueOutputWrapper::value_type>);
+  if (do_expensive_check) {
+    auto num_invalids = thrust::count_if(
+      handle.get_thrust_policy(),
+      vertex_first,
+      vertex_last,
+      [local_vertex_partition_range_first = graph_view.local_vertex_partition_range_first(),
+       local_vertex_partition_range_last =
+         graph_view.local_vertex_partition_range_last()] __device__(auto v) {
+        return (v < local_vertex_partition_range_first) || (v >= local_vertex_partition_range_last);
+      });
+    if constexpr (GraphViewType::is_multi_gpu) {
+      auto& comm = handle.get_comms();
+      num_invalids =
+        host_scalar_allreduce(comm, num_invalids, raft::comms::op_t::SUM, handle.get_stream());
+    }
+    CUGRAPH_EXPECTS(
+      num_invalids == 0,
+      "Invalid input argument: invalid or non-local vertices in [vertex_first, vertex_last).");
+  }
+
   if constexpr (GraphViewType::is_storage_transposed) {
     detail::fill_edge_major_property(
-      handle, graph_view, input, edge_dst_property_output.mutable_view());
+      handle, graph_view, vertex_first, vertex_last, edge_dst_property_output, input);
   } else {
     detail::fill_edge_minor_property(
-      handle, graph_view, input, edge_dst_property_output.mutable_view());
+      handle, graph_view, vertex_first, vertex_last, edge_dst_property_output, input);
   }
 }
 
diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
index 7253fde8d4e..015a9c683f1 100644
--- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
@@ -156,6 +156,8 @@ struct transform_local_nbr_indices_t {
                   edge_partition_e_value_input.get(edge_offset + local_nbr_idx));
     } else if (invalid_value) {
       return *invalid_value;
+    } else {
+      return T{};
     }
   }
 };
diff --git a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
index 509ab56d3fe..027ef1f662d 100644
--- a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
@@ -660,7 +660,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
         auto const major_comm_rank = major_comm.get_rank();
         minor_init                 = (major_comm_rank == 0) ? init : ReduceOp::identity_element;
       }
-      fill_edge_minor_property(handle, graph_view, minor_init, minor_tmp_buffer->mutable_view());
+      fill_edge_minor_property(handle, graph_view, minor_tmp_buffer->mutable_view(), minor_init);
     } else {
       thrust::fill(handle.get_thrust_policy(),
                    vertex_value_output_first,
diff --git a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh
similarity index 64%
rename from cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh
rename to cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh
index 0432e25ae86..e58ab08fa97 100644
--- a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh
+++ b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh
@@ -74,7 +74,8 @@ namespace detail {
 
 int32_t constexpr update_v_frontier_from_outgoing_e_kernel_block_size = 512;
 
-template <typename key_t,
+template <bool reduce_by_src,
+          typename key_t,
           typename payload_t,
           typename vertex_t,
           typename src_value_t,
@@ -92,14 +93,15 @@ struct transform_reduce_v_frontier_call_e_op_t {
   {
     auto e_op_result = e_op(key, dst, sv, dv, ev);
     if (e_op_result.has_value()) {
+      auto reduce_by = reduce_by_src ? thrust_tuple_get_or_identity<key_t, 0>(key) : dst;
       if constexpr (std::is_same_v<key_t, vertex_t> && std::is_same_v<payload_t, void>) {
-        return dst;
+        return reduce_by;
       } else if constexpr (std::is_same_v<key_t, vertex_t> && !std::is_same_v<payload_t, void>) {
-        return thrust::make_tuple(dst, *e_op_result);
+        return thrust::make_tuple(reduce_by, *e_op_result);
       } else if constexpr (!std::is_same_v<key_t, vertex_t> && std::is_same_v<payload_t, void>) {
-        return thrust::make_tuple(dst, *e_op_result);
+        return thrust::make_tuple(reduce_by, *e_op_result);
       } else {
-        return thrust::make_tuple(thrust::make_tuple(dst, thrust::get<0>(*e_op_result)),
+        return thrust::make_tuple(thrust::make_tuple(reduce_by, thrust::get<0>(*e_op_result)),
                                   thrust::get<1>(*e_op_result));
       }
     } else {
@@ -176,6 +178,151 @@ auto sort_and_reduce_buffer_elements(
   return std::make_tuple(std::move(key_buffer), std::move(payload_buffer));
 }
 
+template <bool reduce_by_src,
+          typename GraphViewType,
+          typename VertexFrontierBucketType,
+          typename EdgeSrcValueInputWrapper,
+          typename EdgeDstValueInputWrapper,
+          typename EdgeValueInputWrapper,
+          typename EdgeOp,
+          typename ReduceOp>
+std::conditional_t<
+  !std::is_same_v<typename ReduceOp::value_type, void>,
+  std::tuple<decltype(allocate_dataframe_buffer<typename VertexFrontierBucketType::key_type>(
+               0, rmm::cuda_stream_view{})),
+             decltype(detail::allocate_optional_dataframe_buffer<typename ReduceOp::value_type>(
+               0, rmm::cuda_stream_view{}))>,
+  decltype(allocate_dataframe_buffer<typename VertexFrontierBucketType::key_type>(
+    0, rmm::cuda_stream_view{}))>
+transform_reduce_v_frontier_outgoing_e_by_src_dst(raft::handle_t const& handle,
+                                                  GraphViewType const& graph_view,
+                                                  VertexFrontierBucketType const& frontier,
+                                                  EdgeSrcValueInputWrapper edge_src_value_input,
+                                                  EdgeDstValueInputWrapper edge_dst_value_input,
+                                                  EdgeValueInputWrapper edge_value_input,
+                                                  EdgeOp e_op,
+                                                  ReduceOp reduce_op,
+                                                  bool do_expensive_check = false)
+{
+  static_assert(!GraphViewType::is_storage_transposed,
+                "GraphViewType should support the push model.");
+
+  using vertex_t  = typename GraphViewType::vertex_type;
+  using edge_t    = typename GraphViewType::edge_type;
+  using key_t     = typename VertexFrontierBucketType::key_type;
+  using payload_t = typename ReduceOp::value_type;
+
+  if (do_expensive_check) {
+    // currently, nothing to do
+  }
+
+  // 1. fill the buffer
+
+  detail::transform_reduce_v_frontier_call_e_op_t<reduce_by_src,
+                                                  key_t,
+                                                  payload_t,
+                                                  vertex_t,
+                                                  typename EdgeSrcValueInputWrapper::value_type,
+                                                  typename EdgeDstValueInputWrapper::value_type,
+                                                  typename EdgeValueInputWrapper::value_type,
+                                                  EdgeOp>
+    e_op_wrapper{e_op};
+
+  bool constexpr max_one_e_per_frontier_key =
+    reduce_by_src && std::is_same_v<ReduceOp, reduce_op::any<typename ReduceOp::value_type>>;
+  auto [key_buffer, payload_buffer] =
+    detail::extract_transform_v_frontier_e<false, max_one_e_per_frontier_key, key_t, payload_t>(
+      handle,
+      graph_view,
+      frontier,
+      edge_src_value_input,
+      edge_dst_value_input,
+      edge_value_input,
+      e_op_wrapper,
+      do_expensive_check);
+
+  // 2. reduce the buffer
+
+  std::tie(key_buffer, payload_buffer) =
+    detail::sort_and_reduce_buffer_elements<key_t, payload_t, ReduceOp>(
+      handle, std::move(key_buffer), std::move(payload_buffer), reduce_op);
+  if constexpr (GraphViewType::is_multi_gpu) {
+    // FIXME: this step is unnecessary if major_comm_size== 1
+    auto& comm                 = handle.get_comms();
+    auto& major_comm           = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
+    auto const major_comm_rank = major_comm.get_rank();
+    auto const major_comm_size = major_comm.get_size();
+    auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+    auto const minor_comm_rank = minor_comm.get_rank();
+    auto const minor_comm_size = minor_comm.get_size();
+
+    std::vector<vertex_t> h_vertex_lasts(reduce_by_src ? minor_comm_size : major_comm_size);
+    for (size_t i = 0; i < h_vertex_lasts.size(); ++i) {
+      auto vertex_partition_id =
+        reduce_by_src
+          ? detail::compute_local_edge_partition_major_range_vertex_partition_id_t{major_comm_size,
+                                                                                   minor_comm_size,
+                                                                                   major_comm_rank,
+                                                                                   minor_comm_rank}(
+              i)
+          : detail::compute_local_edge_partition_minor_range_vertex_partition_id_t{
+              major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i);
+      h_vertex_lasts[i] = graph_view.vertex_partition_range_last(vertex_partition_id);
+    }
+
+    rmm::device_uvector<vertex_t> d_vertex_lasts(h_vertex_lasts.size(), handle.get_stream());
+    raft::update_device(
+      d_vertex_lasts.data(), h_vertex_lasts.data(), h_vertex_lasts.size(), handle.get_stream());
+    rmm::device_uvector<edge_t> d_tx_buffer_last_boundaries(d_vertex_lasts.size(),
+                                                            handle.get_stream());
+    auto reduce_by_first =
+      thrust_tuple_get_or_identity<decltype(get_dataframe_buffer_begin(key_buffer)), 0>(
+        get_dataframe_buffer_begin(key_buffer));
+    thrust::lower_bound(handle.get_thrust_policy(),
+                        reduce_by_first,
+                        reduce_by_first + size_dataframe_buffer(key_buffer),
+                        d_vertex_lasts.begin(),
+                        d_vertex_lasts.end(),
+                        d_tx_buffer_last_boundaries.begin());
+    std::vector<edge_t> h_tx_buffer_last_boundaries(d_tx_buffer_last_boundaries.size());
+    raft::update_host(h_tx_buffer_last_boundaries.data(),
+                      d_tx_buffer_last_boundaries.data(),
+                      d_tx_buffer_last_boundaries.size(),
+                      handle.get_stream());
+    handle.sync_stream();
+    std::vector<size_t> tx_counts(h_tx_buffer_last_boundaries.size());
+    std::adjacent_difference(
+      h_tx_buffer_last_boundaries.begin(), h_tx_buffer_last_boundaries.end(), tx_counts.begin());
+
+    auto rx_key_buffer = allocate_dataframe_buffer<key_t>(size_t{0}, handle.get_stream());
+    std::tie(rx_key_buffer, std::ignore) = shuffle_values(reduce_by_src ? minor_comm : major_comm,
+                                                          get_dataframe_buffer_begin(key_buffer),
+                                                          tx_counts,
+                                                          handle.get_stream());
+    key_buffer = std::move(rx_key_buffer);
+
+    if constexpr (!std::is_same_v<payload_t, void>) {
+      auto rx_payload_buffer = allocate_dataframe_buffer<payload_t>(size_t{0}, handle.get_stream());
+      std::tie(rx_payload_buffer, std::ignore) =
+        shuffle_values(reduce_by_src ? minor_comm : major_comm,
+                       get_dataframe_buffer_begin(payload_buffer),
+                       tx_counts,
+                       handle.get_stream());
+      payload_buffer = std::move(rx_payload_buffer);
+    }
+
+    std::tie(key_buffer, payload_buffer) =
+      detail::sort_and_reduce_buffer_elements<key_t, payload_t, ReduceOp>(
+        handle, std::move(key_buffer), std::move(payload_buffer), reduce_op);
+  }
+
+  if constexpr (!std::is_same_v<payload_t, void>) {
+    return std::make_tuple(std::move(key_buffer), std::move(payload_buffer));
+  } else {
+    return std::move(key_buffer);
+  }
+}
+
 }  // namespace detail
 
 template <typename GraphViewType, typename VertexFrontierBucketType>
@@ -259,6 +406,94 @@ size_t compute_num_out_nbrs_from_frontier(raft::handle_t const& handle,
   return ret;
 }
 
+/**
+ * @brief Iterate over outgoing edges from the current vertex frontier and reduce valid edge functor
+ * outputs by (tagged-)source ID.
+ *
+ * Edge functor outputs are thrust::optional objects and invalid if thrust::nullopt. Vertices are
+ * assumed to be tagged if VertexFrontierBucketType::key_type is a tuple of a vertex type and a tag
+ * type (VertexFrontierBucketType::key_type is identical to a vertex type otherwise).
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam VertexFrontierBucketType Type of the vertex frontier bucket class which abstracts the
+ * current (tagged-)vertex frontier.
+ * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values.
+ * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values.
+ * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values.
+ * @tparam EdgeOp Type of the quinary edge operator.
+ * @tparam ReduceOp Type of the binary reduction operator.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param frontier VertexFrontierBucketType class object for the current vertex frontier.
+ * @param edge_src_value_input Wrapper used to access source input property values (for the edge
+ * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view()
+ * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view()
+ * (if @p e_op does not access source property values). Use update_edge_src_property to fill the
+ * wrapper.
+ * @param edge_dst_value_input Wrapper used to access destination input property values (for the
+ * edge destinations assigned to this process in multi-GPU). Use either
+ * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or
+ * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property
+ * values). Use update_edge_dst_property to fill the wrapper.
+ * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned
+ * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to
+ * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not
+ * access edge property values).
+ * @param e_op Quinary operator takes edge (tagged-)source, edge destination, property values for
+ * the source, destination, and edge and returns 1) thrust::nullopt (if invalid and to be
+ * discarded); 2) dummy (but valid) thrust::optional object (e.g.
+ * thrust::optional<std::byte>{std::byte{0}}, if vertices are not tagged and ReduceOp::value_type is
+ * void); 3) a tag (if vertices are tagged and ReduceOp::value_type is void); 4) a value to be
+ * reduced using the @p reduce_op (if vertices are not tagged and ReduceOp::value_type is not void);
+ * or 5) a tuple of a tag and a value to be reduced (if vertices are tagged and ReduceOp::value_type
+ * is not void).
+ * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one.
+ * There are pre-defined reduction operators in prims/reduce_op.cuh. It is
+ * recommended to use the pre-defined reduction operators whenever possible as the current (and
+ * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has
+ * known member variables) to take a more optimized code path. See the documentation in the
+ * reduce_op.cuh file for instructions on writing custom reduction operators.
+ * @return Tuple of key values and payload values (if ReduceOp::value_type is not void) or just key
+ * values (if ReduceOp::value_type is void). Keys in the return values are sorted in ascending order
+ * using a vertex ID as the primary key and a tag (if relevant) as the secondary key.
+ */
+template <typename GraphViewType,
+          typename VertexFrontierBucketType,
+          typename EdgeSrcValueInputWrapper,
+          typename EdgeDstValueInputWrapper,
+          typename EdgeValueInputWrapper,
+          typename EdgeOp,
+          typename ReduceOp>
+std::conditional_t<
+  !std::is_same_v<typename ReduceOp::value_type, void>,
+  std::tuple<decltype(allocate_dataframe_buffer<typename VertexFrontierBucketType::key_type>(
+               0, rmm::cuda_stream_view{})),
+             decltype(detail::allocate_optional_dataframe_buffer<typename ReduceOp::value_type>(
+               0, rmm::cuda_stream_view{}))>,
+  decltype(allocate_dataframe_buffer<typename VertexFrontierBucketType::key_type>(
+    0, rmm::cuda_stream_view{}))>
+transform_reduce_v_frontier_outgoing_e_by_src(raft::handle_t const& handle,
+                                              GraphViewType const& graph_view,
+                                              VertexFrontierBucketType const& frontier,
+                                              EdgeSrcValueInputWrapper edge_src_value_input,
+                                              EdgeDstValueInputWrapper edge_dst_value_input,
+                                              EdgeValueInputWrapper edge_value_input,
+                                              EdgeOp e_op,
+                                              ReduceOp reduce_op,
+                                              bool do_expensive_check = false)
+{
+  return detail::transform_reduce_v_frontier_outgoing_e_by_src_dst<true>(handle,
+                                                                         graph_view,
+                                                                         frontier,
+                                                                         edge_src_value_input,
+                                                                         edge_dst_value_input,
+                                                                         edge_value_input,
+                                                                         e_op,
+                                                                         reduce_op,
+                                                                         do_expensive_check);
+}
+
 /**
  * @brief Iterate over outgoing edges from the current vertex frontier and reduce valid edge functor
  * outputs by (tagged-)destination ID.
@@ -293,13 +528,14 @@ size_t compute_num_out_nbrs_from_frontier(raft::handle_t const& handle,
  * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to
  * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not
  * access edge property values).
- * @param e_op Quinary operator takes edge source, edge destination, property values for the source,
- * destination, and edge and returns 1) thrust::nullopt (if invalid and to be discarded); 2) dummy
- * (but valid) thrust::optional object (e.g. thrust::optional<std::byte>{std::byte{0}}, if vertices
- * are not tagged and ReduceOp::value_type is void); 3) a tag (if vertices are tagged and
- * ReduceOp::value_type is void); 4) a value to be reduced using the @p reduce_op (if vertices are
- * not tagged and ReduceOp::value_type is not void); or 5) a tuple of a tag and a value to be
- * reduced (if vertices are tagged and ReduceOp::value_type is not void).
+ * @param e_op Quinary operator takes edge (tagged-)source, edge destination, property values for
+ * the source, destination, and edge and returns 1) thrust::nullopt (if invalid and to be
+ * discarded); 2) dummy (but valid) thrust::optional object (e.g.
+ * thrust::optional<std::byte>{std::byte{0}}, if vertices are not tagged and ReduceOp::value_type is
+ * void); 3) a tag (if vertices are tagged and ReduceOp::value_type is void); 4) a value to be
+ * reduced using the @p reduce_op (if vertices are not tagged and ReduceOp::value_type is not void);
+ * or 5) a tuple of a tag and a value to be reduced (if vertices are tagged and ReduceOp::value_type
+ * is not void).
  * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one.
  * There are pre-defined reduction operators in prims/reduce_op.cuh. It is
  * recommended to use the pre-defined reduction operators whenever possible as the current (and
@@ -335,108 +571,15 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle,
                                               ReduceOp reduce_op,
                                               bool do_expensive_check = false)
 {
-  static_assert(!GraphViewType::is_storage_transposed,
-                "GraphViewType should support the push model.");
-
-  using vertex_t  = typename GraphViewType::vertex_type;
-  using edge_t    = typename GraphViewType::edge_type;
-  using key_t     = typename VertexFrontierBucketType::key_type;
-  using payload_t = typename ReduceOp::value_type;
-
-  if (do_expensive_check) {
-    // currently, nothing to do
-  }
-
-  // 1. fill the buffer
-
-  detail::transform_reduce_v_frontier_call_e_op_t<key_t,
-                                                  payload_t,
-                                                  vertex_t,
-                                                  typename EdgeSrcValueInputWrapper::value_type,
-                                                  typename EdgeDstValueInputWrapper::value_type,
-                                                  typename EdgeValueInputWrapper::value_type,
-                                                  EdgeOp>
-    e_op_wrapper{e_op};
-
-  auto [key_buffer, payload_buffer] =
-    detail::extract_transform_v_frontier_e<false, key_t, payload_t>(handle,
-                                                                    graph_view,
-                                                                    frontier,
-                                                                    edge_src_value_input,
-                                                                    edge_dst_value_input,
-                                                                    edge_value_input,
-                                                                    e_op_wrapper,
-                                                                    do_expensive_check);
-
-  // 2. reduce the buffer
-
-  std::tie(key_buffer, payload_buffer) =
-    detail::sort_and_reduce_buffer_elements<key_t, payload_t, ReduceOp>(
-      handle, std::move(key_buffer), std::move(payload_buffer), reduce_op);
-  if constexpr (GraphViewType::is_multi_gpu) {
-    // FIXME: this step is unnecessary if major_comm_size== 1
-    auto& comm                 = handle.get_comms();
-    auto& major_comm           = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
-    auto const major_comm_rank = major_comm.get_rank();
-    auto const major_comm_size = major_comm.get_size();
-    auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
-    auto const minor_comm_rank = minor_comm.get_rank();
-    auto const minor_comm_size = minor_comm.get_size();
-
-    std::vector<vertex_t> h_vertex_lasts(major_comm_size);
-    for (size_t i = 0; i < h_vertex_lasts.size(); ++i) {
-      auto minor_range_vertex_partition_id =
-        detail::compute_local_edge_partition_minor_range_vertex_partition_id_t{
-          major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i);
-      h_vertex_lasts[i] = graph_view.vertex_partition_range_last(minor_range_vertex_partition_id);
-    }
-
-    rmm::device_uvector<vertex_t> d_vertex_lasts(h_vertex_lasts.size(), handle.get_stream());
-    raft::update_device(
-      d_vertex_lasts.data(), h_vertex_lasts.data(), h_vertex_lasts.size(), handle.get_stream());
-    rmm::device_uvector<edge_t> d_tx_buffer_last_boundaries(d_vertex_lasts.size(),
-                                                            handle.get_stream());
-    auto dst_first =
-      thrust_tuple_get_or_identity<decltype(get_dataframe_buffer_begin(key_buffer)), 0>(
-        get_dataframe_buffer_begin(key_buffer));
-    thrust::lower_bound(handle.get_thrust_policy(),
-                        dst_first,
-                        dst_first + size_dataframe_buffer(key_buffer),
-                        d_vertex_lasts.begin(),
-                        d_vertex_lasts.end(),
-                        d_tx_buffer_last_boundaries.begin());
-    std::vector<edge_t> h_tx_buffer_last_boundaries(d_tx_buffer_last_boundaries.size());
-    raft::update_host(h_tx_buffer_last_boundaries.data(),
-                      d_tx_buffer_last_boundaries.data(),
-                      d_tx_buffer_last_boundaries.size(),
-                      handle.get_stream());
-    handle.sync_stream();
-    std::vector<size_t> tx_counts(h_tx_buffer_last_boundaries.size());
-    std::adjacent_difference(
-      h_tx_buffer_last_boundaries.begin(), h_tx_buffer_last_boundaries.end(), tx_counts.begin());
-
-    auto rx_key_buffer = allocate_dataframe_buffer<key_t>(size_t{0}, handle.get_stream());
-    std::tie(rx_key_buffer, std::ignore) = shuffle_values(
-      major_comm, get_dataframe_buffer_begin(key_buffer), tx_counts, handle.get_stream());
-    key_buffer = std::move(rx_key_buffer);
-
-    if constexpr (!std::is_same_v<payload_t, void>) {
-      auto rx_payload_buffer = allocate_dataframe_buffer<payload_t>(size_t{0}, handle.get_stream());
-      std::tie(rx_payload_buffer, std::ignore) = shuffle_values(
-        major_comm, get_dataframe_buffer_begin(payload_buffer), tx_counts, handle.get_stream());
-      payload_buffer = std::move(rx_payload_buffer);
-    }
-
-    std::tie(key_buffer, payload_buffer) =
-      detail::sort_and_reduce_buffer_elements<key_t, payload_t, ReduceOp>(
-        handle, std::move(key_buffer), std::move(payload_buffer), reduce_op);
-  }
-
-  if constexpr (!std::is_same_v<payload_t, void>) {
-    return std::make_tuple(std::move(key_buffer), std::move(payload_buffer));
-  } else {
-    return std::move(key_buffer);
-  }
+  return detail::transform_reduce_v_frontier_outgoing_e_by_src_dst<false>(handle,
+                                                                          graph_view,
+                                                                          frontier,
+                                                                          edge_src_value_input,
+                                                                          edge_dst_value_input,
+                                                                          edge_value_input,
+                                                                          e_op,
+                                                                          reduce_op,
+                                                                          do_expensive_check);
 }
 
 }  // namespace cugraph
diff --git a/cpp/src/prims/update_edge_src_dst_property.cuh b/cpp/src/prims/update_edge_src_dst_property.cuh
index dfac04ce848..1bfdc23c66d 100644
--- a/cpp/src/prims/update_edge_src_dst_property.cuh
+++ b/cpp/src/prims/update_edge_src_dst_property.cuh
@@ -22,10 +22,12 @@
 #include <cugraph/edge_src_dst_property.hpp>
 #include <cugraph/graph_view.hpp>
 #include <cugraph/partition_manager.hpp>
+#include <cugraph/utilities/atomic_ops.cuh>
 #include <cugraph/utilities/dataframe_buffer.hpp>
 #include <cugraph/utilities/device_comm.hpp>
 #include <cugraph/utilities/error.hpp>
 #include <cugraph/utilities/host_scalar_comm.hpp>
+#include <cugraph/utilities/packed_bool_utils.hpp>
 #include <cugraph/vertex_partition_device_view.cuh>
 
 #include <raft/core/handle.hpp>
@@ -56,18 +58,6 @@ namespace cugraph {
 
 namespace detail {
 
-template <typename Iterator, typename vertex_t>
-__device__ void packed_bool_atomic_set(Iterator value_first, vertex_t offset, bool val)
-{
-  auto packed_output_offset = packed_bool_offset(offset);
-  auto packed_output_mask   = packed_bool_mask(offset);
-  if (val) {
-    atomicOr(value_first + packed_output_offset, packed_output_mask);
-  } else {
-    atomicAnd(value_first + packed_output_offset, ~packed_output_mask);
-  }
-}
-
 template <typename BoolInputIterator, typename PackedBoolOutputIterator>
 void pack_bools(raft::handle_t const& handle,
                 BoolInputIterator input_first,
@@ -130,8 +120,12 @@ void update_edge_major_property(raft::handle_t const& handle,
                                 VertexPropertyInputIterator vertex_property_input_first,
                                 EdgeMajorPropertyOutputWrapper edge_major_property_output)
 {
-  constexpr bool packed_bool =
-    std::is_same_v<typename EdgeMajorPropertyOutputWrapper::value_type, bool>;
+  constexpr bool contains_packed_bool_element =
+    cugraph::has_packed_bool_element<typename EdgeMajorPropertyOutputWrapper::value_iterator,
+                                     typename EdgeMajorPropertyOutputWrapper::value_type>();
+  static_assert(!contains_packed_bool_element ||
+                  std::is_arithmetic_v<typename EdgeMajorPropertyOutputWrapper::value_type>,
+                "unimplemented for thrust::tuple types with a packed bool element.");
 
   auto edge_partition_value_firsts = edge_major_property_output.value_firsts();
   if constexpr (GraphViewType::is_multi_gpu) {
@@ -157,16 +151,17 @@ void update_edge_major_property(raft::handle_t const& handle,
           max_rx_size, graph_view.vertex_partition_range_size(major_range_vertex_partition_id));
       }
       auto rx_value_buffer = allocate_dataframe_buffer<
-        std::conditional_t<packed_bool,
+        std::conditional_t<contains_packed_bool_element,
                            uint32_t,
                            typename EdgeMajorPropertyOutputWrapper::value_type>>(
-        packed_bool ? packed_bool_size(max_rx_size) : max_rx_size, handle.get_stream());
+        contains_packed_bool_element ? packed_bool_size(max_rx_size) : max_rx_size,
+        handle.get_stream());
       auto rx_value_first = get_dataframe_buffer_begin(rx_value_buffer);
       for (int i = 0; i < minor_comm_size; ++i) {
         auto major_range_vertex_partition_id =
           compute_local_edge_partition_major_range_vertex_partition_id_t{
             major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i);
-        if constexpr (packed_bool) {
+        if constexpr (contains_packed_bool_element) {
           if (i == minor_comm_rank) {
             pack_bools(handle,
                        vertex_property_input_first,
@@ -220,7 +215,7 @@ void update_edge_major_property(raft::handle_t const& handle,
         auto major_range_vertex_partition_id =
           compute_local_edge_partition_major_range_vertex_partition_id_t{
             major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i);
-        if constexpr (packed_bool) {
+        if constexpr (contains_packed_bool_element) {
           if (i == minor_comm_rank) {
             pack_bools(handle,
                        vertex_property_input_first,
@@ -250,7 +245,7 @@ void update_edge_major_property(raft::handle_t const& handle,
              ? graph_view.local_edge_partition_dst_range_size()
              : graph_view.local_edge_partition_src_range_size());
     assert(edge_partition_value_firsts.size() == size_t{1});
-    if constexpr (packed_bool) {
+    if constexpr (contains_packed_bool_element) {
       pack_bools(handle,
                  vertex_property_input_first,
                  vertex_property_input_first + graph_view.local_vertex_partition_range_size(),
@@ -275,8 +270,12 @@ void update_edge_major_property(raft::handle_t const& handle,
                                 VertexPropertyInputIterator vertex_property_input_first,
                                 EdgeMajorPropertyOutputWrapper edge_major_property_output)
 {
-  constexpr bool packed_bool =
-    std::is_same_v<typename EdgeMajorPropertyOutputWrapper::value_type, bool>;
+  constexpr bool contains_packed_bool_element =
+    cugraph::has_packed_bool_element<typename EdgeMajorPropertyOutputWrapper::value_iterator,
+                                     typename EdgeMajorPropertyOutputWrapper::value_type>();
+  static_assert(!contains_packed_bool_element ||
+                  std::is_arithmetic_v<typename EdgeMajorPropertyOutputWrapper::value_type>,
+                "unimplemented for thrust::tuple types with a packed bool element.");
 
   using vertex_t = typename GraphViewType::vertex_type;
   using edge_t   = typename GraphViewType::edge_type;
@@ -299,9 +298,11 @@ void update_edge_major_property(raft::handle_t const& handle,
       });
     rmm::device_uvector<vertex_t> rx_vertices(max_rx_size, handle.get_stream());
     auto rx_tmp_buffer = allocate_dataframe_buffer<
-      std::
-        conditional_t<packed_bool, uint32_t, typename EdgeMajorPropertyOutputWrapper::value_type>>(
-      packed_bool ? packed_bool_size(max_rx_size) : max_rx_size, handle.get_stream());
+      std::conditional_t<contains_packed_bool_element,
+                         uint32_t,
+                         typename EdgeMajorPropertyOutputWrapper::value_type>>(
+      contains_packed_bool_element ? packed_bool_size(max_rx_size) : max_rx_size,
+      handle.get_stream());
     auto rx_value_first = get_dataframe_buffer_begin(rx_tmp_buffer);
 
     auto edge_partition_keys = edge_major_property_output.keys();
@@ -314,7 +315,7 @@ void update_edge_major_property(raft::handle_t const& handle,
         auto vertex_partition =
           vertex_partition_device_view_t<vertex_t, GraphViewType::is_multi_gpu>(
             graph_view.local_vertex_partition_view());
-        if constexpr (packed_bool) {
+        if constexpr (contains_packed_bool_element) {
           auto bool_first = thrust::make_transform_iterator(
             vertex_first,
             cuda::proclaim_return_type<bool>([vertex_property_input_first,
@@ -351,7 +352,7 @@ void update_edge_major_property(raft::handle_t const& handle,
       device_bcast(minor_comm,
                    rx_value_first,
                    rx_value_first,
-                   packed_bool ? packed_bool_size(rx_counts[i]) : rx_counts[i],
+                   contains_packed_bool_element ? packed_bool_size(rx_counts[i]) : rx_counts[i],
                    i,
                    handle.get_stream());
 
@@ -370,7 +371,7 @@ void update_edge_major_property(raft::handle_t const& handle,
               thrust::seq, edge_partition_key_first, edge_partition_key_last, major);
             if ((it != edge_partition_key_last) && (*it == major)) {
               auto edge_partition_offset = thrust::distance(edge_partition_key_first, it);
-              if constexpr (packed_bool) {
+              if constexpr (contains_packed_bool_element) {
                 auto rx_value = static_cast<bool>(*(rx_value_first + packed_bool_offset(i)) &
                                                   packed_bool_mask(i));
                 packe_bool_atomic_set(edge_partition_value_first, edge_partition_offset, rx_value);
@@ -381,7 +382,7 @@ void update_edge_major_property(raft::handle_t const& handle,
             }
           });
       } else {
-        if constexpr (packed_bool) {
+        if constexpr (contains_packed_bool_element) {
           thrust::for_each(
             handle.get_thrust_policy(),
             thrust::make_counting_iterator(vertex_t{0}),
@@ -417,7 +418,7 @@ void update_edge_major_property(raft::handle_t const& handle,
              ? graph_view.local_edge_partition_dst_range_size()
              : graph_view.local_edge_partition_src_range_size());
     assert(edge_partition_value_firsts.size() == size_t{1});
-    if constexpr (packed_bool) {
+    if constexpr (contains_packed_bool_element) {
       thrust::for_each(handle.get_thrust_policy(),
                        vertex_first,
                        vertex_last,
@@ -445,15 +446,19 @@ void update_edge_minor_property(raft::handle_t const& handle,
                                 VertexPropertyInputIterator vertex_property_input_first,
                                 EdgeMinorPropertyOutputWrapper edge_minor_property_output)
 {
-  constexpr bool packed_bool =
-    std::is_same_v<typename EdgeMinorPropertyOutputWrapper::value_type, bool>;
+  constexpr bool contains_packed_bool_element =
+    cugraph::has_packed_bool_element<typename EdgeMinorPropertyOutputWrapper::value_iterator,
+                                     typename EdgeMinorPropertyOutputWrapper::value_type>();
+  static_assert(!contains_packed_bool_element ||
+                  std::is_arithmetic_v<typename EdgeMinorPropertyOutputWrapper::value_type>,
+                "unimplemented for thrust::tuple types with a packed bool element.");
 
   auto edge_partition_value_first = edge_minor_property_output.value_first();
   if constexpr (GraphViewType::is_multi_gpu) {
     using vertex_t = typename GraphViewType::vertex_type;
     using bcast_buffer_type =
       decltype(allocate_dataframe_buffer<
-               std::conditional_t<packed_bool,
+               std::conditional_t<contains_packed_bool_element,
                                   uint32_t,
                                   typename EdgeMinorPropertyOutputWrapper::value_type>>(
         size_t{0}, handle.get_stream()));
@@ -473,7 +478,7 @@ void update_edge_minor_property(raft::handle_t const& handle,
     // (V/comm_size) * sizeof(value_t)
     // and limit memory requirement to (E / comm_size) * sizeof(vertex_t)
     auto bcast_size = static_cast<size_t>(graph_view.number_of_vertices()) / comm_size;
-    if constexpr (packed_bool) {
+    if constexpr (contains_packed_bool_element) {
       bcast_size /= 8;  // bits to bytes
     } else {
       bcast_size *= sizeof(typename EdgeMinorPropertyOutputWrapper::value_type);
@@ -490,7 +495,7 @@ void update_edge_minor_property(raft::handle_t const& handle,
     auto edge_partition_keys = edge_minor_property_output.keys();
 
     std::optional<std::vector<bcast_buffer_type>> rx_value_buffers{std::nullopt};
-    if (packed_bool || edge_partition_keys) {
+    if (contains_packed_bool_element || edge_partition_keys) {
       rx_value_buffers = std::vector<bcast_buffer_type>{};
       (*rx_value_buffers).reserve(num_concurrent_bcasts);
       for (size_t i = 0; i < num_concurrent_bcasts; ++i) {
@@ -508,10 +513,11 @@ void update_edge_minor_property(raft::handle_t const& handle,
         }
         (*rx_value_buffers)
           .push_back(allocate_dataframe_buffer<
-                     std::conditional_t<packed_bool,
+                     std::conditional_t<contains_packed_bool_element,
                                         uint32_t,
                                         typename EdgeMinorPropertyOutputWrapper::value_type>>(
-            packed_bool ? packed_bool_size(max_size) : max_size, handle.get_stream()));
+            contains_packed_bool_element ? packed_bool_size(max_size) : max_size,
+            handle.get_stream()));
       }
     }
 
@@ -539,7 +545,7 @@ void update_edge_minor_property(raft::handle_t const& handle,
     }
 
     for (size_t round = 0; round < num_rounds; ++round) {
-      if constexpr (packed_bool) {
+      if constexpr (contains_packed_bool_element) {
         for (size_t i = 0; i < num_concurrent_bcasts; ++i) {
           auto j = static_cast<int>(num_rounds * i + round);
           if (j == major_comm_rank) {
@@ -567,7 +573,7 @@ void update_edge_minor_property(raft::handle_t const& handle,
             rx_value_buffers ? get_dataframe_buffer_begin((*rx_value_buffers)[i])
                              : edge_partition_value_first +
                                  std::get<std::vector<size_t>>(key_offsets_or_rx_displacements)[j];
-          if constexpr (packed_bool) {
+          if constexpr (contains_packed_bool_element) {
             device_bcast(major_comm,
                          rx_value_first,
                          rx_value_first,
@@ -595,7 +601,7 @@ void update_edge_minor_property(raft::handle_t const& handle,
               compute_local_edge_partition_minor_range_vertex_partition_id_t{
                 major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(j);
             auto rx_value_first = get_dataframe_buffer_begin((*rx_value_buffers)[i]);
-            if constexpr (packed_bool) {
+            if constexpr (contains_packed_bool_element) {
               if (edge_partition_keys) {
                 auto key_offsets =
                   std::get<raft::host_span<vertex_t const>>(key_offsets_or_rx_displacements);
@@ -657,7 +663,7 @@ void update_edge_minor_property(raft::handle_t const& handle,
     assert(graph_view.local_vertex_partition_range_size() == GraphViewType::is_storage_transposed
              ? graph_view.local_edge_partition_src_range_size()
              : graph_view.local_edge_partition_dst_range_size());
-    if constexpr (packed_bool) {
+    if constexpr (contains_packed_bool_element) {
       pack_bools(handle,
                  vertex_property_input_first,
                  vertex_property_input_first + graph_view.local_vertex_partition_range_size(),
@@ -682,8 +688,12 @@ void update_edge_minor_property(raft::handle_t const& handle,
                                 VertexPropertyInputIterator vertex_property_input_first,
                                 EdgeMinorPropertyOutputWrapper edge_minor_property_output)
 {
-  constexpr bool packed_bool =
-    std::is_same_v<typename EdgeMinorPropertyOutputWrapper::value_type, bool>;
+  constexpr bool contains_packed_bool_element =
+    cugraph::has_packed_bool_element<typename EdgeMinorPropertyOutputWrapper::value_iterator,
+                                     typename EdgeMinorPropertyOutputWrapper::value_type>();
+  static_assert(!contains_packed_bool_element ||
+                  std::is_arithmetic_v<typename EdgeMinorPropertyOutputWrapper::value_type>,
+                "unimplemented for thrust::tuple types with a packed bool element.");
 
   using vertex_t = typename GraphViewType::vertex_type;
   using edge_t   = typename GraphViewType::edge_type;
@@ -706,9 +716,11 @@ void update_edge_minor_property(raft::handle_t const& handle,
       });
     rmm::device_uvector<vertex_t> rx_vertices(max_rx_size, handle.get_stream());
     auto rx_tmp_buffer = allocate_dataframe_buffer<
-      std::
-        conditional_t<packed_bool, uint32_t, typename EdgeMinorPropertyOutputWrapper::value_type>>(
-      packed_bool ? packed_bool_size(max_rx_size) : max_rx_size, handle.get_stream());
+      std::conditional_t<contains_packed_bool_element,
+                         uint32_t,
+                         typename EdgeMinorPropertyOutputWrapper::value_type>>(
+      contains_packed_bool_element ? packed_bool_size(max_rx_size) : max_rx_size,
+      handle.get_stream());
     auto rx_value_first = get_dataframe_buffer_begin(rx_tmp_buffer);
 
     std::optional<raft::host_span<vertex_t const>> key_offsets{};
@@ -727,7 +739,7 @@ void update_edge_minor_property(raft::handle_t const& handle,
         auto vertex_partition =
           vertex_partition_device_view_t<vertex_t, GraphViewType::is_multi_gpu>(
             graph_view.local_vertex_partition_view());
-        if constexpr (packed_bool) {
+        if constexpr (contains_packed_bool_element) {
           auto bool_first = thrust::make_transform_iterator(
             vertex_first,
             cuda::proclaim_return_type<bool>([vertex_property_input_first,
@@ -764,7 +776,7 @@ void update_edge_minor_property(raft::handle_t const& handle,
       device_bcast(major_comm,
                    rx_value_first,
                    rx_value_first,
-                   packed_bool ? packed_bool_size(rx_counts[i]) : rx_counts[i],
+                   contains_packed_bool_element ? packed_bool_size(rx_counts[i]) : rx_counts[i],
                    i,
                    handle.get_stream());
 
@@ -784,7 +796,7 @@ void update_edge_minor_property(raft::handle_t const& handle,
               thrust::lower_bound(thrust::seq, subrange_key_first, subrange_key_last, minor);
             if ((it != subrange_key_last) && (*it == minor)) {
               auto subrange_offset = thrust::distance(subrange_key_first, it);
-              if constexpr (packed_bool) {
+              if constexpr (contains_packed_bool_element) {
                 auto rx_value = static_cast<bool>(*(rx_value_first + packed_bool_offset(i)) &
                                                   packed_bool_mask(i));
                 packed_bool_atomic_set(
@@ -796,7 +808,7 @@ void update_edge_minor_property(raft::handle_t const& handle,
             }
           });
       } else {
-        if constexpr (packed_bool) {
+        if constexpr (contains_packed_bool_element) {
           thrust::for_each(
             handle.get_thrust_policy(),
             thrust::make_counting_iterator(vertex_t{0}),
@@ -830,7 +842,7 @@ void update_edge_minor_property(raft::handle_t const& handle,
   } else {
     assert(graph_view.local_vertex_partition_range_size() ==
            graph_view.local_edge_partition_src_range_size());
-    if constexpr (packed_bool) {
+    if constexpr (contains_packed_bool_element) {
       thrust::for_each(handle.get_thrust_policy(),
                        vertex_first,
                        vertex_last,
@@ -860,6 +872,7 @@ void update_edge_minor_property(raft::handle_t const& handle,
  *
  * @tparam GraphViewType Type of the passed non-owning graph object.
  * @tparam VertexPropertyInputIterator Type of the iterator for vertex property values.
+ * @tparam EdgeSrcValueOutputWrapper Type of the wrapper for output edge source property values.
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Non-owning graph object.
@@ -867,19 +880,18 @@ void update_edge_minor_property(raft::handle_t const& handle,
  * (inclusive) vertex (of the vertex partition assigned to this process in multi-GPU).
  * `vertex_property_input_last` (exclusive) is deduced as @p vertex_property_input_first + @p
  * graph_view.local_vertex_partition_range_size().
- * @param edge_partition_src_property_output edge_src_property_t class object to store source
+ * @param edge_partition_src_property_output edge_src_property_view_t class object to store source
  * property values (for the edge sources assigned to this process in multi-GPU).
  * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
  */
-template <typename GraphViewType, typename VertexPropertyInputIterator>
-void update_edge_src_property(
-  raft::handle_t const& handle,
-  GraphViewType const& graph_view,
-  VertexPropertyInputIterator vertex_property_input_first,
-  edge_src_property_t<GraphViewType,
-                      typename std::iterator_traits<VertexPropertyInputIterator>::value_type>&
-    edge_src_property_output,
-  bool do_expensive_check = false)
+template <typename GraphViewType,
+          typename VertexPropertyInputIterator,
+          typename EdgeSrcValueOutputWrapper>
+void update_edge_src_property(raft::handle_t const& handle,
+                              GraphViewType const& graph_view,
+                              VertexPropertyInputIterator vertex_property_input_first,
+                              EdgeSrcValueOutputWrapper edge_src_property_output,
+                              bool do_expensive_check = false)
 {
   if (do_expensive_check) {
     // currently, nothing to do
@@ -887,10 +899,10 @@ void update_edge_src_property(
 
   if constexpr (GraphViewType::is_storage_transposed) {
     detail::update_edge_minor_property(
-      handle, graph_view, vertex_property_input_first, edge_src_property_output.mutable_view());
+      handle, graph_view, vertex_property_input_first, edge_src_property_output);
   } else {
     detail::update_edge_major_property(
-      handle, graph_view, vertex_property_input_first, edge_src_property_output.mutable_view());
+      handle, graph_view, vertex_property_input_first, edge_src_property_output);
   }
 }
 
@@ -903,6 +915,7 @@ void update_edge_src_property(
  * @tparam GraphViewType Type of the passed non-owning graph object.
  * @tparam VertexIterator  Type of the iterator for vertex identifiers.
  * @tparam VertexPropertyInputIterator Type of the iterator for vertex property values.
+ * @tparam EdgeSrcValueOutputWrapper Type of the wrapper for output edge source property values.
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Non-owning graph object.
@@ -914,21 +927,21 @@ void update_edge_src_property(
  * (inclusive) vertex (of the vertex partition assigned to this process in multi-GPU).
  * `vertex_property_input_last` (exclusive) is deduced as @p vertex_property_input_first + @p
  * graph_view.local_vertex_partition_range_size().
- * @param edge_partition_src_property_output edge_src_property_t class object to store source
+ * @param edge_partition_src_property_output edge_src_property_view_t class object to store source
  * property values (for the edge sources assigned to this process in multi-GPU).
  * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
  */
-template <typename GraphViewType, typename VertexIterator, typename VertexPropertyInputIterator>
-void update_edge_src_property(
-  raft::handle_t const& handle,
-  GraphViewType const& graph_view,
-  VertexIterator vertex_first,
-  VertexIterator vertex_last,
-  VertexPropertyInputIterator vertex_property_input_first,
-  edge_src_property_t<GraphViewType,
-                      typename std::iterator_traits<VertexPropertyInputIterator>::value_type>&
-    edge_src_property_output,
-  bool do_expensive_check = false)
+template <typename GraphViewType,
+          typename VertexIterator,
+          typename VertexPropertyInputIterator,
+          typename EdgeSrcValueOutputWrapper>
+void update_edge_src_property(raft::handle_t const& handle,
+                              GraphViewType const& graph_view,
+                              VertexIterator vertex_first,
+                              VertexIterator vertex_last,
+                              VertexPropertyInputIterator vertex_property_input_first,
+                              EdgeSrcValueOutputWrapper edge_src_property_output,
+                              bool do_expensive_check = false)
 {
   if (do_expensive_check) {
     auto num_invalids = thrust::count_if(
@@ -956,14 +969,14 @@ void update_edge_src_property(
                                        vertex_first,
                                        vertex_last,
                                        vertex_property_input_first,
-                                       edge_src_property_output.mutable_view());
+                                       edge_src_property_output);
   } else {
     detail::update_edge_major_property(handle,
                                        graph_view,
                                        vertex_first,
                                        vertex_last,
                                        vertex_property_input_first,
-                                       edge_src_property_output.mutable_view());
+                                       edge_src_property_output);
   }
 }
 
@@ -975,6 +988,8 @@ void update_edge_src_property(
  *
  * @tparam GraphViewType Type of the passed non-owning graph object.
  * @tparam VertexPropertyInputIterator Type of the iterator for vertex property values.
+ * @tparam EdgeDstValueOutputWrapper Type of the wrapper for output edge destination property
+ * values.
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Non-owning graph object.
@@ -982,19 +997,18 @@ void update_edge_src_property(
  * (inclusive) vertex (of the vertex partition assigned to this process in multi-GPU).
  * `vertex_property_input_last` (exclusive) is deduced as @p vertex_property_input_first + @p
  * graph_view.local_vertex_partition_range_size().
- * @param edge_partition_dst_property_output edge_dst_property_t class object to store destination
- * property values (for the edge destinations assigned to this process in multi-GPU).
+ * @param edge_partition_dst_property_output edge_dst_property_view_t class object to store
+ * destination property values (for the edge destinations assigned to this process in multi-GPU).
  * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
  */
-template <typename GraphViewType, typename VertexPropertyInputIterator>
-void update_edge_dst_property(
-  raft::handle_t const& handle,
-  GraphViewType const& graph_view,
-  VertexPropertyInputIterator vertex_property_input_first,
-  edge_dst_property_t<GraphViewType,
-                      typename std::iterator_traits<VertexPropertyInputIterator>::value_type>&
-    edge_dst_property_output,
-  bool do_expensive_check = false)
+template <typename GraphViewType,
+          typename VertexPropertyInputIterator,
+          typename EdgeDstValueOutputWrapper>
+void update_edge_dst_property(raft::handle_t const& handle,
+                              GraphViewType const& graph_view,
+                              VertexPropertyInputIterator vertex_property_input_first,
+                              EdgeDstValueOutputWrapper edge_dst_property_output,
+                              bool do_expensive_check = false)
 {
   if (do_expensive_check) {
     // currently, nothing to do
@@ -1002,10 +1016,10 @@ void update_edge_dst_property(
 
   if constexpr (GraphViewType::is_storage_transposed) {
     detail::update_edge_major_property(
-      handle, graph_view, vertex_property_input_first, edge_dst_property_output.mutable_view());
+      handle, graph_view, vertex_property_input_first, edge_dst_property_output);
   } else {
     detail::update_edge_minor_property(
-      handle, graph_view, vertex_property_input_first, edge_dst_property_output.mutable_view());
+      handle, graph_view, vertex_property_input_first, edge_dst_property_output);
   }
 }
 
@@ -1018,6 +1032,8 @@ void update_edge_dst_property(
  * @tparam GraphViewType Type of the passed non-owning graph object.
  * @tparam VertexIterator  Type of the iterator for vertex identifiers.
  * @tparam VertexPropertyInputIterator Type of the iterator for vertex property values.
+ * @tparam EdgeDstValueOutputWrapper Type of the wrapper for output edge destination property
+ * values.
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Non-owning graph object.
@@ -1029,21 +1045,21 @@ void update_edge_dst_property(
  * (inclusive) vertex (of the vertex partition assigned to this process in multi-GPU).
  * `vertex_property_input_last` (exclusive) is deduced as @p vertex_property_input_first + @p
  * graph_view.local_vertex_partition_range_size().
- * @param edge_partition_dst_property_output edge_dst_property_t class object to store destination
- * property values (for the edge destinations assigned to this process in multi-GPU).
+ * @param edge_partition_dst_property_output edge_dst_property_view_t class object to store
+ * destination property values (for the edge destinations assigned to this process in multi-GPU).
  * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
  */
-template <typename GraphViewType, typename VertexIterator, typename VertexPropertyInputIterator>
-void update_edge_dst_property(
-  raft::handle_t const& handle,
-  GraphViewType const& graph_view,
-  VertexIterator vertex_first,
-  VertexIterator vertex_last,
-  VertexPropertyInputIterator vertex_property_input_first,
-  edge_dst_property_t<GraphViewType,
-                      typename std::iterator_traits<VertexPropertyInputIterator>::value_type>&
-    edge_dst_property_output,
-  bool do_expensive_check = false)
+template <typename GraphViewType,
+          typename VertexIterator,
+          typename VertexPropertyInputIterator,
+          typename EdgeDstValueOutputWrapper>
+void update_edge_dst_property(raft::handle_t const& handle,
+                              GraphViewType const& graph_view,
+                              VertexIterator vertex_first,
+                              VertexIterator vertex_last,
+                              VertexPropertyInputIterator vertex_property_input_first,
+                              EdgeDstValueOutputWrapper edge_dst_property_output,
+                              bool do_expensive_check = false)
 {
   if (do_expensive_check) {
     auto num_invalids = thrust::count_if(
@@ -1071,14 +1087,14 @@ void update_edge_dst_property(
                                        vertex_first,
                                        vertex_last,
                                        vertex_property_input_first,
-                                       edge_dst_property_output.mutable_view());
+                                       edge_dst_property_output);
   } else {
     detail::update_edge_minor_property(handle,
                                        graph_view,
                                        vertex_first,
                                        vertex_last,
                                        vertex_property_input_first,
-                                       edge_dst_property_output.mutable_view());
+                                       edge_dst_property_output);
   }
 }
 
diff --git a/cpp/src/prims/vertex_frontier.cuh b/cpp/src/prims/vertex_frontier.cuh
index 08aadf9fa18..b13e6bfd458 100644
--- a/cpp/src/prims/vertex_frontier.cuh
+++ b/cpp/src/prims/vertex_frontier.cuh
@@ -43,6 +43,7 @@
 #include <optional>
 #include <tuple>
 #include <type_traits>
+#include <variant>
 #include <vector>
 
 namespace cugraph {
@@ -62,18 +63,52 @@ class key_bucket_t {
 
   static_assert(std::is_same_v<tag_t, void> || std::is_arithmetic_v<tag_t>);
 
-  using optional_buffer_type = std::
-    conditional_t<std::is_same_v<tag_t, void>, std::byte /* dummy */, rmm::device_uvector<tag_t>>;
+  using optional_variant_type =
+    std::conditional_t<std::is_same_v<tag_t, void>,
+                       std::byte /* dummy */,
+                       std::variant<rmm::device_uvector<tag_t>, raft::device_span<tag_t const>>>;
 
   template <typename tag_type = tag_t, std::enable_if_t<std::is_same_v<tag_type, void>>* = nullptr>
   key_bucket_t(raft::handle_t const& handle)
-    : handle_ptr_(&handle), vertices_(0, handle.get_stream()), tags_(std::byte{0})
+    : handle_ptr_(&handle),
+      vertices_(rmm::device_uvector<vertex_t>(0, handle.get_stream())),
+      tags_(std::byte{0})
   {
   }
 
   template <typename tag_type = tag_t, std::enable_if_t<!std::is_same_v<tag_type, void>>* = nullptr>
   key_bucket_t(raft::handle_t const& handle)
-    : handle_ptr_(&handle), vertices_(0, handle.get_stream()), tags_(0, handle.get_stream())
+    : handle_ptr_(&handle),
+      vertices_(rmm::device_uvector<vertex_t>(0, handle.get_stream())),
+      tags_(rmm::device_uvector<tag_t>(0, handle.get_stream()))
+  {
+  }
+
+  template <typename tag_type = tag_t, std::enable_if_t<std::is_same_v<tag_type, void>>* = nullptr>
+  key_bucket_t(raft::handle_t const& handle, rmm::device_uvector<vertex_t>&& vertices)
+    : handle_ptr_(&handle), vertices_(std::move(vertices)), tags_(std::byte{0})
+  {
+  }
+
+  template <typename tag_type = tag_t, std::enable_if_t<!std::is_same_v<tag_type, void>>* = nullptr>
+  key_bucket_t(raft::handle_t const& handle,
+               rmm::device_uvector<vertex_t>&& vertices,
+               rmm::device_uvector<tag_t>&& tags)
+    : handle_ptr_(&handle), vertices_(std::move(vertices)), tags_(std::move(tags))
+  {
+  }
+
+  template <typename tag_type = tag_t, std::enable_if_t<std::is_same_v<tag_type, void>>* = nullptr>
+  key_bucket_t(raft::handle_t const& handle, raft::device_span<vertex_t const> vertices)
+    : handle_ptr_(&handle), vertices_(vertices), tags_(std::byte{0})
+  {
+  }
+
+  template <typename tag_type = tag_t, std::enable_if_t<!std::is_same_v<tag_type, void>>* = nullptr>
+  key_bucket_t(raft::handle_t const& handle,
+               raft::device_span<vertex_t const> vertices,
+               raft::device_span<tag_t const> tags)
+    : handle_ptr_(&handle), vertices_(vertices), tags_(tags)
   {
   }
 
@@ -85,12 +120,15 @@ class key_bucket_t {
   template <typename tag_type = tag_t, std::enable_if_t<std::is_same_v<tag_type, void>>* = nullptr>
   void insert(vertex_t vertex)
   {
-    if (vertices_.size() > 0) {
+    CUGRAPH_EXPECTS(vertices_.index() == 0,
+                    "insert() is supported only when this bucket holds an owning container.");
+    if (std::get<0>(vertices_).size() > 0) {
       rmm::device_scalar<vertex_t> tmp(vertex, handle_ptr_->get_stream());
       insert(tmp.data(), tmp.data() + 1);
     } else {
-      vertices_.resize(1, handle_ptr_->get_stream());
-      raft::update_device(vertices_.data(), &vertex, size_t{1}, handle_ptr_->get_stream());
+      std::get<0>(vertices_).resize(1, handle_ptr_->get_stream());
+      raft::update_device(
+        std::get<0>(vertices_).data(), &vertex, size_t{1}, handle_ptr_->get_stream());
     }
   }
 
@@ -103,17 +141,19 @@ class key_bucket_t {
   template <typename tag_type = tag_t, std::enable_if_t<!std::is_same_v<tag_type, void>>* = nullptr>
   void insert(thrust::tuple<vertex_t, tag_type> key)
   {
-    if (vertices_.size() > 0) {
+    CUGRAPH_EXPECTS(vertices_.index() == 0,
+                    "insert() is supported only when this bucket holds an owning container.");
+    if (std::get<0>(vertices_).size() > 0) {
       rmm::device_scalar<vertex_t> tmp_vertex(thrust::get<0>(key), handle_ptr_->get_stream());
       rmm::device_scalar<tag_t> tmp_tag(thrust::get<1>(key), handle_ptr_->get_stream());
       auto pair_first =
         thrust::make_zip_iterator(thrust::make_tuple(tmp_vertex.data(), tmp_tag.data()));
       insert(pair_first, pair_first + 1);
     } else {
-      vertices_.resize(1, handle_ptr_->get_stream());
-      tags_.resize(1, handle_ptr_->get_stream());
-      auto pair_first =
-        thrust::make_tuple(thrust::make_zip_iterator(vertices_.begin(), tags_.begin()));
+      std::get<0>(vertices_).resize(1, handle_ptr_->get_stream());
+      std::get<0>(tags_).resize(1, handle_ptr_->get_stream());
+      auto pair_first = thrust::make_tuple(
+        thrust::make_zip_iterator(std::get<0>(vertices_).begin(), std::get<0>(tags_).begin()));
       thrust::fill(handle_ptr_->get_thrust_policy(), pair_first, pair_first + 1, key);
     }
   }
@@ -134,14 +174,16 @@ class key_bucket_t {
     static_assert(
       std::is_same_v<typename std::iterator_traits<VertexIterator>::value_type, vertex_t>);
 
-    if (vertices_.size() > 0) {
+    CUGRAPH_EXPECTS(vertices_.index() == 0,
+                    "insert() is supported only when this bucket holds an owning container.");
+    if (std::get<0>(vertices_).size() > 0) {
       if constexpr (sorted_unique) {
         rmm::device_uvector<vertex_t> merged_vertices(
-          vertices_.size() + thrust::distance(vertex_first, vertex_last),
+          std::get<0>(vertices_).size() + thrust::distance(vertex_first, vertex_last),
           handle_ptr_->get_stream());
         thrust::merge(handle_ptr_->get_thrust_policy(),
-                      vertices_.begin(),
-                      vertices_.end(),
+                      std::get<0>(vertices_).begin(),
+                      std::get<0>(vertices_).end(),
                       vertex_first,
                       vertex_last,
                       merged_vertices.begin());
@@ -150,19 +192,23 @@ class key_bucket_t {
                                                                merged_vertices.begin(),
                                                                merged_vertices.end())),
                                handle_ptr_->get_stream());
-        vertices_ = std::move(merged_vertices);
+        std::get<0>(vertices_) = std::move(merged_vertices);
       } else {
-        auto cur_size = vertices_.size();
-        vertices_.resize(cur_size + thrust::distance(vertex_first, vertex_last),
-                         handle_ptr_->get_stream());
+        auto cur_size = std::get<0>(vertices_).size();
+        std::get<0>(vertices_).resize(cur_size + thrust::distance(vertex_first, vertex_last),
+                                      handle_ptr_->get_stream());
         thrust::copy(handle_ptr_->get_thrust_policy(),
                      vertex_first,
                      vertex_last,
-                     vertices_.begin() + cur_size);
+                     std::get<0>(vertices_).begin() + cur_size);
       }
     } else {
-      vertices_.resize(thrust::distance(vertex_first, vertex_last), handle_ptr_->get_stream());
-      thrust::copy(handle_ptr_->get_thrust_policy(), vertex_first, vertex_last, vertices_.begin());
+      std::get<0>(vertices_).resize(thrust::distance(vertex_first, vertex_last),
+                                    handle_ptr_->get_stream());
+      thrust::copy(handle_ptr_->get_thrust_policy(),
+                   vertex_first,
+                   vertex_last,
+                   std::get<0>(vertices_).begin());
     }
   }
 
@@ -182,18 +228,21 @@ class key_bucket_t {
     static_assert(std::is_same_v<typename std::iterator_traits<KeyIterator>::value_type,
                                  thrust::tuple<vertex_t, tag_t>>);
 
-    if (vertices_.size() > 0) {
+    CUGRAPH_EXPECTS(vertices_.index() == 0,
+                    "insert() is supported only when this bucket holds an owning container.");
+    if (std::get<0>(vertices_).size() > 0) {
       if constexpr (sorted_unique) {
         rmm::device_uvector<vertex_t> merged_vertices(
-          vertices_.size() + thrust::distance(key_first, key_last), handle_ptr_->get_stream());
+          std::get<0>(vertices_).size() + thrust::distance(key_first, key_last),
+          handle_ptr_->get_stream());
         rmm::device_uvector<tag_t> merged_tags(merged_vertices.size(), handle_ptr_->get_stream());
-        auto old_pair_first =
-          thrust::make_zip_iterator(thrust::make_tuple(vertices_.begin(), tags_.begin()));
+        auto old_pair_first = thrust::make_zip_iterator(
+          thrust::make_tuple(std::get<0>(vertices_).begin(), std::get<0>(tags_).begin()));
         auto merged_pair_first = thrust::make_zip_iterator(
           thrust::make_tuple(merged_vertices.begin(), merged_tags.begin()));
         thrust::merge(handle_ptr_->get_thrust_policy(),
                       old_pair_first,
-                      old_pair_first + vertices_.size(),
+                      old_pair_first + std::get<0>(vertices_).size(),
                       key_first,
                       key_last,
                       merged_pair_first);
@@ -204,95 +253,154 @@ class key_bucket_t {
                                           merged_pair_first + merged_vertices.size())),
           handle_ptr_->get_stream());
         merged_tags.resize(merged_vertices.size(), handle_ptr_->get_stream());
-        vertices_ = std::move(merged_vertices);
-        tags_     = std::move(merged_tags);
+        std::get<0>(vertices_) = std::move(merged_vertices);
+        std::get<0>(tags_)     = std::move(merged_tags);
       } else {
-        auto cur_size = vertices_.size();
-        vertices_.resize(cur_size + thrust::distance(key_first, key_last),
-                         handle_ptr_->get_stream());
-        tags_.resize(vertices_.size(), handle_ptr_->get_stream());
-        thrust::copy(
-          handle_ptr_->get_thrust_policy(),
-          key_first,
-          key_last,
-          thrust::make_zip_iterator(thrust::make_tuple(vertices_.begin(), tags_.begin())) +
-            cur_size);
+        auto cur_size = std::get<0>(vertices_).size();
+        std::get<0>(vertices_).resize(cur_size + thrust::distance(key_first, key_last),
+                                      handle_ptr_->get_stream());
+        std::get<0>(tags_).resize(std::get<0>(vertices_).size(), handle_ptr_->get_stream());
+        thrust::copy(handle_ptr_->get_thrust_policy(),
+                     key_first,
+                     key_last,
+                     thrust::make_zip_iterator(thrust::make_tuple(std::get<0>(vertices_).begin(),
+                                                                  std::get<0>(tags_).begin())) +
+                       cur_size);
       }
     } else {
-      vertices_.resize(thrust::distance(key_first, key_last), handle_ptr_->get_stream());
-      tags_.resize(thrust::distance(key_first, key_last), handle_ptr_->get_stream());
+      std::get<0>(vertices_).resize(thrust::distance(key_first, key_last),
+                                    handle_ptr_->get_stream());
+      std::get<0>(tags_).resize(thrust::distance(key_first, key_last), handle_ptr_->get_stream());
       thrust::copy(handle_ptr_->get_thrust_policy(),
                    key_first,
                    key_last,
-                   thrust::make_zip_iterator(thrust::make_tuple(vertices_.begin(), tags_.begin())));
+                   thrust::make_zip_iterator(thrust::make_tuple(std::get<0>(vertices_).begin(),
+                                                                std::get<0>(tags_).begin())));
     }
   }
 
-  size_t size() const { return vertices_.size(); }
+  size_t size() const
+  {
+    return vertices_.index() == 0 ? std::get<0>(vertices_).size() : std::get<1>(vertices_).size();
+  }
 
   template <bool do_aggregate = multi_gpu>
   std::enable_if_t<do_aggregate, size_t> aggregate_size() const
   {
-    return host_scalar_allreduce(handle_ptr_->get_comms(),
-                                 vertices_.size(),
-                                 raft::comms::op_t::SUM,
-                                 handle_ptr_->get_stream());
+    return host_scalar_allreduce(
+      handle_ptr_->get_comms(),
+      vertices_.index() == 0 ? std::get<0>(vertices_).size() : std::get<1>(vertices_).size(),
+      raft::comms::op_t::SUM,
+      handle_ptr_->get_stream());
   }
 
   template <bool do_aggregate = multi_gpu>
   std::enable_if_t<!do_aggregate, size_t> aggregate_size() const
   {
-    return vertices_.size();
+    return vertices_.index() == 0 ? std::get<0>(vertices_).size() : std::get<1>(vertices_).size();
   }
 
   void resize(size_t size)
   {
-    vertices_.resize(size, handle_ptr_->get_stream());
-    if constexpr (!std::is_same_v<tag_t, void>) { tags_.resize(size, handle_ptr_->get_stream()); }
+    CUGRAPH_EXPECTS(vertices_.index() == 0,
+                    "resize() is supported only when this bucket holds an owning container.");
+    std::get<0>(vertices_).resize(size, handle_ptr_->get_stream());
+    if constexpr (!std::is_same_v<tag_t, void>) {
+      std::get<0>(tags_).resize(size, handle_ptr_->get_stream());
+    }
   }
 
-  void clear() { resize(0); }
+  void clear()
+  {
+    CUGRAPH_EXPECTS(vertices_.index() == 0,
+                    "clear() is supported only when this bucket holds an owning container.");
+    resize(0);
+  }
 
   void shrink_to_fit()
   {
-    vertices_.shrink_to_fit(handle_ptr_->get_stream());
-    if constexpr (!std::is_same_v<tag_t, void>) { tags_.shrink_to_fit(handle_ptr_->get_stream()); }
+    CUGRAPH_EXPECTS(
+      vertices_.index() == 0,
+      "shrink_to_fit() is supported only when this bucket holds an owning container.");
+    std::get<0>(vertices_).shrink_to_fit(handle_ptr_->get_stream());
+    if constexpr (!std::is_same_v<tag_t, void>) {
+      std::get<0>(tags_).shrink_to_fit(handle_ptr_->get_stream());
+    }
   }
 
   auto const begin() const
   {
     if constexpr (std::is_same_v<tag_t, void>) {
-      return vertices_.begin();
+      return vertices_.index() == 0 ? std::get<0>(vertices_).begin()
+                                    : std::get<1>(vertices_).begin();
     } else {
-      return thrust::make_zip_iterator(thrust::make_tuple(vertices_.begin(), tags_.begin()));
+      return vertices_.index() == 0
+               ? thrust::make_zip_iterator(
+                   thrust::make_tuple(std::get<0>(vertices_).begin(), std::get<0>(tags_).begin()))
+               : thrust::make_zip_iterator(
+                   thrust::make_tuple(std::get<1>(vertices_).begin(), std::get<1>(tags_).begin()));
     }
   }
 
   auto begin()
   {
+    CUGRAPH_EXPECTS(
+      vertices_.index() == 0,
+      "non-const begin() is supported only when this bucket holds an owning container.");
     if constexpr (std::is_same_v<tag_t, void>) {
-      return vertices_.begin();
+      return std::get<0>(vertices_).begin();
     } else {
-      return thrust::make_zip_iterator(thrust::make_tuple(vertices_.begin(), tags_.begin()));
+      return thrust::make_zip_iterator(
+        thrust::make_tuple(std::get<0>(vertices_).begin(), std::get<0>(tags_).begin()));
     }
   }
 
-  auto const end() const { return begin() + vertices_.size(); }
+  auto const end() const
+  {
+    return begin() +
+           (vertices_.index() == 0 ? std::get<0>(vertices_).size() : std::get<1>(vertices_).size());
+  }
+
+  auto end()
+  {
+    CUGRAPH_EXPECTS(
+      vertices_.index() == 0,
+      "non-const end() is supported only when this bucket holds an owning container.");
+    return begin() + std::get<0>(vertices_).size();
+  }
 
-  auto end() { return begin() + vertices_.size(); }
+  auto const vertex_begin() const
+  {
+    return vertices_.index() == 0 ? std::get<0>(vertices_).begin() : std::get<1>(vertices_).begin();
+  }
 
-  auto const vertex_begin() const { return vertices_.begin(); }
+  auto const vertex_end() const
+  {
+    return vertices_.index() == 0 ? std::get<0>(vertices_).end() : std::get<1>(vertices_).end();
+  }
 
-  auto const vertex_end() const { return vertices_.end(); }
+  auto vertex_begin()
+  {
+    CUGRAPH_EXPECTS(
+      vertices_.index() == 0,
+      "non-const vertex_begin() is supported only when this bucket holds an owning container.");
+    return std::get<0>(vertices_).begin();
+  }
 
-  auto vertex_begin() { return vertices_.begin(); }
+  auto vertex_end()
+  {
+    CUGRAPH_EXPECTS(
+      vertices_.index() == 0,
+      "non-const vertex_end() is supported only when this bucket holds an owning container.");
+    return std::get<0>(vertices_).end();
+  }
 
-  auto vertex_end() { return vertices_.end(); }
+  bool is_owning() { return (vertices_.index() == 0); }
 
  private:
   raft::handle_t const* handle_ptr_{nullptr};
-  rmm::device_uvector<vertex_t> vertices_;
-  optional_buffer_type tags_;
+  std::variant<rmm::device_uvector<vertex_t>, raft::device_span<vertex_t const>> vertices_;
+  optional_variant_type tags_;
 };
 
 template <typename vertex_t,
@@ -439,6 +547,7 @@ class vertex_frontier_t {
         [] __device__(auto lhs, auto rhs) { return thrust::get<0>(lhs) < thrust::get<0>(rhs); });
       rmm::device_uvector<uint8_t> d_indices(to_bucket_indices.size(), handle_ptr_->get_stream());
       rmm::device_uvector<size_t> d_counts(d_indices.size(), handle_ptr_->get_stream());
+      // FIXME: thrust::lower_bound & thrust::upper_bound will be faster
       auto it = thrust::reduce_by_key(handle_ptr_->get_thrust_policy(),
                                       bucket_idx_first,
                                       bucket_idx_last,
diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh
index 0689dc4a53a..fb1dee1a92f 100644
--- a/cpp/src/structure/coarsen_graph_impl.cuh
+++ b/cpp/src/structure/coarsen_graph_impl.cuh
@@ -291,9 +291,9 @@ coarsen_graph(raft::handle_t const& handle,
     edge_dst_property_t<graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>, vertex_t>>
     edge_minor_labels(handle, graph_view);
   if constexpr (store_transposed) {
-    update_edge_src_property(handle, graph_view, labels, edge_minor_labels);
+    update_edge_src_property(handle, graph_view, labels, edge_minor_labels.mutable_view());
   } else {
-    update_edge_dst_property(handle, graph_view, labels, edge_minor_labels);
+    update_edge_dst_property(handle, graph_view, labels, edge_minor_labels.mutable_view());
   }
 
   std::vector<rmm::device_uvector<vertex_t>> coarsened_edgelist_majors{};
diff --git a/cpp/src/structure/graph_view_impl.cuh b/cpp/src/structure/graph_view_impl.cuh
index 7097349dce5..5371d53bcf0 100644
--- a/cpp/src/structure/graph_view_impl.cuh
+++ b/cpp/src/structure/graph_view_impl.cuh
@@ -94,7 +94,8 @@ rmm::device_uvector<edge_t> compute_major_degrees(
 
   auto use_dcs = edge_partition_dcs_nzd_vertices.has_value();
 
-  rmm::device_uvector<edge_t> local_degrees(0, handle.get_stream());
+  rmm::device_uvector<edge_t> local_degrees(
+    0, handle.get_stream());  // excluding globally 0 degree vertices
   rmm::device_uvector<edge_t> degrees(0, handle.get_stream());
 
   vertex_t max_num_local_degrees{0};
@@ -104,9 +105,16 @@ rmm::device_uvector<edge_t> compute_major_degrees(
         major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i);
     auto major_range_vertex_partition_size =
       partition.vertex_partition_range_size(major_range_vertex_partition_id);
-    max_num_local_degrees = std::max(max_num_local_degrees, major_range_vertex_partition_size);
+    auto segment_offset_size_per_partition =
+      edge_partition_segment_offsets.size() / static_cast<size_t>(minor_comm_size);
+    auto num_local_degrees =
+      edge_partition_segment_offsets[segment_offset_size_per_partition * i +
+                                     (segment_offset_size_per_partition - 2)];
+    max_num_local_degrees = std::max(max_num_local_degrees, num_local_degrees);
     if (i == minor_comm_rank) {
       degrees.resize(major_range_vertex_partition_size, handle.get_stream());
+      thrust::fill(
+        handle.get_thrust_policy(), degrees.begin() + num_local_degrees, degrees.end(), edge_t{0});
     }
   }
   local_degrees.resize(max_num_local_degrees, handle.get_stream());
@@ -114,20 +122,22 @@ rmm::device_uvector<edge_t> compute_major_degrees(
     auto major_range_vertex_partition_id =
       detail::compute_local_edge_partition_major_range_vertex_partition_id_t{
         major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i);
-    vertex_t major_range_first{};
-    vertex_t major_range_last{};
-    std::tie(major_range_first, major_range_last) =
-      partition.vertex_partition_range(major_range_vertex_partition_id);
+    auto major_range_first =
+      partition.vertex_partition_range_first(major_range_vertex_partition_id);
+
     auto offsets = edge_partition_offsets[i];
     auto masks =
       edge_partition_masks ? thrust::make_optional((*edge_partition_masks)[i]) : thrust::nullopt;
     auto segment_offset_size_per_partition =
       edge_partition_segment_offsets.size() / static_cast<size_t>(minor_comm_size);
+    auto num_local_degrees =
+      edge_partition_segment_offsets[segment_offset_size_per_partition * i +
+                                     (segment_offset_size_per_partition - 2)];
     auto major_hypersparse_first =
       use_dcs ? major_range_first +
                   edge_partition_segment_offsets[segment_offset_size_per_partition * i +
                                                  detail::num_sparse_segments_per_vertex_partition]
-              : major_range_last;
+              : major_range_first + num_local_degrees;
     auto execution_policy = handle.get_thrust_policy();
     thrust::transform(execution_policy,
                       thrust::make_counting_iterator(vertex_t{0}),
@@ -145,7 +155,7 @@ rmm::device_uvector<edge_t> compute_major_degrees(
       auto dcs_nzd_vertices = (*edge_partition_dcs_nzd_vertices)[i];
       thrust::fill(execution_policy,
                    local_degrees.begin() + (major_hypersparse_first - major_range_first),
-                   local_degrees.begin() + (major_range_last - major_range_first),
+                   local_degrees.begin() + num_local_degrees,
                    edge_t{0});
       thrust::for_each(
         execution_policy,
@@ -169,7 +179,7 @@ rmm::device_uvector<edge_t> compute_major_degrees(
     }
     minor_comm.reduce(local_degrees.data(),
                       i == minor_comm_rank ? degrees.data() : static_cast<edge_t*>(nullptr),
-                      static_cast<size_t>(major_range_last - major_range_first),
+                      static_cast<size_t>(num_local_degrees),
                       raft::comms::op_t::SUM,
                       i,
                       handle.get_stream());
diff --git a/cpp/src/structure/induced_subgraph_impl.cuh b/cpp/src/structure/induced_subgraph_impl.cuh
index b1ce8e6f51e..a8cd2f6f50c 100644
--- a/cpp/src/structure/induced_subgraph_impl.cuh
+++ b/cpp/src/structure/induced_subgraph_impl.cuh
@@ -212,10 +212,9 @@ extract_induced_subgraphs(
   dst_subgraph_vertices = raft::device_span<vertex_t const>(dst_subgraph_vertices_v.data(),
                                                             dst_subgraph_vertices_v.size());
 
-  // 3. Call extract_transform_v_frontier_e with a functor that
-  //    returns thrust::nullopt if the destination vertex has
-  //    a property of 0, return the edge if the destination
-  //    vertex has a property of 1
+  // 3. Call extract_transform_v_frontier_outgoing_e with a functor that returns thrust::nullopt if
+  // the destination vertex has a property of 0, return the edge if the destination vertex has a
+  // property of 1
   vertex_frontier_t<vertex_t, size_t, multi_gpu, false> vertex_frontier(handle, 1);
 
   graph_ids_v = detail::expand_sparse_offsets(subgraph_offsets, size_t{0}, handle.get_stream());
diff --git a/cpp/src/traversal/bfs_impl.cuh b/cpp/src/traversal/bfs_impl.cuh
index fb837484a14..8a18dedd2ab 100644
--- a/cpp/src/traversal/bfs_impl.cuh
+++ b/cpp/src/traversal/bfs_impl.cuh
@@ -17,8 +17,7 @@
 
 #include "prims/fill_edge_src_dst_property.cuh"
 #include "prims/reduce_op.cuh"
-#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh"
-#include "prims/update_edge_src_dst_property.cuh"
+#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh"
 #include "prims/update_v_frontier.cuh"
 #include "prims/vertex_frontier.cuh"
 
@@ -38,8 +37,10 @@
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/permutation_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/optional.h>
+#include <thrust/set_operations.h>
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
@@ -51,33 +52,35 @@ namespace cugraph {
 namespace {
 
 template <typename vertex_t, bool multi_gpu>
-struct e_op_t {
+struct topdown_e_op_t {
+  detail::edge_partition_endpoint_property_device_view_t<vertex_t, uint32_t*, bool>
+    prev_visited_flags{};  // visited in the previous iterations, to reduce the number of atomic
+                           // operations
   detail::edge_partition_endpoint_property_device_view_t<vertex_t, uint32_t*, bool> visited_flags{};
-  uint32_t const* prev_visited_flags{
-    nullptr};  // relevant only if multi_gpu is false (this affects only local-computing with 0
-               // impact in communication volume, so this may improve performance in small-scale but
-               // will eat-up more memory with no benefit in performance in large-scale).
-  vertex_t dst_first{};  // relevant only if multi_gpu is true
+  vertex_t dst_first{};
 
   __device__ thrust::optional<vertex_t> operator()(
     vertex_t src, vertex_t dst, thrust::nullopt_t, thrust::nullopt_t, thrust::nullopt_t) const
   {
-    bool push{};
-    if constexpr (multi_gpu) {
-      auto dst_offset = dst - dst_first;
-      auto old        = visited_flags.atomic_or(dst_offset, true);
-      push            = !old;
-    } else {
-      auto mask = uint32_t{1} << (dst % (sizeof(uint32_t) * 8));
-      if (*(prev_visited_flags + packed_bool_offset(dst)) &
-          packed_bool_mask(dst)) {  // check if unvisited in previous iterations
-        push = false;
-      } else {  // check if unvisited in this iteration as well
-        auto old = visited_flags.atomic_or(dst, true);
-        push     = !old;
-      }
-    }
-    return push ? thrust::optional<vertex_t>{src} : thrust::nullopt;
+    auto dst_offset = dst - dst_first;
+    auto old        = prev_visited_flags.get(dst_offset);
+    if (!old) { old = visited_flags.atomic_or(dst_offset, true); }
+    return old ? thrust::nullopt : thrust::optional<vertex_t>{src};
+  }
+};
+
+template <typename vertex_t, bool multi_gpu>
+struct bottomup_e_op_t {
+  detail::edge_partition_endpoint_property_device_view_t<vertex_t, uint32_t*, bool>
+    prev_visited_flags{};  // visited in the previous iterations
+  vertex_t dst_first{};
+
+  __device__ thrust::optional<vertex_t> operator()(
+    vertex_t src, vertex_t dst, thrust::nullopt_t, thrust::nullopt_t, thrust::nullopt_t) const
+  {
+    auto dst_offset = dst - dst_first;
+    auto old        = prev_visited_flags.get(dst_offset);
+    return old ? thrust::optional<vertex_t>{dst} : thrust::nullopt;
   }
 };
 
@@ -87,7 +90,7 @@ namespace detail {
 
 template <typename GraphViewType, typename PredecessorIterator>
 void bfs(raft::handle_t const& handle,
-         GraphViewType const& push_graph_view,
+         GraphViewType const& graph_view,
          typename GraphViewType::vertex_type* distances,
          PredecessorIterator predecessor_first,
          typename GraphViewType::vertex_type const* sources,
@@ -97,35 +100,58 @@ void bfs(raft::handle_t const& handle,
          bool do_expensive_check)
 {
   using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
 
   static_assert(std::is_integral<vertex_t>::value,
                 "GraphViewType::vertex_type should be integral.");
   static_assert(!GraphViewType::is_storage_transposed,
                 "GraphViewType should support the push model.");
 
-  auto const num_vertices = push_graph_view.number_of_vertices();
+  // direction optimizing BFS implementation is based on "S. Beamer, K. Asanovic, D. Patterson,
+  // Direction-Optimizing Breadth-First Search, 2012"
+
+  auto const num_vertices = graph_view.number_of_vertices();
   if (num_vertices == 0) { return; }
 
   // 1. check input arguments
 
   CUGRAPH_EXPECTS((n_sources == 0) || (sources != nullptr),
-                  "Invalid input argument: sources cannot be null");
-
-  auto aggregate_n_sources =
-    GraphViewType::is_multi_gpu
-      ? host_scalar_allreduce(
-          handle.get_comms(), n_sources, raft::comms::op_t::SUM, handle.get_stream())
-      : n_sources;
-  CUGRAPH_EXPECTS(aggregate_n_sources > 0,
-                  "Invalid input argument: input should have at least one source");
+                  "Invalid input argument: sources cannot be null if n_sources > 0.");
+
+  if (GraphViewType::is_multi_gpu) {
+    if (do_expensive_check) {
+      auto aggregate_n_sources = host_scalar_allreduce(handle.get_comms(),
+                                                       static_cast<vertex_t>(n_sources),
+                                                       raft::comms::op_t::SUM,
+                                                       handle.get_stream());
+      CUGRAPH_EXPECTS(aggregate_n_sources > 0,
+                      "Invalid input argument: input should have at least one source");
+    }
+  } else {
+    CUGRAPH_EXPECTS(n_sources > 0,
+                    "Invalid input argument: input should have at least one source.");
+  }
 
   CUGRAPH_EXPECTS(
-    push_graph_view.is_symmetric() || !direction_optimizing,
+    graph_view.is_symmetric() || !direction_optimizing,
     "Invalid input argument: input graph should be symmetric for direction optimizing BFS.");
 
+  auto vertex_partition = vertex_partition_device_view_t<vertex_t, GraphViewType::is_multi_gpu>(
+    graph_view.local_vertex_partition_view());
+
   if (do_expensive_check) {
-    auto vertex_partition = vertex_partition_device_view_t<vertex_t, GraphViewType::is_multi_gpu>(
-      push_graph_view.local_vertex_partition_view());
+    bool is_sorted = thrust::is_sorted(handle.get_thrust_policy(), sources, sources + n_sources);
+    if constexpr (GraphViewType::is_multi_gpu) {
+      is_sorted = static_cast<bool>(host_scalar_allreduce(handle.get_comms(),
+                                                          static_cast<int32_t>(is_sorted),
+                                                          raft::comms::op_t::SUM,
+                                                          handle.get_stream()));
+    }
+
+    CUGRAPH_EXPECTS(
+      is_sorted,
+      "Invalid input arguments: input sources should be sorted in the non-descending order.");
+
     auto num_invalid_vertices =
       thrust::count_if(handle.get_thrust_policy(),
                        sources,
@@ -149,26 +175,51 @@ void bfs(raft::handle_t const& handle,
 
   thrust::fill(handle.get_thrust_policy(),
                distances,
-               distances + push_graph_view.local_vertex_partition_range_size(),
+               distances + graph_view.local_vertex_partition_range_size(),
                invalid_distance);
   thrust::fill(handle.get_thrust_policy(),
                predecessor_first,
-               predecessor_first + push_graph_view.local_vertex_partition_range_size(),
+               predecessor_first + graph_view.local_vertex_partition_range_size(),
                invalid_vertex);
-  auto vertex_partition = vertex_partition_device_view_t<vertex_t, GraphViewType::is_multi_gpu>(
-    push_graph_view.local_vertex_partition_view());
-  if (n_sources) {
-    thrust::for_each(
-      handle.get_thrust_policy(),
-      sources,
-      sources + n_sources,
-      [vertex_partition, distances, predecessor_first] __device__(auto v) {
-        *(distances + vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v)) =
-          vertex_t{0};
-      });
+  auto output_first = thrust::make_permutation_iterator(
+    distances,
+    thrust::make_transform_iterator(
+      sources, detail::shift_left_t<vertex_t>{graph_view.local_vertex_partition_range_first()}));
+  thrust::fill(handle.get_thrust_policy(), output_first, output_first + n_sources, vertex_t{0});
+
+  // 3. update meta data for direction optimizing BFS
+
+  constexpr edge_t direction_optimizing_alpha  = 14;
+  constexpr vertex_t direction_optimizing_beta = 24;
+
+  std::optional<rmm::device_uvector<edge_t>> out_degrees{std::nullopt};
+  std::optional<rmm::device_uvector<vertex_t>> nzd_unvisited_vertices{std::nullopt};
+  if (direction_optimizing) {
+    out_degrees            = graph_view.compute_out_degrees(handle);
+    nzd_unvisited_vertices = rmm::device_uvector<vertex_t>(
+      graph_view.local_vertex_partition_range_size(), handle.get_stream());
+    (*nzd_unvisited_vertices)
+      .resize(thrust::distance(
+                (*nzd_unvisited_vertices).begin(),
+                thrust::copy_if(
+                  handle.get_thrust_policy(),
+                  thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()),
+                  thrust::make_counting_iterator(graph_view.local_vertex_partition_range_last()),
+                  (*nzd_unvisited_vertices).begin(),
+                  [vertex_partition,
+                   sources     = raft::device_span<vertex_t const>(sources, n_sources),
+                   out_degrees = raft::device_span<edge_t const>(
+                     (*out_degrees).data(), (*out_degrees).size())] __device__(vertex_t v) {
+                    auto v_offset =
+                      vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
+                    return (out_degrees[v_offset] > edge_t{0}) &&
+                           !thrust::binary_search(thrust::seq, sources.begin(), sources.end(), v);
+                  })),
+              handle.get_stream());
+    (*nzd_unvisited_vertices).shrink_to_fit(handle.get_stream());
   }
 
-  // 3. initialize BFS frontier
+  // 4. initialize BFS frontier
 
   constexpr size_t bucket_idx_cur  = 0;
   constexpr size_t bucket_idx_next = 1;
@@ -176,110 +227,235 @@ void bfs(raft::handle_t const& handle,
 
   vertex_frontier_t<vertex_t, void, GraphViewType::is_multi_gpu, true> vertex_frontier(handle,
                                                                                        num_buckets);
-
   vertex_frontier.bucket(bucket_idx_cur).insert(sources, sources + n_sources);
-  rmm::device_uvector<uint32_t> visited_flags(
-    packed_bool_size(push_graph_view.local_vertex_partition_range_size()), handle.get_stream());
-  thrust::fill(handle.get_thrust_policy(),
-               visited_flags.begin(),
-               visited_flags.end(),
-               packed_bool_empty_mask());
-  rmm::device_uvector<uint32_t> prev_visited_flags(
-    GraphViewType::is_multi_gpu ? size_t{0} : visited_flags.size(),
-    handle.get_stream());  // relevant only if GraphViewType::is_multi_gpu is false
-  auto dst_visited_flags =
-    GraphViewType::is_multi_gpu
-      ? edge_dst_property_t<GraphViewType, bool>(handle, push_graph_view)
-      : edge_dst_property_t<GraphViewType,
-                            bool>(handle);  // relevant only if GraphViewType::is_multi_gpu is true
-  if constexpr (GraphViewType::is_multi_gpu) {
-    fill_edge_dst_property(handle, push_graph_view, false, dst_visited_flags);
-  }
+
+  // 5. initialize BFS temporary state data
+
+  auto prev_dst_visited_flags = edge_dst_property_t<GraphViewType, bool>(handle, graph_view);
+  fill_edge_dst_property(handle, graph_view, prev_dst_visited_flags.mutable_view(), false);
+  auto dst_visited_flags = edge_dst_property_t<GraphViewType, bool>(
+    handle, graph_view);  // this may mark some vertices visited in previous iterations as unvisited
+                          // (but this is OK as we check prev_dst_visited_flags first)
+  fill_edge_dst_property(handle, graph_view, dst_visited_flags.mutable_view(), false);
+
+  fill_edge_dst_property(handle,
+                         graph_view,
+                         vertex_frontier.bucket(bucket_idx_cur).begin(),
+                         vertex_frontier.bucket(bucket_idx_cur).end(),
+                         prev_dst_visited_flags.mutable_view(),
+                         true);
 
   // 4. BFS iteration
   vertex_t depth{0};
+  bool top_down = true;
+  auto cur_aggregate_vertex_frontier_size =
+    static_cast<vertex_t>(vertex_frontier.bucket(bucket_idx_cur).aggregate_size());
   while (true) {
-    if (direction_optimizing) {
-      CUGRAPH_FAIL("unimplemented.");
-    } else {
-      if (GraphViewType::is_multi_gpu) {
-        update_edge_dst_property(handle,
-                                 push_graph_view,
-                                 vertex_frontier.bucket(bucket_idx_cur).begin(),
-                                 vertex_frontier.bucket(bucket_idx_cur).end(),
-                                 thrust::make_constant_iterator(true),
-                                 dst_visited_flags);
-      } else {
-        thrust::copy(handle.get_thrust_policy(),
-                     visited_flags.begin(),
-                     visited_flags.end(),
-                     prev_visited_flags.begin());
-      }
+    vertex_t next_aggregate_vertex_frontier_size{};
+    if (top_down) {
+      topdown_e_op_t<vertex_t, GraphViewType::is_multi_gpu> e_op{};
+      e_op.prev_visited_flags =
+        detail::edge_partition_endpoint_property_device_view_t<vertex_t, uint32_t*, bool>(
+          prev_dst_visited_flags.mutable_view());
+      e_op.visited_flags =
+        detail::edge_partition_endpoint_property_device_view_t<vertex_t, uint32_t*, bool>(
+          dst_visited_flags.mutable_view());
+      e_op.dst_first = graph_view.local_edge_partition_dst_range_first();
+
+      auto [new_frontier_vertex_buffer, predecessor_buffer] =
+        transform_reduce_v_frontier_outgoing_e_by_dst(handle,
+                                                      graph_view,
+                                                      vertex_frontier.bucket(bucket_idx_cur),
+                                                      edge_src_dummy_property_t{}.view(),
+                                                      edge_dst_dummy_property_t{}.view(),
+                                                      edge_dummy_property_t{}.view(),
+                                                      e_op,
+                                                      reduce_op::any<vertex_t>());
 
-      e_op_t<vertex_t, GraphViewType::is_multi_gpu> e_op{};
-      if constexpr (GraphViewType::is_multi_gpu) {
-        e_op.visited_flags =
-          detail::edge_partition_endpoint_property_device_view_t<vertex_t, uint32_t*, bool>(
-            dst_visited_flags.mutable_view());
-        e_op.dst_first = push_graph_view.local_edge_partition_dst_range_first();
-      } else {
-        e_op.visited_flags =
-          detail::edge_partition_endpoint_property_device_view_t<vertex_t, uint32_t*, bool>(
-            detail::edge_minor_property_view_t<vertex_t, uint32_t*, bool>(visited_flags.data(),
-                                                                          vertex_t{0}));
-        e_op.prev_visited_flags = prev_visited_flags.data();
+      auto input_pair_first = thrust::make_zip_iterator(thrust::make_constant_iterator(depth + 1),
+                                                        predecessor_buffer.begin());
+      thrust::scatter(
+        handle.get_thrust_policy(),
+        input_pair_first,
+        input_pair_first + new_frontier_vertex_buffer.size(),
+        thrust::make_transform_iterator(
+          new_frontier_vertex_buffer.begin(),
+          detail::shift_left_t<vertex_t>{graph_view.local_vertex_partition_range_first()}),
+        thrust::make_zip_iterator(distances, predecessor_first));
+      vertex_frontier.bucket(bucket_idx_next) =
+        key_bucket_t<vertex_t, void, GraphViewType::is_multi_gpu, true>(
+          handle, std::move(new_frontier_vertex_buffer));
+
+      next_aggregate_vertex_frontier_size =
+        static_cast<vertex_t>(vertex_frontier.bucket(bucket_idx_next).aggregate_size());
+      if (next_aggregate_vertex_frontier_size == 0) { break; }
+
+      fill_edge_dst_property(handle,
+                             graph_view,
+                             vertex_frontier.bucket(bucket_idx_next).begin(),
+                             vertex_frontier.bucket(bucket_idx_next).end(),
+                             prev_dst_visited_flags.mutable_view(),
+                             true);
+
+      if (direction_optimizing) {
+        auto m_f = thrust::transform_reduce(
+          handle.get_thrust_policy(),
+          vertex_frontier.bucket(bucket_idx_next).begin(),
+          vertex_frontier.bucket(bucket_idx_next).end(),
+          cuda::proclaim_return_type<edge_t>(
+            [vertex_partition,
+             out_degrees = raft::device_span<edge_t const>(
+               (*out_degrees).data(), (*out_degrees).size())] __device__(vertex_t v) {
+              auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
+              return out_degrees[v_offset];
+            }),
+          edge_t{0},
+          thrust::plus<edge_t>{});
+
+        {
+          rmm::device_uvector<vertex_t> tmp_vertices((*nzd_unvisited_vertices).size(),
+                                                     handle.get_stream());
+          tmp_vertices.resize(
+            thrust::distance(tmp_vertices.begin(),
+                             thrust::set_difference(handle.get_thrust_policy(),
+                                                    (*nzd_unvisited_vertices).begin(),
+                                                    (*nzd_unvisited_vertices).end(),
+                                                    vertex_frontier.bucket(bucket_idx_next).begin(),
+                                                    vertex_frontier.bucket(bucket_idx_next).end(),
+                                                    tmp_vertices.begin())),
+            handle.get_stream());
+          nzd_unvisited_vertices = std::move(tmp_vertices);
+        }
+
+        auto m_u = thrust::transform_reduce(
+          handle.get_thrust_policy(),
+          (*nzd_unvisited_vertices).begin(),
+          (*nzd_unvisited_vertices).end(),
+          cuda::proclaim_return_type<edge_t>(
+            [vertex_partition,
+             out_degrees = raft::device_span<edge_t const>(
+               (*out_degrees).data(), (*out_degrees).size())] __device__(vertex_t v) {
+              auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
+              return out_degrees[v_offset];
+            }),
+          edge_t{0},
+          thrust::plus<edge_t>{});
+        auto aggregate_m_f =
+          GraphViewType::is_multi_gpu
+            ? host_scalar_allreduce(
+                handle.get_comms(), m_f, raft::comms::op_t::SUM, handle.get_stream())
+            : m_f;
+        auto aggregate_m_u =
+          GraphViewType::is_multi_gpu
+            ? host_scalar_allreduce(
+                handle.get_comms(), m_u, raft::comms::op_t::SUM, handle.get_stream())
+            : m_u;
+        if ((aggregate_m_f * direction_optimizing_alpha > aggregate_m_u) &&
+            (next_aggregate_vertex_frontier_size >= cur_aggregate_vertex_frontier_size)) {
+          top_down = false;
+        }
       }
 
+      if (top_down) {  // staying in top-down
+        vertex_frontier.bucket(bucket_idx_cur) =
+          key_bucket_t<vertex_t, void, GraphViewType::is_multi_gpu, true>(handle);
+        vertex_frontier.swap_buckets(bucket_idx_cur, bucket_idx_next);
+      } else {  // swithcing to bottom-up
+        vertex_frontier.bucket(bucket_idx_cur) =
+          key_bucket_t<vertex_t, void, GraphViewType::is_multi_gpu, true>(
+            handle,
+            raft::device_span<vertex_t const>((*nzd_unvisited_vertices).data(),
+                                              (*nzd_unvisited_vertices).size()));
+        vertex_frontier.bucket(bucket_idx_next) =
+          key_bucket_t<vertex_t, void, GraphViewType::is_multi_gpu, true>(handle);
+      }
+    } else {  // bottom up
+      bottomup_e_op_t<vertex_t, GraphViewType::is_multi_gpu> e_op{};
+      e_op.prev_visited_flags =
+        detail::edge_partition_endpoint_property_device_view_t<vertex_t, uint32_t*, bool>(
+          prev_dst_visited_flags.mutable_view());
+      e_op.dst_first = graph_view.local_edge_partition_dst_range_first();
       auto [new_frontier_vertex_buffer, predecessor_buffer] =
-        transform_reduce_v_frontier_outgoing_e_by_dst(handle,
-                                                      push_graph_view,
+        transform_reduce_v_frontier_outgoing_e_by_src(handle,
+                                                      graph_view,
                                                       vertex_frontier.bucket(bucket_idx_cur),
                                                       edge_src_dummy_property_t{}.view(),
                                                       edge_dst_dummy_property_t{}.view(),
                                                       edge_dummy_property_t{}.view(),
-#if 1
                                                       e_op,
-#else
-          // FIXME: need to test more about the performance trade-offs between additional
-          // communication in updating dst_visited_flags (+ using atomics) vs reduced number of
-          // pushes (leading to both less computation & communication in reduction)
-          [vertex_partition, distances] __device__(
-            vertex_t src, vertex_t dst, auto, auto, auto) {
-            auto push = true;
-            if (vertex_partition.in_local_vertex_partition_range_nocheck(dst)) {
-              auto distance =
-                *(distances +
-                  vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(dst));
-              if (distance != invalid_distance) { push = false; }
-            }
-            return thrust::make_tuple(push, src);
-          },
-#endif
                                                       reduce_op::any<vertex_t>());
 
-      update_v_frontier(
-        handle,
-        push_graph_view,
-        std::move(new_frontier_vertex_buffer),
-        std::move(predecessor_buffer),
-        vertex_frontier,
-        std::vector<size_t>{bucket_idx_next},
-        distances,
-        thrust::make_zip_iterator(thrust::make_tuple(distances, predecessor_first)),
-        [depth] __device__(auto v, auto v_val, auto pushed_val) {
-          auto update = (v_val == invalid_distance);
-          return thrust::make_tuple(
-            update ? thrust::optional<size_t>{bucket_idx_next} : thrust::nullopt,
-            update ? thrust::optional<thrust::tuple<vertex_t, vertex_t>>{thrust::make_tuple(
-                       depth + 1, pushed_val)}
-                   : thrust::nullopt);
-        });
-
-      vertex_frontier.bucket(bucket_idx_cur).clear();
-      vertex_frontier.bucket(bucket_idx_cur).shrink_to_fit();
-      vertex_frontier.swap_buckets(bucket_idx_cur, bucket_idx_next);
-      if (vertex_frontier.bucket(bucket_idx_cur).aggregate_size() == 0) { break; }
+      auto input_pair_first = thrust::make_zip_iterator(thrust::make_constant_iterator(depth + 1),
+                                                        predecessor_buffer.begin());
+      thrust::scatter(
+        handle.get_thrust_policy(),
+        input_pair_first,
+        input_pair_first + new_frontier_vertex_buffer.size(),
+        thrust::make_transform_iterator(
+          new_frontier_vertex_buffer.begin(),
+          detail::shift_left_t<vertex_t>{graph_view.local_vertex_partition_range_first()}),
+        thrust::make_zip_iterator(distances, predecessor_first));
+
+      assert(direction_optimizing);
+
+      {
+        rmm::device_uvector<vertex_t> tmp_vertices((*nzd_unvisited_vertices).size(),
+                                                   handle.get_stream());
+        tmp_vertices.resize(
+          thrust::distance(tmp_vertices.begin(),
+                           thrust::set_difference(handle.get_thrust_policy(),
+                                                  (*nzd_unvisited_vertices).begin(),
+                                                  (*nzd_unvisited_vertices).end(),
+                                                  new_frontier_vertex_buffer.begin(),
+                                                  new_frontier_vertex_buffer.end(),
+                                                  tmp_vertices.begin())),
+          handle.get_stream());
+        nzd_unvisited_vertices = std::move(tmp_vertices);
+      }
+
+      next_aggregate_vertex_frontier_size =
+        GraphViewType::is_multi_gpu
+          ? host_scalar_allreduce(handle.get_comms(),
+                                  static_cast<vertex_t>(new_frontier_vertex_buffer.size()),
+                                  raft::comms::op_t::SUM,
+                                  handle.get_stream())
+          : static_cast<vertex_t>(new_frontier_vertex_buffer.size());
+      if (next_aggregate_vertex_frontier_size == 0) { break; }
+
+      fill_edge_dst_property(handle,
+                             graph_view,
+                             new_frontier_vertex_buffer.begin(),
+                             new_frontier_vertex_buffer.end(),
+                             prev_dst_visited_flags.mutable_view(),
+                             true);
+
+      auto aggregate_nzd_unvisted_vertices =
+        GraphViewType::is_multi_gpu
+          ? host_scalar_allreduce(handle.get_comms(),
+                                  static_cast<vertex_t>((*nzd_unvisited_vertices).size()),
+                                  raft::comms::op_t::SUM,
+                                  handle.get_stream())
+          : static_cast<vertex_t>((*nzd_unvisited_vertices).size());
+
+      if ((next_aggregate_vertex_frontier_size * direction_optimizing_beta <
+           aggregate_nzd_unvisted_vertices) &&
+          (next_aggregate_vertex_frontier_size < cur_aggregate_vertex_frontier_size)) {
+        top_down = true;
+      }
+
+      if (top_down) {  // swithcing to top-down
+        vertex_frontier.bucket(bucket_idx_cur) =
+          key_bucket_t<vertex_t, void, GraphViewType::is_multi_gpu, true>(
+            handle, std::move(new_frontier_vertex_buffer));
+      } else {  // staying in bottom-up
+        vertex_frontier.bucket(bucket_idx_cur) =
+          key_bucket_t<vertex_t, void, GraphViewType::is_multi_gpu, true>(
+            handle,
+            raft::device_span<vertex_t const>((*nzd_unvisited_vertices).data(),
+                                              (*nzd_unvisited_vertices).size()));
+      }
     }
+    cur_aggregate_vertex_frontier_size = next_aggregate_vertex_frontier_size;
 
     depth++;
     if (depth >= depth_limit) { break; }
diff --git a/cpp/src/traversal/k_hop_nbrs_impl.cuh b/cpp/src/traversal/k_hop_nbrs_impl.cuh
index c74bb9cd58c..acf3cfe8fc5 100644
--- a/cpp/src/traversal/k_hop_nbrs_impl.cuh
+++ b/cpp/src/traversal/k_hop_nbrs_impl.cuh
@@ -16,7 +16,7 @@
 #pragma once
 
 #include "prims/reduce_op.cuh"
-#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh"
+#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh"
 #include "prims/vertex_frontier.cuh"
 
 #include <cugraph/algorithms.hpp>
diff --git a/cpp/src/traversal/od_shortest_distances_impl.cuh b/cpp/src/traversal/od_shortest_distances_impl.cuh
index d1ff860f004..e1b7444b92f 100644
--- a/cpp/src/traversal/od_shortest_distances_impl.cuh
+++ b/cpp/src/traversal/od_shortest_distances_impl.cuh
@@ -16,12 +16,13 @@
 #pragma once
 
 #include "prims/count_if_e.cuh"
+#include "prims/detail/extract_transform_v_frontier_e.cuh"
 #include "prims/fill_edge_src_dst_property.cuh"
 #include "prims/key_store.cuh"
 #include "prims/kv_store.cuh"
 #include "prims/reduce_op.cuh"
 #include "prims/transform_reduce_e.cuh"
-#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh"
+#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh"
 #include "prims/update_edge_src_dst_property.cuh"
 #include "prims/update_v_frontier.cuh"
 #include "prims/vertex_frontier.cuh"
@@ -640,6 +641,7 @@ rmm::device_uvector<weight_t> od_shortest_distances(
         cutoff,
         invalid_distance};
       detail::transform_reduce_v_frontier_call_e_op_t<
+        false,
         thrust::tuple<vertex_t, od_idx_t>,
         weight_t,
         vertex_t,
@@ -651,8 +653,8 @@ rmm::device_uvector<weight_t> od_shortest_distances(
 
       auto new_frontier_tagged_vertex_buffer =
         allocate_dataframe_buffer<thrust::tuple<vertex_t, od_idx_t>>(0, handle.get_stream());
-      std::tie(new_frontier_tagged_vertex_buffer, distance_buffer) =
-        detail::extract_transform_v_frontier_e<false, thrust::tuple<vertex_t, od_idx_t>, weight_t>(
+      std::tie(new_frontier_tagged_vertex_buffer, distance_buffer) = detail::
+        extract_transform_v_frontier_e<false, false, thrust::tuple<vertex_t, od_idx_t>, weight_t>(
           handle,
           graph_view,
           vertex_frontier.bucket(bucket_idx_near),
diff --git a/cpp/src/traversal/sssp_impl.cuh b/cpp/src/traversal/sssp_impl.cuh
index 4544fb1e8ce..47908524feb 100644
--- a/cpp/src/traversal/sssp_impl.cuh
+++ b/cpp/src/traversal/sssp_impl.cuh
@@ -19,7 +19,7 @@
 #include "prims/fill_edge_src_dst_property.cuh"
 #include "prims/reduce_op.cuh"
 #include "prims/transform_reduce_e.cuh"
-#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh"
+#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh"
 #include "prims/update_edge_src_dst_property.cuh"
 #include "prims/update_v_frontier.cuh"
 #include "prims/vertex_frontier.cuh"
@@ -172,8 +172,10 @@ void sssp(raft::handle_t const& handle,
       ? edge_src_property_t<GraphViewType, weight_t>(handle, push_graph_view)
       : edge_src_property_t<GraphViewType, weight_t>(handle);
   if (GraphViewType::is_multi_gpu) {
-    fill_edge_src_property(
-      handle, push_graph_view, std::numeric_limits<weight_t>::max(), edge_src_distances);
+    fill_edge_src_property(handle,
+                           push_graph_view,
+                           edge_src_distances.mutable_view(),
+                           std::numeric_limits<weight_t>::max());
   }
 
   if (push_graph_view.in_local_vertex_partition_range_nocheck(source_vertex)) {
@@ -188,7 +190,7 @@ void sssp(raft::handle_t const& handle,
                                vertex_frontier.bucket(bucket_idx_cur_near).begin(),
                                vertex_frontier.bucket(bucket_idx_cur_near).end(),
                                distances,
-                               edge_src_distances);
+                               edge_src_distances.mutable_view());
     }
 
     auto vertex_partition = vertex_partition_device_view_t<vertex_t, GraphViewType::is_multi_gpu>(
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 103953dd5d2..73a3104f27b 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -661,9 +661,9 @@ if(BUILD_CUGRAPH_MG_TESTS)
     ConfigureTestMG(MG_COUNT_IF_V_TEST prims/mg_count_if_v.cu)
 
     ###############################################################################################
-    # - MG PRIMS TRANSFORM_REDUCE_V_FRONTIER_OUTGOING_E_BY_DST tests ------------------------------
-    ConfigureTestMG(MG_TRANSFORM_REDUCE_V_FRONTIER_OUTGOING_E_BY_DST_TEST
-                    prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu)
+    # - MG PRIMS TRANSFORM_REDUCE_V_FRONTIER_OUTGOING_E_BY_SRC_DST tests --------------------------
+    ConfigureTestMG(MG_TRANSFORM_REDUCE_V_FRONTIER_OUTGOING_E_BY_SRC_DST_TEST
+                    prims/mg_transform_reduce_v_frontier_outgoing_e_by_src_dst.cu)
 
     ###############################################################################################
     # - MG PRIMS REDUCE_V tests -------------------------------------------------------------------
diff --git a/cpp/tests/components/mg_mis_test.cu b/cpp/tests/components/mg_mis_test.cu
index c2469ba867d..9c50be3fa28 100644
--- a/cpp/tests/components/mg_mis_test.cu
+++ b/cpp/tests/components/mg_mis_test.cu
@@ -145,9 +145,9 @@ class Tests_MGMaximalIndependentSet
         dst_inclusion_cache =
           cugraph::edge_dst_property_t<GraphViewType, vertex_t>(*handle_, mg_graph_view);
         update_edge_src_property(
-          *handle_, mg_graph_view, inclusiong_flags.begin(), src_inclusion_cache);
+          *handle_, mg_graph_view, inclusiong_flags.begin(), src_inclusion_cache.mutable_view());
         update_edge_dst_property(
-          *handle_, mg_graph_view, inclusiong_flags.begin(), dst_inclusion_cache);
+          *handle_, mg_graph_view, inclusiong_flags.begin(), dst_inclusion_cache.mutable_view());
       }
 
       per_v_transform_reduce_outgoing_e(
diff --git a/cpp/tests/components/mg_vertex_coloring_test.cu b/cpp/tests/components/mg_vertex_coloring_test.cu
index 2c54e3cf065..14e15df502f 100644
--- a/cpp/tests/components/mg_vertex_coloring_test.cu
+++ b/cpp/tests/components/mg_vertex_coloring_test.cu
@@ -111,8 +111,10 @@ class Tests_MGGraphColoring
           cugraph::edge_src_property_t<GraphViewType, vertex_t>(*handle_, mg_graph_view);
         dst_color_cache =
           cugraph::edge_dst_property_t<GraphViewType, vertex_t>(*handle_, mg_graph_view);
-        update_edge_src_property(*handle_, mg_graph_view, d_colors.begin(), src_color_cache);
-        update_edge_dst_property(*handle_, mg_graph_view, d_colors.begin(), dst_color_cache);
+        update_edge_src_property(
+          *handle_, mg_graph_view, d_colors.begin(), src_color_cache.mutable_view());
+        update_edge_dst_property(
+          *handle_, mg_graph_view, d_colors.begin(), dst_color_cache.mutable_view());
       }
 
       rmm::device_uvector<uint8_t> d_color_conflicts(
diff --git a/cpp/tests/prims/mg_count_if_e.cu b/cpp/tests/prims/mg_count_if_e.cu
index 137f7db8625..8ad34a5a724 100644
--- a/cpp/tests/prims/mg_count_if_e.cu
+++ b/cpp/tests/prims/mg_count_if_e.cu
@@ -15,7 +15,6 @@
  */
 
 #include "prims/count_if_e.cuh"
-#include "prims/update_edge_src_dst_property.cuh"
 #include "utilities/base_fixture.hpp"
 #include "utilities/conversion_utilities.hpp"
 #include "utilities/device_comm_wrapper.hpp"
diff --git a/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu b/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu
index 75b711fbd9c..681a7d8e6ff 100644
--- a/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu
+++ b/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu
@@ -16,7 +16,6 @@
 
 #include "prims/per_v_pair_transform_dst_nbr_intersection.cuh"
 #include "prims/transform_e.cuh"
-#include "prims/update_edge_src_dst_property.cuh"
 #include "utilities/base_fixture.hpp"
 #include "utilities/conversion_utilities.hpp"
 #include "utilities/device_comm_wrapper.hpp"
diff --git a/cpp/tests/prims/mg_per_v_transform_reduce_dst_key_aggregated_outgoing_e.cu b/cpp/tests/prims/mg_per_v_transform_reduce_dst_key_aggregated_outgoing_e.cu
index fd9192dcce5..bef6395a780 100644
--- a/cpp/tests/prims/mg_per_v_transform_reduce_dst_key_aggregated_outgoing_e.cu
+++ b/cpp/tests/prims/mg_per_v_transform_reduce_dst_key_aggregated_outgoing_e.cu
@@ -16,7 +16,6 @@
 
 #include "prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh"
 #include "prims/reduce_op.cuh"
-#include "prims/update_edge_src_dst_property.cuh"
 #include "result_compare.cuh"
 #include "utilities/base_fixture.hpp"
 #include "utilities/conversion_utilities.hpp"
diff --git a/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu b/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu
index be29c793ad5..d4f102127c5 100644
--- a/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu
+++ b/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu
@@ -16,7 +16,6 @@
 
 #include "prims/per_v_transform_reduce_incoming_outgoing_e.cuh"
 #include "prims/reduce_op.cuh"
-#include "prims/update_edge_src_dst_property.cuh"
 #include "result_compare.cuh"
 #include "utilities/base_fixture.hpp"
 #include "utilities/conversion_utilities.hpp"
diff --git a/cpp/tests/prims/mg_transform_e.cu b/cpp/tests/prims/mg_transform_e.cu
index 2b8d5d52905..2e8b2feaef3 100644
--- a/cpp/tests/prims/mg_transform_e.cu
+++ b/cpp/tests/prims/mg_transform_e.cu
@@ -160,7 +160,8 @@ class Tests_MGTransformE
     cugraph::edge_property_t<decltype(mg_graph_view), result_t> edge_value_output(*handle_,
                                                                                   mg_graph_view);
 
-    cugraph::fill_edge_property(*handle_, mg_graph_view, property_initial_value, edge_value_output);
+    cugraph::fill_edge_property(
+      *handle_, mg_graph_view, edge_value_output.mutable_view(), property_initial_value);
 
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
diff --git a/cpp/tests/prims/mg_transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cu b/cpp/tests/prims/mg_transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cu
index 4fac6ef3be7..8eee3d5a6d5 100644
--- a/cpp/tests/prims/mg_transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cu
+++ b/cpp/tests/prims/mg_transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cu
@@ -16,7 +16,6 @@
 
 #include "prims/transform_e.cuh"
 #include "prims/transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cuh"
-#include "prims/update_edge_src_dst_property.cuh"
 #include "utilities/base_fixture.hpp"
 #include "utilities/conversion_utilities.hpp"
 #include "utilities/device_comm_wrapper.hpp"
diff --git a/cpp/tests/prims/mg_transform_reduce_e.cu b/cpp/tests/prims/mg_transform_reduce_e.cu
index 4785a8bb01b..9f9e71f6e55 100644
--- a/cpp/tests/prims/mg_transform_reduce_e.cu
+++ b/cpp/tests/prims/mg_transform_reduce_e.cu
@@ -15,7 +15,6 @@
  */
 
 #include "prims/transform_reduce_e.cuh"
-#include "prims/update_edge_src_dst_property.cuh"
 #include "result_compare.cuh"
 #include "utilities/base_fixture.hpp"
 #include "utilities/conversion_utilities.hpp"
diff --git a/cpp/tests/prims/mg_transform_reduce_e_by_src_dst_key.cu b/cpp/tests/prims/mg_transform_reduce_e_by_src_dst_key.cu
index 9950b5bdbf4..67cacb27e0c 100644
--- a/cpp/tests/prims/mg_transform_reduce_e_by_src_dst_key.cu
+++ b/cpp/tests/prims/mg_transform_reduce_e_by_src_dst_key.cu
@@ -16,7 +16,6 @@
 
 #include "prims/reduce_op.cuh"
 #include "prims/transform_reduce_e_by_src_dst_key.cuh"
-#include "prims/update_edge_src_dst_property.cuh"
 #include "result_compare.cuh"
 #include "utilities/base_fixture.hpp"
 #include "utilities/conversion_utilities.hpp"
diff --git a/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu b/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_src_dst.cu
similarity index 55%
rename from cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu
rename to cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_src_dst.cu
index 335a7ec879c..596bb3688fb 100644
--- a/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu
+++ b/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_src_dst.cu
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh"
-#include "prims/update_edge_src_dst_property.cuh"
+#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh"
 #include "prims/vertex_frontier.cuh"
 #include "utilities/base_fixture.hpp"
 #include "utilities/conversion_utilities.hpp"
@@ -93,10 +92,10 @@ struct Prims_Usecase {
 };
 
 template <typename input_usecase_t>
-class Tests_MGTransformReduceVFrontierOutgoingEByDst
+class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst
   : public ::testing::TestWithParam<std::tuple<Prims_Usecase, input_usecase_t>> {
  public:
-  Tests_MGTransformReduceVFrontierOutgoingEByDst() {}
+  Tests_MGTransformReduceVFrontierOutgoingEBySrcDst() {}
 
   static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
 
@@ -204,26 +203,68 @@ class Tests_MGTransformReduceVFrontierOutgoingEByDst
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
       handle_->get_comms().barrier();
-      hr_timer.start("MG transform_reduce_v_frontier_outgoing_e_by_dst");
+      hr_timer.start("MG transform_reduce_v_frontier_outgoing_e_by_src");
     }
 
-    auto mg_new_frontier_key_buffer =
+    auto mg_reduce_by_src_new_frontier_key_buffer =
       cugraph::allocate_dataframe_buffer<key_t>(0, handle_->get_stream());
-    [[maybe_unused]] auto mg_payload_buffer =
+    [[maybe_unused]] auto mg_reduce_by_src_payload_buffer =
       cugraph::detail::allocate_optional_dataframe_buffer<payload_t>(0, handle_->get_stream());
 
     if constexpr (std::is_same_v<payload_t, void>) {
-      mg_new_frontier_key_buffer = cugraph::transform_reduce_v_frontier_outgoing_e_by_dst(
-        *handle_,
-        mg_graph_view,
-        mg_vertex_frontier.bucket(bucket_idx_cur),
-        mg_src_prop.view(),
-        mg_dst_prop.view(),
-        cugraph::edge_dummy_property_t{}.view(),
-        e_op_t<key_t, vertex_t, property_t, payload_t>{},
-        cugraph::reduce_op::null{});
+      mg_reduce_by_src_new_frontier_key_buffer =
+        cugraph::transform_reduce_v_frontier_outgoing_e_by_src(
+          *handle_,
+          mg_graph_view,
+          mg_vertex_frontier.bucket(bucket_idx_cur),
+          mg_src_prop.view(),
+          mg_dst_prop.view(),
+          cugraph::edge_dummy_property_t{}.view(),
+          e_op_t<key_t, vertex_t, property_t, payload_t>{},
+          cugraph::reduce_op::null{});
+    } else {
+      std::tie(mg_reduce_by_src_new_frontier_key_buffer, mg_reduce_by_src_payload_buffer) =
+        cugraph::transform_reduce_v_frontier_outgoing_e_by_src(
+          *handle_,
+          mg_graph_view,
+          mg_vertex_frontier.bucket(bucket_idx_cur),
+          mg_src_prop.view(),
+          mg_dst_prop.view(),
+          cugraph::edge_dummy_property_t{}.view(),
+          e_op_t<key_t, vertex_t, property_t, payload_t>{},
+          cugraph::reduce_op::plus<payload_t>{});
+    }
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG transform_reduce_v_frontier_outgoing_e_by_src");
+    }
+
+    auto mg_reduce_by_dst_new_frontier_key_buffer =
+      cugraph::allocate_dataframe_buffer<key_t>(0, handle_->get_stream());
+    [[maybe_unused]] auto mg_reduce_by_dst_payload_buffer =
+      cugraph::detail::allocate_optional_dataframe_buffer<payload_t>(0, handle_->get_stream());
+
+    if constexpr (std::is_same_v<payload_t, void>) {
+      mg_reduce_by_dst_new_frontier_key_buffer =
+        cugraph::transform_reduce_v_frontier_outgoing_e_by_dst(
+          *handle_,
+          mg_graph_view,
+          mg_vertex_frontier.bucket(bucket_idx_cur),
+          mg_src_prop.view(),
+          mg_dst_prop.view(),
+          cugraph::edge_dummy_property_t{}.view(),
+          e_op_t<key_t, vertex_t, property_t, payload_t>{},
+          cugraph::reduce_op::null{});
     } else {
-      std::tie(mg_new_frontier_key_buffer, mg_payload_buffer) =
+      std::tie(mg_reduce_by_dst_new_frontier_key_buffer, mg_reduce_by_dst_payload_buffer) =
         cugraph::transform_reduce_v_frontier_outgoing_e_by_dst(
           *handle_,
           mg_graph_view,
@@ -248,46 +289,110 @@ class Tests_MGTransformReduceVFrontierOutgoingEByDst
       if constexpr (std::is_same_v<key_t, vertex_t>) {
         cugraph::unrenumber_int_vertices<vertex_t, true>(
           *handle_,
-          mg_new_frontier_key_buffer.begin(),
-          mg_new_frontier_key_buffer.size(),
+          mg_reduce_by_src_new_frontier_key_buffer.begin(),
+          mg_reduce_by_src_new_frontier_key_buffer.size(),
+          (*mg_renumber_map).data(),
+          mg_graph_view.vertex_partition_range_lasts());
+
+        cugraph::unrenumber_int_vertices<vertex_t, true>(
+          *handle_,
+          mg_reduce_by_dst_new_frontier_key_buffer.begin(),
+          mg_reduce_by_dst_new_frontier_key_buffer.size(),
           (*mg_renumber_map).data(),
           mg_graph_view.vertex_partition_range_lasts());
       } else {
         cugraph::unrenumber_int_vertices<vertex_t, true>(
           *handle_,
-          std::get<0>(mg_new_frontier_key_buffer).begin(),
-          std::get<0>(mg_new_frontier_key_buffer).size(),
+          std::get<0>(mg_reduce_by_src_new_frontier_key_buffer).begin(),
+          std::get<0>(mg_reduce_by_src_new_frontier_key_buffer).size(),
+          (*mg_renumber_map).data(),
+          mg_graph_view.vertex_partition_range_lasts());
+
+        cugraph::unrenumber_int_vertices<vertex_t, true>(
+          *handle_,
+          std::get<0>(mg_reduce_by_dst_new_frontier_key_buffer).begin(),
+          std::get<0>(mg_reduce_by_dst_new_frontier_key_buffer).size(),
           (*mg_renumber_map).data(),
           mg_graph_view.vertex_partition_range_lasts());
       }
 
-      auto mg_aggregate_new_frontier_key_buffer =
+      auto mg_reduce_by_src_aggregate_new_frontier_key_buffer =
         cugraph::allocate_dataframe_buffer<key_t>(0, handle_->get_stream());
       if constexpr (std::is_same_v<key_t, vertex_t>) {
-        mg_aggregate_new_frontier_key_buffer = cugraph::test::device_gatherv(
-          *handle_, mg_new_frontier_key_buffer.data(), mg_new_frontier_key_buffer.size());
-      } else {
-        std::get<0>(mg_aggregate_new_frontier_key_buffer) =
+        mg_reduce_by_src_aggregate_new_frontier_key_buffer =
           cugraph::test::device_gatherv(*handle_,
-                                        std::get<0>(mg_new_frontier_key_buffer).data(),
-                                        std::get<0>(mg_new_frontier_key_buffer).size());
-        std::get<1>(mg_aggregate_new_frontier_key_buffer) =
+                                        mg_reduce_by_src_new_frontier_key_buffer.data(),
+                                        mg_reduce_by_src_new_frontier_key_buffer.size());
+      } else {
+        std::get<0>(mg_reduce_by_src_aggregate_new_frontier_key_buffer) =
+          cugraph::test::device_gatherv(
+            *handle_,
+            std::get<0>(mg_reduce_by_src_new_frontier_key_buffer).data(),
+            std::get<0>(mg_reduce_by_src_new_frontier_key_buffer).size());
+        std::get<1>(mg_reduce_by_src_aggregate_new_frontier_key_buffer) =
+          cugraph::test::device_gatherv(
+            *handle_,
+            std::get<1>(mg_reduce_by_src_new_frontier_key_buffer).data(),
+            std::get<1>(mg_reduce_by_src_new_frontier_key_buffer).size());
+      }
+
+      auto mg_reduce_by_dst_aggregate_new_frontier_key_buffer =
+        cugraph::allocate_dataframe_buffer<key_t>(0, handle_->get_stream());
+      if constexpr (std::is_same_v<key_t, vertex_t>) {
+        mg_reduce_by_dst_aggregate_new_frontier_key_buffer =
           cugraph::test::device_gatherv(*handle_,
-                                        std::get<1>(mg_new_frontier_key_buffer).data(),
-                                        std::get<1>(mg_new_frontier_key_buffer).size());
+                                        mg_reduce_by_dst_new_frontier_key_buffer.data(),
+                                        mg_reduce_by_dst_new_frontier_key_buffer.size());
+      } else {
+        std::get<0>(mg_reduce_by_dst_aggregate_new_frontier_key_buffer) =
+          cugraph::test::device_gatherv(
+            *handle_,
+            std::get<0>(mg_reduce_by_dst_new_frontier_key_buffer).data(),
+            std::get<0>(mg_reduce_by_dst_new_frontier_key_buffer).size());
+        std::get<1>(mg_reduce_by_dst_aggregate_new_frontier_key_buffer) =
+          cugraph::test::device_gatherv(
+            *handle_,
+            std::get<1>(mg_reduce_by_dst_new_frontier_key_buffer).data(),
+            std::get<1>(mg_reduce_by_dst_new_frontier_key_buffer).size());
       }
 
-      [[maybe_unused]] auto mg_aggregate_payload_buffer =
+      [[maybe_unused]] auto mg_reduce_by_src_aggregate_payload_buffer =
         cugraph::detail::allocate_optional_dataframe_buffer<payload_t>(0, handle_->get_stream());
       if constexpr (!std::is_same_v<payload_t, void>) {
         if constexpr (std::is_arithmetic_v<payload_t>) {
-          mg_aggregate_payload_buffer = cugraph::test::device_gatherv(
-            *handle_, mg_payload_buffer.data(), mg_payload_buffer.size());
+          mg_reduce_by_src_aggregate_payload_buffer =
+            cugraph::test::device_gatherv(*handle_,
+                                          mg_reduce_by_src_payload_buffer.data(),
+                                          mg_reduce_by_src_payload_buffer.size());
         } else {
-          std::get<0>(mg_aggregate_payload_buffer) = cugraph::test::device_gatherv(
-            *handle_, std::get<0>(mg_payload_buffer).data(), std::get<0>(mg_payload_buffer).size());
-          std::get<1>(mg_aggregate_payload_buffer) = cugraph::test::device_gatherv(
-            *handle_, std::get<1>(mg_payload_buffer).data(), std::get<1>(mg_payload_buffer).size());
+          std::get<0>(mg_reduce_by_src_aggregate_payload_buffer) =
+            cugraph::test::device_gatherv(*handle_,
+                                          std::get<0>(mg_reduce_by_src_payload_buffer).data(),
+                                          std::get<0>(mg_reduce_by_src_payload_buffer).size());
+          std::get<1>(mg_reduce_by_src_aggregate_payload_buffer) =
+            cugraph::test::device_gatherv(*handle_,
+                                          std::get<1>(mg_reduce_by_src_payload_buffer).data(),
+                                          std::get<1>(mg_reduce_by_src_payload_buffer).size());
+        }
+      }
+
+      [[maybe_unused]] auto mg_reduce_by_dst_aggregate_payload_buffer =
+        cugraph::detail::allocate_optional_dataframe_buffer<payload_t>(0, handle_->get_stream());
+      if constexpr (!std::is_same_v<payload_t, void>) {
+        if constexpr (std::is_arithmetic_v<payload_t>) {
+          mg_reduce_by_dst_aggregate_payload_buffer =
+            cugraph::test::device_gatherv(*handle_,
+                                          mg_reduce_by_dst_payload_buffer.data(),
+                                          mg_reduce_by_dst_payload_buffer.size());
+        } else {
+          std::get<0>(mg_reduce_by_dst_aggregate_payload_buffer) =
+            cugraph::test::device_gatherv(*handle_,
+                                          std::get<0>(mg_reduce_by_dst_payload_buffer).data(),
+                                          std::get<0>(mg_reduce_by_dst_payload_buffer).size());
+          std::get<1>(mg_reduce_by_dst_aggregate_payload_buffer) =
+            cugraph::test::device_gatherv(*handle_,
+                                          std::get<1>(mg_reduce_by_dst_payload_buffer).data(),
+                                          std::get<1>(mg_reduce_by_dst_payload_buffer).size());
         }
       }
 
@@ -304,15 +409,27 @@ class Tests_MGTransformReduceVFrontierOutgoingEByDst
 
       if (handle_->get_comms().get_rank() == int{0}) {
         if constexpr (std::is_same_v<payload_t, void>) {
-          thrust::sort(handle_->get_thrust_policy(),
-                       cugraph::get_dataframe_buffer_begin(mg_aggregate_new_frontier_key_buffer),
-                       cugraph::get_dataframe_buffer_end(mg_aggregate_new_frontier_key_buffer));
+          thrust::sort(
+            handle_->get_thrust_policy(),
+            cugraph::get_dataframe_buffer_begin(mg_reduce_by_src_aggregate_new_frontier_key_buffer),
+            cugraph::get_dataframe_buffer_end(mg_reduce_by_src_aggregate_new_frontier_key_buffer));
+
+          thrust::sort(
+            handle_->get_thrust_policy(),
+            cugraph::get_dataframe_buffer_begin(mg_reduce_by_dst_aggregate_new_frontier_key_buffer),
+            cugraph::get_dataframe_buffer_end(mg_reduce_by_dst_aggregate_new_frontier_key_buffer));
         } else {
           thrust::sort_by_key(
             handle_->get_thrust_policy(),
-            cugraph::get_dataframe_buffer_begin(mg_aggregate_new_frontier_key_buffer),
-            cugraph::get_dataframe_buffer_end(mg_aggregate_new_frontier_key_buffer),
-            cugraph::get_dataframe_buffer_begin(mg_aggregate_payload_buffer));
+            cugraph::get_dataframe_buffer_begin(mg_reduce_by_src_aggregate_new_frontier_key_buffer),
+            cugraph::get_dataframe_buffer_end(mg_reduce_by_src_aggregate_new_frontier_key_buffer),
+            cugraph::get_dataframe_buffer_begin(mg_reduce_by_src_aggregate_payload_buffer));
+
+          thrust::sort_by_key(
+            handle_->get_thrust_policy(),
+            cugraph::get_dataframe_buffer_begin(mg_reduce_by_dst_aggregate_new_frontier_key_buffer),
+            cugraph::get_dataframe_buffer_end(mg_reduce_by_dst_aggregate_new_frontier_key_buffer),
+            cugraph::get_dataframe_buffer_begin(mg_reduce_by_dst_aggregate_payload_buffer));
         }
 
         auto sg_graph_view = sg_graph.view();
@@ -354,22 +471,51 @@ class Tests_MGTransformReduceVFrontierOutgoingEByDst
           .insert(cugraph::get_dataframe_buffer_begin(sg_key_buffer),
                   cugraph::get_dataframe_buffer_end(sg_key_buffer));
 
-        auto sg_new_frontier_key_buffer =
+        auto sg_reduce_by_src_new_frontier_key_buffer =
           cugraph::allocate_dataframe_buffer<key_t>(0, handle_->get_stream());
-        [[maybe_unused]] auto sg_payload_buffer =
+        [[maybe_unused]] auto sg_reduce_by_src_payload_buffer =
           cugraph::detail::allocate_optional_dataframe_buffer<payload_t>(0, handle_->get_stream());
         if constexpr (std::is_same_v<payload_t, void>) {
-          sg_new_frontier_key_buffer = cugraph::transform_reduce_v_frontier_outgoing_e_by_dst(
-            *handle_,
-            sg_graph_view,
-            sg_vertex_frontier.bucket(bucket_idx_cur),
-            sg_src_prop.view(),
-            sg_dst_prop.view(),
-            cugraph::edge_dummy_property_t{}.view(),
-            e_op_t<key_t, vertex_t, property_t, payload_t>{},
-            cugraph::reduce_op::null{});
+          sg_reduce_by_src_new_frontier_key_buffer =
+            cugraph::transform_reduce_v_frontier_outgoing_e_by_src(
+              *handle_,
+              sg_graph_view,
+              sg_vertex_frontier.bucket(bucket_idx_cur),
+              sg_src_prop.view(),
+              sg_dst_prop.view(),
+              cugraph::edge_dummy_property_t{}.view(),
+              e_op_t<key_t, vertex_t, property_t, payload_t>{},
+              cugraph::reduce_op::null{});
         } else {
-          std::tie(sg_new_frontier_key_buffer, sg_payload_buffer) =
+          std::tie(sg_reduce_by_src_new_frontier_key_buffer, sg_reduce_by_src_payload_buffer) =
+            cugraph::transform_reduce_v_frontier_outgoing_e_by_src(
+              *handle_,
+              sg_graph_view,
+              sg_vertex_frontier.bucket(bucket_idx_cur),
+              sg_src_prop.view(),
+              sg_dst_prop.view(),
+              cugraph::edge_dummy_property_t{}.view(),
+              e_op_t<key_t, vertex_t, property_t, payload_t>{},
+              cugraph::reduce_op::plus<payload_t>{});
+        }
+
+        auto sg_reduce_by_dst_new_frontier_key_buffer =
+          cugraph::allocate_dataframe_buffer<key_t>(0, handle_->get_stream());
+        [[maybe_unused]] auto sg_reduce_by_dst_payload_buffer =
+          cugraph::detail::allocate_optional_dataframe_buffer<payload_t>(0, handle_->get_stream());
+        if constexpr (std::is_same_v<payload_t, void>) {
+          sg_reduce_by_dst_new_frontier_key_buffer =
+            cugraph::transform_reduce_v_frontier_outgoing_e_by_dst(
+              *handle_,
+              sg_graph_view,
+              sg_vertex_frontier.bucket(bucket_idx_cur),
+              sg_src_prop.view(),
+              sg_dst_prop.view(),
+              cugraph::edge_dummy_property_t{}.view(),
+              e_op_t<key_t, vertex_t, property_t, payload_t>{},
+              cugraph::reduce_op::null{});
+        } else {
+          std::tie(sg_reduce_by_dst_new_frontier_key_buffer, sg_reduce_by_dst_payload_buffer) =
             cugraph::transform_reduce_v_frontier_outgoing_e_by_dst(
               *handle_,
               sg_graph_view,
@@ -382,29 +528,56 @@ class Tests_MGTransformReduceVFrontierOutgoingEByDst
         }
 
         if constexpr (std::is_same_v<payload_t, void>) {
-          thrust::sort(handle_->get_thrust_policy(),
-                       cugraph::get_dataframe_buffer_begin(sg_new_frontier_key_buffer),
-                       cugraph::get_dataframe_buffer_end(sg_new_frontier_key_buffer));
+          thrust::sort(
+            handle_->get_thrust_policy(),
+            cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_new_frontier_key_buffer),
+            cugraph::get_dataframe_buffer_end(sg_reduce_by_src_new_frontier_key_buffer));
+
+          thrust::sort(
+            handle_->get_thrust_policy(),
+            cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_new_frontier_key_buffer),
+            cugraph::get_dataframe_buffer_end(sg_reduce_by_dst_new_frontier_key_buffer));
         } else {
-          thrust::sort_by_key(handle_->get_thrust_policy(),
-                              cugraph::get_dataframe_buffer_begin(sg_new_frontier_key_buffer),
-                              cugraph::get_dataframe_buffer_end(sg_new_frontier_key_buffer),
-                              cugraph::get_dataframe_buffer_begin(sg_payload_buffer));
+          thrust::sort_by_key(
+            handle_->get_thrust_policy(),
+            cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_new_frontier_key_buffer),
+            cugraph::get_dataframe_buffer_end(sg_reduce_by_src_new_frontier_key_buffer),
+            cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_payload_buffer));
+
+          thrust::sort_by_key(
+            handle_->get_thrust_policy(),
+            cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_new_frontier_key_buffer),
+            cugraph::get_dataframe_buffer_end(sg_reduce_by_dst_new_frontier_key_buffer),
+            cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_payload_buffer));
         }
 
-        bool key_passed =
-          thrust::equal(handle_->get_thrust_policy(),
-                        cugraph::get_dataframe_buffer_begin(sg_new_frontier_key_buffer),
-                        cugraph::get_dataframe_buffer_end(sg_new_frontier_key_buffer),
-                        cugraph::get_dataframe_buffer_begin(mg_aggregate_new_frontier_key_buffer));
+        bool key_passed = thrust::equal(
+          handle_->get_thrust_policy(),
+          cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_new_frontier_key_buffer),
+          cugraph::get_dataframe_buffer_end(sg_reduce_by_src_new_frontier_key_buffer),
+          cugraph::get_dataframe_buffer_begin(mg_reduce_by_src_aggregate_new_frontier_key_buffer));
+        ASSERT_TRUE(key_passed);
+
+        key_passed = thrust::equal(
+          handle_->get_thrust_policy(),
+          cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_new_frontier_key_buffer),
+          cugraph::get_dataframe_buffer_end(sg_reduce_by_dst_new_frontier_key_buffer),
+          cugraph::get_dataframe_buffer_begin(mg_reduce_by_dst_aggregate_new_frontier_key_buffer));
         ASSERT_TRUE(key_passed);
 
         if constexpr (!std::is_same_v<payload_t, void>) {
-          bool payload_passed =
-            thrust::equal(handle_->get_thrust_policy(),
-                          cugraph::get_dataframe_buffer_begin(sg_payload_buffer),
-                          cugraph::get_dataframe_buffer_begin(sg_payload_buffer),
-                          cugraph::get_dataframe_buffer_end(mg_aggregate_payload_buffer));
+          bool payload_passed = thrust::equal(
+            handle_->get_thrust_policy(),
+            cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_payload_buffer),
+            cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_payload_buffer),
+            cugraph::get_dataframe_buffer_end(mg_reduce_by_src_aggregate_payload_buffer));
+          ASSERT_TRUE(payload_passed);
+
+          payload_passed = thrust::equal(
+            handle_->get_thrust_policy(),
+            cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_payload_buffer),
+            cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_payload_buffer),
+            cugraph::get_dataframe_buffer_end(mg_reduce_by_dst_aggregate_payload_buffer));
           ASSERT_TRUE(payload_passed);
         }
       }
@@ -417,20 +590,20 @@ class Tests_MGTransformReduceVFrontierOutgoingEByDst
 
 template <typename input_usecase_t>
 std::unique_ptr<raft::handle_t>
-  Tests_MGTransformReduceVFrontierOutgoingEByDst<input_usecase_t>::handle_ = nullptr;
+  Tests_MGTransformReduceVFrontierOutgoingEBySrcDst<input_usecase_t>::handle_ = nullptr;
 
-using Tests_MGTransformReduceVFrontierOutgoingEByDst_File =
-  Tests_MGTransformReduceVFrontierOutgoingEByDst<cugraph::test::File_Usecase>;
-using Tests_MGTransformReduceVFrontierOutgoingEByDst_Rmat =
-  Tests_MGTransformReduceVFrontierOutgoingEByDst<cugraph::test::Rmat_Usecase>;
+using Tests_MGTransformReduceVFrontierOutgoingEBySrcDst_File =
+  Tests_MGTransformReduceVFrontierOutgoingEBySrcDst<cugraph::test::File_Usecase>;
+using Tests_MGTransformReduceVFrontierOutgoingEBySrcDst_Rmat =
+  Tests_MGTransformReduceVFrontierOutgoingEBySrcDst<cugraph::test::Rmat_Usecase>;
 
-TEST_P(Tests_MGTransformReduceVFrontierOutgoingEByDst_File, CheckInt32Int32FloatVoidVoid)
+TEST_P(Tests_MGTransformReduceVFrontierOutgoingEBySrcDst_File, CheckInt32Int32FloatVoidVoid)
 {
   auto param = GetParam();
   run_current_test<int32_t, int32_t, float, void, void>(std::get<0>(param), std::get<1>(param));
 }
 
-TEST_P(Tests_MGTransformReduceVFrontierOutgoingEByDst_Rmat, CheckInt32Int32FloatVoidVoid)
+TEST_P(Tests_MGTransformReduceVFrontierOutgoingEBySrcDst_Rmat, CheckInt32Int32FloatVoidVoid)
 {
   auto param = GetParam();
   run_current_test<int32_t, int32_t, float, void, void>(
@@ -438,13 +611,13 @@ TEST_P(Tests_MGTransformReduceVFrontierOutgoingEByDst_Rmat, CheckInt32Int32Float
     cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
 }
 
-TEST_P(Tests_MGTransformReduceVFrontierOutgoingEByDst_File, CheckInt32Int32FloatVoidInt32)
+TEST_P(Tests_MGTransformReduceVFrontierOutgoingEBySrcDst_File, CheckInt32Int32FloatVoidInt32)
 {
   auto param = GetParam();
   run_current_test<int32_t, int32_t, float, void, int32_t>(std::get<0>(param), std::get<1>(param));
 }
 
-TEST_P(Tests_MGTransformReduceVFrontierOutgoingEByDst_Rmat, CheckInt32Int32FloatVoidInt32)
+TEST_P(Tests_MGTransformReduceVFrontierOutgoingEBySrcDst_Rmat, CheckInt32Int32FloatVoidInt32)
 {
   auto param = GetParam();
   run_current_test<int32_t, int32_t, float, void, int32_t>(
@@ -452,14 +625,16 @@ TEST_P(Tests_MGTransformReduceVFrontierOutgoingEByDst_Rmat, CheckInt32Int32Float
     cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
 }
 
-TEST_P(Tests_MGTransformReduceVFrontierOutgoingEByDst_File, CheckInt32Int32FloatVoidTupleFloatInt32)
+TEST_P(Tests_MGTransformReduceVFrontierOutgoingEBySrcDst_File,
+       CheckInt32Int32FloatVoidTupleFloatInt32)
 {
   auto param = GetParam();
   run_current_test<int32_t, int32_t, float, void, thrust::tuple<float, int32_t>>(
     std::get<0>(param), std::get<1>(param));
 }
 
-TEST_P(Tests_MGTransformReduceVFrontierOutgoingEByDst_Rmat, CheckInt32Int32FloatVoidTupleFloatInt32)
+TEST_P(Tests_MGTransformReduceVFrontierOutgoingEBySrcDst_Rmat,
+       CheckInt32Int32FloatVoidTupleFloatInt32)
 {
   auto param = GetParam();
   run_current_test<int32_t, int32_t, float, void, thrust::tuple<float, int32_t>>(
@@ -467,13 +642,13 @@ TEST_P(Tests_MGTransformReduceVFrontierOutgoingEByDst_Rmat, CheckInt32Int32Float
     cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
 }
 
-TEST_P(Tests_MGTransformReduceVFrontierOutgoingEByDst_File, CheckInt32Int32FloatInt32Void)
+TEST_P(Tests_MGTransformReduceVFrontierOutgoingEBySrcDst_File, CheckInt32Int32FloatInt32Void)
 {
   auto param = GetParam();
   run_current_test<int32_t, int32_t, float, int32_t, void>(std::get<0>(param), std::get<1>(param));
 }
 
-TEST_P(Tests_MGTransformReduceVFrontierOutgoingEByDst_Rmat, CheckInt32Int32FloatInt32Void)
+TEST_P(Tests_MGTransformReduceVFrontierOutgoingEBySrcDst_Rmat, CheckInt32Int32FloatInt32Void)
 {
   auto param = GetParam();
   run_current_test<int32_t, int32_t, float, int32_t, void>(
@@ -481,14 +656,14 @@ TEST_P(Tests_MGTransformReduceVFrontierOutgoingEByDst_Rmat, CheckInt32Int32Float
     cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
 }
 
-TEST_P(Tests_MGTransformReduceVFrontierOutgoingEByDst_File, CheckInt32Int32FloatInt32Int32)
+TEST_P(Tests_MGTransformReduceVFrontierOutgoingEBySrcDst_File, CheckInt32Int32FloatInt32Int32)
 {
   auto param = GetParam();
   run_current_test<int32_t, int32_t, float, int32_t, int32_t>(std::get<0>(param),
                                                               std::get<1>(param));
 }
 
-TEST_P(Tests_MGTransformReduceVFrontierOutgoingEByDst_Rmat, CheckInt32Int32FloatInt32Int32)
+TEST_P(Tests_MGTransformReduceVFrontierOutgoingEBySrcDst_Rmat, CheckInt32Int32FloatInt32Int32)
 {
   auto param = GetParam();
   run_current_test<int32_t, int32_t, float, int32_t, int32_t>(
@@ -496,7 +671,7 @@ TEST_P(Tests_MGTransformReduceVFrontierOutgoingEByDst_Rmat, CheckInt32Int32Float
     cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
 }
 
-TEST_P(Tests_MGTransformReduceVFrontierOutgoingEByDst_File,
+TEST_P(Tests_MGTransformReduceVFrontierOutgoingEBySrcDst_File,
        CheckInt32Int32FloatInt32TupleFloatInt32)
 {
   auto param = GetParam();
@@ -504,7 +679,7 @@ TEST_P(Tests_MGTransformReduceVFrontierOutgoingEByDst_File,
     std::get<0>(param), std::get<1>(param));
 }
 
-TEST_P(Tests_MGTransformReduceVFrontierOutgoingEByDst_Rmat,
+TEST_P(Tests_MGTransformReduceVFrontierOutgoingEBySrcDst_Rmat,
        CheckInt32Int32FloatInt32TupleFloatInt32)
 {
   auto param = GetParam();
@@ -513,14 +688,14 @@ TEST_P(Tests_MGTransformReduceVFrontierOutgoingEByDst_Rmat,
     cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
 }
 
-TEST_P(Tests_MGTransformReduceVFrontierOutgoingEByDst_File, CheckInt32Int64FloatInt32Int32)
+TEST_P(Tests_MGTransformReduceVFrontierOutgoingEBySrcDst_File, CheckInt32Int64FloatInt32Int32)
 {
   auto param = GetParam();
   run_current_test<int32_t, int64_t, float, int32_t, int32_t>(std::get<0>(param),
                                                               std::get<1>(param));
 }
 
-TEST_P(Tests_MGTransformReduceVFrontierOutgoingEByDst_Rmat, CheckInt32Int64FloatInt32Int32)
+TEST_P(Tests_MGTransformReduceVFrontierOutgoingEBySrcDst_Rmat, CheckInt32Int64FloatInt32Int32)
 {
   auto param = GetParam();
   run_current_test<int32_t, int64_t, float, int32_t, int32_t>(
@@ -528,14 +703,14 @@ TEST_P(Tests_MGTransformReduceVFrontierOutgoingEByDst_Rmat, CheckInt32Int64Float
     cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
 }
 
-TEST_P(Tests_MGTransformReduceVFrontierOutgoingEByDst_File, CheckInt64Int64FloatInt32Int32)
+TEST_P(Tests_MGTransformReduceVFrontierOutgoingEBySrcDst_File, CheckInt64Int64FloatInt32Int32)
 {
   auto param = GetParam();
   run_current_test<int64_t, int64_t, float, int32_t, int32_t>(std::get<0>(param),
                                                               std::get<1>(param));
 }
 
-TEST_P(Tests_MGTransformReduceVFrontierOutgoingEByDst_Rmat, CheckInt64Int64FloatInt32Int32)
+TEST_P(Tests_MGTransformReduceVFrontierOutgoingEBySrcDst_Rmat, CheckInt64Int64FloatInt32Int32)
 {
   auto param = GetParam();
   run_current_test<int64_t, int64_t, float, int32_t, int32_t>(
@@ -545,7 +720,7 @@ TEST_P(Tests_MGTransformReduceVFrontierOutgoingEByDst_Rmat, CheckInt64Int64Float
 
 INSTANTIATE_TEST_SUITE_P(
   file_test,
-  Tests_MGTransformReduceVFrontierOutgoingEByDst_File,
+  Tests_MGTransformReduceVFrontierOutgoingEBySrcDst_File,
   ::testing::Combine(
     ::testing::Values(Prims_Usecase{false, true}, Prims_Usecase{true, true}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
@@ -554,7 +729,7 @@ INSTANTIATE_TEST_SUITE_P(
                       cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
 
 INSTANTIATE_TEST_SUITE_P(rmat_small_test,
-                         Tests_MGTransformReduceVFrontierOutgoingEByDst_Rmat,
+                         Tests_MGTransformReduceVFrontierOutgoingEBySrcDst_Rmat,
                          ::testing::Combine(::testing::Values(Prims_Usecase{false, true},
                                                               Prims_Usecase{true, true}),
                                             ::testing::Values(cugraph::test::Rmat_Usecase(
@@ -566,7 +741,7 @@ INSTANTIATE_TEST_SUITE_P(
                           vertex & edge type combination) by command line arguments and do not
                           include more than one Rmat_Usecase that differ only in scale or edge
                           factor (to avoid running same benchmarks more than once) */
-  Tests_MGTransformReduceVFrontierOutgoingEByDst_Rmat,
+  Tests_MGTransformReduceVFrontierOutgoingEBySrcDst_Rmat,
   ::testing::Combine(
     ::testing::Values(Prims_Usecase{false, false}, Prims_Usecase{true, false}),
     ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
diff --git a/cpp/tests/traversal/bfs_test.cpp b/cpp/tests/traversal/bfs_test.cpp
index 8d3cdb3d24b..c5041378d78 100644
--- a/cpp/tests/traversal/bfs_test.cpp
+++ b/cpp/tests/traversal/bfs_test.cpp
@@ -101,7 +101,10 @@ class Tests_BFS : public ::testing::TestWithParam<std::tuple<BFS_Usecase, input_
   template <typename vertex_t, typename edge_t>
   void run_current_test(BFS_Usecase const& bfs_usecase, input_usecase_t const& input_usecase)
   {
-    constexpr bool renumber = true;
+    bool constexpr renumber         = true;
+    bool constexpr test_weighted    = false;
+    bool constexpr drop_self_loops  = false;
+    bool constexpr drop_multi_edges = false;
 
     using weight_t = float;
 
@@ -117,7 +120,7 @@ class Tests_BFS : public ::testing::TestWithParam<std::tuple<BFS_Usecase, input_
     std::optional<rmm::device_uvector<vertex_t>> d_renumber_map_labels{std::nullopt};
     std::tie(graph, std::ignore, d_renumber_map_labels) =
       cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, false>(
-        handle, input_usecase, false, renumber);
+        handle, input_usecase, test_weighted, renumber, drop_self_loops, drop_multi_edges);
 
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
@@ -132,6 +135,16 @@ class Tests_BFS : public ::testing::TestWithParam<std::tuple<BFS_Usecase, input_
         cugraph::test::generate<decltype(graph_view), bool>::edge_property(handle, graph_view, 2);
       graph_view.attach_edge_mask((*edge_mask).view());
     }
+    {  // FIXME: for testing, delete
+      auto num_self_loops  = graph_view.count_self_loops(handle);
+      auto number_of_edges = graph_view.compute_number_of_edges(handle);
+      std::cout << "V=" << graph_view.number_of_vertices() << " E=" << number_of_edges
+                << " num_self_loops=" << num_self_loops;
+      if (graph_view.is_symmetric()) {
+        std::cout << " undirected E=" << ((number_of_edges - num_self_loops) / 2 + num_self_loops)
+                  << std::endl;
+      }
+    }
 
     ASSERT_TRUE(static_cast<vertex_t>(bfs_usecase.source) >= 0 &&
                 static_cast<vertex_t>(bfs_usecase.source) < graph_view.number_of_vertices())
@@ -154,7 +167,7 @@ class Tests_BFS : public ::testing::TestWithParam<std::tuple<BFS_Usecase, input_
                  d_predecessors.data(),
                  d_source.data(),
                  size_t{1},
-                 false,
+                 graph_view.is_symmetric() ? true : false,
                  std::numeric_limits<vertex_t>::max());
 
     if (cugraph::test::g_perf) {
@@ -305,10 +318,12 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_BFS_Rmat,
   ::testing::Values(
     // enable correctness checks
-    std::make_tuple(BFS_Usecase{0, false},
-                    cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false)),
-    std::make_tuple(BFS_Usecase{0, true},
-                    cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+    std::make_tuple(
+      BFS_Usecase{0, false},
+      cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, true /* undirected */, false)),
+    std::make_tuple(
+      BFS_Usecase{0, true},
+      cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, true /* undirected */, false))));
 
 INSTANTIATE_TEST_SUITE_P(
   rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
@@ -319,9 +334,13 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_BFS_Rmat,
   ::testing::Values(
     // disable correctness checks for large graphs
-    std::make_tuple(BFS_Usecase{0, false, false},
-                    cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false)),
-    std::make_tuple(BFS_Usecase{0, true, false},
-                    cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+    std::make_tuple(
+      BFS_Usecase{0, false, false},
+      cugraph::test::Rmat_Usecase(
+        20, 16, 0.57, 0.19, 0.19, 0, true /* undirected */, false /* scramble vertex IDs */)),
+    std::make_tuple(
+      BFS_Usecase{0, true, false},
+      cugraph::test::Rmat_Usecase(
+        20, 16, 0.57, 0.19, 0.19, 0, true /* undirected */, false /* scramble vertex IDs */))));
 
 CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/traversal/mg_bfs_test.cpp b/cpp/tests/traversal/mg_bfs_test.cpp
index 1b63ad3b085..aa59719f814 100644
--- a/cpp/tests/traversal/mg_bfs_test.cpp
+++ b/cpp/tests/traversal/mg_bfs_test.cpp
@@ -64,6 +64,11 @@ class Tests_MGBFS : public ::testing::TestWithParam<std::tuple<BFS_Usecase, inpu
   {
     using weight_t = float;
 
+    bool constexpr renumber         = true;
+    bool constexpr test_weighted    = false;
+    bool constexpr drop_self_loops  = false;
+    bool constexpr drop_multi_edges = false;
+
     HighResTimer hr_timer{};
 
     // 1. create MG graph
@@ -78,7 +83,7 @@ class Tests_MGBFS : public ::testing::TestWithParam<std::tuple<BFS_Usecase, inpu
     std::optional<rmm::device_uvector<vertex_t>> mg_renumber_map{std::nullopt};
     std::tie(mg_graph, std::ignore, mg_renumber_map) =
       cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, true>(
-        *handle_, input_usecase, false, true);
+        *handle_, input_usecase, test_weighted, renumber, drop_self_loops, drop_multi_edges);
 
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
@@ -95,6 +100,16 @@ class Tests_MGBFS : public ::testing::TestWithParam<std::tuple<BFS_Usecase, inpu
         *handle_, mg_graph_view, 2);
       mg_graph_view.attach_edge_mask((*edge_mask).view());
     }
+    {  // FIXME: for testing, delete
+      auto num_self_loops  = mg_graph_view.count_self_loops(*handle_);
+      auto number_of_edges = mg_graph_view.compute_number_of_edges(*handle_);
+      std::cout << "V=" << mg_graph_view.number_of_vertices() << " E=" << number_of_edges
+                << " num_self_loops=" << num_self_loops;
+      if (mg_graph_view.is_symmetric()) {
+        std::cout << " undirected E=" << ((number_of_edges - num_self_loops) / 2 + num_self_loops)
+                  << std::endl;
+      }
+    }
 
     ASSERT_TRUE(static_cast<vertex_t>(bfs_usecase.source) >= 0 &&
                 static_cast<vertex_t>(bfs_usecase.source) < mg_graph_view.number_of_vertices())
@@ -124,7 +139,7 @@ class Tests_MGBFS : public ::testing::TestWithParam<std::tuple<BFS_Usecase, inpu
                  d_mg_predecessors.data(),
                  d_mg_source ? (*d_mg_source).data() : static_cast<vertex_t const*>(nullptr),
                  d_mg_source ? size_t{1} : size_t{0},
-                 false,
+                 mg_graph_view.is_symmetric() ? true : false,
                  std::numeric_limits<vertex_t>::max());
 
     if (cugraph::test::g_perf) {
@@ -323,9 +338,13 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_MGBFS_Rmat,
   ::testing::Values(
     // disable correctness checks for large graphs
-    std::make_tuple(BFS_Usecase{0, false, false},
-                    cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false)),
-    std::make_tuple(BFS_Usecase{0, true, false},
-                    cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+    std::make_tuple(
+      BFS_Usecase{0, false, false},
+      cugraph::test::Rmat_Usecase(
+        20, 16, 0.57, 0.19, 0.19, 0, true /* undirected */, false /* scramble vertex IDs */)),
+    std::make_tuple(
+      BFS_Usecase{0, true, false},
+      cugraph::test::Rmat_Usecase(
+        20, 16, 0.57, 0.19, 0.19, 0, true /* undirected */, false /* scramble vertex IDs */))));
 
 CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/utilities/property_generator_utilities_impl.cuh b/cpp/tests/utilities/property_generator_utilities_impl.cuh
index 1a6c87700a9..a46009f95e3 100644
--- a/cpp/tests/utilities/property_generator_utilities_impl.cuh
+++ b/cpp/tests/utilities/property_generator_utilities_impl.cuh
@@ -88,8 +88,10 @@ generate<GraphViewType, property_t>::src_property(raft::handle_t const& handle,
 {
   auto output_property =
     cugraph::edge_src_property_t<GraphViewType, property_t>(handle, graph_view);
-  update_edge_src_property(
-    handle, graph_view, cugraph::get_dataframe_buffer_begin(property), output_property);
+  update_edge_src_property(handle,
+                           graph_view,
+                           cugraph::get_dataframe_buffer_begin(property),
+                           output_property.mutable_view());
   return output_property;
 }
 
@@ -101,8 +103,10 @@ generate<GraphViewType, property_t>::dst_property(raft::handle_t const& handle,
 {
   auto output_property =
     cugraph::edge_dst_property_t<GraphViewType, property_t>(handle, graph_view);
-  update_edge_dst_property(
-    handle, graph_view, cugraph::get_dataframe_buffer_begin(property), output_property);
+  update_edge_dst_property(handle,
+                           graph_view,
+                           cugraph::get_dataframe_buffer_begin(property),
+                           output_property.mutable_view());
   return output_property;
 }