From 897ecc876da7d18fb6a7954aa8d8adaaecd7e2fa Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 14 Sep 2022 10:29:46 -0700
Subject: [PATCH 01/18] rename a prim file

---
 ...utgoine_e.cuh => per_v_random_select_transform_outgoine_e.cuh} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename cpp/src/prims/{per_v_random_select_and_transform_outgoine_e.cuh => per_v_random_select_transform_outgoine_e.cuh} (100%)
diff --git a/cpp/src/prims/per_v_random_select_and_transform_outgoine_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoine_e.cuh
similarity index 100%
rename from cpp/src/prims/per_v_random_select_and_transform_outgoine_e.cuh
rename to cpp/src/prims/per_v_random_select_transform_outgoine_e.cuh

From 0f3e9419dceb2ca012b900fab046badd9c2002a8 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 14 Sep 2022 10:30:40 -0700
Subject: [PATCH 02/18] initial prim implementation

---
 .../cugraph/utilities/device_functors.cuh     |  22 +
 ...r_v_random_select_transform_outgoine_e.cuh | 663 +++++++++++++++++-
 2 files changed, 684 insertions(+), 1 deletion(-)

diff --git a/cpp/include/cugraph/utilities/device_functors.cuh b/cpp/include/cugraph/utilities/device_functors.cuh
index 8354f7701f3..8a75f2a3379 100644
--- a/cpp/include/cugraph/utilities/device_functors.cuh
+++ b/cpp/include/cugraph/utilities/device_functors.cuh
@@ -75,6 +75,21 @@ struct check_in_range_t {
   __device__ bool operator()(T val) const { return (val >= min) && (val < max); }
 };
 
+template <typename T>
+struct strided_sum_t {
+  T const* values{nullptr};
+  size_t stride{0};
+  size_t count{0};
+
+  __device__ T operator()(size_t start_offset) const {
+    T sum{0};
+    for (size_t j = 0; j < count; ++j) {
+      sum += values[start_offset + stride * j];
+    }
+    return sum;
+  }
+};
+
 template <typename T>
 struct shift_left_t {
   T offset{};
@@ -104,6 +119,13 @@ struct multiply_and_add_t {
   __device__ T operator()(T input) const { return input * multiplier + adder; }
 };
 
+template <typename T>
+struct divider_t {
+  T divisor{};
+
+  __device__ T operator()(T input) const { return input / divisor; }
+};
+
 }  // namespace detail
 
 }  // namespace cugraph
diff --git a/cpp/src/prims/per_v_random_select_transform_outgoine_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoine_e.cuh
index bc213c713c4..8d22b906ff6 100644
--- a/cpp/src/prims/per_v_random_select_transform_outgoine_e.cuh
+++ b/cpp/src/prims/per_v_random_select_transform_outgoine_e.cuh
@@ -17,6 +17,652 @@
 
 namespace cugraph {
 
+namespace detail {
+
+// convert a (neighbor index, key index) pair  to a (col_comm_rank, neighbor index, key index)
+// triplet, col_comm_rank is set to -1 if an neighbor index is invalid
+template <typename edge_t>
+struct convert_pair_to_triplet_t {
+  raft::device_span<edge_t const> gathered_local_degrees{};
+  size_t stride{};
+  size_t K{};
+  int32_t col_comm_size{};
+  edge_t invalid_idx{};
+
+  __device__ int32_t operator()(thrust::tuple<edge_t, size_t> index_pair) const
+  {
+    auto nbr_idx       = thrust::get<0>(index_pair);
+    auto key_idx       = thrust::get<1>(index_pair);
+    auto local_nbr_idx = nbr_idx;
+    int32_t col_comm_rank{-1};
+    if (nbr_idx != invalid_idx) {
+      col_comm_rank = col_comm_size - 1;
+      for (int rank = 0; rank < col_comm_size - 1; ++rank) {
+        auto local_degree = gathered_local_degrees[stride * rank + key_idx];
+        if (local_nbr_idx < local_degree) {
+          col_comm_rank = rank;
+          break;
+        } else {
+          local_nbr_idx -= local_degree;
+        }
+      }
+    }
+    return thrust::make_tuple(col_comm_rank, local_nbr_idx, key_idx);
+  }
+};
+
+template <typename edge_t>
+struct invalid_col_comm_rank_t {
+  int32_t invalid_col_comm_rank{};
+  __device__ bool operator()(thrust::tuple<edge_t, int32_t, size_t> triplet) const
+  {
+    return thrust::get<1>(triplet) == invalid_col_comm_rank;
+  }
+};
+
+template <typename GraphViewType,
+          typename KeyIterator,
+          typename OffsetIterator,
+          typename LocalNbrIdxIterator,
+          typename OutputValueIterator,
+          typename OutputCountIterator,
+          typename EdgePartitionSrcValueInputWrapper,
+          typename EdgePartitionDstValueInputWrapper,
+          typename EdgeOp,
+          typename T>
+struct transform_and_count_local_nbr_indices_t {
+  using key_t    = typename thrust::iterator_traits<KeyIterator>::value_type;
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using weight_t = typename GraphViewType::weight_type;
+
+  edge_partition_device_view_t<vertex_t, edge_t, weight_t, GraphViewType::is_multi_gpu>
+    edge_partition{};
+  KeyIterator key_first{};
+  OffsetIterator offset_first{};
+  LocalNbrIdxIterator local_nbr_idx_first{};
+  OutputValueIterator output_value_first{};
+  thrust::optional<OutputCountIterator> output_count_first{};
+  EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
+    EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, EdgeOp e_op{};
+  edge_t invalid_idx{};
+  thrust::optional<T> invalid_value{thrust::nullopt};
+
+  __device__ void operator()(size_t key_idx) const
+  {
+    auto key = *(key_first + key_idx);
+    if constexpr (std::is_same_v<key_t, vertex_t>) {
+      major = key;
+    } else {
+      major = thrust::get<0>(key);
+    }
+    auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+    vertex_t const* indices{nullptr};
+    thrust::optional<weight_t const*> weights{thrust::nullopt};
+    [[maybe_unused]] edge_t local_degree{0};
+    if constexpr (GraphViewType::is_multi_gpu) {
+      auto major_hypersparse_first = edge_partition.major_hypersparse_first();
+      if (major_hypersparse_first && (major >= *major_hypersparse_first)) {
+        auto major_hypersparse_idx = edge_partition.major_hypersparse_idx_from_major_nocheck(major);
+        if (major_hypersparse_idx) {
+          thrust::tie(indices, weights, local_degree) = edge_partition.local_edges(
+            edge_partition.major_offset_from_major_nocheck(*major_hypersparse_first) +
+            *major_hypersparse_idx);
+        }
+      } else {
+        thrust::tie(indices, weights, local_degree) = edge_partition.local_edges(major_offset);
+      }
+    } else {
+      thrust::tie(indices, weights, local_degree) = edge_partition.local_edges(major_offset);
+    }
+    auto start_offset = *(local_nbr_idx_offset_first + key_idx);
+    auto end_offset   = *(local_nbr_idx_offset_first + (key_idx + 1));
+
+    size_t num_valid_local_nbr_indices{0};
+    for (size_t i = start_offset; i < end_offset; ++i) {
+      auto local_nbr_idx = *(local_nbr_idx_first + i);
+      if (local_nbr_idx != invalid_idx) {
+        assert(local_nbr_idx < local_degree);
+        auto minor        = indices[local_nbr_idx];
+        auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
+
+        std::conditional_t<GraphViewType::is_storage_transposed, vertex_t, key_t>
+          key_or_src{};  // key if major
+        std::conditional_t<GraphViewType::is_storage_transposed, key_t, vertex_t>
+          key_or_dst{};  // key if major
+        if constexpr (GraphViewType::is_storage_transposed) {
+          key_or_src = minor;
+          key_or_dst = key;
+        } else {
+          key_or_src = key;
+          key_or_dst = minor;
+        }
+        auto src_offset     = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
+        auto dst_offset     = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
+        *(output_first + i) = evaluate_edge_op<GraphViewType,
+                                               key_t,
+                                               EdgePartitionSrcValueInputWrapper,
+                                               EdgePartitionDstValueInputWrapper,
+                                               EdgeOp>()
+                                .compute(key_or_src,
+                                         key_or_dst,
+                                         weights ? (*weights)[local_nbr_idx] : weight_t{1.0},
+                                         edge_partition_src_value_input.get(src_offset),
+                                         edge_partition_dst_value_input.get(dst_offset),
+                                         e_op);
+        ++num_valid_local_nbr_indices;
+      } else if (invalid_value) {
+        *(output_first + i) = *invalid_value;
+      } else {
+        assert(output_count_first);
+      }
+    }
+    if (output_count_first) { *(*output_count_first + key_idx) = num_valid_local_nbr_indices; }
+  }
+};
+
+template <typename InputIterator, typename OutputIterator>
+struct copy_and_fill_sample_e_op_results_t {
+  raft::device_span<size_t const> sample_counts{};
+  raft::device_span<size_t const> sample_displacements{};
+  InputIterator input_first{};
+  OutputIterator output_first{};
+  size_t K{};
+
+  __device__ void operator()(size_t i) const
+  {
+    auto num_valid_samples = sample_counts[i];
+    for (size_t j = 0; j < num_valid_samples; ++j) {  // copy
+      *(output_first + K * i + j) = *(input_first + sample_displacements[i] + j);
+    }
+    for (size_t j = num_valid_samples; j < K; ++j) {  // fill
+      *(output_first + K * i + j) = invalid_value;
+    }
+  }
+};
+
+template <bool incoming,
+          typename GraphViewType,
+          typename VertexFrontierBucketType,
+          typename EdgeSrcValueInputWrapper,
+          typename EdgeDstValueInputWrapper,
+          typename EdgeOp,
+          typename T>
+std::tuple<std::optional<rmm::device_uvector<size_t>>,
+           decltype(allocate_dataframe_buffer<T>(size_t{0}, rmm::cuda_stream_view{}))>
+per_v_random_select_transform_e(raft::handle_t const& handle,
+                                GraphViewType const& graph_view,
+                                VertexFrontierBucketType const& frontier,
+                                EdgeSrcValueInputWrapper edge_src_value_input,
+                                EdgeDstValueInputWrapper edge_dst_value_input,
+#if 0  // FIXME: This will be necessary to include edge IDs in the output.
+       // Primitives API should be updated to support this in a consistent way.
+                               EdgeValueInputWrapper egde_value_input,
+#endif
+                                EdgeOp e_op,
+                                raft::random::RngState& rng_state,
+                                size_t K,
+                                bool with_replacement,
+                                std::optional<T> invalid_value,
+                                bool do_expensive_check)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using weight_t = typename GraphViewType::weight_type;
+  using key_t    = typename VertexFrontierBucketType::key_type;
+
+  using edge_partition_src_input_device_view_t = std::conditional_t<
+    std::is_same_v<typename EdgeSrcValueInputWrapper::value_type, thrust::nullopt_t>,
+    edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
+    std::conditional_t<GraphViewType::is_storage_transposed,
+                       edge_partition_endpoint_property_device_view_t<
+                         vertex_t,
+                         typename EdgeSrcValueInputWrapper::value_iterator>,
+                       edge_partition_endpoint_property_device_view_t<
+                         vertex_t,
+                         typename EdgeSrcValueInputWrapper::value_iterator>>>;
+  using edge_partition_dst_input_device_view_t = std::conditional_t<
+    std::is_same_v<typename EdgeDstValueInputWrapper::value_type, thrust::nullopt_t>,
+    edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
+    std::conditional_t<GraphViewType::is_storage_transposed,
+                       edge_partition_endpoint_property_device_view_t<
+                         vertex_t,
+                         typename EdgeDstValueInputWrapper::value_iterator>,
+                       edge_partition_endpoint_property_device_view_t<
+                         vertex_t,
+                         typename EdgeDstValueInputWrapper::value_iterator>>>;
+
+  static_assert(!GraphViewType::is_storage_transposed == incoming);
+  static_assert(std::is_same_v<typename evaluate_edge_op<GraphViewType,
+                                                         key_t,
+                                                         EdgeSrcValueInputWrapper,
+                                                         EdgeDstValueInputWrapper,
+                                                         EdgeOp>::result_type,
+                               T>);
+
+  if (do_expensive_check) {
+    // FIXME: better re-factor this check function?
+    vertex_t const* frontier_vertex_first{nullptr};
+    vertex_t const* frontier_vertex_last{nullptr};
+    if constexpr (std::is_same_v<key_t, vertex_t>) {
+      frontier_vertex_first = frontier.begin();
+      frontier_vertex_last  = frontier.end();
+    } else {
+      frontier_vertex_first = thrust::get<0>(frontier.begin().get_iterator_tuple());
+      frontier_vertex_last  = thrust::get<0>(frontier.end().get_iterator_tuple());
+    }
+    auto num_invalid_keys =
+      frontier.size() -
+      thrust::count_if(handle.get_thrust_policy(),
+                       frontier_vertex_first,
+                       frontier_vertex_last,
+                       check_in_range_t<vertex_t>{graph_view.local_vertex_partition_range_first(),
+                                                  graph_view.local_vertex_partition_range_last()});
+    if constexpr (GraphViewType::is_multi_gpu) {
+      num_invalid_keys = host_scalar_allreduce(
+        handle.get_comms(), num_invalid_keys, raft::comms::op_t::SUM, handle.get_stream());
+    }
+    CUGRAPH_EXPECTS(num_invalid_keys == size_t{0},
+                    "Invalid input argument: frontier includes out-of-range keys.");
+  }
+
+  auto frontier_key_first = frontier.begin();
+  auto frontier_key_last  = frontier.end();
+
+  std::vector<size_t> local_frontier_sizes{};
+  if constexpr (GraphViewType::is_multi_gpu) {
+    auto& col_comm       = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+    local_frontier_sizes = host_scalar_allgather(
+      col_comm,
+      static_cast<size_t>(thrust::distance(frontier_key_first, frontier_key_last)),
+      handle.get_stream());
+  } else {
+    local_frontier_sizes = std::vector<size_t>{static_cast<size_t>(
+      static_cast<vertex_t>(thrust::distance(frontier_key_first, frontier_key_last)))};
+  }
+  std::vector<size_t> local_frontier_displacements(local_frontier_sizes.size());
+  std::exclusive_scan(local_frontier_sizes.begin(),
+                      local_frontier_sizes.end(),
+                      local_frontier_displacements.begin(),
+                      size_t{0});
+
+  // 1. aggregate frontier
+
+  auto aggregate_local_frontier_keys =
+    GraphViewType::is_multi_gpu
+      ? std::make_optional<decltype(
+          allocate_dataframe_buffer<key_t>(size_t{0}, rmm::cuda_stream_view{}))>(
+          local_frontier_displacements.back() + local_frontier_sizes.back(), handle.get_stream())
+      : std::nullopt;
+  if constexpr (GraphViewType::is_multi_gpu) {
+    auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+    device_allgatherv(col_comm,
+                      frontier_key_first,
+                      get_dataframe_buffer_begin(aggregate_local_frontier_keys),
+                      local_frontier_sizes,
+                      local_frontier_displacements,
+                      handle.get_stream());
+  }
+
+  // 2. compute degrees
+
+  auto aggregate_local_frontier_local_degrees =
+    GrpahViewType::is_multi_gpu
+      ? std::make_optional<rmm::device_uvector<edge_t>>(
+          local_frontier_displacements.back() + local_frontier_sizes.back(), handle.get_stream())
+      : std::nullopt;
+  rmm::device_uvector<edge_t> frontier_degrees(frontier.size(), handle.get_stream());
+  for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
+    auto edge_partition =
+      edge_partition_device_view_t<vertex_t, edge_t, weight_t, GraphViewType::is_multi_gpu>(
+        graph_view.local_edge_partition_view(i));
+
+    vertex_t const* edge_partition_frontier_major_first{nullptr};
+    vertex_t const* edge_partition_frontier_major_last{nullptr};
+
+    auto edge_partition_frontier_key_first =
+      (GraphViewType::is_multi_gpu ? get_dataframe_buffer_begin(*aggregate_local_frontier_keys)
+                                   : frontier_key_first) +
+      local_frontier_displacements[i];
+    if constexpr (std::is_same_v<key_t, vertex_t>) {
+      edge_partition_frontier_major_first = edge_partition_frontier_key_first;
+    } else {
+      edge_partition_frontier_major_first = thrust::get<0>(edge_partition_frontier_key_first);
+    }
+    edge_partition_frontier_major_last =
+      edge_partition_frontier_major_first + local_frontier_sizes[i];
+
+    auto edge_partition_frontier_local_degrees = edge_partition.compute_local_degrees(
+      raft::device_span<vertex_t const*>(edge_partition_frontier_major_first,
+                                         edge_partition_frontier_size),
+      handle.get_stream());
+
+    if constexpr (GraphViewType::is_multi_gpu) {
+      // FIXME: this copy is unnecessary if edge_partition.compute_local_degrees() takes a pointer
+      // to the output array
+      thrust::copy(
+        handle.get_thrust_policy(),
+        edge_partition_frontier_local_degrees.begin(),
+        edge_partition_frontier_local_degrees.end(),
+        (*aggregate_local_frontier_local_degrees).begin() + local_frontier_displacements[i]);
+    } else {
+      frontier_degrees = std::move(edge_partition_frontier_local_degrees);
+    }
+  }
+
+  auto frontier_gathered_local_degrees =
+    GrpahViewType::is_multi_gpu
+      ? std::make_optional<rmm::device_uvector<edge_t>>(frontier.size() * K, handle.get_stream())
+      : std::nullopt;
+  if constexpr (GrpahViewType::is_multi_gpu) {
+    std::tie(frontier_gathered_local_degrees, std::ignore) =
+      shuffle_values(col_comm,
+                     (*aggregate_local_frontier_local_degrees).begin(),
+                     local_frontier_sizes,
+                     handle.get_stream());
+    thrust::tablulate(handle.get_thrust_policy(),
+                      frontier_degrees.begin(),
+                      frontier_degrees.end(),
+                      strided_sum_t<edge_t>{frontier_gathered_local_degrees.data(),
+                                            frontier.size(),
+                                            static_cast<size_t>(col_comm_size)});
+    aggregate_local_frontier_local_degrees = std::nullopt;
+  }
+
+  // 3. randomly select neighbor indices
+
+  rmm::device_uvector<edge_t> sample_nbr_indices(frontier.size() * K, handle.get_stream());
+  // FIXME: get_sampling_index is inefficient when degree >> K & with_replacement = false
+  // FIXME: Need to verify that sample_nbr_indices[] are filled with INVALID_IDX when degree == 0
+  // (with_replacement = true) or degree < K (with_replacement = false)
+  cugraph_ops::get_sampling_index(sample_nbr_indices.data(),
+                                  rng_state,
+                                  frontier_degrees.data(),
+                                  static_cast<edge_t>(frontier.size()),
+                                  static_cats<int32_t>(K),
+                                  with_replacement,
+                                  handle.get_stream());
+  frontier_degrees.resize(0, handle.get_stream());
+  frontier_degrees.shrink_to_fit(handle.get_stream());
+
+  // 4. shuffle randomly selected indices
+
+  auto sample_local_nbr_indices = std::move(
+    sample_nbr_indices);  // neighbor index within an edge partition (note that each vertex's
+                          // neighbors are distributed in col_comm_size partitions)
+  auto sample_key_indices =
+    GraphViewType::is_multi_gpu
+      ? std::make_optional<rmm::devie_uvector<size_t>>(0, handle.get_stream())
+      : std::nullopt;
+  auto local_frontier_sample_counts =
+    GrpahViewType::is_multi_gpu ? std::vector<size_t>(0) : std::vector<size_t>{frontier.size() * K};
+  auto local_frontier_sample_displacements =
+    GrpahViewType::is_multi_gpu ? std::vector<size_t>(0) : std::vector<size_t>{0};
+  if constexpr (GrpahViewType::is_multi_gpu) {
+    auto col_comm_ranks =
+      rmm::device_uvector<int32_t>(sample_local_nbr_indices.size(), handle.get_stream());
+    auto sample_key_indices =
+      rmm::device_uvector<size_t>(sample_local_nbr_indices.size(), handle.get_stream());
+    auto input_pair_first = thrust::make_zip_iterator(
+      thrust::make_tuple(sample_local_nbr_indices.begin(),
+                         thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{0}),
+                                                         divider_t<size_t>{K})));
+    thrust::transform(handle.get_thrsut_policy(),
+                      input_pair_first,
+                      input_pair_first + sample_indices.size(),
+                      thrust::make_zip_iterator(thrust::make_tuple(
+                        col_comm_ranks.begin(), sample_indices.begin(), key_indices.begin())),
+                      convert_pair_to_triplet_t<edge_t>{
+                        raft::device_span<edge_t const>(frontier_gathered_local_degrees.data(),
+                                                        frontier_gathered_local_degrees.size()),
+                        frontier.size(),
+                        K,
+                        col_comm_size,
+                        ops::gnn::graph::INVALID_IDX<edge_t>});
+
+    frontier_gathered_local_degrees = std::nullopt;
+
+    auto triplet_first = thrust::make_zip_iterator(thrust::make_tuple(
+      sample_local_nbr_indices.bgin(), col_comm_ranks.begin(), key_indices.begin()));
+    sample_local_nbr_indices.resize(
+      thrust::distance(triplet_first,
+                       thrust::remove_if(handle.get_thrust_policy(),
+                                         triplet_first,
+                                         triplet_first + sample_local_nbr_indices.size(),
+                                         invalid_col_comm_rank<edge_t>{int32_t{-1}})),
+      handle.get_stream());
+    col_comm_ranks.resize(sample_local_nbr_indices.size(), handle.get_stream());
+    sample_key_indices.resize(sample_local_nbr_indices.size(), handle.get_stream());
+
+    auto d_tx_counts = groupby_and_count(col_comm_ranks.begin(),
+                                         col_comm_ranks.end(),
+                                         thrust::make_zip_iterator(thrust::make_tuple(
+                                           sample_local_nbr_indices.begin(), key_indices.begin())),
+                                         thrust::identity<int32_t>{},
+                                         col_comm_size,
+                                         std::numeric_limits<size_t>::max(),
+                                         handle.get_stream());
+
+    std::vector<size_t> h_tx_counts(d_tx_counts.size());
+    raft::update_host(
+      h_tx_counts.data(), d_tx_counts.data(), d_tx_counts.size(), handle.get_stream());
+    handle.sync_stream();
+
+    auto pair_first = thrust::make_zip_iterator(
+      thrust::make_tuple(sample_local_nbr_indices.bgin(), key_indices.begin()));
+    auto [rx_value_buffer, rx_counts] =
+      shuffle_values(col_comm, pair_first, h_tx_counts, handle.get_stream());
+
+    sample_local_nbr_indices            = std::move(std::get<0>(rx_value_buffer));
+    sample_key_indices                  = std::move(std::get<1>(rx_value_buffer));
+    local_frontier_sample_displacements = std::vector<size_t>(rx_counts.size());
+    std::exclusive_scan(
+      rx_counts.begin(), rx_counts.end(), local_frontier_sample_displacements.begin());
+    local_frontier_sample_counts = std::move(rx_counts);
+  }
+
+  // 5. transform
+
+  auto sample_counts =
+    (!GraphViewType::is_multi_gpu && !invalid_value)
+      ? std::make_optional<rmm::device_uvector<size_t>>(frontier.size(), handle.get_stream())
+      : std::nullopt;
+  auto sample_e_op_results = allocate_dataframe_buffer<e_op_result_t>(
+    local_frontier_sample_displacements.back() + local_frontier_sample_counts.back(),
+    handle.get_stream());
+  for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
+    auto edge_partition =
+      edge_partition_device_view_t<vertex_t, edge_t, weight_t, GraphViewType::is_multi_gpu>(
+        graph_view.local_edge_partition_view(i));
+
+    auto edge_partition_frontier_key_first =
+      (GraphViewType::is_multi_gpu ? get_dataframe_buffer_begin(*aggregate_local_frontier_keys)
+                                   : frontier_key_first) +
+      local_frontier_displacements[i];
+    auto edge_partition_sample_local_nbr_index_first =
+      sample_local_nbr_indices.begin() + local_frontier_sample_displacements[i];
+
+    auto edge_partition_sample_e_op_result_first =
+      get_dataframe_buffer_beign(sample_e_op_results) + local_frontier_sample_displacements[i];
+
+    edge_partition_src_input_device_view_t edge_partition_src_value_input{};
+    edge_partition_dst_input_device_view_t edge_partition_dst_value_input{};
+    if constexpr (GraphViewType::is_storage_transposed) {
+      edge_partition_src_value_input = edge_partition_src_input_device_view_t(edge_src_value_input);
+      edge_partition_dst_value_input =
+        edge_partition_dst_input_device_view_t(edge_dst_value_input, i);
+    } else {
+      edge_partition_src_value_input =
+        edge_partition_src_input_device_view_t(edge_src_value_input, i);
+      edge_partition_dst_value_input = edge_partition_dst_input_device_view_t(edge_dst_value_input);
+    }
+
+    if constexpr (GraphViewType::is_multi_gpu) {
+      thrust::sort_by_key(handle.get_thrust_policy(),
+                          (*sample_key_indices).begin() + local_frontier_sample_displacements[i],
+                          (*sample_key_indices).begin() + local_frontier_sample_displacements[i] +
+                            local_frontier_sample_counts[i],
+                          edge_partition_sample_local_nbr_index_first);
+      auto num_unique_key_indices =
+        thrust::count_if(handle.get_thrust_policy(),
+                         thrust::make_counting_iterator(size_t{0}),
+                         thrust::make_counting_iterator(local_frontier_sample_counts[i]),
+                         is_first_in_run_t<size_t const*>{sample_key_indices.data() +
+                                                          local_frontier_sample_displacements[i]});
+      rmm::device_uvector<size_t> unique_key_indices(num_unique_key_indices, handle.get_strema());
+      rmm::device_uvector<size_t> unique_key_local_nbr_idx_counts(num_unique_key_indices,
+                                                                  handle.get_stream());
+      thrust::reduce_by_key(handle.get_thrust_policy(),
+                            sample_key_indices.begin() + local_frontier_sample_displacements[i],
+                            sample_key_indices.begin() + local_frontier_sample_displacements[i] +
+                              local_frontier_sample_counts[i],
+                            thrust::make_constant_iterator(edge_t{1}),
+                            unique_key_indices.begin(),
+                            unique_key_local_nbr_idx_counts.begin());
+      rmm::device_uvector<size_t> unique_key_local_nbr_idx_offsets(num_unique_key_indices + 1,
+                                                                   handle.get_stream());
+      unique_key_local_nbr_idx_offsets.set_element_to_zero_async(size_t{0}, handle.get_stream());
+      thrust::inclusive_scan(handle.get_thrust_policy(),
+                             unique_local_nbr_idx_counts.begin(),
+                             unique_local_nbr_idx_counts.end(),
+                             unique_local_nbr_idx_offsets.begin() + 1);
+      auto offset_first = unique_key_local_nbr_idx_offsets.begin();
+      thrust::for_each(
+        handle.get_thrust_policy(),
+        unique_key_indices.begin(),
+        unique_key_indices.end(),
+        transform_and_count_local_nbr_indices_t<GraphViewType,
+                                                decltype(edge_partition_frontier_key_first),
+                                                decltype(offset_first),
+                                                decltype(
+                                                  edge_partition_sample_local_nbr_index_first),
+                                                decltype(edge_partition_sample_e_op_result_first),
+                                                size_t const*,
+                                                edge_partition_src_input_device_view_t,
+                                                edge_partition_dst_input_device_view_t,
+                                                EdgeOp,
+                                                T>{edge_partition,
+                                                   edge_partition_frontier_key_first,
+                                                   offset_first,
+                                                   edge_partition_sample_local_nbr_index_first,
+                                                   edge_partition_sample_e_op_result_first,
+                                                   thrust::nullopt,
+                                                   edge_partition_src_value_input,
+                                                   edge_partition_dst_value_input,
+                                                   e_op,
+                                                   invalid_idx,
+                                                   invalid_value});
+    } else {
+      auto offset_first = thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{0}),
+                                                          multiplier_t<size_t>{K});
+      thrust::for_each(handle.get_thrust_policy(),
+                       thrust::make_counting_iterator(size_t{0}),
+                       thrust::make_counting_iterator(unique_key_indices.size()),
+                       transform_and_count_local_nbr_indices_t<
+                         GraphViewType,
+                         decltype(edge_partition_frontier_key_first),
+                         decltype(offset_first),
+                         decltype(edge_partition_sample_local_nbr_index_first),
+                         decltype(edge_partition_sample_e_op_result_first),
+                         size_t const*,
+                         edge_partition_src_input_device_view_t,
+                         edge_partition_dst_input_device_view_t,
+                         EdgeOp,
+                         T>{edge_partition,
+                            edge_partition_frontier_key_first,
+                            offset_first,
+                            edge_partition_sample_local_nbr_index_first,
+                            edge_partition_sample_e_op_result_first,
+                            sample_counts ? thrust::optional<size_t const*>((*sample_counts).data())
+                                          : thrust::nullopt,
+                            edge_partition_src_value_input,
+                            edge_partition_dst_value_input,
+                            e_op,
+                            invalid_idx,
+                            invalid_value});
+    }
+  }
+
+  // 6. shuffle randomly selected & transformed results and update sample_offsets
+
+  auto sample_offsets = invalid_value ? std::nullopt
+                                      : std::make_optional<rmm::device_uvector<size_t>>(
+                                          frontier.size() + 1, handle.get_stream());
+  if (GraphViewType::is_multi_gpu) {
+    auto pair_first = thrust::make_zip_iterator(
+      thrust::make_tuple(sample_e_op_results.bgin(), sample_key_indices.begin()));
+    auto [rx_value_buffer, rx_counts] =
+      shuffle_values(col_comm, pair_first, local_frontier_sample_counts, handle.get_stream());
+    sample_e_op_results = std::move(std::get<0>(rx_value_buffer));
+    sample_key_indices  = std::move(std::get<1>(rx_value_buffer));
+    // FIXME: better refactor this sort-and-reduce-by-key
+    thrust::sort_by_key(handle.get_thrust_policy(),
+                        sample_key_indices.begin(),
+                        sample_key_indices.end(),
+                        sample_e_op_results.begin());
+    auto num_unique_key_indices =
+    thrust::count_if(handle.get_thrust_policy(),
+                     thrust::make_counting_iterator(size_t{0}),
+                     thrust::make_counting_iterator(sample_key_indices.size()),
+                     is_first_in_run_t<size_t const*>{sample_key_indices.data()},
+    rmm::device_uvector<size_t> unique_key_indices(num_unique_key_indices, handle.get_strema());
+    rmm::device_uvector<size_t> unique_key_sample_counts(num_unique_key_indices,
+                                                                  handle.get_stream());
+    thrust::reduce_by_key(handle.get_thrust_policy(),
+                          sample_key_indices.begin(),
+                          sample_key_indices.end(),
+                          thrust::make_constant_iterator(edge_t{1}),
+                          unique_key_indices.begin(),
+                          unique_key_sample_counts.begin());
+    sample_counts = rmm::device_uvector<size_t>(frontier.size(), handle.get_stream());
+    thrust::fill(
+      handle.get_thrust_policy(), (*sample_counts).begin(), (*sample_counts).end(), size_t{0});
+    thrust::scatter(handle.get_thrust_policy(), unique_key_sample_counts.begin(), unique_key_sample_counts.end(), unique_key_indices.begin(), (*sample_counts).begin());
+    if (invalid_value) {
+      rmm::device_uvector<size_t> sample_displacements((*sample_counts).size(),
+                                                       handle.get_stream());
+      thrust::exclusive_scan(handle.get_thrust_policy(),
+                             (*sample_counts).beign(),
+                             (*sample_counts).emd(),
+                             sample_displacements.begin());
+      auto tmp_sample_e_op_results =
+        allocate_dataframe_buffer<T>(frontier.size() * K, handle.get_stream());
+      auto input_first  = get_dataframe_buffer_begin(sample_e_op_results);
+      auto output_first = get_dataframe_buffer_begin(tmp_sample_e_op_results);
+      thrust::for_each(
+        handle.get_thrust_policy(),
+        unique_key_indices.begin(),
+        unique_key_indices.end(),
+        copy_and_fill_sample_e_op_results_t<decltype(input_first), decltype(output_first)>{
+          raft::device_span<size_t const>((*sample_counts).data(), (*sample_counts).size()),
+          raft::device_span<size_t const>(sample_displacements.data(), sample_displacements.size()),
+          input_first,
+          output_first,
+          K});
+      sample_e_op_results = std::move(tmp_sample_e_op_results);
+    }
+    else {
+      (*sample_offsets).set_element_to_zero_async(size_t{0}, handle.get_stream());
+      thrust::inclusive_scan(handle.get_thrust_policy(),
+                             (*sample_counts).begin(),
+                             (*sample_counts).end(),
+                             (*sample_offsets).begin() + 1);
+    }
+  } else {
+    if (!invalid_value) {
+      (*sample_offsets).set_element_to_zero_async(size_t{0}, handle.get_stream());
+      thrust::inclusive_scan(handle.get_thrust_policy(),
+                             (*sample_counts).begin(),
+                             (*sample_counts).end(),
+                             (*sample_offsets).begin() + 1);
+    }
+  }
+
+  return std::make_tuple(std::move(sample_offsets), std::move(sample_e_op_results));
+}
+
+}  // namespace detail
+
 /**
  * @brief Randomly select and transform the input (tagged-)vertices' outgoing edges with biases.
  *
@@ -86,12 +732,16 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle,
 #endif
                                          EdgeBiasOp e_bias_op,
                                          EdgeOp e_op,
+                                         raft::random::RngState& rng_state,
                                          size_t K,
                                          bool with_replacement,
                                          std::optional<T> invalid_value,
                                          bool do_expensive_check = false)
 {
   static_assert(false, "unimplemented.");
+
+  return std::make_tuple(std::nullopt,
+                         allocate_dataframe_buffer<T>(size_t{0}, rmm::cuda_stream_view{}));
 }
 
 /**
@@ -159,12 +809,23 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle,
                                EdgeValueInputWrapper egde_value_input,
 #endif
                                          EdgeOp e_op,
+                                         raft::random::RngState& rng_state,
                                          size_t K,
                                          bool with_replacement,
                                          std::optional<T> invalid_value,
                                          bool do_expensive_check = false)
 {
-  static_assert(false, "unimplemented.");
+  return detail::per_v_random_select_transform_e<false>(hanlde,
+                                                        graph_view,
+                                                        frontier,
+                                                        edge_src_value_input,
+                                                        edge_dst_value_input,
+                                                        e_op,
+                                                        rng_state,
+                                                        K,
+                                                        with_replacement,
+                                                        invalid_value,
+                                                        do_expensive_check);
 }
 
 }  // namespace cugraph

From 06598aecc4f0d150641451df9f3524430cf9262b Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 14 Sep 2022 22:49:51 -0700
Subject: [PATCH 03/18] refactor prim tests

---
 cpp/tests/prims/mg_count_if_e.cu              | 169 +++-----------
 cpp/tests/prims/mg_extract_if_e.cu            | 105 ++-------
 ...extract_transform_v_frontier_outgoing_e.cu | 116 +++-------
 ..._v_transform_reduce_incoming_outgoing_e.cu | 214 +++++-------------
 cpp/tests/prims/mg_reduce_v.cu                | 157 +++----------
 cpp/tests/prims/mg_transform_reduce_e.cu      | 182 +++------------
 cpp/tests/prims/mg_transform_reduce_v.cu      |  92 +++-----
 ...orm_reduce_v_frontier_outgoing_e_by_dst.cu | 116 +++-------
 cpp/tests/prims/property_generator.cuh        | 143 ++++++++++++
 9 files changed, 409 insertions(+), 885 deletions(-)
 create mode 100644 cpp/tests/prims/property_generator.cuh

diff --git a/cpp/tests/prims/mg_count_if_e.cu b/cpp/tests/prims/mg_count_if_e.cu
index 25bdb1e4190..5c96570c7d2 100644
--- a/cpp/tests/prims/mg_count_if_e.cu
+++ b/cpp/tests/prims/mg_count_if_e.cu
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "property_generator.cuh"
+
 #include <utilities/base_fixture.hpp>
 #include <utilities/device_comm_wrapper.hpp>
 #include <utilities/high_res_clock.h>
@@ -50,119 +52,6 @@
 
 #include <random>
 
-template <typename... Args>
-struct property_type {
-  using type = std::conditional_t<(sizeof...(Args) > 1),
-                                  thrust::tuple<Args...>,
-                                  typename thrust::tuple_element<0, thrust::tuple<Args...>>::type>;
-};
-
-template <typename vertex_t, typename... Args>
-struct property_transform
-  : public thrust::unary_function<vertex_t, typename property_type<Args...>::type> {
-  int mod{};
-  property_transform(int mod_count) : mod(mod_count) {}
-
-  template <typename type = typename property_type<Args...>::type>
-  constexpr __device__
-    typename std::enable_if_t<cugraph::is_thrust_tuple_of_arithmetic<type>::value, type>
-    operator()(const vertex_t& val)
-  {
-    cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
-    auto value = hash_func(val) % mod;
-    return thrust::make_tuple(static_cast<Args>(value)...);
-  }
-
-  template <typename type = typename property_type<Args...>::type>
-  constexpr __device__ typename std::enable_if_t<std::is_arithmetic<type>::value, type> operator()(
-    const vertex_t& val)
-  {
-    cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
-    auto value = hash_func(val) % mod;
-    return static_cast<type>(value);
-  }
-};
-
-template <typename vertex_t, template <typename...> typename Tuple, typename... Args>
-struct property_transform<vertex_t, Tuple<Args...>> : public property_transform<vertex_t, Args...> {
-};
-
-template <typename... Args>
-struct generate_impl {
- private:
-  using type                 = typename property_type<Args...>::type;
-  using property_buffer_type = std::conditional_t<
-    (sizeof...(Args) > 1),
-    std::tuple<rmm::device_uvector<Args>...>,
-    rmm::device_uvector<typename thrust::tuple_element<0, thrust::tuple<Args...>>::type>>;
-
- public:
-  static thrust::tuple<Args...> initial_value(int init)
-  {
-    return thrust::make_tuple(static_cast<Args>(init)...);
-  }
-  template <typename label_t>
-  static auto vertex_property(rmm::device_uvector<label_t>& labels,
-                              int hash_bin_count,
-                              raft::handle_t const& handle)
-  {
-    auto data = cugraph::allocate_dataframe_buffer<type>(labels.size(), handle.get_stream());
-    auto zip  = cugraph::get_dataframe_buffer_begin(data);
-    thrust::transform(handle.get_thrust_policy(),
-                      labels.begin(),
-                      labels.end(),
-                      zip,
-                      property_transform<label_t, Args...>(hash_bin_count));
-    return data;
-  }
-  template <typename label_t>
-  static auto vertex_property(thrust::counting_iterator<label_t> begin,
-                              thrust::counting_iterator<label_t> end,
-                              int hash_bin_count,
-                              raft::handle_t const& handle)
-  {
-    auto length = thrust::distance(begin, end);
-    auto data   = cugraph::allocate_dataframe_buffer<type>(length, handle.get_stream());
-    auto zip    = cugraph::get_dataframe_buffer_begin(data);
-    thrust::transform(handle.get_thrust_policy(),
-                      begin,
-                      end,
-                      zip,
-                      property_transform<label_t, Args...>(hash_bin_count));
-    return data;
-  }
-
-  template <typename graph_view_type>
-  static auto column_property(raft::handle_t const& handle,
-                              graph_view_type const& graph_view,
-                              property_buffer_type& property)
-  {
-    auto output_property = cugraph::edge_dst_property_t<graph_view_type, type>(handle, graph_view);
-    update_edge_dst_property(
-      handle, graph_view, cugraph::get_dataframe_buffer_begin(property), output_property);
-    return output_property;
-  }
-
-  template <typename graph_view_type>
-  static auto row_property(raft::handle_t const& handle,
-                           graph_view_type const& graph_view,
-                           property_buffer_type& property)
-  {
-    auto output_property = cugraph::edge_src_property_t<graph_view_type, type>(handle, graph_view);
-    update_edge_src_property(
-      handle, graph_view, cugraph::get_dataframe_buffer_begin(property), output_property);
-    return output_property;
-  }
-};
-
-template <typename T>
-struct generate : public generate_impl<T> {
-  static T initial_value(int init) { return static_cast<T>(init); }
-};
-template <typename... Args>
-struct generate<std::tuple<Args...>> : public generate_impl<Args...> {
-};
-
 struct Prims_Usecase {
   bool check_correctness{true};
   bool test_weighted{false};
@@ -216,11 +105,12 @@ class Tests_MGCountIfE
 
     const int hash_bin_count = 5;
 
-    auto vertex_property_data =
-      generate<result_t>::vertex_property((*d_mg_renumber_map_labels), hash_bin_count, *handle_);
-    auto col_prop =
-      generate<result_t>::column_property(*handle_, mg_graph_view, vertex_property_data);
-    auto row_prop = generate<result_t>::row_property(*handle_, mg_graph_view, vertex_property_data);
+    auto mg_vertex_prop = cugraph::test::generate<vertex_t, result_t>::vertex_property(
+      *handle_, *d_mg_renumber_map_labels, hash_bin_count);
+    auto mg_src_prop = cugraph::test::generate<vertex_t, result_t>::src_property(
+      *handle_, mg_graph_view, mg_vertex_prop);
+    auto mg_dst_prop = cugraph::test::generate<vertex_t, result_t>::dst_property(
+      *handle_, mg_graph_view, mg_vertex_prop);
 
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
@@ -231,11 +121,12 @@ class Tests_MGCountIfE
     auto result = count_if_e(
       *handle_,
       mg_graph_view,
-      row_prop.view(),
-      col_prop.view(),
-      [] __device__(auto row, auto col, weight_t wt, auto row_property, auto col_property) {
-        return row_property < col_property;
+      mg_src_prop.view(),
+      mg_dst_prop.view(),
+      [] __device__(auto row, auto col, weight_t wt, auto src_property, auto dst_property) {
+        return src_property < dst_property;
       });
+
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
       handle_->get_comms().barrier();
@@ -253,23 +144,23 @@ class Tests_MGCountIfE
           *handle_, input_usecase, prims_usecase.test_weighted, false);
       auto sg_graph_view = sg_graph.view();
 
-      auto sg_vertex_property_data = generate<result_t>::vertex_property(
+      auto sg_vertex_prop = cugraph::test::generate<vertex_t, result_t>::vertex_property(
+        *handle_,
         thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_first()),
         thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_last()),
-        hash_bin_count,
-        *handle_);
-      auto sg_col_prop =
-        generate<result_t>::column_property(*handle_, sg_graph_view, sg_vertex_property_data);
-      auto sg_row_prop =
-        generate<result_t>::row_property(*handle_, sg_graph_view, sg_vertex_property_data);
+        hash_bin_count);
+      auto sg_src_prop = cugraph::test::generate<vertex_t, result_t>::src_property(
+        *handle_, sg_graph_view, sg_vertex_prop);
+      auto sg_dst_prop = cugraph::test::generate<vertex_t, result_t>::dst_property(
+        *handle_, sg_graph_view, sg_vertex_prop);
 
       auto expected_result = count_if_e(
         *handle_,
         sg_graph_view,
-        sg_row_prop.view(),
-        sg_col_prop.view(),
-        [] __device__(auto row, auto col, weight_t wt, auto row_property, auto col_property) {
-          return row_property < col_property;
+        sg_src_prop.view(),
+        sg_dst_prop.view(),
+        [] __device__(auto row, auto col, weight_t wt, auto src_property, auto dst_property) {
+          return src_property < dst_property;
         });
       ASSERT_TRUE(expected_result == result);
     }
@@ -288,14 +179,14 @@ using Tests_MGCountIfE_Rmat = Tests_MGCountIfE<cugraph::test::Rmat_Usecase>;
 TEST_P(Tests_MGCountIfE_File, CheckInt32Int32FloatTupleIntFloatTransposeFalse)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int32_t, float, std::tuple<int, float>, false>(std::get<0>(param),
-                                                                           std::get<1>(param));
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>, false>(std::get<0>(param),
+                                                                              std::get<1>(param));
 }
 
 TEST_P(Tests_MGCountIfE_Rmat, CheckInt32Int32FloatTupleIntFloatTransposeFalse)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int32_t, float, std::tuple<int, float>, false>(
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>, false>(
     std::get<0>(param),
     cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
 }
@@ -303,14 +194,14 @@ TEST_P(Tests_MGCountIfE_Rmat, CheckInt32Int32FloatTupleIntFloatTransposeFalse)
 TEST_P(Tests_MGCountIfE_File, CheckInt32Int32FloatTupleIntFloatTransposeTrue)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int32_t, float, std::tuple<int, float>, true>(std::get<0>(param),
-                                                                          std::get<1>(param));
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>, true>(std::get<0>(param),
+                                                                             std::get<1>(param));
 }
 
 TEST_P(Tests_MGCountIfE_Rmat, CheckInt32Int32FloatTupleIntFloatTransposeTrue)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int32_t, float, std::tuple<int, float>, true>(
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>, true>(
     std::get<0>(param),
     cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
 }
diff --git a/cpp/tests/prims/mg_extract_if_e.cu b/cpp/tests/prims/mg_extract_if_e.cu
index 988905b9800..47ae30537bb 100644
--- a/cpp/tests/prims/mg_extract_if_e.cu
+++ b/cpp/tests/prims/mg_extract_if_e.cu
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "property_generator.cuh"
+
 #include <utilities/base_fixture.hpp>
 #include <utilities/device_comm_wrapper.hpp>
 #include <utilities/high_res_clock.h>
@@ -51,39 +53,6 @@
 
 #include <random>
 
-template <typename TupleType, typename T, std::size_t... Is>
-__device__ auto make_type_casted_tuple_from_scalar(T val, std::index_sequence<Is...>)
-{
-  return thrust::make_tuple(
-    static_cast<typename thrust::tuple_element<Is, TupleType>::type>(val)...);
-}
-
-template <typename property_t, typename T>
-__device__ __host__ auto make_property_value(T val)
-{
-  property_t ret{};
-  if constexpr (cugraph::is_thrust_tuple_of_arithmetic<property_t>::value) {
-    ret = make_type_casted_tuple_from_scalar<property_t>(
-      val, std::make_index_sequence<thrust::tuple_size<property_t>::value>{});
-  } else {
-    ret = static_cast<property_t>(val);
-  }
-  return ret;
-}
-
-template <typename vertex_t, typename property_t>
-struct property_transform_t {
-  int mod{};
-
-  constexpr __device__ property_t operator()(vertex_t const v) const
-  {
-    static_assert(cugraph::is_thrust_tuple_of_arithmetic<property_t>::value ||
-                  std::is_arithmetic_v<property_t>);
-    cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
-    return make_property_value<property_t>(hash_func(v) % mod);
-  }
-};
-
 template <typename T, std::size_t... Is>
 __device__ bool compare_equal_scalar(T const& lhs, T const& rhs)
 {
@@ -146,7 +115,7 @@ class Tests_MGExtractIfE
   template <typename vertex_t,
             typename edge_t,
             typename weight_t,
-            typename property_t,
+            typename result_t,
             bool store_transposed>
   void run_current_test(Prims_Usecase const& prims_usecase, input_usecase_t const& input_usecase)
   {
@@ -160,7 +129,7 @@ class Tests_MGExtractIfE
       hr_clock.start();
     }
 
-    auto [mg_graph, mg_renumber_map_labels] =
+    auto [mg_graph, d_mg_renumber_map_labels] =
       cugraph::test::construct_graph<vertex_t, edge_t, weight_t, store_transposed, true>(
         *handle_, input_usecase, true, true);
 
@@ -178,28 +147,12 @@ class Tests_MGExtractIfE
 
     constexpr int hash_bin_count = 5;
 
-    auto mg_property_buffer = cugraph::allocate_dataframe_buffer<property_t>(
-      mg_graph_view.local_vertex_partition_range_size(), handle_->get_stream());
-
-    thrust::transform(handle_->get_thrust_policy(),
-                      (*mg_renumber_map_labels).begin(),
-                      (*mg_renumber_map_labels).end(),
-                      cugraph::get_dataframe_buffer_begin(mg_property_buffer),
-                      property_transform_t<vertex_t, property_t>{hash_bin_count});
-
-    cugraph::edge_src_property_t<decltype(mg_graph_view), property_t> mg_src_properties(
-      *handle_, mg_graph_view);
-    cugraph::edge_dst_property_t<decltype(mg_graph_view), property_t> mg_dst_properties(
-      *handle_, mg_graph_view);
-
-    update_edge_src_property(*handle_,
-                             mg_graph_view,
-                             cugraph::get_dataframe_buffer_cbegin(mg_property_buffer),
-                             mg_src_properties);
-    update_edge_dst_property(*handle_,
-                             mg_graph_view,
-                             cugraph::get_dataframe_buffer_cbegin(mg_property_buffer),
-                             mg_dst_properties);
+    auto mg_vertex_prop = cugraph::test::generate<vertex_t, result_t>::vertex_property(
+      *handle_, *d_mg_renumber_map_labels, hash_bin_count);
+    auto mg_src_prop = cugraph::test::generate<vertex_t, result_t>::src_property(
+      *handle_, mg_graph_view, mg_vertex_prop);
+    auto mg_dst_prop = cugraph::test::generate<vertex_t, result_t>::dst_property(
+      *handle_, mg_graph_view, mg_vertex_prop);
 
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
@@ -210,8 +163,8 @@ class Tests_MGExtractIfE
     auto [mg_edgelist_srcs, mg_edgelist_dsts, mg_edgelist_weights] =
       extract_if_e(*handle_,
                    mg_graph_view,
-                   mg_src_properties.view(),
-                   mg_dst_properties.view(),
+                   mg_src_prop.view(),
+                   mg_dst_prop.view(),
                    [] __device__(vertex_t src, vertex_t dst, auto src_val, auto dst_val) {
                      return src_val < dst_val;
                    });
@@ -230,7 +183,7 @@ class Tests_MGExtractIfE
       // 3-1. aggregate MG results
 
       auto mg_aggregate_renumber_map_labels = cugraph::test::device_gatherv(
-        *handle_, (*mg_renumber_map_labels).data(), (*mg_renumber_map_labels).size());
+        *handle_, (*d_mg_renumber_map_labels).data(), (*d_mg_renumber_map_labels).size());
       auto mg_aggregate_edgelist_srcs =
         cugraph::test::device_gatherv(*handle_, mg_edgelist_srcs.data(), mg_edgelist_srcs.size());
       auto mg_aggregate_edgelist_dsts =
@@ -267,35 +220,21 @@ class Tests_MGExtractIfE
 
         // 3-4. run SG extract_if_e
 
-        auto sg_property_buffer = cugraph::allocate_dataframe_buffer<property_t>(
-          sg_graph_view.local_vertex_partition_range_size(), handle_->get_stream());
-
-        thrust::transform(
-          handle_->get_thrust_policy(),
+        auto sg_vertex_prop = cugraph::test::generate<vertex_t, result_t>::vertex_property(
+          *handle_,
           thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_first()),
           thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_last()),
-          cugraph::get_dataframe_buffer_begin(sg_property_buffer),
-          property_transform_t<vertex_t, property_t>{hash_bin_count});
-
-        cugraph::edge_src_property_t<decltype(sg_graph_view), property_t> sg_src_properties(
-          *handle_, sg_graph_view);
-        cugraph::edge_dst_property_t<decltype(sg_graph_view), property_t> sg_dst_properties(
-          *handle_, sg_graph_view);
-
-        update_edge_src_property(*handle_,
-                                 sg_graph_view,
-                                 cugraph::get_dataframe_buffer_cbegin(sg_property_buffer),
-                                 sg_src_properties);
-        update_edge_dst_property(*handle_,
-                                 sg_graph_view,
-                                 cugraph::get_dataframe_buffer_cbegin(sg_property_buffer),
-                                 sg_dst_properties);
+          hash_bin_count);
+        auto sg_src_prop = cugraph::test::generate<vertex_t, result_t>::src_property(
+          *handle_, sg_graph_view, sg_vertex_prop);
+        auto sg_dst_prop = cugraph::test::generate<vertex_t, result_t>::dst_property(
+          *handle_, sg_graph_view, sg_vertex_prop);
 
         auto [sg_edgelist_srcs, sg_edgelist_dsts, sg_edgelist_weights] =
           extract_if_e(*handle_,
                        sg_graph_view,
-                       sg_src_properties.view(),
-                       sg_dst_properties.view(),
+                       sg_src_prop.view(),
+                       sg_dst_prop.view(),
                        [] __device__(vertex_t src, vertex_t dst, auto src_val, auto dst_val) {
                          return src_val < dst_val;
                        });
diff --git a/cpp/tests/prims/mg_extract_transform_v_frontier_outgoing_e.cu b/cpp/tests/prims/mg_extract_transform_v_frontier_outgoing_e.cu
index 049a9583d0c..2be87bf2508 100644
--- a/cpp/tests/prims/mg_extract_transform_v_frontier_outgoing_e.cu
+++ b/cpp/tests/prims/mg_extract_transform_v_frontier_outgoing_e.cu
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "property_generator.cuh"
+
 #include <utilities/base_fixture.hpp>
 #include <utilities/device_comm_wrapper.hpp>
 #include <utilities/high_res_clock.h>
@@ -57,39 +59,6 @@
 #include <random>
 #include <type_traits>
 
-template <typename TupleType, typename T, std::size_t... Is>
-__device__ __host__ auto make_type_casted_tuple_from_scalar(T val, std::index_sequence<Is...>)
-{
-  return thrust::make_tuple(
-    static_cast<typename thrust::tuple_element<Is, TupleType>::type>(val)...);
-}
-
-template <typename property_t, typename T>
-__device__ __host__ auto make_property_value(T val)
-{
-  property_t ret{};
-  if constexpr (cugraph::is_thrust_tuple_of_arithmetic<property_t>::value) {
-    ret = make_type_casted_tuple_from_scalar<property_t>(
-      val, std::make_index_sequence<thrust::tuple_size<property_t>::value>{});
-  } else {
-    ret = static_cast<property_t>(val);
-  }
-  return ret;
-}
-
-template <typename vertex_t, typename property_t>
-struct property_transform_t {
-  int mod{};
-
-  constexpr __device__ property_t operator()(vertex_t const v) const
-  {
-    static_assert(cugraph::is_thrust_tuple_of_arithmetic<property_t>::value ||
-                  std::is_arithmetic_v<property_t>);
-    cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
-    return make_property_value<property_t>(hash_func(v) % mod);
-  }
-};
-
 template <typename key_t, typename vertex_t, typename property_t, typename output_payload_t>
 struct e_op_t {
   static_assert(std::is_same_v<key_t, vertex_t> ||
@@ -174,7 +143,7 @@ class Tests_MGExtractTransformVFrontierOutgoingE
             typename output_payload_t>
   void run_current_test(Prims_Usecase const& prims_usecase, input_usecase_t const& input_usecase)
   {
-    using property_t = int32_t;
+    using result_t = int32_t;
 
     using key_t =
       std::conditional_t<std::is_same_v<tag_t, void>, vertex_t, thrust::tuple<vertex_t, tag_t>>;
@@ -200,7 +169,7 @@ class Tests_MGExtractTransformVFrontierOutgoingE
       hr_clock.start();
     }
 
-    auto [mg_graph, mg_renumber_map_labels] =
+    auto [mg_graph, d_mg_renumber_map_labels] =
       cugraph::test::construct_graph<vertex_t, edge_t, weight_t, store_transposed, is_multi_gpu>(
         *handle_, input_usecase, false, renumber);
 
@@ -218,28 +187,12 @@ class Tests_MGExtractTransformVFrontierOutgoingE
 
     const int hash_bin_count = 5;
 
-    auto mg_property_buffer = cugraph::allocate_dataframe_buffer<property_t>(
-      mg_graph_view.local_vertex_partition_range_size(), handle_->get_stream());
-
-    thrust::transform(handle_->get_thrust_policy(),
-                      (*mg_renumber_map_labels).begin(),
-                      (*mg_renumber_map_labels).end(),
-                      cugraph::get_dataframe_buffer_begin(mg_property_buffer),
-                      property_transform_t<vertex_t, property_t>{hash_bin_count});
-
-    cugraph::edge_src_property_t<decltype(mg_graph_view), property_t> mg_src_properties(
-      *handle_, mg_graph_view);
-    cugraph::edge_dst_property_t<decltype(mg_graph_view), property_t> mg_dst_properties(
-      *handle_, mg_graph_view);
-
-    update_edge_src_property(*handle_,
-                             mg_graph_view,
-                             cugraph::get_dataframe_buffer_cbegin(mg_property_buffer),
-                             mg_src_properties);
-    update_edge_dst_property(*handle_,
-                             mg_graph_view,
-                             cugraph::get_dataframe_buffer_cbegin(mg_property_buffer),
-                             mg_dst_properties);
+    auto mg_vertex_prop = cugraph::test::generate<vertex_t, result_t>::vertex_property(
+      *handle_, *d_mg_renumber_map_labels, hash_bin_count);
+    auto mg_src_prop = cugraph::test::generate<vertex_t, result_t>::src_property(
+      *handle_, mg_graph_view, mg_vertex_prop);
+    auto mg_dst_prop = cugraph::test::generate<vertex_t, result_t>::dst_property(
+      *handle_, mg_graph_view, mg_vertex_prop);
 
     auto mg_key_buffer = cugraph::allocate_dataframe_buffer<key_t>(
       mg_graph_view.local_vertex_partition_range_size(), handle_->get_stream());
@@ -252,7 +205,7 @@ class Tests_MGExtractTransformVFrontierOutgoingE
       thrust::tabulate(handle_->get_thrust_policy(),
                        cugraph::get_dataframe_buffer_begin(mg_key_buffer),
                        cugraph::get_dataframe_buffer_end(mg_key_buffer),
-                       [mg_renumber_map_labels = (*mg_renumber_map_labels).data(),
+                       [mg_renumber_map_labels = (*d_mg_renumber_map_labels).data(),
                         local_vertex_partition_range_first =
                           mg_graph_view.local_vertex_partition_range_first()] __device__(size_t i) {
                          return thrust::make_tuple(
@@ -280,9 +233,9 @@ class Tests_MGExtractTransformVFrontierOutgoingE
       *handle_,
       mg_graph_view,
       mg_vertex_frontier.bucket(bucket_idx_cur),
-      mg_src_properties.view(),
-      mg_dst_properties.view(),
-      e_op_t<key_t, vertex_t, property_t, output_payload_t>{});
+      mg_src_prop.view(),
+      mg_dst_prop.view(),
+      e_op_t<key_t, vertex_t, result_t, output_payload_t>{});
 
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
@@ -297,10 +250,10 @@ class Tests_MGExtractTransformVFrontierOutgoingE
 
     if (prims_usecase.check_correctness) {
       auto mg_aggregate_renumber_map_labels = cugraph::test::device_gatherv(
-        *handle_, (*mg_renumber_map_labels).data(), (*mg_renumber_map_labels).size());
+        *handle_, (*d_mg_renumber_map_labels).data(), (*d_mg_renumber_map_labels).size());
 
       auto mg_aggregate_extract_transform_output_buffer = cugraph::allocate_dataframe_buffer<
-        typename e_op_t<key_t, vertex_t, property_t, output_payload_t>::return_type::value_type>(
+        typename e_op_t<key_t, vertex_t, result_t, output_payload_t>::return_type::value_type>(
         size_t{0}, handle_->get_stream());
       std::get<0>(mg_aggregate_extract_transform_output_buffer) =
         cugraph::test::device_gatherv(*handle_,
@@ -340,28 +293,15 @@ class Tests_MGExtractTransformVFrontierOutgoingE
             *handle_, input_usecase, false, false);
         auto sg_graph_view = sg_graph.view();
 
-        auto sg_property_buffer = cugraph::allocate_dataframe_buffer<property_t>(
-          sg_graph_view.local_vertex_partition_range_size(), handle_->get_stream());
-
-        thrust::transform(
-          handle_->get_thrust_policy(),
-          thrust::make_counting_iterator(vertex_t{0}),
-          thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_size()),
-          cugraph::get_dataframe_buffer_begin(sg_property_buffer),
-          property_transform_t<vertex_t, property_t>{hash_bin_count});
-
-        cugraph::edge_src_property_t<decltype(sg_graph_view), property_t> sg_src_properties(
-          *handle_, sg_graph_view);
-        cugraph::edge_dst_property_t<decltype(sg_graph_view), property_t> sg_dst_properties(
-          *handle_, sg_graph_view);
-        update_edge_src_property(*handle_,
-                                 sg_graph_view,
-                                 cugraph::get_dataframe_buffer_cbegin(sg_property_buffer),
-                                 sg_src_properties);
-        update_edge_dst_property(*handle_,
-                                 sg_graph_view,
-                                 cugraph::get_dataframe_buffer_cbegin(sg_property_buffer),
-                                 sg_dst_properties);
+        auto sg_vertex_prop = cugraph::test::generate<vertex_t, result_t>::vertex_property(
+          *handle_,
+          thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_first()),
+          thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_last()),
+          hash_bin_count);
+        auto sg_src_prop = cugraph::test::generate<vertex_t, result_t>::src_property(
+          *handle_, sg_graph_view, sg_vertex_prop);
+        auto sg_dst_prop = cugraph::test::generate<vertex_t, result_t>::dst_property(
+          *handle_, sg_graph_view, sg_vertex_prop);
 
         auto sg_key_buffer = cugraph::allocate_dataframe_buffer<key_t>(
           sg_graph_view.local_vertex_partition_range_size(), handle_->get_stream());
@@ -391,9 +331,9 @@ class Tests_MGExtractTransformVFrontierOutgoingE
           *handle_,
           sg_graph_view,
           sg_vertex_frontier.bucket(bucket_idx_cur),
-          sg_src_properties.view(),
-          sg_dst_properties.view(),
-          e_op_t<key_t, vertex_t, property_t, output_payload_t>{});
+          sg_src_prop.view(),
+          sg_dst_prop.view(),
+          e_op_t<key_t, vertex_t, result_t, output_payload_t>{});
 
         thrust::sort(handle_->get_thrust_policy(),
                      cugraph::get_dataframe_buffer_begin(sg_extract_transform_output_buffer),
diff --git a/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu b/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu
index 9881cbade01..af5703b32ea 100644
--- a/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu
+++ b/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "property_generator.cuh"
+
 #include <utilities/base_fixture.hpp>
 #include <utilities/device_comm_wrapper.hpp>
 #include <utilities/high_res_clock.h>
@@ -52,111 +54,6 @@
 
 #include <random>
 
-template <typename... Args>
-struct property_type {
-  using type = std::conditional_t<(sizeof...(Args) > 1),
-                                  thrust::tuple<Args...>,
-                                  typename thrust::tuple_element<0, thrust::tuple<Args...>>::type>;
-};
-
-template <typename vertex_t, typename... Args>
-struct property_transform
-  : public thrust::unary_function<vertex_t, typename property_type<Args...>::type> {
-  int mod{};
-  property_transform(int mod_count) : mod(mod_count) {}
-
-  template <typename type = typename property_type<Args...>::type>
-  constexpr __device__
-    typename std::enable_if_t<cugraph::is_thrust_tuple_of_arithmetic<type>::value, type>
-    operator()(const vertex_t& val)
-  {
-    cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
-    auto value = hash_func(val) % mod;
-    return thrust::make_tuple(static_cast<Args>(value)...);
-  }
-
-  template <typename type = typename property_type<Args...>::type>
-  constexpr __device__ typename std::enable_if_t<std::is_arithmetic<type>::value, type> operator()(
-    const vertex_t& val)
-  {
-    cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
-    auto value = hash_func(val) % mod;
-    return static_cast<type>(value);
-  }
-};
-
-template <typename vertex_t, template <typename...> typename Tuple, typename... Args>
-struct property_transform<vertex_t, Tuple<Args...>> : public property_transform<vertex_t, Args...> {
-};
-
-template <typename... Args>
-struct generate_impl {
- private:
-  using property_buffer_type = std::conditional_t<
-    (sizeof...(Args) > 1),
-    std::tuple<rmm::device_uvector<Args>...>,
-    rmm::device_uvector<typename thrust::tuple_element<0, thrust::tuple<Args...>>::type>>;
-
- public:
-  using type = typename property_type<Args...>::type;
-  static thrust::tuple<Args...> initial_value(int init)
-  {
-    return thrust::make_tuple(static_cast<Args>(init)...);
-  }
-  template <typename label_t>
-  static auto vertex_property(rmm::device_uvector<label_t>& labels,
-                              int hash_bin_count,
-                              raft::handle_t const& handle)
-  {
-    auto data = cugraph::allocate_dataframe_buffer<type>(labels.size(), handle.get_stream());
-    auto zip  = cugraph::get_dataframe_buffer_begin(data);
-    thrust::transform(handle.get_thrust_policy(),
-                      labels.begin(),
-                      labels.end(),
-                      zip,
-                      property_transform<label_t, Args...>(hash_bin_count));
-    return data;
-  }
-  template <typename label_t>
-  static auto vertex_property(thrust::counting_iterator<label_t> begin,
-                              thrust::counting_iterator<label_t> end,
-                              int hash_bin_count,
-                              raft::handle_t const& handle)
-  {
-    auto length = thrust::distance(begin, end);
-    auto data   = cugraph::allocate_dataframe_buffer<type>(length, handle.get_stream());
-    auto zip    = cugraph::get_dataframe_buffer_begin(data);
-    thrust::transform(handle.get_thrust_policy(),
-                      begin,
-                      end,
-                      zip,
-                      property_transform<label_t, Args...>(hash_bin_count));
-    return data;
-  }
-
-  template <typename graph_view_type>
-  static auto column_property(raft::handle_t const& handle,
-                              graph_view_type const& graph_view,
-                              property_buffer_type& property)
-  {
-    auto output_property = cugraph::edge_dst_property_t<graph_view_type, type>(handle, graph_view);
-    update_edge_dst_property(
-      handle, graph_view, cugraph::get_dataframe_buffer_begin(property), output_property);
-    return output_property;
-  }
-
-  template <typename graph_view_type>
-  static auto row_property(raft::handle_t const& handle,
-                           graph_view_type const& graph_view,
-                           property_buffer_type& property)
-  {
-    auto output_property = cugraph::edge_src_property_t<graph_view_type, type>(handle, graph_view);
-    update_edge_src_property(
-      handle, graph_view, cugraph::get_dataframe_buffer_begin(property), output_property);
-    return output_property;
-  }
-};
-
 template <typename T>
 struct comparator {
   static constexpr double threshold_ratio{1e-2};
@@ -211,14 +108,6 @@ buffer_type aggregate(const raft::handle_t& handle, const buffer_type& result)
   return aggregated_result;
 }
 
-template <typename T>
-struct generate : public generate_impl<T> {
-  static T initial_value(int init) { return static_cast<T>(init); }
-};
-template <typename... Args>
-struct generate<std::tuple<Args...>> : public generate_impl<Args...> {
-};
-
 struct Prims_Usecase {
   bool check_correctness{true};
   bool test_weighted{false};
@@ -273,16 +162,19 @@ class Tests_MGPerVTransformReduceIncomingOutgoingE
     const int hash_bin_count = 5;
     const int initial_value  = 4;
 
-    auto property_initial_value = generate<result_t>::initial_value(initial_value);
-    using property_t            = decltype(property_initial_value);
-    auto vertex_property_data =
-      generate<result_t>::vertex_property((*d_mg_renumber_map_labels), hash_bin_count, *handle_);
-    auto col_prop =
-      generate<result_t>::column_property(*handle_, mg_graph_view, vertex_property_data);
-    auto row_prop = generate<result_t>::row_property(*handle_, mg_graph_view, vertex_property_data);
-    auto out_result = cugraph::allocate_dataframe_buffer<property_t>(
+    auto property_initial_value =
+      cugraph::test::generate<vertex_t, result_t>::initial_value(initial_value);
+
+    auto mg_vertex_prop = cugraph::test::generate<vertex_t, result_t>::vertex_property(
+      *handle_, *d_mg_renumber_map_labels, hash_bin_count);
+    auto mg_src_prop = cugraph::test::generate<vertex_t, result_t>::src_property(
+      *handle_, mg_graph_view, mg_vertex_prop);
+    auto mg_dst_prop = cugraph::test::generate<vertex_t, result_t>::dst_property(
+      *handle_, mg_graph_view, mg_vertex_prop);
+
+    auto out_result = cugraph::allocate_dataframe_buffer<result_t>(
       mg_graph_view.local_vertex_partition_range_size(), handle_->get_stream());
-    auto in_result = cugraph::allocate_dataframe_buffer<property_t>(
+    auto in_result = cugraph::allocate_dataframe_buffer<result_t>(
       mg_graph_view.local_vertex_partition_range_size(), handle_->get_stream());
 
     if (cugraph::test::g_perf) {
@@ -294,13 +186,13 @@ class Tests_MGPerVTransformReduceIncomingOutgoingE
     per_v_transform_reduce_incoming_e(
       *handle_,
       mg_graph_view,
-      row_prop.view(),
-      col_prop.view(),
-      [] __device__(auto row, auto col, weight_t wt, auto row_property, auto col_property) {
-        if (row_property < col_property) {
-          return row_property;
+      mg_src_prop.view(),
+      mg_dst_prop.view(),
+      [] __device__(auto src, auto dst, weight_t wt, auto src_property, auto dst_property) {
+        if (src_property < dst_property) {
+          return src_property;
         } else {
-          return col_property;
+          return dst_property;
         }
       },
       property_initial_value,
@@ -323,13 +215,13 @@ class Tests_MGPerVTransformReduceIncomingOutgoingE
     per_v_transform_reduce_outgoing_e(
       *handle_,
       mg_graph_view,
-      row_prop.view(),
-      col_prop.view(),
-      [] __device__(auto row, auto col, weight_t wt, auto row_property, auto col_property) {
-        if (row_property < col_property) {
-          return row_property;
+      mg_src_prop.view(),
+      mg_dst_prop.view(),
+      [] __device__(auto src, auto dst, weight_t wt, auto src_property, auto dst_property) {
+        if (src_property < dst_property) {
+          return src_property;
         } else {
-          return col_property;
+          return dst_property;
         }
       },
       property_initial_value,
@@ -352,46 +244,46 @@ class Tests_MGPerVTransformReduceIncomingOutgoingE
           *handle_, input_usecase, true, false);
       auto sg_graph_view = sg_graph.view();
 
-      auto sg_vertex_property_data = generate<result_t>::vertex_property(
+      auto sg_vertex_prop = cugraph::test::generate<vertex_t, result_t>::vertex_property(
+        *handle_,
         thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_first()),
         thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_last()),
-        hash_bin_count,
-        *handle_);
-      auto sg_col_prop =
-        generate<result_t>::column_property(*handle_, sg_graph_view, sg_vertex_property_data);
-      auto sg_row_prop =
-        generate<result_t>::row_property(*handle_, sg_graph_view, sg_vertex_property_data);
+        hash_bin_count);
+      auto sg_dst_prop = cugraph::test::generate<vertex_t, result_t>::dst_property(
+        *handle_, sg_graph_view, sg_vertex_prop);
+      auto sg_src_prop = cugraph::test::generate<vertex_t, result_t>::src_property(
+        *handle_, sg_graph_view, sg_vertex_prop);
       result_compare comp{*handle_};
 
-      auto global_out_result = cugraph::allocate_dataframe_buffer<property_t>(
+      auto global_out_result = cugraph::allocate_dataframe_buffer<result_t>(
         sg_graph_view.local_vertex_partition_range_size(), handle_->get_stream());
       per_v_transform_reduce_outgoing_e(
         *handle_,
         sg_graph_view,
-        sg_row_prop.view(),
-        sg_col_prop.view(),
-        [] __device__(auto row, auto col, weight_t wt, auto row_property, auto col_property) {
-          if (row_property < col_property) {
-            return row_property;
+        sg_src_prop.view(),
+        sg_dst_prop.view(),
+        [] __device__(auto src, auto dst, weight_t wt, auto src_property, auto dst_property) {
+          if (src_property < dst_property) {
+            return src_property;
           } else {
-            return col_property;
+            return dst_property;
           }
         },
         property_initial_value,
         cugraph::get_dataframe_buffer_begin(global_out_result));
 
-      auto global_in_result = cugraph::allocate_dataframe_buffer<property_t>(
+      auto global_in_result = cugraph::allocate_dataframe_buffer<result_t>(
         sg_graph_view.local_vertex_partition_range_size(), handle_->get_stream());
       per_v_transform_reduce_incoming_e(
         *handle_,
         sg_graph_view,
-        sg_row_prop.view(),
-        sg_col_prop.view(),
-        [] __device__(auto row, auto col, weight_t wt, auto row_property, auto col_property) {
-          if (row_property < col_property) {
-            return row_property;
+        sg_src_prop.view(),
+        sg_dst_prop.view(),
+        [] __device__(auto src, auto dst, weight_t wt, auto src_property, auto dst_property) {
+          if (src_property < dst_property) {
+            return src_property;
           } else {
-            return col_property;
+            return dst_property;
           }
         },
         property_initial_value,
@@ -427,15 +319,15 @@ TEST_P(Tests_MGPerVTransformReduceIncomingOutgoingE_File,
        CheckInt32Int32FloatTupleIntFloatTransposeFalse)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int32_t, float, std::tuple<int, float>, false>(std::get<0>(param),
-                                                                           std::get<1>(param));
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>, false>(std::get<0>(param),
+                                                                              std::get<1>(param));
 }
 
 TEST_P(Tests_MGPerVTransformReduceIncomingOutgoingE_Rmat,
        CheckInt32Int32FloatTupleIntFloatTransposeFalse)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int32_t, float, std::tuple<int, float>, false>(
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>, false>(
     std::get<0>(param),
     cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
 }
@@ -444,15 +336,15 @@ TEST_P(Tests_MGPerVTransformReduceIncomingOutgoingE_File,
        CheckInt32Int32FloatTupleIntFloatTransposeTrue)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int32_t, float, std::tuple<int, float>, true>(std::get<0>(param),
-                                                                          std::get<1>(param));
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>, true>(std::get<0>(param),
+                                                                             std::get<1>(param));
 }
 
 TEST_P(Tests_MGPerVTransformReduceIncomingOutgoingE_Rmat,
        CheckInt32Int32FloatTupleIntFloatTransposeTrue)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int32_t, float, std::tuple<int, float>, true>(
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>, true>(
     std::get<0>(param),
     cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
 }
diff --git a/cpp/tests/prims/mg_reduce_v.cu b/cpp/tests/prims/mg_reduce_v.cu
index e488bf2bf4f..043dd4658da 100644
--- a/cpp/tests/prims/mg_reduce_v.cu
+++ b/cpp/tests/prims/mg_reduce_v.cu
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "property_generator.cuh"
+
 #include <utilities/base_fixture.hpp>
 #include <utilities/device_comm_wrapper.hpp>
 #include <utilities/high_res_clock.h>
@@ -48,87 +50,6 @@
 
 #include <random>
 
-template <typename vertex_t, typename... Args>
-struct property_transform : public thrust::unary_function<vertex_t, thrust::tuple<Args...>> {
-  int mod{};
-  property_transform(int mod_count) : mod(mod_count) {}
-  constexpr __device__ auto operator()(const vertex_t& val)
-  {
-    cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
-    auto value = hash_func(val) % mod;
-    return thrust::make_tuple(static_cast<Args>(value)...);
-  }
-};
-
-template <typename vertex_t, template <typename...> typename Tuple, typename... Args>
-struct property_transform<vertex_t, Tuple<Args...>> : public property_transform<vertex_t, Args...> {
-};
-
-template <typename Tuple, std::size_t... I>
-auto make_iterator_tuple(Tuple& data, std::index_sequence<I...>)
-{
-  return thrust::make_tuple((std::get<I>(data).begin())...);
-}
-
-template <typename... Args>
-auto get_zip_iterator(std::tuple<Args...>& data)
-{
-  return thrust::make_zip_iterator(make_iterator_tuple(
-    data, std::make_index_sequence<std::tuple_size<std::tuple<Args...>>::value>()));
-}
-
-template <typename T>
-auto get_property_iterator(std::tuple<T>& data)
-{
-  return (std::get<0>(data)).begin();
-}
-
-template <typename T0, typename... Args>
-auto get_property_iterator(std::tuple<T0, Args...>& data)
-{
-  return get_zip_iterator(data);
-}
-
-template <typename... Args>
-struct generate_impl {
-  static thrust::tuple<Args...> initial_value(int init)
-  {
-    return thrust::make_tuple(static_cast<Args>(init)...);
-  }
-
-  template <typename label_t>
-  static std::tuple<rmm::device_uvector<Args>...> property(rmm::device_uvector<label_t>& labels,
-                                                           int hash_bin_count,
-                                                           raft::handle_t const& handle)
-  {
-    auto data = std::make_tuple(rmm::device_uvector<Args>(labels.size(), handle.get_stream())...);
-    auto zip  = get_zip_iterator(data);
-    thrust::transform(handle.get_thrust_policy(),
-                      labels.begin(),
-                      labels.end(),
-                      zip,
-                      property_transform<label_t, Args...>(hash_bin_count));
-    return data;
-  }
-
-  template <typename label_t>
-  static std::tuple<rmm::device_uvector<Args>...> property(thrust::counting_iterator<label_t> begin,
-                                                           thrust::counting_iterator<label_t> end,
-                                                           int hash_bin_count,
-                                                           raft::handle_t const& handle)
-  {
-    auto length = thrust::distance(begin, end);
-    auto data   = std::make_tuple(rmm::device_uvector<Args>(length, handle.get_stream())...);
-    auto zip    = get_zip_iterator(data);
-    thrust::transform(handle.get_thrust_policy(),
-                      begin,
-                      end,
-                      zip,
-                      property_transform<label_t, Args...>(hash_bin_count));
-    return data;
-  }
-};
-
 template <typename T>
 struct result_compare {
   static constexpr double threshold_ratio{1e-2};
@@ -173,15 +94,6 @@ struct result_compare<thrust::tuple<Args...>> {
   }
 };
 
-template <typename T>
-struct generate : public generate_impl<T> {
-  static T initial_value(int init) { return static_cast<T>(init); }
-};
-
-template <typename... Args>
-struct generate<std::tuple<Args...>> : public generate_impl<Args...> {
-};
-
 struct Prims_Usecase {
   bool check_correctness{true};
 };
@@ -236,17 +148,18 @@ class Tests_MGReduceV
     const int hash_bin_count = 5;
     const int initial_value  = 10;
 
-    auto property_initial_value = generate<result_t>::initial_value(initial_value);
-    using property_t            = decltype(property_initial_value);
-    auto property_data =
-      generate<result_t>::property((*d_mg_renumber_map_labels), hash_bin_count, *handle_);
-    auto property_iter = get_property_iterator(property_data);
+    auto property_initial_value =
+      cugraph::test::generate<vertex_t, result_t>::initial_value(initial_value);
+
+    auto mg_vertex_prop = cugraph::test::generate<vertex_t, result_t>::vertex_property(
+      *handle_, (*d_mg_renumber_map_labels), hash_bin_count);
+    auto property_iter = cugraph::get_dataframe_buffer_begin(mg_vertex_prop);
 
     enum class reduction_type_t { PLUS, MINIMUM, MAXIMUM };
     reduction_type_t reduction_types[] = {
       reduction_type_t::PLUS, reduction_type_t::MINIMUM, reduction_type_t::MAXIMUM};
 
-    std::unordered_map<reduction_type_t, property_t> results;
+    std::unordered_map<reduction_type_t, result_t> results;
 
     for (auto reduction_type : reduction_types) {
       if (cugraph::test::g_perf) {
@@ -261,21 +174,21 @@ class Tests_MGReduceV
                                              mg_graph_view,
                                              property_iter,
                                              property_initial_value,
-                                             cugraph::reduce_op::plus<property_t>{});
+                                             cugraph::reduce_op::plus<result_t>{});
           break;
         case reduction_type_t::MINIMUM:
           results[reduction_type] = reduce_v(*handle_,
                                              mg_graph_view,
                                              property_iter,
                                              property_initial_value,
-                                             cugraph::reduce_op::minimum<property_t>{});
+                                             cugraph::reduce_op::minimum<result_t>{});
           break;
         case reduction_type_t::MAXIMUM:
           results[reduction_type] = reduce_v(*handle_,
                                              mg_graph_view,
                                              property_iter,
                                              property_initial_value,
-                                             cugraph::reduce_op::maximum<property_t>{});
+                                             cugraph::reduce_op::maximum<result_t>{});
           break;
         default: FAIL() << "should not be reached.";
       }
@@ -298,40 +211,40 @@ class Tests_MGReduceV
           *handle_, input_usecase, true, false);
       auto sg_graph_view = sg_graph.view();
 
-      auto sg_property_data = generate<result_t>::property(
+      auto sg_vertex_prop = cugraph::test::generate<vertex_t, result_t>::vertex_property(
+        *handle_,
         thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_first()),
         thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_last()),
-        hash_bin_count,
-        *handle_);
-      auto sg_property_iter = get_property_iterator(sg_property_data);
+        hash_bin_count);
+      auto sg_property_iter = cugraph::get_dataframe_buffer_begin(sg_vertex_prop);
 
       for (auto reduction_type : reduction_types) {
-        property_t expected_result{};
+        result_t expected_result{};
         switch (reduction_type) {
           case reduction_type_t::PLUS:
             expected_result = reduce_v(*handle_,
                                        sg_graph_view,
                                        sg_property_iter,
                                        property_initial_value,
-                                       cugraph::reduce_op::plus<property_t>{});
+                                       cugraph::reduce_op::plus<result_t>{});
             break;
           case reduction_type_t::MINIMUM:
             expected_result = reduce_v(*handle_,
                                        sg_graph_view,
                                        sg_property_iter,
                                        property_initial_value,
-                                       cugraph::reduce_op::minimum<property_t>{});
+                                       cugraph::reduce_op::minimum<result_t>{});
             break;
           case reduction_type_t::MAXIMUM:
             expected_result = reduce_v(*handle_,
                                        sg_graph_view,
                                        sg_property_iter,
                                        property_initial_value,
-                                       cugraph::reduce_op::maximum<property_t>{});
+                                       cugraph::reduce_op::maximum<result_t>{});
             break;
           default: FAIL() << "should not be reached.";
         }
-        result_compare<property_t> compare{};
+        result_compare<result_t> compare{};
         ASSERT_TRUE(compare(expected_result, results[reduction_type]));
       }
     }
@@ -350,14 +263,14 @@ using Tests_MGReduceV_Rmat = Tests_MGReduceV<cugraph::test::Rmat_Usecase>;
 TEST_P(Tests_MGReduceV_File, CheckInt32Int32FloatTupleIntFloatTransposeFalse)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int32_t, float, std::tuple<int, float>, false>(std::get<0>(param),
-                                                                           std::get<1>(param));
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int32_t, float>, false>(
+    std::get<0>(param), std::get<1>(param));
 }
 
 TEST_P(Tests_MGReduceV_Rmat, CheckInt32Int32FloatTupleIntFloatTransposeFalse)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int32_t, float, std::tuple<int, float>, false>(
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int32_t, float>, false>(
     std::get<0>(param),
     cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
 }
@@ -365,14 +278,14 @@ TEST_P(Tests_MGReduceV_Rmat, CheckInt32Int32FloatTupleIntFloatTransposeFalse)
 TEST_P(Tests_MGReduceV_File, CheckInt32Int32FloatTupleIntFloatTransposeTrue)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int32_t, float, std::tuple<int, float>, true>(std::get<0>(param),
-                                                                          std::get<1>(param));
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int32_t, float>, true>(
+    std::get<0>(param), std::get<1>(param));
 }
 
 TEST_P(Tests_MGReduceV_Rmat, CheckInt32Int32FloatTupleIntFloatTransposeTrue)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int32_t, float, std::tuple<int, float>, true>(
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int32_t, float>, true>(
     std::get<0>(param),
     cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
 }
@@ -380,54 +293,54 @@ TEST_P(Tests_MGReduceV_Rmat, CheckInt32Int32FloatTupleIntFloatTransposeTrue)
 TEST_P(Tests_MGReduceV_File, CheckInt32Int32FloatTransposeFalse)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int32_t, float, int, false>(std::get<0>(param), std::get<1>(param));
+  run_current_test<int32_t, int32_t, float, int32_t, false>(std::get<0>(param), std::get<1>(param));
 }
 
 TEST_P(Tests_MGReduceV_Rmat, CheckInt32Int32FloatTransposeFalse)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int32_t, float, int, false>(
+  run_current_test<int32_t, int32_t, float, int32_t, false>(
     std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
 }
 
 TEST_P(Tests_MGReduceV_Rmat, CheckInt32Int64FloatTransposeFalse)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int64_t, float, int, false>(
+  run_current_test<int32_t, int64_t, float, int32_t, false>(
     std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
 }
 
 TEST_P(Tests_MGReduceV_Rmat, CheckInt64Int64FloatTransposeFalse)
 {
   auto param = GetParam();
-  run_current_test<int64_t, int64_t, float, int, false>(
+  run_current_test<int64_t, int64_t, float, int32_t, false>(
     std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
 }
 
 TEST_P(Tests_MGReduceV_File, CheckInt32Int32FloatTransposeTrue)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int32_t, float, int, true>(std::get<0>(param), std::get<1>(param));
+  run_current_test<int32_t, int32_t, float, int32_t, true>(std::get<0>(param), std::get<1>(param));
 }
 
 TEST_P(Tests_MGReduceV_Rmat, CheckInt32Int32FloatTransposeTrue)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int32_t, float, int, true>(
+  run_current_test<int32_t, int32_t, float, int32_t, true>(
     std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
 }
 
 TEST_P(Tests_MGReduceV_Rmat, CheckInt32Int64FloatTransposeTrue)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int64_t, float, int, true>(
+  run_current_test<int32_t, int64_t, float, int32_t, true>(
     std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
 }
 
 TEST_P(Tests_MGReduceV_Rmat, CheckInt64Int64FloatTransposeTrue)
 {
   auto param = GetParam();
-  run_current_test<int64_t, int64_t, float, int, true>(
+  run_current_test<int64_t, int64_t, float, int32_t, true>(
     std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
 }
 
diff --git a/cpp/tests/prims/mg_transform_reduce_e.cu b/cpp/tests/prims/mg_transform_reduce_e.cu
index 82356809952..6935149b6d2 100644
--- a/cpp/tests/prims/mg_transform_reduce_e.cu
+++ b/cpp/tests/prims/mg_transform_reduce_e.cu
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "property_generator.cuh"
+
 #include <utilities/base_fixture.hpp>
 #include <utilities/device_comm_wrapper.hpp>
 #include <utilities/high_res_clock.h>
@@ -50,111 +52,6 @@
 
 #include <random>
 
-template <typename... Args>
-struct property_type {
-  using type = std::conditional_t<(sizeof...(Args) > 1),
-                                  thrust::tuple<Args...>,
-                                  typename thrust::tuple_element<0, thrust::tuple<Args...>>::type>;
-};
-
-template <typename vertex_t, typename... Args>
-struct property_transform
-  : public thrust::unary_function<vertex_t, typename property_type<Args...>::type> {
-  int mod{};
-  property_transform(int mod_count) : mod(mod_count) {}
-
-  template <typename type = typename property_type<Args...>::type>
-  constexpr __device__
-    typename std::enable_if_t<cugraph::is_thrust_tuple_of_arithmetic<type>::value, type>
-    operator()(const vertex_t& val)
-  {
-    cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
-    auto value = hash_func(val) % mod;
-    return thrust::make_tuple(static_cast<Args>(value)...);
-  }
-
-  template <typename type = typename property_type<Args...>::type>
-  constexpr __device__ typename std::enable_if_t<std::is_arithmetic<type>::value, type> operator()(
-    const vertex_t& val)
-  {
-    cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
-    auto value = hash_func(val) % mod;
-    return static_cast<type>(value);
-  }
-};
-
-template <typename vertex_t, template <typename...> typename Tuple, typename... Args>
-struct property_transform<vertex_t, Tuple<Args...>> : public property_transform<vertex_t, Args...> {
-};
-
-template <typename... Args>
-struct generate_impl {
- private:
-  using type                 = typename property_type<Args...>::type;
-  using property_buffer_type = std::conditional_t<
-    (sizeof...(Args) > 1),
-    std::tuple<rmm::device_uvector<Args>...>,
-    rmm::device_uvector<typename thrust::tuple_element<0, thrust::tuple<Args...>>::type>>;
-
- public:
-  static thrust::tuple<Args...> initial_value(int init)
-  {
-    return thrust::make_tuple(static_cast<Args>(init)...);
-  }
-  template <typename label_t>
-  static auto vertex_property(rmm::device_uvector<label_t>& labels,
-                              int hash_bin_count,
-                              raft::handle_t const& handle)
-  {
-    auto data = cugraph::allocate_dataframe_buffer<type>(labels.size(), handle.get_stream());
-    auto zip  = cugraph::get_dataframe_buffer_begin(data);
-    thrust::transform(handle.get_thrust_policy(),
-                      labels.begin(),
-                      labels.end(),
-                      zip,
-                      property_transform<label_t, Args...>(hash_bin_count));
-    return data;
-  }
-  template <typename label_t>
-  static auto vertex_property(thrust::counting_iterator<label_t> begin,
-                              thrust::counting_iterator<label_t> end,
-                              int hash_bin_count,
-                              raft::handle_t const& handle)
-  {
-    auto length = thrust::distance(begin, end);
-    auto data   = cugraph::allocate_dataframe_buffer<type>(length, handle.get_stream());
-    auto zip    = cugraph::get_dataframe_buffer_begin(data);
-    thrust::transform(handle.get_thrust_policy(),
-                      begin,
-                      end,
-                      zip,
-                      property_transform<label_t, Args...>(hash_bin_count));
-    return data;
-  }
-
-  template <typename graph_view_type>
-  static auto column_property(raft::handle_t const& handle,
-                              graph_view_type const& graph_view,
-                              property_buffer_type& property)
-  {
-    auto output_property = cugraph::edge_dst_property_t<graph_view_type, type>(handle, graph_view);
-    update_edge_dst_property(
-      handle, graph_view, cugraph::get_dataframe_buffer_begin(property), output_property);
-    return output_property;
-  }
-
-  template <typename graph_view_type>
-  static auto row_property(raft::handle_t const& handle,
-                           graph_view_type const& graph_view,
-                           property_buffer_type& property)
-  {
-    auto output_property = cugraph::edge_src_property_t<graph_view_type, type>(handle, graph_view);
-    update_edge_src_property(
-      handle, graph_view, cugraph::get_dataframe_buffer_begin(property), output_property);
-    return output_property;
-  }
-};
-
 template <typename T>
 struct result_compare {
   static constexpr double threshold_ratio{1e-3};
@@ -193,14 +90,6 @@ struct result_compare<thrust::tuple<Args...>> {
   }
 };
 
-template <typename T>
-struct generate : public generate_impl<T> {
-  static T initial_value(int init) { return static_cast<T>(init); }
-};
-template <typename... Args>
-struct generate<std::tuple<Args...>> : public generate_impl<Args...> {
-};
-
 struct Prims_Usecase {
   bool check_correctness{true};
   bool test_weighted{false};
@@ -255,13 +144,14 @@ class Tests_MGTransformReduceE
     const int hash_bin_count = 5;
     const int initial_value  = 4;
 
-    auto property_initial_value = generate<result_t>::initial_value(initial_value);
-    using property_t            = decltype(property_initial_value);
-    auto vertex_property_data =
-      generate<result_t>::vertex_property((*d_mg_renumber_map_labels), hash_bin_count, *handle_);
-    auto col_prop =
-      generate<result_t>::column_property(*handle_, mg_graph_view, vertex_property_data);
-    auto row_prop = generate<result_t>::row_property(*handle_, mg_graph_view, vertex_property_data);
+    auto property_initial_value =
+      cugraph::test::generate<vertex_t, result_t>::initial_value(initial_value);
+    auto mg_vertex_prop = cugraph::test::generate<vertex_t, result_t>::vertex_property(
+      *handle_, *d_mg_renumber_map_labels, hash_bin_count);
+    auto mg_src_prop = cugraph::test::generate<vertex_t, result_t>::src_property(
+      *handle_, mg_graph_view, mg_vertex_prop);
+    auto mg_dst_prop = cugraph::test::generate<vertex_t, result_t>::dst_property(
+      *handle_, mg_graph_view, mg_vertex_prop);
 
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
@@ -272,13 +162,13 @@ class Tests_MGTransformReduceE
     auto result = transform_reduce_e(
       *handle_,
       mg_graph_view,
-      row_prop.view(),
-      col_prop.view(),
-      [] __device__(auto row, auto col, weight_t wt, auto row_property, auto col_property) {
-        if (row_property < col_property) {
-          return row_property;
+      mg_src_prop.view(),
+      mg_dst_prop.view(),
+      [] __device__(auto src, auto dst, weight_t wt, auto src_property, auto dst_property) {
+        if (src_property < dst_property) {
+          return src_property;
         } else {
-          return col_property;
+          return dst_property;
         }
       },
       property_initial_value);
@@ -300,30 +190,30 @@ class Tests_MGTransformReduceE
           *handle_, input_usecase, true, false);
       auto sg_graph_view = sg_graph.view();
 
-      auto sg_vertex_property_data = generate<result_t>::vertex_property(
+      auto sg_vertex_prop = cugraph::test::generate<vertex_t, result_t>::vertex_property(
+        *handle_,
         thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_first()),
         thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_last()),
-        hash_bin_count,
-        *handle_);
-      auto sg_col_prop =
-        generate<result_t>::column_property(*handle_, sg_graph_view, sg_vertex_property_data);
-      auto sg_row_prop =
-        generate<result_t>::row_property(*handle_, sg_graph_view, sg_vertex_property_data);
+        hash_bin_count);
+      auto sg_src_prop = cugraph::test::generate<vertex_t, result_t>::src_property(
+        *handle_, sg_graph_view, sg_vertex_prop);
+      auto sg_dst_prop = cugraph::test::generate<vertex_t, result_t>::dst_property(
+        *handle_, sg_graph_view, sg_vertex_prop);
 
       auto expected_result = transform_reduce_e(
         *handle_,
         sg_graph_view,
-        sg_row_prop.view(),
-        sg_col_prop.view(),
-        [] __device__(auto row, auto col, weight_t wt, auto row_property, auto col_property) {
-          if (row_property < col_property) {
-            return row_property;
+        sg_src_prop.view(),
+        sg_dst_prop.view(),
+        [] __device__(auto src, auto dst, weight_t wt, auto src_property, auto dst_property) {
+          if (src_property < dst_property) {
+            return src_property;
           } else {
-            return col_property;
+            return dst_property;
           }
         },
         property_initial_value);
-      result_compare<property_t> compare{};
+      result_compare<result_t> compare{};
       ASSERT_TRUE(compare(expected_result, result));
     }
   }
@@ -341,14 +231,14 @@ using Tests_MGTransformReduceE_Rmat = Tests_MGTransformReduceE<cugraph::test::Rm
 TEST_P(Tests_MGTransformReduceE_File, CheckInt32Int32FloatTupleIntFloatTransposeFalse)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int32_t, float, std::tuple<int, float>, false>(std::get<0>(param),
-                                                                           std::get<1>(param));
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>, false>(std::get<0>(param),
+                                                                              std::get<1>(param));
 }
 
 TEST_P(Tests_MGTransformReduceE_Rmat, CheckInt32Int32FloatTupleIntFloatTransposeFalse)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int32_t, float, std::tuple<int, float>, false>(
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>, false>(
     std::get<0>(param),
     cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
 }
@@ -356,14 +246,14 @@ TEST_P(Tests_MGTransformReduceE_Rmat, CheckInt32Int32FloatTupleIntFloatTranspose
 TEST_P(Tests_MGTransformReduceE_File, CheckInt32Int32FloatTupleIntFloatTransposeTrue)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int32_t, float, std::tuple<int, float>, true>(std::get<0>(param),
-                                                                          std::get<1>(param));
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>, true>(std::get<0>(param),
+                                                                             std::get<1>(param));
 }
 
 TEST_P(Tests_MGTransformReduceE_Rmat, CheckInt32Int32FloatTupleIntFloatTransposeTrue)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int32_t, float, std::tuple<int, float>, true>(
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>, true>(
     std::get<0>(param),
     cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
 }
diff --git a/cpp/tests/prims/mg_transform_reduce_v.cu b/cpp/tests/prims/mg_transform_reduce_v.cu
index d7b2f18cdd8..e4f2560f41b 100644
--- a/cpp/tests/prims/mg_transform_reduce_v.cu
+++ b/cpp/tests/prims/mg_transform_reduce_v.cu
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "property_generator.cuh"
+
 #include <utilities/base_fixture.hpp>
 #include <utilities/device_comm_wrapper.hpp>
 #include <utilities/high_res_clock.h>
@@ -44,28 +46,14 @@
 
 #include <random>
 
-template <typename vertex_t, typename T>
-struct property_transform : public thrust::unary_function<vertex_t, T> {
-  int mod{};
-  property_transform(int mod_count) : mod(mod_count) {}
-  constexpr __device__ auto operator()(vertex_t, const vertex_t& val)
-  {
-    cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
-    auto value = hash_func(val) % mod;
-    return static_cast<T>(value);
-  }
-};
+template <typename vertex_t, typename property_t>
+struct v_op_t {
+  int32_t mod{};
 
-template <typename vertex_t, typename... Args>
-struct property_transform<vertex_t, std::tuple<Args...>>
-  : public thrust::unary_function<vertex_t, thrust::tuple<Args...>> {
-  int mod{};
-  property_transform(int mod_count) : mod(mod_count) {}
-  constexpr __device__ auto operator()(vertex_t, const vertex_t& val)
+  __device__ auto operator()(vertex_t, vertex_t val) const
   {
     cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
-    auto value = hash_func(val) % mod;
-    return thrust::make_tuple(static_cast<Args>(value)...);
+    return cugraph::test::detail::make_property_value<property_t>(hash_func(val) % mod);
   }
 };
 
@@ -113,18 +101,6 @@ struct result_compare<thrust::tuple<Args...>> {
   }
 };
 
-template <typename T>
-struct generate {
-  static T initial_value(int init) { return static_cast<T>(init); }
-};
-template <typename... Args>
-struct generate<std::tuple<Args...>> {
-  static thrust::tuple<Args...> initial_value(int init)
-  {
-    return thrust::make_tuple(static_cast<Args>(init)...);
-  }
-};
-
 struct Prims_Usecase {
   bool check_correctness{true};
 };
@@ -178,14 +154,14 @@ class Tests_MGTransformReduceV
     const int hash_bin_count = 5;
     const int initial_value  = 10;
 
-    property_transform<vertex_t, result_t> prop(hash_bin_count);
-    auto property_initial_value = generate<result_t>::initial_value(initial_value);
-    using property_t            = decltype(property_initial_value);
+    v_op_t<vertex_t, result_t> v_op{hash_bin_count};
+    auto property_initial_value =
+      cugraph::test::generate<vertex_t, result_t>::initial_value(initial_value);
     enum class reduction_type_t { PLUS, MINIMUM, MAXIMUM };
     reduction_type_t reduction_types[] = {
       reduction_type_t::PLUS, reduction_type_t::MINIMUM, reduction_type_t::MAXIMUM};
 
-    std::unordered_map<reduction_type_t, property_t> results;
+    std::unordered_map<reduction_type_t, result_t> results;
 
     for (auto reduction_type : reduction_types) {
       if (cugraph::test::g_perf) {
@@ -198,26 +174,26 @@ class Tests_MGTransformReduceV
         case reduction_type_t::PLUS:
           results[reduction_type] = transform_reduce_v(*handle_,
                                                        mg_graph_view,
-                                                       d_mg_renumber_map_labels->begin(),
-                                                       prop,
+                                                       (*d_mg_renumber_map_labels).begin(),
+                                                       v_op,
                                                        property_initial_value,
-                                                       cugraph::reduce_op::plus<property_t>{});
+                                                       cugraph::reduce_op::plus<result_t>{});
           break;
         case reduction_type_t::MINIMUM:
           results[reduction_type] = transform_reduce_v(*handle_,
                                                        mg_graph_view,
-                                                       d_mg_renumber_map_labels->begin(),
-                                                       prop,
+                                                       (*d_mg_renumber_map_labels).begin(),
+                                                       v_op,
                                                        property_initial_value,
-                                                       cugraph::reduce_op::minimum<property_t>{});
+                                                       cugraph::reduce_op::minimum<result_t>{});
           break;
         case reduction_type_t::MAXIMUM:
           results[reduction_type] = transform_reduce_v(*handle_,
                                                        mg_graph_view,
-                                                       d_mg_renumber_map_labels->begin(),
-                                                       prop,
+                                                       (*d_mg_renumber_map_labels).begin(),
+                                                       v_op,
                                                        property_initial_value,
-                                                       cugraph::reduce_op::maximum<property_t>{});
+                                                       cugraph::reduce_op::maximum<result_t>{});
           break;
         default: FAIL() << "should not be reached.";
       }
@@ -241,38 +217,38 @@ class Tests_MGTransformReduceV
       auto sg_graph_view = sg_graph.view();
 
       for (auto reduction_type : reduction_types) {
-        property_t expected_result{};
+        result_t expected_result{};
         switch (reduction_type) {
           case reduction_type_t::PLUS:
             expected_result = transform_reduce_v(
               *handle_,
               sg_graph_view,
               thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_first()),
-              prop,
+              v_op,
               property_initial_value,
-              cugraph::reduce_op::plus<property_t>{});
+              cugraph::reduce_op::plus<result_t>{});
             break;
           case reduction_type_t::MINIMUM:
             expected_result = transform_reduce_v(
               *handle_,
               sg_graph_view,
               thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_first()),
-              prop,
+              v_op,
               property_initial_value,
-              cugraph::reduce_op::minimum<property_t>{});
+              cugraph::reduce_op::minimum<result_t>{});
             break;
           case reduction_type_t::MAXIMUM:
             expected_result = transform_reduce_v(
               *handle_,
               sg_graph_view,
               thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_first()),
-              prop,
+              v_op,
               property_initial_value,
-              cugraph::reduce_op::maximum<property_t>{});
+              cugraph::reduce_op::maximum<result_t>{});
             break;
           default: FAIL() << "should not be reached.";
         }
-        result_compare<property_t> compare{};
+        result_compare<result_t> compare{};
         ASSERT_TRUE(compare(expected_result, results[reduction_type]));
       }
     }
@@ -291,14 +267,14 @@ using Tests_MGTransformReduceV_Rmat = Tests_MGTransformReduceV<cugraph::test::Rm
 TEST_P(Tests_MGTransformReduceV_File, CheckInt32Int32FloatTupleIntFloatTransposeFalse)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int32_t, float, std::tuple<int, float>, false>(std::get<0>(param),
-                                                                           std::get<1>(param));
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>, false>(std::get<0>(param),
+                                                                              std::get<1>(param));
 }
 
 TEST_P(Tests_MGTransformReduceV_Rmat, CheckInt32Int32FloatTupleIntFloatTransposeFalse)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int32_t, float, std::tuple<int, float>, false>(
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>, false>(
     std::get<0>(param),
     cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
 }
@@ -306,14 +282,14 @@ TEST_P(Tests_MGTransformReduceV_Rmat, CheckInt32Int32FloatTupleIntFloatTranspose
 TEST_P(Tests_MGTransformReduceV_File, CheckInt32Int32FloatTupleIntFloatTransposeTrue)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int32_t, float, std::tuple<int, float>, true>(std::get<0>(param),
-                                                                          std::get<1>(param));
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>, true>(std::get<0>(param),
+                                                                             std::get<1>(param));
 }
 
 TEST_P(Tests_MGTransformReduceV_Rmat, CheckInt32Int32FloatTupleIntFloatTransposeTrue)
 {
   auto param = GetParam();
-  run_current_test<int32_t, int32_t, float, std::tuple<int, float>, true>(
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>, true>(
     std::get<0>(param),
     cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
 }
diff --git a/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu b/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu
index ff8f0850368..25edd3b2748 100644
--- a/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu
+++ b/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "property_generator.cuh"
+
 #include <utilities/base_fixture.hpp>
 #include <utilities/device_comm_wrapper.hpp>
 #include <utilities/high_res_clock.h>
@@ -56,39 +58,6 @@
 
 #include <random>
 
-template <typename TupleType, typename T, std::size_t... Is>
-__device__ __host__ auto make_type_casted_tuple_from_scalar(T val, std::index_sequence<Is...>)
-{
-  return thrust::make_tuple(
-    static_cast<typename thrust::tuple_element<Is, TupleType>::type>(val)...);
-}
-
-template <typename property_t, typename T>
-__device__ __host__ auto make_property_value(T val)
-{
-  property_t ret{};
-  if constexpr (cugraph::is_thrust_tuple_of_arithmetic<property_t>::value) {
-    ret = make_type_casted_tuple_from_scalar<property_t>(
-      val, std::make_index_sequence<thrust::tuple_size<property_t>::value>{});
-  } else {
-    ret = static_cast<property_t>(val);
-  }
-  return ret;
-}
-
-template <typename vertex_t, typename property_t>
-struct property_transform_t {
-  int mod{};
-
-  constexpr __device__ property_t operator()(vertex_t const v) const
-  {
-    static_assert(cugraph::is_thrust_tuple_of_arithmetic<property_t>::value ||
-                  std::is_arithmetic_v<property_t>);
-    cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
-    return make_property_value<property_t>(hash_func(v) % mod);
-  }
-};
-
 template <typename key_t, typename vertex_t, typename property_t, typename payload_t>
 struct e_op_t {
   __device__ auto operator()(key_t optionally_tagged_src,
@@ -169,7 +138,7 @@ class Tests_MGTransformReduceVFrontierOutgoingEByDst
       hr_clock.start();
     }
 
-    auto [mg_graph, mg_renumber_map_labels] =
+    auto [mg_graph, d_mg_renumber_map_labels] =
       cugraph::test::construct_graph<vertex_t, edge_t, weight_t, store_transposed, is_multi_gpu>(
         *handle_, input_usecase, false, renumber);
 
@@ -187,28 +156,12 @@ class Tests_MGTransformReduceVFrontierOutgoingEByDst
 
     const int hash_bin_count = 5;
 
-    auto mg_property_buffer = cugraph::allocate_dataframe_buffer<property_t>(
-      mg_graph_view.local_vertex_partition_range_size(), handle_->get_stream());
-
-    thrust::transform(handle_->get_thrust_policy(),
-                      (*mg_renumber_map_labels).begin(),
-                      (*mg_renumber_map_labels).end(),
-                      cugraph::get_dataframe_buffer_begin(mg_property_buffer),
-                      property_transform_t<vertex_t, property_t>{hash_bin_count});
-
-    cugraph::edge_src_property_t<decltype(mg_graph_view), property_t> mg_src_properties(
-      *handle_, mg_graph_view);
-    cugraph::edge_dst_property_t<decltype(mg_graph_view), property_t> mg_dst_properties(
-      *handle_, mg_graph_view);
-
-    update_edge_src_property(*handle_,
-                             mg_graph_view,
-                             cugraph::get_dataframe_buffer_cbegin(mg_property_buffer),
-                             mg_src_properties);
-    update_edge_dst_property(*handle_,
-                             mg_graph_view,
-                             cugraph::get_dataframe_buffer_cbegin(mg_property_buffer),
-                             mg_dst_properties);
+    auto mg_vertex_prop = cugraph::test::generate<vertex_t, property_t>::vertex_property(
+      *handle_, *d_mg_renumber_map_labels, hash_bin_count);
+    auto mg_src_prop = cugraph::test::generate<vertex_t, property_t>::src_property(
+      *handle_, mg_graph_view, mg_vertex_prop);
+    auto mg_dst_prop = cugraph::test::generate<vertex_t, property_t>::dst_property(
+      *handle_, mg_graph_view, mg_vertex_prop);
 
     auto mg_key_buffer = cugraph::allocate_dataframe_buffer<key_t>(
       mg_graph_view.local_vertex_partition_range_size(), handle_->get_stream());
@@ -221,7 +174,7 @@ class Tests_MGTransformReduceVFrontierOutgoingEByDst
       thrust::tabulate(handle_->get_thrust_policy(),
                        cugraph::get_dataframe_buffer_begin(mg_key_buffer),
                        cugraph::get_dataframe_buffer_end(mg_key_buffer),
-                       [mg_renumber_map_labels = (*mg_renumber_map_labels).data(),
+                       [mg_renumber_map_labels = (*d_mg_renumber_map_labels).data(),
                         local_vertex_partition_range_first =
                           mg_graph_view.local_vertex_partition_range_first()] __device__(size_t i) {
                          return thrust::make_tuple(
@@ -255,8 +208,8 @@ class Tests_MGTransformReduceVFrontierOutgoingEByDst
         *handle_,
         mg_graph_view,
         mg_vertex_frontier.bucket(bucket_idx_cur),
-        mg_src_properties.view(),
-        mg_dst_properties.view(),
+        mg_src_prop.view(),
+        mg_dst_prop.view(),
         e_op_t<key_t, vertex_t, property_t, payload_t>{},
         cugraph::reduce_op::null{});
     } else {
@@ -265,8 +218,8 @@ class Tests_MGTransformReduceVFrontierOutgoingEByDst
           *handle_,
           mg_graph_view,
           mg_vertex_frontier.bucket(bucket_idx_cur),
-          mg_src_properties.view(),
-          mg_dst_properties.view(),
+          mg_src_prop.view(),
+          mg_dst_prop.view(),
           e_op_t<key_t, vertex_t, property_t, payload_t>{},
           cugraph::reduce_op::plus<payload_t>{});
     }
@@ -284,7 +237,7 @@ class Tests_MGTransformReduceVFrontierOutgoingEByDst
 
     if (prims_usecase.check_correctness) {
       auto mg_aggregate_renumber_map_labels = cugraph::test::device_gatherv(
-        *handle_, (*mg_renumber_map_labels).data(), (*mg_renumber_map_labels).size());
+        *handle_, (*d_mg_renumber_map_labels).data(), (*d_mg_renumber_map_labels).size());
 
       auto mg_aggregate_new_frontier_key_buffer =
         cugraph::allocate_dataframe_buffer<key_t>(0, handle_->get_stream());
@@ -352,28 +305,15 @@ class Tests_MGTransformReduceVFrontierOutgoingEByDst
             *handle_, input_usecase, false, false);
         auto sg_graph_view = sg_graph.view();
 
-        auto sg_property_buffer = cugraph::allocate_dataframe_buffer<property_t>(
-          sg_graph_view.local_vertex_partition_range_size(), handle_->get_stream());
-
-        thrust::transform(
-          handle_->get_thrust_policy(),
-          thrust::make_counting_iterator(vertex_t{0}),
-          thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_size()),
-          cugraph::get_dataframe_buffer_begin(sg_property_buffer),
-          property_transform_t<vertex_t, property_t>{hash_bin_count});
-
-        cugraph::edge_src_property_t<decltype(sg_graph_view), property_t> sg_src_properties(
-          *handle_, sg_graph_view);
-        cugraph::edge_dst_property_t<decltype(sg_graph_view), property_t> sg_dst_properties(
-          *handle_, sg_graph_view);
-        update_edge_src_property(*handle_,
-                                 sg_graph_view,
-                                 cugraph::get_dataframe_buffer_cbegin(sg_property_buffer),
-                                 sg_src_properties);
-        update_edge_dst_property(*handle_,
-                                 sg_graph_view,
-                                 cugraph::get_dataframe_buffer_cbegin(sg_property_buffer),
-                                 sg_dst_properties);
+        auto sg_vertex_prop = cugraph::test::generate<vertex_t, property_t>::vertex_property(
+          *handle_,
+          thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_first()),
+          thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_last()),
+          hash_bin_count);
+        auto sg_src_prop = cugraph::test::generate<vertex_t, property_t>::src_property(
+          *handle_, sg_graph_view, sg_vertex_prop);
+        auto sg_dst_prop = cugraph::test::generate<vertex_t, property_t>::dst_property(
+          *handle_, sg_graph_view, sg_vertex_prop);
 
         auto sg_key_buffer = cugraph::allocate_dataframe_buffer<key_t>(
           sg_graph_view.local_vertex_partition_range_size(), handle_->get_stream());
@@ -408,8 +348,8 @@ class Tests_MGTransformReduceVFrontierOutgoingEByDst
             *handle_,
             sg_graph_view,
             sg_vertex_frontier.bucket(bucket_idx_cur),
-            sg_src_properties.view(),
-            sg_dst_properties.view(),
+            sg_src_prop.view(),
+            sg_dst_prop.view(),
             e_op_t<key_t, vertex_t, property_t, payload_t>{},
             cugraph::reduce_op::null{});
         } else {
@@ -418,8 +358,8 @@ class Tests_MGTransformReduceVFrontierOutgoingEByDst
               *handle_,
               sg_graph_view,
               sg_vertex_frontier.bucket(bucket_idx_cur),
-              sg_src_properties.view(),
-              sg_dst_properties.view(),
+              sg_src_prop.view(),
+              sg_dst_prop.view(),
               e_op_t<key_t, vertex_t, property_t, payload_t>{},
               cugraph::reduce_op::plus<payload_t>{});
         }
diff --git a/cpp/tests/prims/property_generator.cuh b/cpp/tests/prims/property_generator.cuh
new file mode 100644
index 00000000000..fa779778770
--- /dev/null
+++ b/cpp/tests/prims/property_generator.cuh
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <prims/update_edge_src_dst_property.cuh>
+
+#include <cugraph/edge_src_dst_property.hpp>
+#include <cugraph/utilities/dataframe_buffer.hpp>
+#include <cugraph/utilities/thrust_tuple_utils.hpp>
+
+#include <cuco/detail/hash_functions.cuh>
+
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/distance.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/transform.h>
+#include <thrust/tuple.h>
+
+#include <tuple>
+#include <vector>
+
+namespace cugraph {
+namespace test {
+
+namespace detail {
+
+template <typename TupleType, typename T, std::size_t... Is>
+__host__ __device__ auto make_type_casted_tuple_from_scalar(T val, std::index_sequence<Is...>)
+{
+  return thrust::make_tuple(
+    static_cast<typename thrust::tuple_element<Is, TupleType>::type>(val)...);
+}
+
+template <typename property_t, typename T>
+__host__ __device__ auto make_property_value(T val)
+{
+  property_t ret{};
+  if constexpr (cugraph::is_thrust_tuple_of_arithmetic<property_t>::value) {
+    ret = make_type_casted_tuple_from_scalar<property_t>(
+      val, std::make_index_sequence<thrust::tuple_size<property_t>::value>{});
+  } else {
+    ret = static_cast<property_t>(val);
+  }
+  return ret;
+}
+
+template <typename vertex_t, typename property_t>
+struct property_transform {
+  int32_t mod{};
+
+  constexpr __device__ property_t operator()(vertex_t v) const
+  {
+    static_assert(cugraph::is_thrust_tuple_of_arithmetic<property_t>::value ||
+                  std::is_arithmetic_v<property_t>);
+    cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
+    return make_property_value<property_t>(hash_func(v) % mod);
+  }
+};
+
+}  // namespace detail
+
+template <typename vertex_t, typename property_t>
+struct generate {
+ private:
+  using property_buffer_type =
+    decltype(allocate_dataframe_buffer<property_t>(size_t{0}, rmm::cuda_stream_view{}));
+
+ public:
+  static property_t initial_value(int32_t init)
+  {
+    return detail::make_property_value<property_t>(init);
+  }
+
+  static auto vertex_property(raft::handle_t const& handle,
+                              rmm::device_uvector<vertex_t> const& labels,
+                              int32_t hash_bin_count)
+  {
+    auto data = cugraph::allocate_dataframe_buffer<property_t>(labels.size(), handle.get_stream());
+    thrust::transform(handle.get_thrust_policy(),
+                      labels.begin(),
+                      labels.end(),
+                      cugraph::get_dataframe_buffer_begin(data),
+                      detail::property_transform<vertex_t, property_t>{hash_bin_count});
+    return data;
+  }
+
+  static auto vertex_property(raft::handle_t const& handle,
+                              thrust::counting_iterator<vertex_t> begin,
+                              thrust::counting_iterator<vertex_t> end,
+                              int32_t hash_bin_count)
+  {
+    auto length = thrust::distance(begin, end);
+    auto data   = cugraph::allocate_dataframe_buffer<property_t>(length, handle.get_stream());
+    thrust::transform(handle.get_thrust_policy(),
+                      begin,
+                      end,
+                      cugraph::get_dataframe_buffer_begin(data),
+                      detail::property_transform<vertex_t, property_t>{hash_bin_count});
+    return data;
+  }
+
+  template <typename graph_view_type>
+  static auto src_property(raft::handle_t const& handle,
+                           graph_view_type const& graph_view,
+                           property_buffer_type const& property)
+  {
+    auto output_property =
+      cugraph::edge_src_property_t<graph_view_type, property_t>(handle, graph_view);
+    update_edge_src_property(
+      handle, graph_view, cugraph::get_dataframe_buffer_begin(property), output_property);
+    return output_property;
+  }
+
+  template <typename graph_view_type>
+  static auto dst_property(raft::handle_t const& handle,
+                           graph_view_type const& graph_view,
+                           property_buffer_type const& property)
+  {
+    auto output_property =
+      cugraph::edge_dst_property_t<graph_view_type, property_t>(handle, graph_view);
+    update_edge_dst_property(
+      handle, graph_view, cugraph::get_dataframe_buffer_begin(property), output_property);
+    return output_property;
+  }
+};
+
+}  // namespace test
+}  // namespace cugraph

From 368740ca1ce51e6a8675b97b4da3e8300dbf2ef2 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 15 Sep 2022 00:10:02 -0700
Subject: [PATCH 04/18] fix file name

---
 ...utgoine_e.cuh => per_v_random_select_transform_outgoing_e.cuh} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename cpp/src/prims/{per_v_random_select_transform_outgoine_e.cuh => per_v_random_select_transform_outgoing_e.cuh} (100%)

diff --git a/cpp/src/prims/per_v_random_select_transform_outgoine_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
similarity index 100%
rename from cpp/src/prims/per_v_random_select_transform_outgoine_e.cuh
rename to cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh

From 1c24e1274b02d487fd8482e0c612cbccb4e8cb67 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Mon, 19 Sep 2022 09:50:29 -0700
Subject: [PATCH 05/18] add to_thrust_tuple utility function

---
 cpp/include/cugraph/utilities/thrust_tuple_utils.hpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp b/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp
index 74e04af463c..230de14326a 100644
--- a/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp
+++ b/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp
@@ -184,6 +184,18 @@ auto std_tuple_to_thrust_tuple(TupleType tup)
     tup, std::make_index_sequence<std::tuple_size_v<TupleType>>{});
 }
 
+template <typename T>
+auto to_thrust_tuple(T scalar_value)
+{
+  return thrust::make_tuple(scalar_value);
+}
+
+template <typename... Ts>
+auto to_thrust_tuple(thrust::tuple<Ts...> tuple_value)
+{
+  return tuple_value;
+}
+
 // a temporary function to emulate thrust::tuple_cat (not supported) using std::tuple_cat (should
 // retire once thrust::tuple is replaced with cuda::std::tuple)
 template <typename... TupleTypes>

From 193cb0dd4b4c06cd5732886ac9897716fb4e9565 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Mon, 19 Sep 2022 09:51:05 -0700
Subject: [PATCH 06/18] fix compile error in vertex_frontier.cuh

---
 cpp/src/prims/vertex_frontier.cuh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/prims/vertex_frontier.cuh b/cpp/src/prims/vertex_frontier.cuh
index 72fec4518cf..1e628893235 100644
--- a/cpp/src/prims/vertex_frontier.cuh
+++ b/cpp/src/prims/vertex_frontier.cuh
@@ -153,7 +153,8 @@ class key_bucket_t {
         vertices_ = std::move(merged_vertices);
       } else {
         auto cur_size = vertices_.size();
-        vertices_.resize(cur_size + thrust::distance(vertex_first, vertex_last));
+        vertices_.resize(cur_size + thrust::distance(vertex_first, vertex_last),
+                         handle_ptr_->get_stream());
         thrust::copy(handle_ptr_->get_thrust_policy(),
                      vertex_first,
                      vertex_last,

From 313dac9582f27b0c890d380bc17762a66acf71e9 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Mon, 19 Sep 2022 13:19:15 -0700
Subject: [PATCH 07/18] fix compiler warnings

---
 .../cugraph/edge_partition_device_view.cuh    | 34 +++++++++++--------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/cpp/include/cugraph/edge_partition_device_view.cuh b/cpp/include/cugraph/edge_partition_device_view.cuh
index 8fae1f93054..e0538a78241 100644
--- a/cpp/include/cugraph/edge_partition_device_view.cuh
+++ b/cpp/include/cugraph/edge_partition_device_view.cuh
@@ -211,13 +211,16 @@ class edge_partition_device_view_t<vertex_t,
   {
     rmm::device_uvector<edge_t> local_degrees(this->major_range_size(), stream);
     if (dcs_nzd_vertices_) {
-      thrust::transform(
-        rmm::exec_policy(stream),
-        thrust::make_counting_iterator(this->major_range_first()),
-        thrust::make_counting_iterator(this->major_range_last()),
-        local_degrees.begin(),
-        detail::local_degree_op_t<vertex_t, edge_t, multi_gpu, true>{
-          this->offsets_, major_range_first_, *dcs_nzd_vertices_, *major_hypersparse_first_});
+      assert(major_hypersparse_first_);
+      thrust::transform(rmm::exec_policy(stream),
+                        thrust::make_counting_iterator(this->major_range_first()),
+                        thrust::make_counting_iterator(this->major_range_last()),
+                        local_degrees.begin(),
+                        detail::local_degree_op_t<vertex_t, edge_t, multi_gpu, true>{
+                          this->offsets_,
+                          major_range_first_,
+                          *dcs_nzd_vertices_,
+                          major_hypersparse_first_.value_or(vertex_t{0})});
     } else {
       thrust::transform(
         rmm::exec_policy(stream),
@@ -235,13 +238,16 @@ class edge_partition_device_view_t<vertex_t,
   {
     rmm::device_uvector<edge_t> local_degrees(majors.size(), stream);
     if (dcs_nzd_vertices_) {
-      thrust::transform(
-        rmm::exec_policy(stream),
-        majors.begin(),
-        majors.end(),
-        local_degrees.begin(),
-        detail::local_degree_op_t<vertex_t, edge_t, multi_gpu, true>{
-          this->offsets_, major_range_first_, *dcs_nzd_vertices_, *major_hypersparse_first_});
+      assert(major_hypersparse_first_);
+      thrust::transform(rmm::exec_policy(stream),
+                        majors.begin(),
+                        majors.end(),
+                        local_degrees.begin(),
+                        detail::local_degree_op_t<vertex_t, edge_t, multi_gpu, true>{
+                          this->offsets_,
+                          major_range_first_,
+                          dcs_nzd_vertices_.value(),
+                          major_hypersparse_first_.value_or(vertex_t{0})});
     } else {
       thrust::transform(
         rmm::exec_policy(stream),

From f57568a146e9088d8d87fa5d1bb4a0874a7cabb3 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Mon, 19 Sep 2022 13:19:36 -0700
Subject: [PATCH 08/18] add test suit

---
 cpp/tests/CMakeLists.txt                      |   6 +
 ...er_v_random_select_transform_outgoing_e.cu | 292 ++++++++++++++++++
 2 files changed, 298 insertions(+)
 create mode 100644 cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu

diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 4179ee026e0..c8357b89c50 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -534,6 +534,12 @@ if(BUILD_CUGRAPH_MG_TESTS)
                     prims/mg_extract_transform_v_frontier_outgoing_e.cu)
     target_link_libraries(MG_EXTRACT_TRANSFORM_V_FRONTIER_OUTGOING_E_TEST PRIVATE cuco::cuco)
 
+    ###########################################################################################
+    # - MG PRIMS PER_V_RANDOM_SELECT_TRANSFORM_OUTGOING_E tests -------------------------------
+    ConfigureTestMG(MG_PER_V_RANDOM_SELECT_TRANSFORM_OUTGOING_E_TEST
+                    prims/mg_per_v_random_select_transform_outgoing_e.cu)
+    target_link_libraries(MG_PER_V_RANDOM_SELECT_TRANSFORM_OUTGOING_E_TEST PRIVATE cuco::cuco)
+
     ###########################################################################################
     # - MG GATHER_UTILS tests -----------------------------------------------------------------
     ConfigureTestMG(MG_GATHER_UTILS_TEST sampling/detail/mg_gather_utils.cu)
diff --git a/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu b/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu
new file mode 100644
index 00000000000..a1b6df9ad7d
--- /dev/null
+++ b/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "property_generator.cuh"
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/device_comm_wrapper.hpp>
+#include <utilities/high_res_clock.h>
+#include <utilities/mg_utilities.hpp>
+#include <utilities/test_graphs.hpp>
+#include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
+
+#include <prims/per_v_random_select_transform_outgoing_e.cuh>
+#include <prims/vertex_frontier.cuh>
+
+#include <cugraph/edge_src_dst_property.hpp>
+#include <cugraph/utilities/dataframe_buffer.hpp>
+#include <cugraph/utilities/thrust_tuple_utils.hpp>
+
+#include <cugraph/graph_view.hpp>
+
+#include <raft/comms/comms.hpp>
+#include <raft/comms/mpi_comms.hpp>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/tuple.h>
+
+#include <gtest/gtest.h>
+
+#include <random>
+
+template <typename vertex_t, typename property_t>
+struct e_op_t {
+  using result_t = decltype(cugraph::thrust_tuple_cat(thrust::tuple<vertex_t, vertex_t>{},
+                                                      cugraph::to_thrust_tuple(property_t{}),
+                                                      cugraph::to_thrust_tuple(property_t{})));
+
+  __device__ result_t operator()(vertex_t src,
+                                 vertex_t dst,
+                                 property_t src_prop,
+                                 property_t dst_prop) const
+  {
+    if constexpr (cugraph::is_thrust_tuple_of_arithmetic<property_t>::value) {
+      static_assert(thrust::tuple_size<property_t>::value == size_t{2});
+      return thrust::make_tuple(src,
+                                dst,
+                                thrust::get<0>(src_prop),
+                                thrust::get<1>(src_prop),
+                                thrust::get<0>(dst_prop),
+                                thrust::get<1>(dst_prop));
+    } else {
+      return thrust::make_tuple(src, dst, src_prop, dst_prop);
+    }
+  }
+};
+
+struct Prims_Usecase {
+  size_t K{0};
+  bool with_replacement{false};
+  bool test_weighted{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MGPerVRandomSelectTransformOutgoingE
+  : public ::testing::TestWithParam<std::tuple<Prims_Usecase, input_usecase_t>> {
+ public:
+  Tests_MGPerVRandomSelectTransformOutgoingE() {}
+
+  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
+
+  static void TearDownTestCase() { handle_.reset(); }
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  // Verify the results of per_v_random_select_transform_outgoing_e primitive
+  template <typename vertex_t, typename edge_t, typename weight_t, typename property_t>
+  void run_current_test(Prims_Usecase const& prims_usecase, input_usecase_t const& input_usecase)
+  {
+    HighResClock hr_clock{};
+
+    auto const comm_rank = handle_->get_comms().get_rank();
+    auto const comm_size = handle_->get_comms().get_size();
+
+    // 1. create MG graph
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_clock.start();
+    }
+
+    auto [mg_graph, d_mg_renumber_map_labels] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, true>(
+        *handle_, input_usecase, prims_usecase.test_weighted, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      double elapsed_time{0.0};
+      hr_clock.stop(&elapsed_time);
+      std::cout << "MG construct_graph took " << elapsed_time * 1e-6 << " s.\n";
+    }
+
+    auto mg_graph_view = mg_graph.view();
+
+    // 2. run MG per_v_random_select_transform_outgoing_e primitive
+
+    const int hash_bin_count = 5;
+
+    auto mg_vertex_prop = cugraph::test::generate<vertex_t, property_t>::vertex_property(
+      *handle_, *d_mg_renumber_map_labels, hash_bin_count);
+    auto mg_src_prop = cugraph::test::generate<vertex_t, property_t>::src_property(
+      *handle_, mg_graph_view, mg_vertex_prop);
+    auto mg_dst_prop = cugraph::test::generate<vertex_t, property_t>::dst_property(
+      *handle_, mg_graph_view, mg_vertex_prop);
+
+    auto mg_vertex_buffer = rmm::device_uvector<vertex_t>(
+      mg_graph_view.local_vertex_partition_range_size(), handle_->get_stream());
+    thrust::sequence(handle_->get_thrust_policy(),
+                     cugraph::get_dataframe_buffer_begin(mg_vertex_buffer),
+                     cugraph::get_dataframe_buffer_end(mg_vertex_buffer),
+                     mg_graph_view.local_vertex_partition_range_first());
+
+    constexpr size_t bucket_idx_cur = 0;
+    constexpr size_t num_buckets    = 1;
+
+    cugraph::vertex_frontier_t<vertex_t, void, true, false> mg_vertex_frontier(*handle_,
+                                                                               num_buckets);
+    mg_vertex_frontier.bucket(bucket_idx_cur)
+      .insert(cugraph::get_dataframe_buffer_begin(mg_vertex_buffer),
+              cugraph::get_dataframe_buffer_end(mg_vertex_buffer));
+
+    raft::random::RngState rng_state(static_cast<uint64_t>(handle_->get_comms().get_rank()));
+
+    using result_t = decltype(cugraph::thrust_tuple_cat(thrust::tuple<vertex_t, vertex_t>{},
+                                                        cugraph::to_thrust_tuple(property_t{}),
+                                                        cugraph::to_thrust_tuple(property_t{})));
+
+    std::optional<result_t> invalid_value{std::nullopt};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_clock.start();
+    }
+
+    auto [sample_offsets, sample_e_op_results] =
+      cugraph::per_v_random_select_transform_outgoing_e(*handle_,
+                                                        mg_graph_view,
+                                                        mg_vertex_frontier.bucket(bucket_idx_cur),
+                                                        mg_src_prop.view(),
+                                                        mg_dst_prop.view(),
+                                                        e_op_t<vertex_t, property_t>{},
+                                                        rng_state,
+                                                        prims_usecase.K,
+                                                        prims_usecase.with_replacement,
+                                                        invalid_value);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      double elapsed_time{0.0};
+      hr_clock.stop(&elapsed_time);
+      std::cout << "MG per_v_random_select_transform_outgoing_e took " << elapsed_time * 1e-6
+                << " s.\n";
+    }
+
+    // 3. validate MG results
+
+    if (prims_usecase.check_correctness) {
+#if 0
+      cugraph::graph_t<vertex_t, edge_t, weight_t, false, false> sg_graph(*handle_);
+      std::tie(sg_graph, std::ignore) =
+        cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, false>(
+          *handle_, input_usecase, prims_usecase.test_weighted, false);
+      auto sg_graph_view = sg_graph.view();
+
+// 1. check whether sources coincide with the local vertices in the frontier or not
+
+// 2. check sample counts
+
+// 3. check destinations exist in the input graph
+
+// 4. check source/destination property values
+
+      auto sg_vertex_property_data = generate<property_t>::vertex_property(
+        thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_first()),
+        thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_last()),
+        hash_bin_count,
+        *handle_);
+      auto sg_dst_prop =
+        generate<property_t>::dst_property(*handle_, sg_graph_view, sg_vertex_property_data);
+      auto sg_src_prop =
+        generate<property_t>::src_property(*handle_, sg_graph_view, sg_vertex_property_data);
+
+      auto expected_result = count_if_e(
+        *handle_,
+        sg_graph_view,
+        sg_src_prop.view(),
+        sg_dst_prop.view(),
+        [] __device__(auto src, auto dst, weight_t, auto src_property, auto dst_property) {
+          return src_property < dst_property;
+        });
+      ASSERT_TRUE(expected_result == result);
+#endif
+    }
+  }
+
+ private:
+  static std::unique_ptr<raft::handle_t> handle_;
+};
+
+template <typename input_usecase_t>
+std::unique_ptr<raft::handle_t>
+  Tests_MGPerVRandomSelectTransformOutgoingE<input_usecase_t>::handle_ = nullptr;
+
+using Tests_MGPerVRandomSelectTransformOutgoingE_File =
+  Tests_MGPerVRandomSelectTransformOutgoingE<cugraph::test::File_Usecase>;
+using Tests_MGPerVRandomSelectTransformOutgoingE_Rmat =
+  Tests_MGPerVRandomSelectTransformOutgoingE<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MGPerVRandomSelectTransformOutgoingE_File, CheckInt32Int32FloatTupleIntFloat)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>>(std::get<0>(param),
+                                                                       std::get<1>(param));
+}
+
+TEST_P(Tests_MGPerVRandomSelectTransformOutgoingE_Rmat, CheckInt32Int32FloatTupleIntFloat)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGPerVRandomSelectTransformOutgoingE_File, CheckInt32Int32Float)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, int>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_MGPerVRandomSelectTransformOutgoingE_Rmat, CheckInt32Int32Float)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, int>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_MGPerVRandomSelectTransformOutgoingE_File,
+  ::testing::Combine(
+    ::testing::Values(Prims_Usecase{true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_MGPerVRandomSelectTransformOutgoingE_Rmat,
+  ::testing::Combine(::testing::Values(Prims_Usecase{true}),
+                     ::testing::Values(cugraph::test::Rmat_Usecase(
+                       10, 16, 0.57, 0.19, 0.19, 0, false, false, 0, true))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_large_test,
+  Tests_MGPerVRandomSelectTransformOutgoingE_Rmat,
+  ::testing::Combine(::testing::Values(Prims_Usecase{false}),
+                     ::testing::Values(cugraph::test::Rmat_Usecase(
+                       20, 32, 0.57, 0.19, 0.19, 0, false, false, 0, true))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()

From 41abaf00747939dfe0a904b55473156fe128b2ae Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Mon, 19 Sep 2022 13:19:50 -0700
Subject: [PATCH 09/18] fix compile errors

---
 ...r_v_random_select_transform_outgoing_e.cuh | 327 ++++++++++--------
 1 file changed, 184 insertions(+), 143 deletions(-)

diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
index 8d22b906ff6..32c863191ee 100644
--- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
@@ -15,6 +15,29 @@
  */
 #pragma once
 
+#include <prims/property_op_utils.cuh>
+
+#include <cugraph/utilities/dataframe_buffer.hpp>
+#include <cugraph/utilities/shuffle_comm.cuh>
+
+#include <raft/random/rng.cuh>
+#ifndef NO_CUGRAPH_OPS
+#include <cugraph-ops/graph/sampling.hpp>
+#endif
+
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/optional.h>
+#include <thrust/remove.h>
+#include <thrust/sort.h>
+#include <thrust/tabulate.h>
+#include <thrust/tuple.h>
+
+#include <optional>
+#include <tuple>
+
 namespace cugraph {
 
 namespace detail {
@@ -29,7 +52,8 @@ struct convert_pair_to_triplet_t {
   int32_t col_comm_size{};
   edge_t invalid_idx{};
 
-  __device__ int32_t operator()(thrust::tuple<edge_t, size_t> index_pair) const
+  __device__ thrust::tuple<int32_t, edge_t, size_t> operator()(
+    thrust::tuple<edge_t, size_t> index_pair) const
   {
     auto nbr_idx       = thrust::get<0>(index_pair);
     auto key_idx       = thrust::get<1>(index_pair);
@@ -83,14 +107,16 @@ struct transform_and_count_local_nbr_indices_t {
   LocalNbrIdxIterator local_nbr_idx_first{};
   OutputValueIterator output_value_first{};
   thrust::optional<OutputCountIterator> output_count_first{};
-  EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
-    EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, EdgeOp e_op{};
+  EdgePartitionSrcValueInputWrapper edge_partition_src_value_input;
+  EdgePartitionDstValueInputWrapper edge_partition_dst_value_input;
+  EdgeOp e_op{};
   edge_t invalid_idx{};
   thrust::optional<T> invalid_value{thrust::nullopt};
 
   __device__ void operator()(size_t key_idx) const
   {
     auto key = *(key_first + key_idx);
+    vertex_t major{};
     if constexpr (std::is_same_v<key_t, vertex_t>) {
       major = key;
     } else {
@@ -115,8 +141,8 @@ struct transform_and_count_local_nbr_indices_t {
     } else {
       thrust::tie(indices, weights, local_degree) = edge_partition.local_edges(major_offset);
     }
-    auto start_offset = *(local_nbr_idx_offset_first + key_idx);
-    auto end_offset   = *(local_nbr_idx_offset_first + (key_idx + 1));
+    auto start_offset = *(offset_first + key_idx);
+    auto end_offset   = *(offset_first + (key_idx + 1));
 
     size_t num_valid_local_nbr_indices{0};
     for (size_t i = start_offset; i < end_offset; ++i) {
@@ -137,22 +163,22 @@ struct transform_and_count_local_nbr_indices_t {
           key_or_src = key;
           key_or_dst = minor;
         }
-        auto src_offset     = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
-        auto dst_offset     = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
-        *(output_first + i) = evaluate_edge_op<GraphViewType,
-                                               key_t,
-                                               EdgePartitionSrcValueInputWrapper,
-                                               EdgePartitionDstValueInputWrapper,
-                                               EdgeOp>()
-                                .compute(key_or_src,
-                                         key_or_dst,
-                                         weights ? (*weights)[local_nbr_idx] : weight_t{1.0},
-                                         edge_partition_src_value_input.get(src_offset),
-                                         edge_partition_dst_value_input.get(dst_offset),
-                                         e_op);
+        auto src_offset = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
+        auto dst_offset = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
+        *(output_value_first + i) = evaluate_edge_op<GraphViewType,
+                                                     key_t,
+                                                     EdgePartitionSrcValueInputWrapper,
+                                                     EdgePartitionDstValueInputWrapper,
+                                                     EdgeOp>()
+                                      .compute(key_or_src,
+                                               key_or_dst,
+                                               weights ? (*weights)[local_nbr_idx] : weight_t{1.0},
+                                               edge_partition_src_value_input.get(src_offset),
+                                               edge_partition_dst_value_input.get(dst_offset),
+                                               e_op);
         ++num_valid_local_nbr_indices;
       } else if (invalid_value) {
-        *(output_first + i) = *invalid_value;
+        *(output_value_first + i) = *invalid_value;
       } else {
         assert(output_count_first);
       }
@@ -168,6 +194,7 @@ struct copy_and_fill_sample_e_op_results_t {
   InputIterator input_first{};
   OutputIterator output_first{};
   size_t K{};
+  typename thrust::iterator_traits<OutputIterator>::value_type invalid_value;
 
   __device__ void operator()(size_t i) const
   {
@@ -210,6 +237,8 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
   using edge_t   = typename GraphViewType::edge_type;
   using weight_t = typename GraphViewType::weight_type;
   using key_t    = typename VertexFrontierBucketType::key_type;
+  using key_buffer_t =
+    decltype(allocate_dataframe_buffer<key_t>(size_t{0}, rmm::cuda_stream_view{}));
 
   using edge_partition_src_input_device_view_t = std::conditional_t<
     std::is_same_v<typename EdgeSrcValueInputWrapper::value_type, thrust::nullopt_t>,
@@ -232,7 +261,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
                          vertex_t,
                          typename EdgeDstValueInputWrapper::value_iterator>>>;
 
-  static_assert(!GraphViewType::is_storage_transposed == incoming);
+  static_assert(GraphViewType::is_storage_transposed == incoming);
   static_assert(std::is_same_v<typename evaluate_edge_op<GraphViewType,
                                                          key_t,
                                                          EdgeSrcValueInputWrapper,
@@ -290,15 +319,14 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
 
   auto aggregate_local_frontier_keys =
     GraphViewType::is_multi_gpu
-      ? std::make_optional<decltype(
-          allocate_dataframe_buffer<key_t>(size_t{0}, rmm::cuda_stream_view{}))>(
+      ? std::make_optional<key_buffer_t>(
           local_frontier_displacements.back() + local_frontier_sizes.back(), handle.get_stream())
       : std::nullopt;
   if constexpr (GraphViewType::is_multi_gpu) {
     auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
     device_allgatherv(col_comm,
                       frontier_key_first,
-                      get_dataframe_buffer_begin(aggregate_local_frontier_keys),
+                      get_dataframe_buffer_begin(*aggregate_local_frontier_keys),
                       local_frontier_sizes,
                       local_frontier_displacements,
                       handle.get_stream());
@@ -307,7 +335,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
   // 2. compute degrees
 
   auto aggregate_local_frontier_local_degrees =
-    GrpahViewType::is_multi_gpu
+    GraphViewType::is_multi_gpu
       ? std::make_optional<rmm::device_uvector<edge_t>>(
           local_frontier_displacements.back() + local_frontier_sizes.back(), handle.get_stream())
       : std::nullopt;
@@ -318,7 +346,6 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
         graph_view.local_edge_partition_view(i));
 
     vertex_t const* edge_partition_frontier_major_first{nullptr};
-    vertex_t const* edge_partition_frontier_major_last{nullptr};
 
     auto edge_partition_frontier_key_first =
       (GraphViewType::is_multi_gpu ? get_dataframe_buffer_begin(*aggregate_local_frontier_keys)
@@ -329,12 +356,10 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
     } else {
       edge_partition_frontier_major_first = thrust::get<0>(edge_partition_frontier_key_first);
     }
-    edge_partition_frontier_major_last =
-      edge_partition_frontier_major_first + local_frontier_sizes[i];
 
     auto edge_partition_frontier_local_degrees = edge_partition.compute_local_degrees(
-      raft::device_span<vertex_t const*>(edge_partition_frontier_major_first,
-                                         edge_partition_frontier_size),
+      raft::device_span<vertex_t const>(edge_partition_frontier_major_first,
+                                        local_frontier_sizes[i]),
       handle.get_stream());
 
     if constexpr (GraphViewType::is_multi_gpu) {
@@ -351,21 +376,24 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
   }
 
   auto frontier_gathered_local_degrees =
-    GrpahViewType::is_multi_gpu
+    GraphViewType::is_multi_gpu
       ? std::make_optional<rmm::device_uvector<edge_t>>(frontier.size() * K, handle.get_stream())
       : std::nullopt;
-  if constexpr (GrpahViewType::is_multi_gpu) {
+  if constexpr (GraphViewType::is_multi_gpu) {
+    auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+    auto const col_comm_size = col_comm.get_size();
+
     std::tie(frontier_gathered_local_degrees, std::ignore) =
       shuffle_values(col_comm,
                      (*aggregate_local_frontier_local_degrees).begin(),
                      local_frontier_sizes,
                      handle.get_stream());
-    thrust::tablulate(handle.get_thrust_policy(),
-                      frontier_degrees.begin(),
-                      frontier_degrees.end(),
-                      strided_sum_t<edge_t>{frontier_gathered_local_degrees.data(),
-                                            frontier.size(),
-                                            static_cast<size_t>(col_comm_size)});
+    thrust::tabulate(handle.get_thrust_policy(),
+                     frontier_degrees.begin(),
+                     frontier_degrees.end(),
+                     strided_sum_t<edge_t>{(*frontier_gathered_local_degrees).data(),
+                                           frontier.size(),
+                                           static_cast<size_t>(col_comm_size)});
     aggregate_local_frontier_local_degrees = std::nullopt;
   }
 
@@ -375,13 +403,13 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
   // FIXME: get_sampling_index is inefficient when degree >> K & with_replacement = false
   // FIXME: Need to verify that sample_nbr_indices[] are filled with INVALID_IDX when degree == 0
   // (with_replacement = true) or degree < K (with_replacement = false)
-  cugraph_ops::get_sampling_index(sample_nbr_indices.data(),
-                                  rng_state,
-                                  frontier_degrees.data(),
-                                  static_cast<edge_t>(frontier.size()),
-                                  static_cats<int32_t>(K),
-                                  with_replacement,
-                                  handle.get_stream());
+  cugraph::ops::gnn::graph::get_sampling_index(sample_nbr_indices.data(),
+                                               rng_state,
+                                               frontier_degrees.data(),
+                                               static_cast<edge_t>(frontier_degrees.size()),
+                                               static_cast<int32_t>(K),
+                                               with_replacement,
+                                               handle.get_stream());
   frontier_degrees.resize(0, handle.get_stream());
   frontier_degrees.shrink_to_fit(handle.get_stream());
 
@@ -392,13 +420,14 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
                           // neighbors are distributed in col_comm_size partitions)
   auto sample_key_indices =
     GraphViewType::is_multi_gpu
-      ? std::make_optional<rmm::devie_uvector<size_t>>(0, handle.get_stream())
+      ? std::make_optional<rmm::device_uvector<size_t>>(0, handle.get_stream())
       : std::nullopt;
-  auto local_frontier_sample_counts =
-    GrpahViewType::is_multi_gpu ? std::vector<size_t>(0) : std::vector<size_t>{frontier.size() * K};
-  auto local_frontier_sample_displacements =
-    GrpahViewType::is_multi_gpu ? std::vector<size_t>(0) : std::vector<size_t>{0};
-  if constexpr (GrpahViewType::is_multi_gpu) {
+  auto local_frontier_sample_counts        = std::vector<size_t>{};
+  auto local_frontier_sample_displacements = std::vector<size_t>{};
+  if constexpr (GraphViewType::is_multi_gpu) {
+    auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+    auto const col_comm_size = col_comm.get_size();
+
     auto col_comm_ranks =
       rmm::device_uvector<int32_t>(sample_local_nbr_indices.size(), handle.get_stream());
     auto sample_key_indices =
@@ -407,41 +436,43 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
       thrust::make_tuple(sample_local_nbr_indices.begin(),
                          thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{0}),
                                                          divider_t<size_t>{K})));
-    thrust::transform(handle.get_thrsut_policy(),
-                      input_pair_first,
-                      input_pair_first + sample_indices.size(),
-                      thrust::make_zip_iterator(thrust::make_tuple(
-                        col_comm_ranks.begin(), sample_indices.begin(), key_indices.begin())),
-                      convert_pair_to_triplet_t<edge_t>{
-                        raft::device_span<edge_t const>(frontier_gathered_local_degrees.data(),
-                                                        frontier_gathered_local_degrees.size()),
-                        frontier.size(),
-                        K,
-                        col_comm_size,
-                        ops::gnn::graph::INVALID_IDX<edge_t>});
+    thrust::transform(
+      handle.get_thrust_policy(),
+      input_pair_first,
+      input_pair_first + sample_local_nbr_indices.size(),
+      thrust::make_zip_iterator(thrust::make_tuple(
+        col_comm_ranks.begin(), sample_local_nbr_indices.begin(), sample_key_indices.begin())),
+      convert_pair_to_triplet_t<edge_t>{
+        raft::device_span<edge_t const>((*frontier_gathered_local_degrees).data(),
+                                        (*frontier_gathered_local_degrees).size()),
+        frontier.size(),
+        K,
+        col_comm_size,
+        cugraph::ops::gnn::graph::INVALID_ID<edge_t>});
 
     frontier_gathered_local_degrees = std::nullopt;
 
     auto triplet_first = thrust::make_zip_iterator(thrust::make_tuple(
-      sample_local_nbr_indices.bgin(), col_comm_ranks.begin(), key_indices.begin()));
+      sample_local_nbr_indices.begin(), col_comm_ranks.begin(), sample_key_indices.begin()));
     sample_local_nbr_indices.resize(
       thrust::distance(triplet_first,
                        thrust::remove_if(handle.get_thrust_policy(),
                                          triplet_first,
                                          triplet_first + sample_local_nbr_indices.size(),
-                                         invalid_col_comm_rank<edge_t>{int32_t{-1}})),
+                                         invalid_col_comm_rank_t<edge_t>{int32_t{-1}})),
       handle.get_stream());
     col_comm_ranks.resize(sample_local_nbr_indices.size(), handle.get_stream());
     sample_key_indices.resize(sample_local_nbr_indices.size(), handle.get_stream());
 
-    auto d_tx_counts = groupby_and_count(col_comm_ranks.begin(),
-                                         col_comm_ranks.end(),
-                                         thrust::make_zip_iterator(thrust::make_tuple(
-                                           sample_local_nbr_indices.begin(), key_indices.begin())),
-                                         thrust::identity<int32_t>{},
-                                         col_comm_size,
-                                         std::numeric_limits<size_t>::max(),
-                                         handle.get_stream());
+    auto d_tx_counts =
+      groupby_and_count(col_comm_ranks.begin(),
+                        col_comm_ranks.end(),
+                        thrust::make_zip_iterator(thrust::make_tuple(
+                          sample_local_nbr_indices.begin(), sample_key_indices.begin())),
+                        thrust::identity<int32_t>{},
+                        col_comm_size,
+                        std::numeric_limits<size_t>::max(),
+                        handle.get_stream());
 
     std::vector<size_t> h_tx_counts(d_tx_counts.size());
     raft::update_host(
@@ -449,7 +480,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
     handle.sync_stream();
 
     auto pair_first = thrust::make_zip_iterator(
-      thrust::make_tuple(sample_local_nbr_indices.bgin(), key_indices.begin()));
+      thrust::make_tuple(sample_local_nbr_indices.begin(), sample_key_indices.begin()));
     auto [rx_value_buffer, rx_counts] =
       shuffle_values(col_comm, pair_first, h_tx_counts, handle.get_stream());
 
@@ -457,8 +488,11 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
     sample_key_indices                  = std::move(std::get<1>(rx_value_buffer));
     local_frontier_sample_displacements = std::vector<size_t>(rx_counts.size());
     std::exclusive_scan(
-      rx_counts.begin(), rx_counts.end(), local_frontier_sample_displacements.begin());
+      rx_counts.begin(), rx_counts.end(), local_frontier_sample_displacements.begin(), size_t{0});
     local_frontier_sample_counts = std::move(rx_counts);
+  } else {
+    local_frontier_sample_counts.push_back(frontier.size() * K);
+    local_frontier_sample_displacements.push_back(size_t{0});
   }
 
   // 5. transform
@@ -467,7 +501,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
     (!GraphViewType::is_multi_gpu && !invalid_value)
       ? std::make_optional<rmm::device_uvector<size_t>>(frontier.size(), handle.get_stream())
       : std::nullopt;
-  auto sample_e_op_results = allocate_dataframe_buffer<e_op_result_t>(
+  auto sample_e_op_results = allocate_dataframe_buffer<T>(
     local_frontier_sample_displacements.back() + local_frontier_sample_counts.back(),
     handle.get_stream());
   for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
@@ -483,7 +517,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
       sample_local_nbr_indices.begin() + local_frontier_sample_displacements[i];
 
     auto edge_partition_sample_e_op_result_first =
-      get_dataframe_buffer_beign(sample_e_op_results) + local_frontier_sample_displacements[i];
+      get_dataframe_buffer_begin(sample_e_op_results) + local_frontier_sample_displacements[i];
 
     edge_partition_src_input_device_view_t edge_partition_src_value_input{};
     edge_partition_dst_input_device_view_t edge_partition_dst_value_input{};
@@ -507,14 +541,14 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
         thrust::count_if(handle.get_thrust_policy(),
                          thrust::make_counting_iterator(size_t{0}),
                          thrust::make_counting_iterator(local_frontier_sample_counts[i]),
-                         is_first_in_run_t<size_t const*>{sample_key_indices.data() +
+                         is_first_in_run_t<size_t const*>{(*sample_key_indices).data() +
                                                           local_frontier_sample_displacements[i]});
-      rmm::device_uvector<size_t> unique_key_indices(num_unique_key_indices, handle.get_strema());
+      rmm::device_uvector<size_t> unique_key_indices(num_unique_key_indices, handle.get_stream());
       rmm::device_uvector<size_t> unique_key_local_nbr_idx_counts(num_unique_key_indices,
                                                                   handle.get_stream());
       thrust::reduce_by_key(handle.get_thrust_policy(),
-                            sample_key_indices.begin() + local_frontier_sample_displacements[i],
-                            sample_key_indices.begin() + local_frontier_sample_displacements[i] +
+                            (*sample_key_indices).begin() + local_frontier_sample_displacements[i],
+                            (*sample_key_indices).begin() + local_frontier_sample_displacements[i] +
                               local_frontier_sample_counts[i],
                             thrust::make_constant_iterator(edge_t{1}),
                             unique_key_indices.begin(),
@@ -523,48 +557,20 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
                                                                    handle.get_stream());
       unique_key_local_nbr_idx_offsets.set_element_to_zero_async(size_t{0}, handle.get_stream());
       thrust::inclusive_scan(handle.get_thrust_policy(),
-                             unique_local_nbr_idx_counts.begin(),
-                             unique_local_nbr_idx_counts.end(),
-                             unique_local_nbr_idx_offsets.begin() + 1);
+                             unique_key_local_nbr_idx_counts.begin(),
+                             unique_key_local_nbr_idx_counts.end(),
+                             unique_key_local_nbr_idx_offsets.begin() + 1);
       auto offset_first = unique_key_local_nbr_idx_offsets.begin();
-      thrust::for_each(
-        handle.get_thrust_policy(),
-        unique_key_indices.begin(),
-        unique_key_indices.end(),
-        transform_and_count_local_nbr_indices_t<GraphViewType,
-                                                decltype(edge_partition_frontier_key_first),
-                                                decltype(offset_first),
-                                                decltype(
-                                                  edge_partition_sample_local_nbr_index_first),
-                                                decltype(edge_partition_sample_e_op_result_first),
-                                                size_t const*,
-                                                edge_partition_src_input_device_view_t,
-                                                edge_partition_dst_input_device_view_t,
-                                                EdgeOp,
-                                                T>{edge_partition,
-                                                   edge_partition_frontier_key_first,
-                                                   offset_first,
-                                                   edge_partition_sample_local_nbr_index_first,
-                                                   edge_partition_sample_e_op_result_first,
-                                                   thrust::nullopt,
-                                                   edge_partition_src_value_input,
-                                                   edge_partition_dst_value_input,
-                                                   e_op,
-                                                   invalid_idx,
-                                                   invalid_value});
-    } else {
-      auto offset_first = thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{0}),
-                                                          multiplier_t<size_t>{K});
       thrust::for_each(handle.get_thrust_policy(),
-                       thrust::make_counting_iterator(size_t{0}),
-                       thrust::make_counting_iterator(unique_key_indices.size()),
+                       unique_key_indices.begin(),
+                       unique_key_indices.end(),
                        transform_and_count_local_nbr_indices_t<
                          GraphViewType,
                          decltype(edge_partition_frontier_key_first),
                          decltype(offset_first),
                          decltype(edge_partition_sample_local_nbr_index_first),
                          decltype(edge_partition_sample_e_op_result_first),
-                         size_t const*,
+                         size_t*,
                          edge_partition_src_input_device_view_t,
                          edge_partition_dst_input_device_view_t,
                          EdgeOp,
@@ -573,13 +579,40 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
                             offset_first,
                             edge_partition_sample_local_nbr_index_first,
                             edge_partition_sample_e_op_result_first,
-                            sample_counts ? thrust::optional<size_t const*>((*sample_counts).data())
-                                          : thrust::nullopt,
+                            thrust::nullopt,
                             edge_partition_src_value_input,
                             edge_partition_dst_value_input,
                             e_op,
-                            invalid_idx,
-                            invalid_value});
+                            cugraph::ops::gnn::graph::INVALID_ID<edge_t>,
+                            invalid_value ? thrust::optional<T>{*invalid_value} : thrust::nullopt});
+    } else {
+      auto offset_first = thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{0}),
+                                                          multiplier_t<size_t>{K});
+      thrust::for_each(
+        handle.get_thrust_policy(),
+        thrust::make_counting_iterator(size_t{0}),
+        thrust::make_counting_iterator(frontier.size()),
+        transform_and_count_local_nbr_indices_t<
+          GraphViewType,
+          decltype(edge_partition_frontier_key_first),
+          decltype(offset_first),
+          decltype(edge_partition_sample_local_nbr_index_first),
+          decltype(edge_partition_sample_e_op_result_first),
+          size_t*,
+          edge_partition_src_input_device_view_t,
+          edge_partition_dst_input_device_view_t,
+          EdgeOp,
+          T>{edge_partition,
+             edge_partition_frontier_key_first,
+             offset_first,
+             edge_partition_sample_local_nbr_index_first,
+             edge_partition_sample_e_op_result_first,
+             sample_counts ? thrust::optional<size_t*>((*sample_counts).data()) : thrust::nullopt,
+             edge_partition_src_value_input,
+             edge_partition_dst_value_input,
+             e_op,
+             cugraph::ops::gnn::graph::INVALID_ID<edge_t>,
+             invalid_value ? thrust::optional<T>{*invalid_value} : thrust::nullopt});
     }
   }
 
@@ -589,41 +622,49 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
                                       : std::make_optional<rmm::device_uvector<size_t>>(
                                           frontier.size() + 1, handle.get_stream());
   if (GraphViewType::is_multi_gpu) {
-    auto pair_first = thrust::make_zip_iterator(
-      thrust::make_tuple(sample_e_op_results.bgin(), sample_key_indices.begin()));
-    auto [rx_value_buffer, rx_counts] =
-      shuffle_values(col_comm, pair_first, local_frontier_sample_counts, handle.get_stream());
-    sample_e_op_results = std::move(std::get<0>(rx_value_buffer));
-    sample_key_indices  = std::move(std::get<1>(rx_value_buffer));
+    auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+
+    std::vector<size_t> rx_counts{};
+    std::tie(sample_e_op_results, rx_counts) =
+      shuffle_values(col_comm,
+                     get_dataframe_buffer_begin(sample_e_op_results),
+                     local_frontier_sample_counts,
+                     handle.get_stream());
+    std::tie(sample_key_indices, std::ignore) = shuffle_values(
+      col_comm, (*sample_key_indices).begin(), local_frontier_sample_counts, handle.get_stream());
     // FIXME: better refactor this sort-and-reduce-by-key
     thrust::sort_by_key(handle.get_thrust_policy(),
-                        sample_key_indices.begin(),
-                        sample_key_indices.end(),
-                        sample_e_op_results.begin());
+                        (*sample_key_indices).begin(),
+                        (*sample_key_indices).end(),
+                        get_dataframe_buffer_begin(sample_e_op_results));
     auto num_unique_key_indices =
-    thrust::count_if(handle.get_thrust_policy(),
-                     thrust::make_counting_iterator(size_t{0}),
-                     thrust::make_counting_iterator(sample_key_indices.size()),
-                     is_first_in_run_t<size_t const*>{sample_key_indices.data()},
-    rmm::device_uvector<size_t> unique_key_indices(num_unique_key_indices, handle.get_strema());
+      thrust::count_if(handle.get_thrust_policy(),
+                       thrust::make_counting_iterator(size_t{0}),
+                       thrust::make_counting_iterator((*sample_key_indices).size()),
+                       is_first_in_run_t<size_t const*>{(*sample_key_indices).data()});
+    rmm::device_uvector<size_t> unique_key_indices(num_unique_key_indices, handle.get_stream());
     rmm::device_uvector<size_t> unique_key_sample_counts(num_unique_key_indices,
-                                                                  handle.get_stream());
+                                                         handle.get_stream());
     thrust::reduce_by_key(handle.get_thrust_policy(),
-                          sample_key_indices.begin(),
-                          sample_key_indices.end(),
+                          (*sample_key_indices).begin(),
+                          (*sample_key_indices).end(),
                           thrust::make_constant_iterator(edge_t{1}),
                           unique_key_indices.begin(),
                           unique_key_sample_counts.begin());
     sample_counts = rmm::device_uvector<size_t>(frontier.size(), handle.get_stream());
     thrust::fill(
       handle.get_thrust_policy(), (*sample_counts).begin(), (*sample_counts).end(), size_t{0});
-    thrust::scatter(handle.get_thrust_policy(), unique_key_sample_counts.begin(), unique_key_sample_counts.end(), unique_key_indices.begin(), (*sample_counts).begin());
+    thrust::scatter(handle.get_thrust_policy(),
+                    unique_key_sample_counts.begin(),
+                    unique_key_sample_counts.end(),
+                    unique_key_indices.begin(),
+                    (*sample_counts).begin());
     if (invalid_value) {
       rmm::device_uvector<size_t> sample_displacements((*sample_counts).size(),
                                                        handle.get_stream());
       thrust::exclusive_scan(handle.get_thrust_policy(),
-                             (*sample_counts).beign(),
-                             (*sample_counts).emd(),
+                             (*sample_counts).begin(),
+                             (*sample_counts).end(),
                              sample_displacements.begin());
       auto tmp_sample_e_op_results =
         allocate_dataframe_buffer<T>(frontier.size() * K, handle.get_stream());
@@ -638,10 +679,10 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
           raft::device_span<size_t const>(sample_displacements.data(), sample_displacements.size()),
           input_first,
           output_first,
-          K});
+          K,
+          *invalid_value});
       sample_e_op_results = std::move(tmp_sample_e_op_results);
-    }
-    else {
+    } else {
       (*sample_offsets).set_element_to_zero_async(size_t{0}, handle.get_stream());
       thrust::inclusive_scan(handle.get_thrust_policy(),
                              (*sample_counts).begin(),
@@ -738,7 +779,7 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle,
                                          std::optional<T> invalid_value,
                                          bool do_expensive_check = false)
 {
-  static_assert(false, "unimplemented.");
+  CUGRAPH_FAIL("unimplemented.");
 
   return std::make_tuple(std::nullopt,
                          allocate_dataframe_buffer<T>(size_t{0}, rmm::cuda_stream_view{}));
@@ -815,7 +856,7 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle,
                                          std::optional<T> invalid_value,
                                          bool do_expensive_check = false)
 {
-  return detail::per_v_random_select_transform_e<false>(hanlde,
+  return detail::per_v_random_select_transform_e<false>(handle,
                                                         graph_view,
                                                         frontier,
                                                         edge_src_value_input,

From da83d3ba42db17b1c3213e8e88adc09e5311f759 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 22 Sep 2022 20:17:38 -0700
Subject: [PATCH 10/18] silence spurious may-be-used-uninitialized warnings

---
 cpp/include/cugraph/edge_partition_device_view.cuh | 10 +++-------
 cpp/include/cugraph/utilities/misc_utils.cuh       | 10 ++++++++++
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/cpp/include/cugraph/edge_partition_device_view.cuh b/cpp/include/cugraph/edge_partition_device_view.cuh
index e0538a78241..df6a7ab5ad3 100644
--- a/cpp/include/cugraph/edge_partition_device_view.cuh
+++ b/cpp/include/cugraph/edge_partition_device_view.cuh
@@ -17,6 +17,7 @@
 
 #include <cugraph/edge_partition_view.hpp>
 #include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/misc_utils.cuh>
 
 #include <raft/span.hpp>
 #include <rmm/cuda_stream_view.hpp>
@@ -165,13 +166,8 @@ class edge_partition_device_view_t<vertex_t,
   edge_partition_device_view_t(edge_partition_view_t<vertex_t, edge_t, weight_t, multi_gpu> view)
     : detail::edge_partition_device_view_base_t<vertex_t, edge_t, weight_t>(
         view.offsets(), view.indices(), view.weights()),
-      dcs_nzd_vertices_(
-        view.dcs_nzd_vertices()
-          ? thrust::optional<raft::device_span<vertex_t const>>{*(view.dcs_nzd_vertices())}
-          : thrust::nullopt),
-      major_hypersparse_first_(view.major_hypersparse_first()
-                                 ? thrust::optional<vertex_t>{*(view.major_hypersparse_first())}
-                                 : thrust::nullopt),
+      dcs_nzd_vertices_(detail::to_thrust_optional(view.dcs_nzd_vertices())),
+      major_hypersparse_first_(detail::to_thrust_optional(view.major_hypersparse_first())),
       major_range_first_(view.major_range_first()),
       major_range_last_(view.major_range_last()),
       minor_range_first_(view.minor_range_first()),
diff --git a/cpp/include/cugraph/utilities/misc_utils.cuh b/cpp/include/cugraph/utilities/misc_utils.cuh
index 4d96360e2c5..2d34327d71a 100644
--- a/cpp/include/cugraph/utilities/misc_utils.cuh
+++ b/cpp/include/cugraph/utilities/misc_utils.cuh
@@ -23,7 +23,9 @@
 #include <thrust/gather.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
+#include <thrust/optional.h>
 
+#include <optional>
 #include <tuple>
 #include <vector>
 
@@ -72,6 +74,14 @@ std::tuple<std::vector<vertex_t>, std::vector<edge_t>> compute_offset_aligned_ed
   return std::make_tuple(h_vertex_offsets, h_edge_offsets);
 }
 
+template <typename T>
+thrust::optional<T> to_thrust_optional(std::optional<T> val)
+{
+  thrust::optional<T> ret{thrust::nullopt};
+  if (val) { ret = *val; }
+  return ret;
+}
+
 }  // namespace detail
 
 }  // namespace cugraph

From d5e415b98a9b0b4ec91571d440c4b3002ce15f97 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 22 Sep 2022 20:22:37 -0700
Subject: [PATCH 11/18] bug fixes

---
 ...r_v_random_select_transform_outgoing_e.cuh | 115 ++++++-----
 ...er_v_random_select_transform_outgoing_e.cu | 188 ++++++++++++++----
 2 files changed, 215 insertions(+), 88 deletions(-)

diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
index 32c863191ee..f82eb10d8c7 100644
--- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
@@ -18,6 +18,7 @@
 #include <prims/property_op_utils.cuh>
 
 #include <cugraph/utilities/dataframe_buffer.hpp>
+#include <cugraph/utilities/misc_utils.cuh>
 #include <cugraph/utilities/shuffle_comm.cuh>
 
 #include <raft/random/rng.cuh>
@@ -85,6 +86,7 @@ struct invalid_col_comm_rank_t {
 };
 
 template <typename GraphViewType,
+          typename UniqueKeyIdxIterator,
           typename KeyIterator,
           typename OffsetIterator,
           typename LocalNbrIdxIterator,
@@ -102,6 +104,7 @@ struct transform_and_count_local_nbr_indices_t {
 
   edge_partition_device_view_t<vertex_t, edge_t, weight_t, GraphViewType::is_multi_gpu>
     edge_partition{};
+  UniqueKeyIdxIterator unique_key_idx_first{};
   KeyIterator key_first{};
   OffsetIterator offset_first{};
   LocalNbrIdxIterator local_nbr_idx_first{};
@@ -113,9 +116,10 @@ struct transform_and_count_local_nbr_indices_t {
   edge_t invalid_idx{};
   thrust::optional<T> invalid_value{thrust::nullopt};
 
-  __device__ void operator()(size_t key_idx) const
+  __device__ void operator()(size_t i) const
   {
-    auto key = *(key_first + key_idx);
+    auto key_idx = *(unique_key_idx_first + i);
+    auto key     = *(key_first + key_idx);
     vertex_t major{};
     if constexpr (std::is_same_v<key_t, vertex_t>) {
       major = key;
@@ -123,6 +127,7 @@ struct transform_and_count_local_nbr_indices_t {
       major = thrust::get<0>(key);
     }
     auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+    printf("major=%d major_offste=%d\n", (int)major, (int)major_offset);
     vertex_t const* indices{nullptr};
     thrust::optional<weight_t const*> weights{thrust::nullopt};
     [[maybe_unused]] edge_t local_degree{0};
@@ -141,8 +146,8 @@ struct transform_and_count_local_nbr_indices_t {
     } else {
       thrust::tie(indices, weights, local_degree) = edge_partition.local_edges(major_offset);
     }
-    auto start_offset = *(offset_first + key_idx);
-    auto end_offset   = *(offset_first + (key_idx + 1));
+    auto start_offset = *(offset_first + i);
+    auto end_offset   = *(offset_first + (i + 1));
 
     size_t num_valid_local_nbr_indices{0};
     for (size_t i = start_offset; i < end_offset; ++i) {
@@ -269,6 +274,9 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
                                                          EdgeOp>::result_type,
                                T>);
 
+  CUGRAPH_EXPECTS(K >= size_t{1},
+                  "Invalid input argument: invalid K, K should be a positive integer.");
+
   if (do_expensive_check) {
     // FIXME: better re-factor this check function?
     vertex_t const* frontier_vertex_first{nullptr};
@@ -377,7 +385,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
 
   auto frontier_gathered_local_degrees =
     GraphViewType::is_multi_gpu
-      ? std::make_optional<rmm::device_uvector<edge_t>>(frontier.size() * K, handle.get_stream())
+      ? std::make_optional<rmm::device_uvector<edge_t>>(size_t{0}, handle.get_stream())
       : std::nullopt;
   if constexpr (GraphViewType::is_multi_gpu) {
     auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
@@ -401,15 +409,15 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
 
   rmm::device_uvector<edge_t> sample_nbr_indices(frontier.size() * K, handle.get_stream());
   // FIXME: get_sampling_index is inefficient when degree >> K & with_replacement = false
-  // FIXME: Need to verify that sample_nbr_indices[] are filled with INVALID_IDX when degree == 0
-  // (with_replacement = true) or degree < K (with_replacement = false)
-  cugraph::ops::gnn::graph::get_sampling_index(sample_nbr_indices.data(),
-                                               rng_state,
-                                               frontier_degrees.data(),
-                                               static_cast<edge_t>(frontier_degrees.size()),
-                                               static_cast<int32_t>(K),
-                                               with_replacement,
-                                               handle.get_stream());
+  if (frontier_degrees.size() > 0) {
+    cugraph::ops::gnn::graph::get_sampling_index(sample_nbr_indices.data(),
+                                                 rng_state,
+                                                 frontier_degrees.data(),
+                                                 static_cast<edge_t>(frontier_degrees.size()),
+                                                 static_cast<int32_t>(K),
+                                                 with_replacement,
+                                                 handle.get_stream());
+  }
   frontier_degrees.resize(0, handle.get_stream());
   frontier_degrees.shrink_to_fit(handle.get_stream());
 
@@ -418,20 +426,18 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
   auto sample_local_nbr_indices = std::move(
     sample_nbr_indices);  // neighbor index within an edge partition (note that each vertex's
                           // neighbors are distributed in col_comm_size partitions)
-  auto sample_key_indices =
-    GraphViewType::is_multi_gpu
-      ? std::make_optional<rmm::device_uvector<size_t>>(0, handle.get_stream())
-      : std::nullopt;
+  std::optional<rmm::device_uvector<size_t>> sample_key_indices{
+    std::nullopt};  // relevant only when multi-GPU
   auto local_frontier_sample_counts        = std::vector<size_t>{};
   auto local_frontier_sample_displacements = std::vector<size_t>{};
   if constexpr (GraphViewType::is_multi_gpu) {
     auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
     auto const col_comm_size = col_comm.get_size();
 
+    sample_key_indices =
+      rmm::device_uvector<size_t>(sample_local_nbr_indices.size(), handle.get_stream());
     auto col_comm_ranks =
       rmm::device_uvector<int32_t>(sample_local_nbr_indices.size(), handle.get_stream());
-    auto sample_key_indices =
-      rmm::device_uvector<size_t>(sample_local_nbr_indices.size(), handle.get_stream());
     auto input_pair_first = thrust::make_zip_iterator(
       thrust::make_tuple(sample_local_nbr_indices.begin(),
                          thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{0}),
@@ -441,7 +447,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
       input_pair_first,
       input_pair_first + sample_local_nbr_indices.size(),
       thrust::make_zip_iterator(thrust::make_tuple(
-        col_comm_ranks.begin(), sample_local_nbr_indices.begin(), sample_key_indices.begin())),
+        col_comm_ranks.begin(), sample_local_nbr_indices.begin(), (*sample_key_indices).begin())),
       convert_pair_to_triplet_t<edge_t>{
         raft::device_span<edge_t const>((*frontier_gathered_local_degrees).data(),
                                         (*frontier_gathered_local_degrees).size()),
@@ -453,7 +459,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
     frontier_gathered_local_degrees = std::nullopt;
 
     auto triplet_first = thrust::make_zip_iterator(thrust::make_tuple(
-      sample_local_nbr_indices.begin(), col_comm_ranks.begin(), sample_key_indices.begin()));
+      sample_local_nbr_indices.begin(), col_comm_ranks.begin(), (*sample_key_indices).begin()));
     sample_local_nbr_indices.resize(
       thrust::distance(triplet_first,
                        thrust::remove_if(handle.get_thrust_policy(),
@@ -462,13 +468,13 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
                                          invalid_col_comm_rank_t<edge_t>{int32_t{-1}})),
       handle.get_stream());
     col_comm_ranks.resize(sample_local_nbr_indices.size(), handle.get_stream());
-    sample_key_indices.resize(sample_local_nbr_indices.size(), handle.get_stream());
+    (*sample_key_indices).resize(sample_local_nbr_indices.size(), handle.get_stream());
 
     auto d_tx_counts =
       groupby_and_count(col_comm_ranks.begin(),
                         col_comm_ranks.end(),
                         thrust::make_zip_iterator(thrust::make_tuple(
-                          sample_local_nbr_indices.begin(), sample_key_indices.begin())),
+                          sample_local_nbr_indices.begin(), (*sample_key_indices).begin())),
                         thrust::identity<int32_t>{},
                         col_comm_size,
                         std::numeric_limits<size_t>::max(),
@@ -480,7 +486,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
     handle.sync_stream();
 
     auto pair_first = thrust::make_zip_iterator(
-      thrust::make_tuple(sample_local_nbr_indices.begin(), sample_key_indices.begin()));
+      thrust::make_tuple(sample_local_nbr_indices.begin(), (*sample_key_indices).begin()));
     auto [rx_value_buffer, rx_counts] =
       shuffle_values(col_comm, pair_first, h_tx_counts, handle.get_stream());
 
@@ -561,30 +567,33 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
                              unique_key_local_nbr_idx_counts.end(),
                              unique_key_local_nbr_idx_offsets.begin() + 1);
       auto offset_first = unique_key_local_nbr_idx_offsets.begin();
-      thrust::for_each(handle.get_thrust_policy(),
-                       unique_key_indices.begin(),
-                       unique_key_indices.end(),
-                       transform_and_count_local_nbr_indices_t<
-                         GraphViewType,
-                         decltype(edge_partition_frontier_key_first),
-                         decltype(offset_first),
-                         decltype(edge_partition_sample_local_nbr_index_first),
-                         decltype(edge_partition_sample_e_op_result_first),
-                         size_t*,
-                         edge_partition_src_input_device_view_t,
-                         edge_partition_dst_input_device_view_t,
-                         EdgeOp,
-                         T>{edge_partition,
-                            edge_partition_frontier_key_first,
-                            offset_first,
-                            edge_partition_sample_local_nbr_index_first,
-                            edge_partition_sample_e_op_result_first,
-                            thrust::nullopt,
-                            edge_partition_src_value_input,
-                            edge_partition_dst_value_input,
-                            e_op,
-                            cugraph::ops::gnn::graph::INVALID_ID<edge_t>,
-                            invalid_value ? thrust::optional<T>{*invalid_value} : thrust::nullopt});
+      thrust::for_each(
+        handle.get_thrust_policy(),
+        thrust::make_counting_iterator(size_t{0}),
+        thrust::make_counting_iterator(unique_key_indices.size()),
+        transform_and_count_local_nbr_indices_t<GraphViewType,
+                                                decltype(unique_key_indices.begin()),
+                                                decltype(edge_partition_frontier_key_first),
+                                                decltype(offset_first),
+                                                decltype(
+                                                  edge_partition_sample_local_nbr_index_first),
+                                                decltype(edge_partition_sample_e_op_result_first),
+                                                size_t*,
+                                                edge_partition_src_input_device_view_t,
+                                                edge_partition_dst_input_device_view_t,
+                                                EdgeOp,
+                                                T>{edge_partition,
+                                                   unique_key_indices.begin(),
+                                                   edge_partition_frontier_key_first,
+                                                   offset_first,
+                                                   edge_partition_sample_local_nbr_index_first,
+                                                   edge_partition_sample_e_op_result_first,
+                                                   thrust::nullopt,
+                                                   edge_partition_src_value_input,
+                                                   edge_partition_dst_value_input,
+                                                   e_op,
+                                                   cugraph::ops::gnn::graph::INVALID_ID<edge_t>,
+                                                   to_thrust_optional(invalid_value)});
     } else {
       auto offset_first = thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{0}),
                                                           multiplier_t<size_t>{K});
@@ -594,6 +603,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
         thrust::make_counting_iterator(frontier.size()),
         transform_and_count_local_nbr_indices_t<
           GraphViewType,
+          decltype(thrust::make_counting_iterator(size_t{0})),
           decltype(edge_partition_frontier_key_first),
           decltype(offset_first),
           decltype(edge_partition_sample_local_nbr_index_first),
@@ -603,6 +613,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
           edge_partition_dst_input_device_view_t,
           EdgeOp,
           T>{edge_partition,
+             thrust::make_counting_iterator(size_t{0}),
              edge_partition_frontier_key_first,
              offset_first,
              edge_partition_sample_local_nbr_index_first,
@@ -612,7 +623,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
              edge_partition_dst_value_input,
              e_op,
              cugraph::ops::gnn::graph::INVALID_ID<edge_t>,
-             invalid_value ? thrust::optional<T>{*invalid_value} : thrust::nullopt});
+             to_thrust_optional(invalid_value)});
     }
   }
 
@@ -672,8 +683,8 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
       auto output_first = get_dataframe_buffer_begin(tmp_sample_e_op_results);
       thrust::for_each(
         handle.get_thrust_policy(),
-        unique_key_indices.begin(),
-        unique_key_indices.end(),
+        thrust::make_counting_iterator(size_t{0}),
+        thrust::make_counting_iterator(frontier.size()),
         copy_and_fill_sample_e_op_results_t<decltype(input_first), decltype(output_first)>{
           raft::device_span<size_t const>((*sample_counts).data(), (*sample_counts).size()),
           raft::device_span<size_t const>(sample_displacements.data(), sample_displacements.size()),
diff --git a/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu b/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu
index a1b6df9ad7d..7fde20e7cac 100644
--- a/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu
+++ b/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu
@@ -29,6 +29,7 @@
 
 #include <cugraph/edge_src_dst_property.hpp>
 #include <cugraph/utilities/dataframe_buffer.hpp>
+#include <cugraph/utilities/host_scalar_comm.hpp>
 #include <cugraph/utilities/thrust_tuple_utils.hpp>
 
 #include <cugraph/graph_view.hpp>
@@ -72,6 +73,7 @@ struct e_op_t {
 struct Prims_Usecase {
   size_t K{0};
   bool with_replacement{false};
+  bool use_invalid_value{false};
   bool test_weighted{false};
   bool check_correctness{true};
 };
@@ -154,6 +156,11 @@ class Tests_MGPerVRandomSelectTransformOutgoingE
                                                         cugraph::to_thrust_tuple(property_t{})));
 
     std::optional<result_t> invalid_value{std::nullopt};
+    if (prims_usecase.use_invalid_value) {
+      invalid_value                  = result_t{};
+      thrust::get<0>(*invalid_value) = cugraph::invalid_vertex_id<vertex_t>::value;
+      thrust::get<1>(*invalid_value) = cugraph::invalid_vertex_id<vertex_t>::value;
+    }
 
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
@@ -185,41 +192,141 @@ class Tests_MGPerVRandomSelectTransformOutgoingE
     // 3. validate MG results
 
     if (prims_usecase.check_correctness) {
-#if 0
-      cugraph::graph_t<vertex_t, edge_t, weight_t, false, false> sg_graph(*handle_);
-      std::tie(sg_graph, std::ignore) =
+      auto d_mg_aggregate_renumber_map_labels = cugraph::test::device_allgatherv(
+        *handle_, (*d_mg_renumber_map_labels).data(), (*d_mg_renumber_map_labels).size());
+      auto out_degrees = mg_graph_view.compute_out_degrees(*handle_);
+
+      cugraph::graph_t<vertex_t, edge_t, weight_t, false, false> unrenumbered_graph(*handle_);
+      std::tie(unrenumbered_graph, std::ignore) =
         cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, false>(
           *handle_, input_usecase, prims_usecase.test_weighted, false);
-      auto sg_graph_view = sg_graph.view();
-
-// 1. check whether sources coincide with the local vertices in the frontier or not
-
-// 2. check sample counts
-
-// 3. check destinations exist in the input graph
-
-// 4. check source/destination property values
-
-      auto sg_vertex_property_data = generate<property_t>::vertex_property(
-        thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_first()),
-        thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_last()),
-        hash_bin_count,
-        *handle_);
-      auto sg_dst_prop =
-        generate<property_t>::dst_property(*handle_, sg_graph_view, sg_vertex_property_data);
-      auto sg_src_prop =
-        generate<property_t>::src_property(*handle_, sg_graph_view, sg_vertex_property_data);
-
-      auto expected_result = count_if_e(
-        *handle_,
-        sg_graph_view,
-        sg_src_prop.view(),
-        sg_dst_prop.view(),
-        [] __device__(auto src, auto dst, weight_t, auto src_property, auto dst_property) {
-          return src_property < dst_property;
-        });
-      ASSERT_TRUE(expected_result == result);
-#endif
+      auto unrenumbered_graph_view = unrenumbered_graph.view();
+
+      rmm::device_uvector<edge_t> unrenumbered_offsets(
+        unrenumbered_graph_view.number_of_vertices() + vertex_t{1}, handle_->get_stream());
+      thrust::copy(handle_->get_thrust_policy(),
+                   unrenumbered_graph_view.local_edge_partition_view().offsets().begin(),
+                   unrenumbered_graph_view.local_edge_partition_view().offsets().end(),
+                   unrenumbered_offsets.begin());
+      rmm::device_uvector<vertex_t> unrenumbered_indices(unrenumbered_graph_view.number_of_edges(),
+                                                         handle_->get_stream());
+      thrust::copy(handle_->get_thrust_policy(),
+                   unrenumbered_graph_view.local_edge_partition_view().indices().begin(),
+                   unrenumbered_graph_view.local_edge_partition_view().indices().end(),
+                   unrenumbered_indices.begin());
+
+      auto num_invalids = static_cast<size_t>(thrust::count_if(
+        handle_->get_thrust_policy(),
+        thrust::make_counting_iterator(size_t{0}),
+        thrust::make_counting_iterator(mg_vertex_frontier.bucket(bucket_idx_cur).size()),
+        [frontier_vertex_first         = mg_vertex_frontier.bucket(bucket_idx_cur).begin(),
+         v_first                       = mg_graph_view.local_vertex_partition_range_first(),
+         sample_offsets                = sample_offsets
+                                           ? thrust::make_optional<size_t const*>((*sample_offsets).data())
+                                           : thrust::nullopt,
+         sample_e_op_results           = cugraph::get_dataframe_buffer_begin(sample_e_op_results),
+         out_degrees                   = out_degrees.begin(),
+         aggregate_renumber_map_labels = d_mg_aggregate_renumber_map_labels.begin(),
+         unrenumbered_offsets          = unrenumbered_offsets.begin(),
+         unrenumbered_indices          = unrenumbered_indices.begin(),
+         K                             = prims_usecase.K,
+         with_replacement              = prims_usecase.with_replacement,
+         invalid_value =
+           invalid_value ? thrust::make_optional<result_t>(*invalid_value) : thrust::nullopt,
+         property_transform = cugraph::test::detail::property_transform<vertex_t, property_t>{
+           hash_bin_count}] __device__(size_t i) {
+          auto v = *(frontier_vertex_first + i);
+
+          // check sample_offsets
+
+          auto offset_first = sample_offsets ? *(*sample_offsets + i) : K * i;
+          auto offset_last  = sample_offsets ? *(*sample_offsets + (i + 1)) : K * (i + 1);
+          if (!sample_offsets) {
+            size_t num_valids{0};
+            for (size_t j = offset_first; j < offset_last; ++j) {
+              auto e_op_result = *(sample_e_op_results + j);
+              if (e_op_result == *invalid_value) { break; }
+              ++num_valids;
+            }
+            for (size_t j = offset_first + num_valids; j < offset_last; ++j) {
+              auto e_op_result = *(sample_e_op_results + j);
+              if (e_op_result != *invalid_value) { return true; }
+            }
+            offset_last = offset_first + num_valids;
+          }
+          auto count = offset_last - offset_first;
+
+          auto v_offset   = v - v_first;
+          auto out_degree = *(out_degrees + v_offset);
+          if (with_replacement) {
+            if ((out_degree > 0 && count != K) || (out_degree == 0 && count != 0)) { return true; }
+          } else {
+            if (count != std::min(static_cast<size_t>(out_degree), K)) { return true; }
+          }
+
+          // check sample_e_op_results
+
+          for (size_t j = offset_first; j < offset_last; ++j) {
+            auto e_op_result = *(sample_e_op_results + j);
+            auto src         = thrust::get<0>(e_op_result);
+            auto dst         = thrust::get<1>(e_op_result);
+            if (src != v) { return true; }
+            auto unrenumbered_src = *(aggregate_renumber_map_labels + src);
+            auto unrenumbered_dst = *(aggregate_renumber_map_labels + dst);
+            auto unrenumbered_dst_first =
+              unrenumbered_indices + *(unrenumbered_offsets + unrenumbered_src);
+            auto unrenumbered_dst_last =
+              unrenumbered_indices + *(unrenumbered_offsets + (unrenumbered_src + vertex_t{1}));
+            if (!thrust::binary_search(thrust::seq,
+                                       unrenumbered_dst_first,
+                                       unrenumbered_dst_last,
+                                       unrenumbered_dst)) {  // assumed neighbor lists are sorted
+              return true;
+            }
+            property_t src_val{};
+            property_t dst_val{};
+            if constexpr (cugraph::is_thrust_tuple_of_arithmetic<property_t>::value) {
+              src_val =
+                thrust::make_tuple(thrust::get<2>(e_op_result), thrust::get<3>(e_op_result));
+              dst_val =
+                thrust::make_tuple(thrust::get<4>(e_op_result), thrust::get<5>(e_op_result));
+            } else {
+              src_val = thrust::get<2>(e_op_result);
+              dst_val = thrust::get<3>(e_op_result);
+            }
+            if (src_val != property_transform(unrenumbered_src)) { return true; }
+            if (dst_val != property_transform(unrenumbered_dst)) { return true; }
+
+            if (!with_replacement) {
+              auto dst_first =
+                thrust::get<1>(sample_e_op_results.get_iterator_tuple()) + offset_first;
+              auto dst_last =
+                thrust::get<1>(sample_e_op_results.get_iterator_tuple()) + offset_last;
+              auto dst_count =
+                thrust::count(thrust::seq,
+                              dst_first,
+                              dst_last,
+                              dst);  // this could be inefficient for high-degree vertices, if we
+                                     // sort [dst_first, dst_last) we can use binary search but we
+                                     // may better not modify the sampling output and allow
+                                     // inefficiency as this is just for testing
+              auto multiplicity = thrust::distance(
+                thrust::lower_bound(
+                  thrust::seq, unrenumbered_dst_first, unrenumbered_dst_last, unrenumbered_dst),
+                thrust::upper_bound(thrust::seq,
+                                    unrenumbered_dst_first,
+                                    unrenumbered_dst_last,
+                                    unrenumbered_dst));  // this assumes neighbor lists are sorted
+              if (dst_count > multiplicity) { return true; }
+            }
+          }
+
+          return false;
+        }));
+
+      num_invalids = cugraph::host_scalar_allreduce(
+        handle_->get_comms(), num_invalids, raft::comms::op_t::SUM, handle_->get_stream());
+      ASSERT_TRUE(num_invalids == 0);
     }
   }
 
@@ -269,7 +376,10 @@ INSTANTIATE_TEST_SUITE_P(
   file_test,
   Tests_MGPerVRandomSelectTransformOutgoingE_File,
   ::testing::Combine(
-    ::testing::Values(Prims_Usecase{true}),
+    ::testing::Values(Prims_Usecase{size_t{4}, false, false, false, true},
+                      Prims_Usecase{size_t{4}, false, true, false, true},
+                      Prims_Usecase{size_t{4}, true, false, false, true},
+                      Prims_Usecase{size_t{4}, true, true, false, true}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
                       cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
                       cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
@@ -278,14 +388,20 @@ INSTANTIATE_TEST_SUITE_P(
 INSTANTIATE_TEST_SUITE_P(
   rmat_small_test,
   Tests_MGPerVRandomSelectTransformOutgoingE_Rmat,
-  ::testing::Combine(::testing::Values(Prims_Usecase{true}),
+  ::testing::Combine(::testing::Values(Prims_Usecase{size_t{4}, false, false, false, true},
+                                       Prims_Usecase{size_t{4}, false, true, false, true},
+                                       Prims_Usecase{size_t{4}, true, false, false, true},
+                                       Prims_Usecase{size_t{4}, true, true, false, true}),
                      ::testing::Values(cugraph::test::Rmat_Usecase(
                        10, 16, 0.57, 0.19, 0.19, 0, false, false, 0, true))));
 
 INSTANTIATE_TEST_SUITE_P(
   rmat_large_test,
   Tests_MGPerVRandomSelectTransformOutgoingE_Rmat,
-  ::testing::Combine(::testing::Values(Prims_Usecase{false}),
+  ::testing::Combine(::testing::Values(Prims_Usecase{size_t{4}, false, false, false, false},
+                                       Prims_Usecase{size_t{4}, false, true, false, false},
+                                       Prims_Usecase{size_t{4}, true, false, false, false},
+                                       Prims_Usecase{size_t{4}, true, true, false, false}),
                      ::testing::Values(cugraph::test::Rmat_Usecase(
                        20, 32, 0.57, 0.19, 0.19, 0, false, false, 0, true))));
 

From 2a7d0c845462da4f59b2d95055aa296635b21d91 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 22 Sep 2022 20:27:19 -0700
Subject: [PATCH 12/18] fix clang-format errors

---
 cpp/include/cugraph/utilities/device_functors.cuh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/include/cugraph/utilities/device_functors.cuh b/cpp/include/cugraph/utilities/device_functors.cuh
index 8a75f2a3379..bf43b411757 100644
--- a/cpp/include/cugraph/utilities/device_functors.cuh
+++ b/cpp/include/cugraph/utilities/device_functors.cuh
@@ -81,7 +81,8 @@ struct strided_sum_t {
   size_t stride{0};
   size_t count{0};
 
-  __device__ T operator()(size_t start_offset) const {
+  __device__ T operator()(size_t start_offset) const
+  {
     T sum{0};
     for (size_t j = 0; j < count; ++j) {
       sum += values[start_offset + stride * j];

From 1fb69174ecb6e29df717c54e9f455efa0299326b Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Fri, 23 Sep 2022 09:12:10 -0700
Subject: [PATCH 13/18] guard a cugraph-ops call inside ifdef

---
 cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
index f82eb10d8c7..b874cbe433f 100644
--- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
@@ -410,6 +410,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
   rmm::device_uvector<edge_t> sample_nbr_indices(frontier.size() * K, handle.get_stream());
   // FIXME: get_sampling_index is inefficient when degree >> K & with_replacement = false
   if (frontier_degrees.size() > 0) {
+#ifndef NO_CUGRAPH_OPS
     cugraph::ops::gnn::graph::get_sampling_index(sample_nbr_indices.data(),
                                                  rng_state,
                                                  frontier_degrees.data(),
@@ -417,6 +418,9 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
                                                  static_cast<int32_t>(K),
                                                  with_replacement,
                                                  handle.get_stream());
+#else
+    CUGRAPH_FAIL("unimplemented.");
+#endif
   }
   frontier_degrees.resize(0, handle.get_stream());
   frontier_degrees.shrink_to_fit(handle.get_stream());

From 1d73867302591f51f94eed09b79e88c3ea389ac0 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Fri, 23 Sep 2022 18:02:47 -0700
Subject: [PATCH 14/18] first draft implementation

---
 cpp/src/prims/detail/nbr_intersection.cuh     | 145 +++----
 ..._v_pair_transform_dst_nbr_intersection.cuh | 357 ++++++++++++++++++
 ...t_nbr_intersection_of_e_endpoints_by_v.cuh |  20 +-
 3 files changed, 439 insertions(+), 83 deletions(-)
 create mode 100644 cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh

diff --git a/cpp/src/prims/detail/nbr_intersection.cuh b/cpp/src/prims/detail/nbr_intersection.cuh
index 91bb5bfec13..cb1ef32b660 100644
--- a/cpp/src/prims/detail/nbr_intersection.cuh
+++ b/cpp/src/prims/detail/nbr_intersection.cuh
@@ -473,6 +473,77 @@ struct gatherv_indices_t {
   }
 };
 
+template <typename GraphViewType, typename VertexPairIterator>
+size_t count_invalid_vertex_pairs(raft::handle_t const& handle,
+                                  GraphViewType const& graph_view,
+                                  VertexPairIterator vertex_pair_first,
+                                  VertexPairIterator vertex_pair_last)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+
+  std::vector<vertex_t> h_edge_partition_major_range_firsts(
+    graph_view.number_of_local_edge_partitions());
+  std::vector<vertex_t> h_edge_partition_major_range_lasts(
+    h_edge_partition_major_range_firsts.size());
+  vertex_t edge_partition_minor_range_first{};
+  vertex_t edge_partition_minor_range_last{};
+  if constexpr (GraphViewType::is_multi_gpu) {
+    for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); i++) {
+      if constexpr (GraphViewType::is_storage_transposed) {
+        h_edge_partition_major_range_firsts[i] = graph_view.local_edge_partition_dst_range_first(i);
+        h_edge_partition_major_range_lasts[i]  = graph_view.local_edge_partition_dst_range_last(i);
+      } else {
+        h_edge_partition_major_range_firsts[i] = graph_view.local_edge_partition_src_range_first(i);
+        h_edge_partition_major_range_lasts[i]  = graph_view.local_edge_partition_src_range_last(i);
+      }
+    }
+    if constexpr (GraphViewType::is_storage_transposed) {
+      edge_partition_minor_range_first = graph_view.local_edge_partition_src_range_first();
+      edge_partition_minor_range_last  = graph_view.local_edge_partition_src_range_last();
+    } else {
+      edge_partition_minor_range_first = graph_view.local_edge_partition_dst_range_first();
+      edge_partition_minor_range_last  = graph_view.local_edge_partition_dst_range_last();
+    }
+  } else {
+    h_edge_partition_major_range_firsts[0] = vertex_t{0};
+    h_edge_partition_major_range_lasts[0]  = graph_view.number_of_vertices();
+    edge_partition_minor_range_first       = vertex_t{0};
+    edge_partition_minor_range_last        = graph_view.number_of_vertices();
+  }
+  rmm::device_uvector<vertex_t> d_edge_partition_major_range_firsts(
+    h_edge_partition_major_range_firsts.size(), handle.get_stream());
+  rmm::device_uvector<vertex_t> d_edge_partition_major_range_lasts(
+    h_edge_partition_major_range_lasts.size(), handle.get_stream());
+  raft::update_device(d_edge_partition_major_range_firsts.data(),
+                      h_edge_partition_major_range_firsts.data(),
+                      h_edge_partition_major_range_firsts.size(),
+                      handle.get_stream());
+  raft::update_device(d_edge_partition_major_range_lasts.data(),
+                      h_edge_partition_major_range_lasts.data(),
+                      h_edge_partition_major_range_lasts.size(),
+                      handle.get_stream());
+
+  auto num_invalid_pairs = thrust::count_if(
+    handle.get_thrust_policy(),
+    vertex_pair_first,
+    vertex_pair_last,
+    is_invalid_input_vertex_pair_t<vertex_t>{
+      graph_view.number_of_vertices(),
+      raft::device_span<vertex_t const>(d_edge_partition_major_range_firsts.begin(),
+                                        d_edge_partition_major_range_firsts.end()),
+      raft::device_span<vertex_t const>(d_edge_partition_major_range_lasts.begin(),
+                                        d_edge_partition_major_range_lasts.end()),
+      edge_partition_minor_range_first,
+      edge_partition_minor_range_last});
+  if constexpr (GraphViewType::is_multi_gpu) {
+    auto& comm = handle.get_comms();
+    num_invalid_pairs =
+      host_scalar_allreduce(comm, num_invalid_pairs, raft::comms::op_t::SUM, handle.get_stream());
+  }
+
+  return num_invalid_pairs;
+}
+
 // In multi-GPU, the first element of every vertex pair in [vertex_pair_first, vertex_pair) should
 // be within the valid edge partition major range assigned to this process and the second element
 // should be within the valid edge partition minor range assigned to this process.
@@ -483,7 +554,7 @@ struct gatherv_indices_t {
 // one can limit the number of unique vertices (aggregated over column communicator in multi-GPU) to
 // build neighbor list; we need to bulid neighbor lists for the first element of every input vertex
 // pair if intersect_dst_nbr[0] == GraphViewType::is_storage_transposed and build neighbor lists for
-// the second element of every input vertex pair if single-GPU and intersect_dst_nbr[0] ==
+// the second element of every input vertex pair if single-GPU and intersect_dst_nbr[1] ==
 // GraphViewType::is_storage_transposed or multi-GPU. For load balancing,
 // thrust::distance(vertex_pair_first, vertex_pair_last) should be comparable across the global
 // communicator. If we need to build the neighbor lists, grouping based on applying "vertex ID %
@@ -517,79 +588,22 @@ nbr_intersection(raft::handle_t const& handle,
   if (do_expensive_check) {
     auto is_sorted =
       thrust::is_sorted(handle.get_thrust_policy(), vertex_pair_first, vertex_pair_last);
-
-    std::vector<vertex_t> h_edge_partition_major_range_firsts(
-      graph_view.number_of_local_edge_partitions());
-    std::vector<vertex_t> h_edge_partition_major_range_lasts(
-      h_edge_partition_major_range_firsts.size());
-    vertex_t edge_partition_minor_range_first{};
-    vertex_t edge_partition_minor_range_last{};
-    if constexpr (GraphViewType::is_multi_gpu) {
-      for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); i++) {
-        if constexpr (GraphViewType::is_storage_transposed) {
-          h_edge_partition_major_range_firsts[i] =
-            graph_view.local_edge_partition_dst_range_first(i);
-          h_edge_partition_major_range_lasts[i] = graph_view.local_edge_partition_dst_range_last(i);
-        } else {
-          h_edge_partition_major_range_firsts[i] =
-            graph_view.local_edge_partition_src_range_first(i);
-          h_edge_partition_major_range_lasts[i] = graph_view.local_edge_partition_src_range_last(i);
-        }
-      }
-      if constexpr (GraphViewType::is_storage_transposed) {
-        edge_partition_minor_range_first = graph_view.local_edge_partition_src_range_first();
-        edge_partition_minor_range_last  = graph_view.local_edge_partition_src_range_last();
-      } else {
-        edge_partition_minor_range_first = graph_view.local_edge_partition_dst_range_first();
-        edge_partition_minor_range_last  = graph_view.local_edge_partition_dst_range_last();
-      }
-    } else {
-      h_edge_partition_major_range_firsts[0] = vertex_t{0};
-      h_edge_partition_major_range_lasts[0]  = graph_view.number_of_vertices();
-      edge_partition_minor_range_first       = vertex_t{0};
-      edge_partition_minor_range_last        = graph_view.number_of_vertices();
-    }
-    rmm::device_uvector<vertex_t> d_edge_partition_major_range_firsts(
-      h_edge_partition_major_range_firsts.size(), handle.get_stream());
-    rmm::device_uvector<vertex_t> d_edge_partition_major_range_lasts(
-      h_edge_partition_major_range_lasts.size(), handle.get_stream());
-    raft::update_device(d_edge_partition_major_range_firsts.data(),
-                        h_edge_partition_major_range_firsts.data(),
-                        h_edge_partition_major_range_firsts.size(),
-                        handle.get_stream());
-    raft::update_device(d_edge_partition_major_range_lasts.data(),
-                        h_edge_partition_major_range_lasts.data(),
-                        h_edge_partition_major_range_lasts.size(),
-                        handle.get_stream());
-
-    auto num_invalid_pairs = thrust::count_if(
-      handle.get_thrust_policy(),
-      vertex_pair_first,
-      vertex_pair_last,
-      is_invalid_input_vertex_pair_t<vertex_t>{
-        graph_view.number_of_vertices(),
-        raft::device_span<vertex_t const>(d_edge_partition_major_range_firsts.begin(),
-                                          d_edge_partition_major_range_firsts.end()),
-        raft::device_span<vertex_t const>(d_edge_partition_major_range_lasts.begin(),
-                                          d_edge_partition_major_range_lasts.end()),
-        edge_partition_minor_range_first,
-        edge_partition_minor_range_last});
     if constexpr (GraphViewType::is_multi_gpu) {
       auto& comm = handle.get_comms();
-
-      is_sorted = static_cast<bool>(host_scalar_allreduce(
+      is_sorted  = static_cast<bool>(host_scalar_allreduce(
         comm, static_cast<int>(is_sorted), raft::comms::op_t::MIN, handle.get_stream()));
-      num_invalid_pairs =
-        host_scalar_allreduce(comm, num_invalid_pairs, raft::comms::op_t::SUM, handle.get_stream());
     }
-
     CUGRAPH_EXPECTS(is_sorted, "Invalid input arguments: input vertex pairs should be sorted.");
+
+    auto num_invalid_pairs =
+      count_invalid_vertex_pairs(handle, graph_view, vertex_pair_first, vertex_pair_last);
     CUGRAPH_EXPECTS(num_invalid_pairs == 0,
                     "Invalid input arguments: there are invalid input vertex pairs.");
   }
 
   // 2. Collect neighbor lists for unique second pair elements (for the neighbors within the minor
-  // range for this GPU)
+  // range for this GPU); Note that no need to collect for first pair elements as they already
+  // locally reside.
 
   auto poly_alloc = rmm::mr::polymorphic_allocator<char>(rmm::mr::get_current_device_resource());
   [[maybe_unused]] auto stream_adapter =
@@ -634,6 +648,9 @@ nbr_intersection(raft::handle_t const& handle,
         unique_majors.shrink_to_fit(handle.get_stream());
 
         if (col_comm_size > 1) {
+          // FIXME: We may refactor this code to improve scalability. We may call multiple gatherv
+          // calls, perform local sort and unique, and call multiple broadcasts rather than
+          // performing sort and unique for the entire range in every GPU in col_comm.
           auto rx_counts =
             host_scalar_allgather(col_comm, unique_majors.size(), handle.get_stream());
           std::vector<size_t> rx_displacements(rx_counts.size());
diff --git a/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh b/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh
new file mode 100644
index 00000000000..a0f2d6f5338
--- /dev/null
+++ b/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh
@@ -0,0 +1,357 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <detail/graph_utils.cuh>
+#include <prims/detail/nbr_intersection.cuh>
+#include <prims/property_op_utils.cuh>
+
+#include <cugraph/edge_partition_device_view.cuh>
+#include <cugraph/edge_partition_endpoint_property_device_view.cuh>
+#include <cugraph/edge_src_dst_property.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/device_functors.cuh>
+#include <cugraph/utilities/error.hpp>
+
+#include <raft/handle.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/execution_policy.h>
+#include <thrust/fill.h>
+#include <thrust/for_each.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/merge.h>
+#include <thrust/reduce.h>
+#include <thrust/sort.h>
+#include <thrust/tabulate.h>
+#include <thrust/tuple.h>
+
+#include <type_traits>
+
+namespace cugraph {
+
+namespace detail {
+
+template <typename VertexPairIterator>
+struct compute_local_edge_partition_id_t {
+  VertexPairIterator vertex_pair_first{};
+  size_t num_local_edge_partitions{};
+  raft::device_span<vertex_t const> edge_partition_major_range_lasts{};
+
+  __device__ int operator()(size_t i) const
+  {
+    auto major = thurst::get<0>(*(vertex_pair_first + i));
+    return static_cast<int>(
+      thrust::distance(edge_partition_major_range_lasts.begin(),
+                       thrust::upper_bound(thrust::seq,
+                                           edge_partition_major_range_lasts.begin(),
+                                           edge_partition_major_range_lasts.end(),
+                                           major)));
+  }
+};
+
+template <typename VertexPairIterator>
+struct compute_chunk_id_t {
+  VertexPairIterator vertex_pair_first{};
+  size_t num_chunks{};
+
+  __device__ int operator()(size_t i) const
+  {
+    return static_cast<int>(thrust::get<1>(*(vertex_pair_first + i)) % num_chunks);
+  }
+};
+
+template <typename GraphViewType,
+          typename EdgePartitionSrcValueInputWrapper,
+          typename EdgePartitionDstValueInputWrapper,
+          typename IntersectionOp,
+          typename VertexPairIndexIterator,
+          typename VertexPairIterator,
+          typename VertexPairValueOutputIterator>
+struct call_intersection_op_t {
+  edge_partition_device_view_t<typename GraphViewType::vertex_type,
+                               typename GraphViewType::edge_type,
+                               typename GraphViewType::weight_type,
+                               GraphViewType::is_multi_gpu>
+    edge_partition{};
+  EdgePartitionSrcValueInputWrapper edge_partition_src_value_input{};
+  EdgePartitionDstValueInputWrapper edge_partition_dst_value_input{};
+  IntersectionOp intersection_op{};
+  size_t const* nbr_offsets{nullptr};
+  typename GraphViewType::vertex_type const* nbr_indices{nullptr};
+  VertexPairIndexIterator major_minor_pair_index_first{};
+  VertexPairIterator major_minor_pair_first{};
+  VertexPairValueOutputIterator major_minor_pair_value_output_first{};
+
+  __device__ void operator()(size_t i) const
+  {
+    auto index        = *(major_minor_pair_index_first + i);
+    auto pair         = *(major_minor_pair_first + index);
+    auto major        = thrust::get<0>(pair);
+    auto minor        = thrust::get<1>(pair);
+    auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+    auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
+    auto src          = GraphViewType::is_storage_transposed ? minor : major;
+    auto dst          = GraphViewType::is_storage_transposed ? major : minor;
+    auto src_offset   = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
+    auto dst_offset   = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
+    auto intersection = raft::device_span<typename GraphViewType::vertex_type const>(
+      nbr_indices + nbr_offsets[i], nbr_indices + nbr_offsets[i + 1]);
+    *(major_minor_pair_value_output_first + index) =
+      evaluate_intersection_op<GraphViewType,
+                               typename EdgePartitionSrcValueInputWrapper::value_type,
+                               typename EdgePartitionDstValueInputWrapper::value_type,
+                               IntersectionOp>()
+        .compute(src,
+                 dst,
+                 edge_partition_src_value_input.get(src_offset),
+                 edge_partition_dst_value_input.get(dst_offset),
+                 intersection,
+                 intersection_op);
+  }
+};
+
+}  // namespace detail
+
+/**
+ * @brief Iterate over each input vertex pair and apply a functor to the common destination neighbor
+ * list of the pair.
+ *
+ * Iterate over every vertex pair; intersect destination neighbor lists of the two vertices in the
+ * pair; invoke a user-provided functor, and store the functor output.
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam VertexPairIterator Type of the iterator for input vertex pairs.
+ * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values.
+ * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values.
+ * @tparam IntersectionOp Type of the quinary per intersection operator.
+ * @tparam VertexPairValueOutputIterator Type of the iterator for vertex pair output property
+ * variables.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param vertex_pair_first Iterator pointing to the first (inclusive) input vertex pair.
+ * @param vertex_pair_last Iterator pointing to the last (exclusive) input vertex pair.
+ * @param edge_src_value_input Wrapper used to access source input property values (for the edge
+ * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view()
+ * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view()
+ * (if @p e_op does not access source property values). Use update_edge_src_property to fill the
+ * wrapper.
+ * @param edge_dst_value_input Wrapper used to access destination input property values (for the
+ * edge destinations assigned to this process in multi-GPU). Use either
+ * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or
+ * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property
+ * values). Use update_edge_dst_property to fill the wrapper.
+ * @param intersection_op quinary operator takes first vertex of the pair, second vertex of the
+ * pair, property values for the first vertex, property values for the second vertex, and a list of
+ * vertices in the intersection of the first & second vertices' destination neighbors and returns an
+ * output value for the input pair.
+ * @param vertex_pair_value_output_first Iterator pointing to the vertex pair property variables for
+ * the first vertex pair (inclusive). `vertex_pair_value_output_last` (exclusive) is deduced as @p
+ * vertex_pair_value_output_first + @p thrust::distance(vertex_pair_first, vertex_pair_last).
+ * @param A flag to run expensive checks for input arguments (if set to `true`).
+ */
+template <typename GraphViewType,
+          typename VertexPairIterator,
+          typename EdgeSrcValueInputWrapper,
+          typename EdgeDstValueInputWrapper,
+          typename IntersectionOp,
+          typename VertexPairValueOutputIterator>
+void per_v_pair_transform_dst_nbr_intersection(
+  raft::handle_t const& handle,
+  GraphViewType const& graph_view,
+  VertexPairIterator vertex_pair_first,
+  VertexPairIterator vertex_pair_last,
+  EdgeSrcValueInputWrapper edge_src_value_input,
+  EdgeDstValueInputWrapper edge_dst_value_input,
+  IntersectionOp intersection_op,
+  VertexPairValueOutputIterator vertex_pair_value_output_first,
+  bool do_expensive_check = false)
+{
+  static_assert(!GraphViewType::is_storage_transposed);
+
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using weight_t = typename GraphViewType::weight_type;
+  using result_t = typename thrust::iterator_traits<VertexPairValueOutputIterator>::value_type;
+
+  using edge_partition_src_input_device_view_t = std::conditional_t<
+    std::is_same_v<typename EdgeSrcValueInputWrapper::value_type, thrust::nullopt_t>,
+    detail::edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
+    std::conditional_t<GraphViewType::is_storage_transposed,
+                       detail::edge_partition_endpoint_property_device_view_t<
+                         vertex_t,
+                         typename EdgeSrcValueInputWrapper::value_iterator>,
+                       detail::edge_partition_endpoint_property_device_view_t<
+                         vertex_t,
+                         typename EdgeSrcValueInputWrapper::value_iterator>>>;
+  using edge_partition_dst_input_device_view_t = std::conditional_t<
+    std::is_same_v<typename EdgeDstValueInputWrapper::value_type, thrust::nullopt_t>,
+    detail::edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
+    std::conditional_t<GraphViewType::is_storage_transposed,
+                       detail::edge_partition_endpoint_property_device_view_t<
+                         vertex_t,
+                         typename EdgeDstValueInputWrapper::value_iterator>,
+                       detail::edge_partition_endpoint_property_device_view_t<
+                         vertex_t,
+                         typename EdgeDstValueInputWrapper::value_iterator>>>;
+
+  if (do_expensive_check) {
+    auto num_invalids =
+      detail::count_invalid_vertex_pairs(handle, graph_view, vertex_pair_first, vertex_pair_last);
+    CUGRAPH_EXPECTS(num_invalids == 0, "");
+  }
+
+  rmm::device_uvector<size_t> vertex_pair_indices(
+    thrust::distance(vertex_pair_first, vertex_pair_last), handle.get_stream());
+  thrust::sequence(
+    handle.get_thrust_policy(), vertex_pair_indices.begin(), vertex_pair_indices.end(), size_t{0});
+
+  std::vector<vertex_t> h_edge_partition_major_range_lasts(
+    graph_view.number_of_local_edge_partitions);
+  for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
+    h_edge_partition_major_range_lasts[i] = graph_view.local_edge_partition_src_range_first(i);
+  }
+  rmm::device_uvector<vertex_t> d_edge_partition_major_range_lasts(
+    h_edge_partition_major_range_lasts.size(), handle.get_stream());
+  raft::update_device(d_edge_partition_major_range_lasts.data(),
+                      h_edge_partition_major_range_lasts.data(),
+                      h_edge_partition_major_range_lasts.size(),
+                      std::cout);
+  auto d_edge_partition_group_sizes = groupby_and_count(
+    vertex_pair_indices.begin(),
+    vertex_pair_indices.end(),
+    detail::compute_local_edge_partition_id_t<VertexPairIterator>{
+      vertex_pair_first,
+      graph_view.number_of_local_edge_partitions(),
+      raft::device_span<vertex_t const>(d_edge_partition_major_range_lasts.data(),
+                                        d_edge_partition_major_range_lasts.size())},
+    static_cast<int>(graph_view.number_of_local_edge_partitions()),
+    std::numeric_limits<size_t>::max(),
+    handle.get_stream());
+  std::vector<size_t> h_edge_partition_group_sizes(d_edge_partition_group_sizes.size());
+  raft::update_host(h_edge_partition_group_sizes.data(),
+                    d_edge_partition_group_sizes.data(),
+                    d_edge_partition_group_sizes.size(),
+                    handle.get_stream());
+  handle.sync_stream();
+  std::vector<size_t> h_edge_partition_group_displacements(h_edge_partition_group_sizes.size());
+  std::exclusive_scan(h_edge_partition_group_sizes.begin(),
+                      h_edge_partition_group_sizes.end(),
+                      h_edge_partition_group_displacements.begin());
+
+  for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
+    auto edge_partition =
+      edge_partition_device_view_t<vertex_t, edge_t, weight_t, GraphViewType::is_multi_gpu>(
+        graph_view.local_edge_partition_view(i));
+
+    edge_partition_src_input_device_view_t edge_partition_src_value_input{};
+    edge_partition_dst_input_device_view_t edge_partition_dst_value_input{};
+    if constexpr (GraphViewType::is_storage_transposed) {
+      edge_partition_src_value_input = edge_partition_src_input_device_view_t(edge_src_value_input);
+      edge_partition_dst_value_input =
+        edge_partition_dst_input_device_view_t(edge_dst_value_input, i);
+    } else {
+      edge_partition_src_value_input =
+        edge_partition_src_input_device_view_t(edge_src_value_input, i);
+      edge_partition_dst_value_input = edge_partition_dst_input_device_view_t(edge_dst_value_input);
+    }
+
+    auto edge_partition_vertex_pair_index_first =
+      vertex_pair_indices.begin() + h_edge_partition_group_displacements[i];
+
+    // FIXME: Peak memory requirement is also dependent on the average minimum degree of the input
+    // vertex pairs. We may need a more sophisticated mechanism to set max_chunk_size considering
+    // vertex degrees. to limit memory footprint ((1 << 15) is a tuning parameter)
+    auto max_chunk_size =
+      static_cast<size_t>(handle.get_device_properties().multiProcessorCount) * (1 << 15);
+    auto max_num_chunks = (majors.size() + max_chunk_size - 1) / max_chunk_size;
+    if constexpr (GraphViewType::is_multi_gpu) {
+      max_num_chunks = host_scalar_allreduce(
+        handle.get_comms(), max_num_chunks, raft::comms::op_t::MAX, handle.get_stream());
+    }
+
+    std::vector<size_t> h_chunk_sizes(max_num_chunks);
+    if (h_chunk_sizes.size() > size_t{1}) {
+      auto d_chunk_sizes =
+        groupby_and_count(edge_partition_vertex_pair_index_first,
+                          edge_partition_vertex_pair_index_first + h_edge_partition_group_sizes[i],
+                          detail::compute_chunk_id_t<VertexPairIterator>{max_num_chunks},
+                          static_cast<int>(max_num_chunks),
+                          std::numeric_limits<size_t>::max(),
+                          handle.get_stream());
+      raft::update_host(
+        h_chunk_sizes.data(), d_chunk_sizes.data(), d_chunk_sizes.size(), handle.get_stream());
+      handle.sync_stream();
+    } else if (h_chunk_sizes.size() == size_t{1}) {
+      h_chunk_sizes[0] = majors.size();
+    }
+
+    auto chunk_vertex_pair_index_first = edge_partition_vertx_pair_index_first;
+    for (size_t j = 0; j < h_chunk_sizes.size(); ++j) {
+      auto this_chunk_size = h_chunk_sizes[j];
+
+      thrust::sort(handle.get_thrust_policy(),
+                   chunk_vertex_pair_index_first,
+                   chunk_vertex_pair_index_first + this_chunk_size,
+                   indirection_copmare_less_t<VertexPairIterator>{
+                     vertex_pair_first});  // detail::nbr_intersection() requires the input vertex
+                                           // pairs to be sorted.
+
+      // FIXME: better restrict detail::nbr_intersection input vetex pairs to a single edge
+      // partition? This may provide additional performance improvement opportunities???
+      auto chunk_vertex_pair_fist = thrust::make_transform_iterator(
+        chunk_vertex_pair_index_first, indirection_t<VertexPairIterator>{vertex_pair_first});
+      auto [intersection_offsets, intersection_indices] =
+        detail::nbr_intersection(handle,
+                                 graph_view,
+                                 chunk_vertex_pair_first,
+                                 chunk_vertex_pair_first + this_chunk_size,
+                                 std::array<bool, 2>{true, true},
+                                 do_expensive_check);
+
+      thrust::for_each(
+        handle.get_thrust_policy(),
+        thrust::make_counting_iterator(size_t{0}),
+        thrust::make_counting_iterator(this_chunk_size),
+        detail::call_intersection_op_t<GraphViewType,
+                                       edge_partition_src_input_device_view_t,
+                                       edge_partition_dst_input_device_view_t,
+                                       IntersectionOp,
+                                       decltype(chunk_vertex_pair_index_first),
+                                       VertexPairIterator VertexPairValueOutputIterator>{
+          edge_partition,
+          edge_partition_src_value_input,
+          edge_partition_dst_value_input,
+          intersection_op,
+          intersection_offsets.data(),
+          intersection_indices.data(),
+          chunk_vertex_pair_index_first,
+          vertex_pair_first,
+          vertex_pair_value_output_first});
+
+      chunk_vertex_pair_index_first += this_chunk_size;
+    }
+  }
+}
+
+}  // namespace cugraph
diff --git a/cpp/src/prims/transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cuh b/cpp/src/prims/transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cuh
index ab33dc7d7b9..5267bf58488 100644
--- a/cpp/src/prims/transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cuh
+++ b/cpp/src/prims/transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cuh
@@ -275,24 +275,6 @@ void transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v(
                vertex_value_output_first + graph_view.local_vertex_partition_range_size(),
                init);
 
-  std::optional<rmm::device_uvector<vertex_t>>
-    d_vertex_partition_range_lasts_in_edge_partition_minor_range{std::nullopt};
-  if constexpr (GraphViewType::is_multi_gpu) {
-    auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-    auto const row_comm_size = row_comm.get_size();
-
-    auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-    auto const col_comm_rank = col_comm.get_rank();
-
-    d_vertex_partition_range_lasts_in_edge_partition_minor_range =
-      rmm::device_uvector<vertex_t>(row_comm_size, handle.get_stream());
-    auto h_vertex_partition_range_lasts = graph_view.vertex_partition_range_lasts();
-    raft::update_device((*d_vertex_partition_range_lasts_in_edge_partition_minor_range).data(),
-                        h_vertex_partition_range_lasts.data() + row_comm_size * col_comm_rank,
-                        row_comm_size,
-                        handle.get_stream());
-  }
-
   for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
     auto edge_partition =
       edge_partition_device_view_t<vertex_t, edge_t, weight_t, GraphViewType::is_multi_gpu>(
@@ -325,7 +307,7 @@ void transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v(
 
     // FIXME: Peak memory requirement is also dependent on the average minimum degree of the input
     // vertex pairs. We may need a more sophisticated mechanism to set max_chunk_size considering
-    // vertex degrees. to limit memory footprint ((1 << 10) is a tuning parameter)
+    // vertex degrees. to limit memory footprint ((1 << 15) is a tuning parameter)
     auto max_chunk_size =
       static_cast<size_t>(handle.get_device_properties().multiProcessorCount) * (1 << 15);
     auto max_num_chunks = (majors.size() + max_chunk_size - 1) / max_chunk_size;

From bbe6fbacad0bd8b08b4569a670b662f5d0782288 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Mon, 26 Sep 2022 15:42:32 -0700
Subject: [PATCH 15/18] rename utility functors

---
 cpp/include/cugraph/graph_functions.hpp       | 23 ++++++-----
 .../weakly_connected_components_impl.cuh      |  2 +-
 cpp/src/detail/graph_utils.cuh                | 41 +++++++++++++++----
 cpp/src/detail/shuffle_wrappers.cu            |  8 ++--
 .../create_graph_from_edgelist_impl.cuh       |  2 +-
 cpp/src/structure/renumber_edgelist_impl.cuh  |  5 ++-
 cpp/src/utilities/cython.cu                   |  6 +--
 .../utilities/matrix_market_file_utilities.cu |  2 +-
 8 files changed, 57 insertions(+), 32 deletions(-)

diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp
index fcf5cb00572..ac14642f2c3 100644
--- a/cpp/include/cugraph/graph_functions.hpp
+++ b/cpp/include/cugraph/graph_functions.hpp
@@ -51,7 +51,8 @@ struct renumber_meta_t<vertex_t, edge_t, multi_gpu, std::enable_if_t<!multi_gpu>
  *
  * This function assumes that vertices are pre-shuffled to their target processes and edges are
  * pre-shuffled to their target processess and edge partitions using compute_gpu_id_from_vertex_t
- * and compute_gpu_id_from_edge_t & compute_partition_id_from_edge_t functors, respectively.
+ * and compute_gpu_id_from_ext_edge_endpoints_t & compute_partition_id_from_ext_edge_endpoints_t
+ * functors, respectively.
  *
  * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
  * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
@@ -65,12 +66,12 @@ struct renumber_meta_t<vertex_t, edge_t, multi_gpu, std::enable_if_t<!multi_gpu>
  * work (vertices should be pre-shuffled).
  * @param edgelist_srcs Pointers (one pointer per local edge partition assigned to this process) to
  * edge source vertex IDs. Source IDs are updated in-place ([INOUT] parameter). Applying the
- * compute_gpu_id_from_edge_t functor to every (destination ID, source ID) pair (if store_transposed
- * = true) or (source ID, destination ID) pair (if store_transposed = false) should return the local
- * GPU ID for this function to work (edges should be pre-shuffled). Applying the
- * compute_partition_id_from_edge_t to every (destination ID, source ID) pair (if store_transposed =
- * true) or (source ID, destination ID) pair (if store_transposed = false) should also return the
- * corresponding edge partition ID. The best way to enforce this is to use
+ * compute_gpu_id_from_ext_edge_endpoints_t functor to every (destination ID, source ID) pair (if
+ * store_transposed = true) or (source ID, destination ID) pair (if store_transposed = false) should
+ * return the local GPU ID for this function to work (edges should be pre-shuffled). Applying the
+ * compute_partition_id_from_ext_edge_endpoints_t to every (destination ID, source ID) pair (if
+ * store_transposed = true) or (source ID, destination ID) pair (if store_transposed = false) should
+ * also return the corresponding edge partition ID. The best way to enforce this is to use
  * shuffle_edgelist_by_gpu_id & groupby_and_count_edgelist_by_local_partition_id.
  * @param edgelist_dsts Pointers (one pointer per local edge partition assigned to this process) to
  * edge destination vertex IDs. Destination IDs are updated in-place ([INOUT] parameter).
@@ -347,8 +348,8 @@ void renumber_local_ext_vertices(raft::handle_t const& handle,
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param edgelist_srcs Vector of edge source vertex IDs. If multi-GPU, applying the
- * compute_gpu_id_from_edge_t to every edge should return the local GPU ID for this function to work
- * (edges should be pre-shuffled).
+ * compute_gpu_id_from_ext_edge_endpoints_t to every edge should return the local GPU ID for this
+ * function to work (edges should be pre-shuffled).
  * @param edgelist_dsts Vector of edge destination vertex IDs.
  * @param edgelist_weights Vector of edge weights.
  * @param reciprocal Flag indicating whether to keep (if set to `false`) or discard (if set to
@@ -500,8 +501,8 @@ extract_induced_subgraphs(
  * compute_gpu_id_from_vertex_t to every vertex should return the local GPU ID for this function to
  * work (vertices should be pre-shuffled).
  * @param edgelist_srcs Vector of edge source vertex IDs. If multi-GPU, applying the
- * compute_gpu_id_from_edge_t to every edge should return the local GPU ID for this function to work
- * (edges should be pre-shuffled).
+ * compute_gpu_id_from_ext_edge_endpoints_t to every edge should return the local GPU ID for this
+ * function to work (edges should be pre-shuffled).
  * @param edgelist_dsts Vector of edge destination vertex IDs.
  * @param edgelist_weights Vector of edge weights.
  * @param edgelist_id_type_pairs Vector of edge ID and type pairs.
diff --git a/cpp/src/components/weakly_connected_components_impl.cuh b/cpp/src/components/weakly_connected_components_impl.cuh
index 6aa14ce5e73..d2a1ef30d67 100644
--- a/cpp/src/components/weakly_connected_components_impl.cuh
+++ b/cpp/src/components/weakly_connected_components_impl.cuh
@@ -711,7 +711,7 @@ void weakly_connected_components_impl(raft::handle_t const& handle,
           get_dataframe_buffer_begin(edge_buffer),
           get_dataframe_buffer_end(edge_buffer),
           [key_func =
-             cugraph::detail::compute_gpu_id_from_edge_t<vertex_t>{
+             cugraph::detail::compute_gpu_id_from_ext_edge_endpoints_t<vertex_t>{
                comm_size, row_comm_size, col_comm_size}] __device__(auto val) {
             return key_func(thrust::get<0>(val), thrust::get<1>(val));
           },
diff --git a/cpp/src/detail/graph_utils.cuh b/cpp/src/detail/graph_utils.cuh
index 8094de556c1..f7b0490f156 100644
--- a/cpp/src/detail/graph_utils.cuh
+++ b/cpp/src/detail/graph_utils.cuh
@@ -54,21 +54,19 @@ struct compute_gpu_id_from_ext_vertex_t {
 
 template <typename vertex_t>
 struct compute_gpu_id_from_int_vertex_t {
-  raft::device_span<vertex_t> vertex_partition_range_lasts_span;
+  raft::device_span<vertex_t const> vertex_partition_range_lasts{};
 
   __device__ int operator()(vertex_t v) const
   {
-    return static_cast<int>(
-      thrust::distance(vertex_partition_range_lasts_span.begin(),
-                       thrust::upper_bound(thrust::seq,
-                                           vertex_partition_range_lasts_span.begin(),
-                                           vertex_partition_range_lasts_span.end(),
-                                           v)));
+    return static_cast<int>(thrust::distance(
+      vertex_partition_range_lasts.begin(),
+      thrust::upper_bound(
+        thrust::seq, vertex_partition_range_lasts.begin(), vertex_partition_range_lasts.end(), v)));
   }
 };
 
 template <typename vertex_t>
-struct compute_gpu_id_from_edge_t {
+struct compute_gpu_id_from_ext_edge_endpoints_t {
   int comm_size{0};
   int row_comm_size{0};
   int col_comm_size{0};
@@ -83,7 +81,32 @@ struct compute_gpu_id_from_edge_t {
 };
 
 template <typename vertex_t>
-struct compute_partition_id_from_edge_t {
+struct compute_gpu_id_from_int_edge_endpoints_t {
+  raft::device_span<vertex_t const> vertex_partition_range_lasts{};
+  int comm_size{0};
+  int row_comm_size{0};
+  int col_comm_size{0};
+
+  __device__ int operator()(vertex_t major, vertex_t minor) const
+  {
+    auto major_comm_rank =
+      static_cast<int>(thrust::distance(vertex_partition_range_lasts.begin(),
+                                        thrust::upper_bound(thrust::seq,
+                                                            vertex_partition_range_lasts.begin(),
+                                                            vertex_partition_range_lasts.end(),
+                                                            major)));
+    auto minor_comm_rank =
+      static_cast<int>(thrust::distance(vertex_partition_range_lasts.begin(),
+                                        thrust::upper_bound(thrust::seq,
+                                                            vertex_partition_range_lasts.begin(),
+                                                            vertex_partition_range_lasts.end(),
+                                                            major)));
+    return (minor_comm_rank / row_comm_size) * row_comm_size + (major_comm_rank % row_comm_size);
+  }
+};
+
+template <typename vertex_t>
+struct compute_partition_id_from_ext_edge_endpoints_t {
   int comm_size{0};
   int row_comm_size{0};
   int col_comm_size{0};
diff --git a/cpp/src/detail/shuffle_wrappers.cu b/cpp/src/detail/shuffle_wrappers.cu
index 93e408feccd..f8271cbfa4a 100644
--- a/cpp/src/detail/shuffle_wrappers.cu
+++ b/cpp/src/detail/shuffle_wrappers.cu
@@ -84,7 +84,7 @@ shuffle_edgelist_by_gpu_id(raft::handle_t const& handle,
       edge_first,
       edge_first + d_edgelist_majors.size(),
       [key_func =
-         cugraph::detail::compute_gpu_id_from_edge_t<vertex_t>{
+         cugraph::detail::compute_gpu_id_from_ext_edge_endpoints_t<vertex_t>{
            comm_size, row_comm_size, col_comm_size}] __device__(auto val) {
         return key_func(thrust::get<0>(val), thrust::get<1>(val));
       },
@@ -133,7 +133,7 @@ shuffle_edgelist_by_gpu_id(raft::handle_t const& handle,
       edge_first,
       edge_first + d_edgelist_majors.size(),
       [key_func =
-         cugraph::detail::compute_gpu_id_from_edge_t<vertex_t>{
+         cugraph::detail::compute_gpu_id_from_ext_edge_endpoints_t<vertex_t>{
            comm_size, row_comm_size, col_comm_size}] __device__(auto val) {
         return key_func(thrust::get<0>(val), thrust::get<1>(val));
       },
@@ -364,7 +364,7 @@ rmm::device_uvector<size_t> groupby_and_count_edgelist_by_local_partition_id(
       [comm_size,
        row_comm_size,
        partition_id_key_func =
-         cugraph::detail::compute_partition_id_from_edge_t<vertex_t>{
+         cugraph::detail::compute_partition_id_from_ext_edge_endpoints_t<vertex_t>{
            comm_size, row_comm_size, col_comm_size},
        gpu_id_key_func = cugraph::detail::compute_gpu_id_from_ext_vertex_t<vertex_t>{
          comm_size}] __device__(auto pair) {
@@ -412,7 +412,7 @@ rmm::device_uvector<size_t> groupby_and_count_edgelist_by_local_partition_id(
   } else {
     auto local_partition_id_op =
       [comm_size,
-       key_func = cugraph::detail::compute_partition_id_from_edge_t<vertex_t>{
+       key_func = cugraph::detail::compute_partition_id_from_ext_edge_endpoints_t<vertex_t>{
          comm_size, row_comm_size, col_comm_size}] __device__(auto pair) {
         return key_func(thrust::get<0>(pair), thrust::get<1>(pair)) /
                comm_size;  // global partition id to local partition id
diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.cuh b/cpp/src/structure/create_graph_from_edgelist_impl.cuh
index dfe2941eb86..6ddef877d52 100644
--- a/cpp/src/structure/create_graph_from_edgelist_impl.cuh
+++ b/cpp/src/structure/create_graph_from_edgelist_impl.cuh
@@ -115,7 +115,7 @@ void expensive_check_edgelist(raft::handle_t const& handle,
                        edge_first + edgelist_majors.size(),
                        [comm_rank,
                         gpu_id_key_func =
-                          detail::compute_gpu_id_from_edge_t<vertex_t>{
+                          detail::compute_gpu_id_from_ext_edge_endpoints_t<vertex_t>{
                             comm_size, row_comm_size, col_comm_size}] __device__(auto e) {
                          return (gpu_id_key_func(thrust::get<0>(e), thrust::get<1>(e)) !=
                                  comm_rank);
diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh
index f473bfba292..e45cdef8192 100644
--- a/cpp/src/structure/renumber_edgelist_impl.cuh
+++ b/cpp/src/structure/renumber_edgelist_impl.cuh
@@ -510,9 +510,10 @@ void expensive_check_edgelist(
            col_comm_rank,
            i,
            gpu_id_key_func =
-             detail::compute_gpu_id_from_edge_t<vertex_t>{comm_size, row_comm_size, col_comm_size},
+             detail::compute_gpu_id_from_ext_edge_endpoints_t<vertex_t>{
+               comm_size, row_comm_size, col_comm_size},
            partition_id_key_func =
-             detail::compute_partition_id_from_edge_t<vertex_t>{
+             detail::compute_partition_id_from_ext_edge_endpoints_t<vertex_t>{
                comm_size, row_comm_size, col_comm_size}] __device__(auto edge) {
             return (gpu_id_key_func(thrust::get<0>(edge), thrust::get<1>(edge)) != comm_rank) ||
                    (partition_id_key_func(thrust::get<0>(edge), thrust::get<1>(edge)) !=
diff --git a/cpp/src/utilities/cython.cu b/cpp/src/utilities/cython.cu
index 727a63c7080..10c6f2f616c 100644
--- a/cpp/src/utilities/cython.cu
+++ b/cpp/src/utilities/cython.cu
@@ -840,7 +840,7 @@ std::unique_ptr<major_minor_weights_t<vertex_t, edge_t, weight_t>> call_shuffle(
         zip_edge,
         zip_edge + num_edgelist_edges,
         [key_func =
-           cugraph::detail::compute_gpu_id_from_edge_t<vertex_t>{
+           cugraph::detail::compute_gpu_id_from_ext_edge_endpoints_t<vertex_t>{
              comm.get_size(), row_comm.get_size(), col_comm.get_size()}] __device__(auto val) {
           return key_func(thrust::get<0>(val), thrust::get<1>(val));
         },
@@ -856,7 +856,7 @@ std::unique_ptr<major_minor_weights_t<vertex_t, edge_t, weight_t>> call_shuffle(
         zip_edge,
         zip_edge + num_edgelist_edges,
         [key_func =
-           cugraph::detail::compute_gpu_id_from_edge_t<vertex_t>{
+           cugraph::detail::compute_gpu_id_from_ext_edge_endpoints_t<vertex_t>{
              comm.get_size(), row_comm.get_size(), col_comm.get_size()}] __device__(auto val) {
           return key_func(thrust::get<0>(val), thrust::get<1>(val));
         },
@@ -865,7 +865,7 @@ std::unique_ptr<major_minor_weights_t<vertex_t, edge_t, weight_t>> call_shuffle(
 
   auto local_partition_id_op =
     [comm_size,
-     key_func = cugraph::detail::compute_partition_id_from_edge_t<vertex_t>{
+     key_func = cugraph::detail::compute_partition_id_from_ext_edge_endpoints_t<vertex_t>{
        comm_size, row_comm_size, col_comm_size}] __device__(auto pair) {
       return key_func(thrust::get<0>(pair), thrust::get<1>(pair)) /
              comm_size;  // global partition id to local partition id
diff --git a/cpp/tests/utilities/matrix_market_file_utilities.cu b/cpp/tests/utilities/matrix_market_file_utilities.cu
index 5804b0e4b40..616f69eebe7 100644
--- a/cpp/tests/utilities/matrix_market_file_utilities.cu
+++ b/cpp/tests/utilities/matrix_market_file_utilities.cu
@@ -342,7 +342,7 @@ read_edgelist_from_matrix_market_file(raft::handle_t const& handle,
       handle.get_stream());
     d_vertices.shrink_to_fit(handle.get_stream());
 
-    auto edge_key_func = cugraph::detail::compute_gpu_id_from_edge_t<vertex_t>{
+    auto edge_key_func = cugraph::detail::compute_gpu_id_from_ext_edge_endpoints_t<vertex_t>{
       comm_size, row_comm_size, col_comm_size};
     size_t number_of_local_edges{};
     if (d_edgelist_weights) {

From 342036455c69c4ebd6f1702547c34b5d5ce46431 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 27 Sep 2022 14:39:01 -0700
Subject: [PATCH 16/18] add a test and fix compile errors

---
 .../cugraph/utilities/device_functors.cuh     |  10 +
 cpp/src/detail/graph_utils.cuh                |  19 +-
 cpp/src/prims/detail/nbr_intersection.cuh     |   1 +
 ..._v_pair_transform_dst_nbr_intersection.cuh | 265 +++++++++++-------
 cpp/tests/CMakeLists.txt                      | 138 ++++-----
 5 files changed, 258 insertions(+), 175 deletions(-)

diff --git a/cpp/include/cugraph/utilities/device_functors.cuh b/cpp/include/cugraph/utilities/device_functors.cuh
index bf43b411757..19e14d1d199 100644
--- a/cpp/include/cugraph/utilities/device_functors.cuh
+++ b/cpp/include/cugraph/utilities/device_functors.cuh
@@ -37,6 +37,16 @@ struct typecast_t {
   __device__ output_t operator()(input_t val) const { return static_cast<output_t>(val); }
 };
 
+template <typename Iterator>
+struct indirection_t {
+  Iterator first{};
+
+  __device__ typename thrust::iterator_traits<Iterator>::value_type operator()(size_t i) const
+  {
+    return *(first + i);
+  }
+};
+
 template <typename T>
 struct not_equal_t {
   T compare{};
diff --git a/cpp/src/detail/graph_utils.cuh b/cpp/src/detail/graph_utils.cuh
index f7b0490f156..7c760bb020b 100644
--- a/cpp/src/detail/graph_utils.cuh
+++ b/cpp/src/detail/graph_utils.cuh
@@ -100,7 +100,24 @@ struct compute_gpu_id_from_int_edge_endpoints_t {
                                         thrust::upper_bound(thrust::seq,
                                                             vertex_partition_range_lasts.begin(),
                                                             vertex_partition_range_lasts.end(),
-                                                            major)));
+                                                            minor)));
+    return (minor_comm_rank / row_comm_size) * row_comm_size + (major_comm_rank % row_comm_size);
+  }
+
+  __device__ int operator()(thrust::tuple<vertex_t, vertex_t> pair /* major, minor */) const
+  {
+    auto major_comm_rank =
+      static_cast<int>(thrust::distance(vertex_partition_range_lasts.begin(),
+                                        thrust::upper_bound(thrust::seq,
+                                                            vertex_partition_range_lasts.begin(),
+                                                            vertex_partition_range_lasts.end(),
+                                                            thrust::get<0>(pair))));
+    auto minor_comm_rank =
+      static_cast<int>(thrust::distance(vertex_partition_range_lasts.begin(),
+                                        thrust::upper_bound(thrust::seq,
+                                                            vertex_partition_range_lasts.begin(),
+                                                            vertex_partition_range_lasts.end(),
+                                                            thrust::get<1>(pair))));
     return (minor_comm_rank / row_comm_size) * row_comm_size + (major_comm_rank % row_comm_size);
   }
 };
diff --git a/cpp/src/prims/detail/nbr_intersection.cuh b/cpp/src/prims/detail/nbr_intersection.cuh
index cb1ef32b660..b750342ad17 100644
--- a/cpp/src/prims/detail/nbr_intersection.cuh
+++ b/cpp/src/prims/detail/nbr_intersection.cuh
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include <cugraph/edge_partition_device_view.cuh>
 #include <cugraph/graph.hpp>
 #include <cugraph/partition_manager.hpp>
 #include <cugraph/utilities/device_functors.cuh>
diff --git a/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh b/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh
index a0f2d6f5338..af8b354f94d 100644
--- a/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh
+++ b/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh
@@ -18,6 +18,7 @@
 #include <detail/graph_utils.cuh>
 #include <prims/detail/nbr_intersection.cuh>
 #include <prims/property_op_utils.cuh>
+#include <utilities/collect_comm.cuh>
 
 #include <cugraph/edge_partition_device_view.cuh>
 #include <cugraph/edge_partition_endpoint_property_device_view.cuh>
@@ -53,13 +54,16 @@ namespace detail {
 
 template <typename VertexPairIterator>
 struct compute_local_edge_partition_id_t {
+  using vertex_t = typename thrust::
+    tuple_element<0, typename thrust::iterator_traits<VertexPairIterator>::value_type>::type;
+
   VertexPairIterator vertex_pair_first{};
   size_t num_local_edge_partitions{};
   raft::device_span<vertex_t const> edge_partition_major_range_lasts{};
 
   __device__ int operator()(size_t i) const
   {
-    auto major = thurst::get<0>(*(vertex_pair_first + i));
+    auto major = thrust::get<0>(*(vertex_pair_first + i));
     return static_cast<int>(
       thrust::distance(edge_partition_major_range_lasts.begin(),
                        thrust::upper_bound(thrust::seq,
@@ -80,9 +84,18 @@ struct compute_chunk_id_t {
   }
 };
 
+template <typename VertexPairIterator>
+struct indirection_compare_less_t {
+  VertexPairIterator vertex_pair_first{};
+
+  __device__ bool operator()(size_t i, size_t j) const
+  {
+    return *(vertex_pair_first + i) < *(vertex_pair_first + j);
+  }
+};
+
 template <typename GraphViewType,
-          typename EdgePartitionSrcValueInputWrapper,
-          typename EdgePartitionDstValueInputWrapper,
+          typename VertexValueInputIterator,
           typename IntersectionOp,
           typename VertexPairIndexIterator,
           typename VertexPairIterator,
@@ -93,8 +106,8 @@ struct call_intersection_op_t {
                                typename GraphViewType::weight_type,
                                GraphViewType::is_multi_gpu>
     edge_partition{};
-  EdgePartitionSrcValueInputWrapper edge_partition_src_value_input{};
-  EdgePartitionDstValueInputWrapper edge_partition_dst_value_input{};
+  thrust::optional<raft::device_span<typename GraphViewType::vertex_type const>> unique_vertices;
+  VertexValueInputIterator vertex_property_first;
   IntersectionOp intersection_op{};
   size_t const* nbr_offsets{nullptr};
   typename GraphViewType::vertex_type const* nbr_indices{nullptr};
@@ -104,29 +117,44 @@ struct call_intersection_op_t {
 
   __device__ void operator()(size_t i) const
   {
+    using property_t = typename thrust::iterator_traits<VertexValueInputIterator>::value_type;
+
     auto index        = *(major_minor_pair_index_first + i);
     auto pair         = *(major_minor_pair_first + index);
     auto major        = thrust::get<0>(pair);
     auto minor        = thrust::get<1>(pair);
-    auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
-    auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
     auto src          = GraphViewType::is_storage_transposed ? minor : major;
     auto dst          = GraphViewType::is_storage_transposed ? major : minor;
-    auto src_offset   = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
-    auto dst_offset   = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
     auto intersection = raft::device_span<typename GraphViewType::vertex_type const>(
       nbr_indices + nbr_offsets[i], nbr_indices + nbr_offsets[i + 1]);
+
+    property_t src_prop{};
+    property_t dst_prop{};
+    if (unique_vertices) {
+      src_prop = *(vertex_property_first +
+                   thrust::distance(
+                     (*unique_vertices).begin(),
+                     thrust::lower_bound(
+                       thrust::seq, (*unique_vertices).begin(), (*unique_vertices).end(), src)));
+      dst_prop = *(vertex_property_first +
+                   thrust::distance(
+                     (*unique_vertices).begin(),
+                     thrust::lower_bound(
+                       thrust::seq, (*unique_vertices).begin(), (*unique_vertices).end(), dst)));
+    } else {
+      auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+      auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
+      auto src_offset   = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
+      auto dst_offset   = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
+      src_prop          = *(vertex_property_first + src_offset);
+      dst_prop          = *(vertex_property_first + src_offset);
+    }
     *(major_minor_pair_value_output_first + index) =
       evaluate_intersection_op<GraphViewType,
-                               typename EdgePartitionSrcValueInputWrapper::value_type,
-                               typename EdgePartitionDstValueInputWrapper::value_type,
+                               decltype(src_prop),
+                               decltype(dst_prop),
                                IntersectionOp>()
-        .compute(src,
-                 dst,
-                 edge_partition_src_value_input.get(src_offset),
-                 edge_partition_dst_value_input.get(dst_offset),
-                 intersection,
-                 intersection_op);
+        .compute(src, dst, src_prop, dst_prop, intersection, intersection_op);
   }
 };
 
@@ -141,8 +169,7 @@ struct call_intersection_op_t {
  *
  * @tparam GraphViewType Type of the passed non-owning graph object.
  * @tparam VertexPairIterator Type of the iterator for input vertex pairs.
- * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values.
- * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values.
+ * @tparam VertexValueInputWrapper Type of the wrapper for vertex property values.
  * @tparam IntersectionOp Type of the quinary per intersection operator.
  * @tparam VertexPairValueOutputIterator Type of the iterator for vertex pair output property
  * variables.
@@ -151,16 +178,8 @@ struct call_intersection_op_t {
  * @param graph_view Non-owning graph object.
  * @param vertex_pair_first Iterator pointing to the first (inclusive) input vertex pair.
  * @param vertex_pair_last Iterator pointing to the last (exclusive) input vertex pair.
- * @param edge_src_value_input Wrapper used to access source input property values (for the edge
- * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view()
- * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view()
- * (if @p e_op does not access source property values). Use update_edge_src_property to fill the
- * wrapper.
- * @param edge_dst_value_input Wrapper used to access destination input property values (for the
- * edge destinations assigned to this process in multi-GPU). Use either
- * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or
- * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property
- * values). Use update_edge_dst_property to fill the wrapper.
+ * @param vertex_src_value_input Wrapper used to access vertex input property values (for the
+ * vertices assigned to this process in multi-GPU).
  * @param intersection_op quinary operator takes first vertex of the pair, second vertex of the
  * pair, property values for the first vertex, property values for the second vertex, and a list of
  * vertices in the intersection of the first & second vertices' destination neighbors and returns an
@@ -172,8 +191,7 @@ struct call_intersection_op_t {
  */
 template <typename GraphViewType,
           typename VertexPairIterator,
-          typename EdgeSrcValueInputWrapper,
-          typename EdgeDstValueInputWrapper,
+          typename VertexValueInputIterator,
           typename IntersectionOp,
           typename VertexPairValueOutputIterator>
 void per_v_pair_transform_dst_nbr_intersection(
@@ -181,39 +199,18 @@ void per_v_pair_transform_dst_nbr_intersection(
   GraphViewType const& graph_view,
   VertexPairIterator vertex_pair_first,
   VertexPairIterator vertex_pair_last,
-  EdgeSrcValueInputWrapper edge_src_value_input,
-  EdgeDstValueInputWrapper edge_dst_value_input,
+  VertexValueInputIterator vertex_value_input_first,
   IntersectionOp intersection_op,
   VertexPairValueOutputIterator vertex_pair_value_output_first,
   bool do_expensive_check = false)
 {
   static_assert(!GraphViewType::is_storage_transposed);
 
-  using vertex_t = typename GraphViewType::vertex_type;
-  using edge_t   = typename GraphViewType::edge_type;
-  using weight_t = typename GraphViewType::weight_type;
-  using result_t = typename thrust::iterator_traits<VertexPairValueOutputIterator>::value_type;
-
-  using edge_partition_src_input_device_view_t = std::conditional_t<
-    std::is_same_v<typename EdgeSrcValueInputWrapper::value_type, thrust::nullopt_t>,
-    detail::edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
-    std::conditional_t<GraphViewType::is_storage_transposed,
-                       detail::edge_partition_endpoint_property_device_view_t<
-                         vertex_t,
-                         typename EdgeSrcValueInputWrapper::value_iterator>,
-                       detail::edge_partition_endpoint_property_device_view_t<
-                         vertex_t,
-                         typename EdgeSrcValueInputWrapper::value_iterator>>>;
-  using edge_partition_dst_input_device_view_t = std::conditional_t<
-    std::is_same_v<typename EdgeDstValueInputWrapper::value_type, thrust::nullopt_t>,
-    detail::edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
-    std::conditional_t<GraphViewType::is_storage_transposed,
-                       detail::edge_partition_endpoint_property_device_view_t<
-                         vertex_t,
-                         typename EdgeDstValueInputWrapper::value_iterator>,
-                       detail::edge_partition_endpoint_property_device_view_t<
-                         vertex_t,
-                         typename EdgeDstValueInputWrapper::value_iterator>>>;
+  using vertex_t   = typename GraphViewType::vertex_type;
+  using edge_t     = typename GraphViewType::edge_type;
+  using weight_t   = typename GraphViewType::weight_type;
+  using property_t = typename thrust::iterator_traits<VertexValueInputIterator>::value_type;
+  using result_t   = typename thrust::iterator_traits<VertexPairValueOutputIterator>::value_type;
 
   if (do_expensive_check) {
     auto num_invalids =
@@ -221,13 +218,51 @@ void per_v_pair_transform_dst_nbr_intersection(
     CUGRAPH_EXPECTS(num_invalids == 0, "");
   }
 
-  rmm::device_uvector<size_t> vertex_pair_indices(
-    thrust::distance(vertex_pair_first, vertex_pair_last), handle.get_stream());
+  auto num_input_pairs = static_cast<size_t>(thrust::distance(vertex_pair_first, vertex_pair_last));
+  std::optional<rmm::device_uvector<vertex_t>> unique_vertices{std::nullopt};
+  std::optional<decltype(allocate_dataframe_buffer<property_t>(size_t{0}, rmm::cuda_stream_view{}))>
+    property_buffer_for_unique_vertices{std::nullopt};
+  if constexpr (GraphViewType::is_multi_gpu) {
+    unique_vertices  = rmm::device_uvector<vertex_t>(num_input_pairs * 2, handle.get_stream());
+    auto elem0_first = thrust::make_transform_iterator(
+      vertex_pair_first,
+      cugraph::thrust_tuple_get<typename thrust::iterator_traits<VertexPairIterator>::value_type,
+                                0>{});
+    thrust::copy(handle.get_thrust_policy(),
+                 elem0_first,
+                 elem0_first + num_input_pairs,
+                 (*unique_vertices).begin());
+    auto elem1_first = thrust::make_transform_iterator(
+      vertex_pair_first,
+      cugraph::thrust_tuple_get<typename thrust::iterator_traits<VertexPairIterator>::value_type,
+                                1>{});
+    thrust::copy(handle.get_thrust_policy(),
+                 elem1_first,
+                 elem1_first + num_input_pairs,
+                 (*unique_vertices).begin() + num_input_pairs);
+    thrust::sort(handle.get_thrust_policy(), (*unique_vertices).begin(), (*unique_vertices).end());
+    (*unique_vertices)
+      .resize(thrust::distance((*unique_vertices).begin(),
+                               thrust::unique(handle.get_thrust_policy(),
+                                              (*unique_vertices).begin(),
+                                              (*unique_vertices).end())),
+              handle.get_stream());
+
+    property_buffer_for_unique_vertices =
+      collect_values_for_sorted_unique_vertices(handle.get_comms(),
+                                                (*unique_vertices).data(),
+                                                static_cast<vertex_t>((*unique_vertices).size()),
+                                                vertex_value_input_first,
+                                                graph_view.vertex_partition_range_lasts(),
+                                                handle.get_stream());
+  }
+
+  rmm::device_uvector<size_t> vertex_pair_indices(num_input_pairs, handle.get_stream());
   thrust::sequence(
     handle.get_thrust_policy(), vertex_pair_indices.begin(), vertex_pair_indices.end(), size_t{0});
 
   std::vector<vertex_t> h_edge_partition_major_range_lasts(
-    graph_view.number_of_local_edge_partitions);
+    graph_view.number_of_local_edge_partitions());
   for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
     h_edge_partition_major_range_lasts[i] = graph_view.local_edge_partition_src_range_first(i);
   }
@@ -236,7 +271,7 @@ void per_v_pair_transform_dst_nbr_intersection(
   raft::update_device(d_edge_partition_major_range_lasts.data(),
                       h_edge_partition_major_range_lasts.data(),
                       h_edge_partition_major_range_lasts.size(),
-                      std::cout);
+                      handle.get_stream());
   auto d_edge_partition_group_sizes = groupby_and_count(
     vertex_pair_indices.begin(),
     vertex_pair_indices.end(),
@@ -257,25 +292,14 @@ void per_v_pair_transform_dst_nbr_intersection(
   std::vector<size_t> h_edge_partition_group_displacements(h_edge_partition_group_sizes.size());
   std::exclusive_scan(h_edge_partition_group_sizes.begin(),
                       h_edge_partition_group_sizes.end(),
-                      h_edge_partition_group_displacements.begin());
+                      h_edge_partition_group_displacements.begin(),
+                      size_t{0});
 
   for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
     auto edge_partition =
       edge_partition_device_view_t<vertex_t, edge_t, weight_t, GraphViewType::is_multi_gpu>(
         graph_view.local_edge_partition_view(i));
 
-    edge_partition_src_input_device_view_t edge_partition_src_value_input{};
-    edge_partition_dst_input_device_view_t edge_partition_dst_value_input{};
-    if constexpr (GraphViewType::is_storage_transposed) {
-      edge_partition_src_value_input = edge_partition_src_input_device_view_t(edge_src_value_input);
-      edge_partition_dst_value_input =
-        edge_partition_dst_input_device_view_t(edge_dst_value_input, i);
-    } else {
-      edge_partition_src_value_input =
-        edge_partition_src_input_device_view_t(edge_src_value_input, i);
-      edge_partition_dst_value_input = edge_partition_dst_input_device_view_t(edge_dst_value_input);
-    }
-
     auto edge_partition_vertex_pair_index_first =
       vertex_pair_indices.begin() + h_edge_partition_group_displacements[i];
 
@@ -284,7 +308,7 @@ void per_v_pair_transform_dst_nbr_intersection(
     // vertex degrees. to limit memory footprint ((1 << 15) is a tuning parameter)
     auto max_chunk_size =
       static_cast<size_t>(handle.get_device_properties().multiProcessorCount) * (1 << 15);
-    auto max_num_chunks = (majors.size() + max_chunk_size - 1) / max_chunk_size;
+    auto max_num_chunks = (h_edge_partition_group_sizes[i] + max_chunk_size - 1) / max_chunk_size;
     if constexpr (GraphViewType::is_multi_gpu) {
       max_num_chunks = host_scalar_allreduce(
         handle.get_comms(), max_num_chunks, raft::comms::op_t::MAX, handle.get_stream());
@@ -292,35 +316,36 @@ void per_v_pair_transform_dst_nbr_intersection(
 
     std::vector<size_t> h_chunk_sizes(max_num_chunks);
     if (h_chunk_sizes.size() > size_t{1}) {
-      auto d_chunk_sizes =
-        groupby_and_count(edge_partition_vertex_pair_index_first,
-                          edge_partition_vertex_pair_index_first + h_edge_partition_group_sizes[i],
-                          detail::compute_chunk_id_t<VertexPairIterator>{max_num_chunks},
-                          static_cast<int>(max_num_chunks),
-                          std::numeric_limits<size_t>::max(),
-                          handle.get_stream());
+      auto d_chunk_sizes = groupby_and_count(
+        edge_partition_vertex_pair_index_first,
+        edge_partition_vertex_pair_index_first + h_edge_partition_group_sizes[i],
+        detail::compute_chunk_id_t<VertexPairIterator>{vertex_pair_first, max_num_chunks},
+        static_cast<int>(max_num_chunks),
+        std::numeric_limits<size_t>::max(),
+        handle.get_stream());
       raft::update_host(
         h_chunk_sizes.data(), d_chunk_sizes.data(), d_chunk_sizes.size(), handle.get_stream());
       handle.sync_stream();
     } else if (h_chunk_sizes.size() == size_t{1}) {
-      h_chunk_sizes[0] = majors.size();
+      h_chunk_sizes[0] = h_edge_partition_group_sizes[i];
     }
 
-    auto chunk_vertex_pair_index_first = edge_partition_vertx_pair_index_first;
+    auto chunk_vertex_pair_index_first = edge_partition_vertex_pair_index_first;
     for (size_t j = 0; j < h_chunk_sizes.size(); ++j) {
       auto this_chunk_size = h_chunk_sizes[j];
 
       thrust::sort(handle.get_thrust_policy(),
                    chunk_vertex_pair_index_first,
                    chunk_vertex_pair_index_first + this_chunk_size,
-                   indirection_copmare_less_t<VertexPairIterator>{
+                   detail::indirection_compare_less_t<VertexPairIterator>{
                      vertex_pair_first});  // detail::nbr_intersection() requires the input vertex
                                            // pairs to be sorted.
 
-      // FIXME: better restrict detail::nbr_intersection input vetex pairs to a single edge
+      // FIXME: better restrict detail::nbr_intersection input vertex pairs to a single edge
       // partition? This may provide additional performance improvement opportunities???
-      auto chunk_vertex_pair_fist = thrust::make_transform_iterator(
-        chunk_vertex_pair_index_first, indirection_t<VertexPairIterator>{vertex_pair_first});
+      auto chunk_vertex_pair_first = thrust::make_transform_iterator(
+        chunk_vertex_pair_index_first,
+        detail::indirection_t<VertexPairIterator>{vertex_pair_first});
       auto [intersection_offsets, intersection_indices] =
         detail::nbr_intersection(handle,
                                  graph_view,
@@ -329,25 +354,49 @@ void per_v_pair_transform_dst_nbr_intersection(
                                  std::array<bool, 2>{true, true},
                                  do_expensive_check);
 
-      thrust::for_each(
-        handle.get_thrust_policy(),
-        thrust::make_counting_iterator(size_t{0}),
-        thrust::make_counting_iterator(this_chunk_size),
-        detail::call_intersection_op_t<GraphViewType,
-                                       edge_partition_src_input_device_view_t,
-                                       edge_partition_dst_input_device_view_t,
-                                       IntersectionOp,
-                                       decltype(chunk_vertex_pair_index_first),
-                                       VertexPairIterator VertexPairValueOutputIterator>{
-          edge_partition,
-          edge_partition_src_value_input,
-          edge_partition_dst_value_input,
-          intersection_op,
-          intersection_offsets.data(),
-          intersection_indices.data(),
-          chunk_vertex_pair_index_first,
-          vertex_pair_first,
-          vertex_pair_value_output_first});
+      if (unique_vertices) {
+        auto vertex_value_input_for_unique_vertices_first =
+          get_dataframe_buffer_begin(*property_buffer_for_unique_vertices);
+        thrust::for_each(
+          handle.get_thrust_policy(),
+          thrust::make_counting_iterator(size_t{0}),
+          thrust::make_counting_iterator(this_chunk_size),
+          detail::call_intersection_op_t<GraphViewType,
+                                         decltype(vertex_value_input_for_unique_vertices_first),
+                                         IntersectionOp,
+                                         decltype(chunk_vertex_pair_index_first),
+                                         VertexPairIterator,
+                                         VertexPairValueOutputIterator>{
+            edge_partition,
+            thrust::make_optional<raft::device_span<vertex_t const>>((*unique_vertices).data(),
+                                                                     (*unique_vertices).size()),
+            vertex_value_input_for_unique_vertices_first,
+            intersection_op,
+            intersection_offsets.data(),
+            intersection_indices.data(),
+            chunk_vertex_pair_index_first,
+            vertex_pair_first,
+            vertex_pair_value_output_first});
+      } else {
+        thrust::for_each(handle.get_thrust_policy(),
+                         thrust::make_counting_iterator(size_t{0}),
+                         thrust::make_counting_iterator(this_chunk_size),
+                         detail::call_intersection_op_t<GraphViewType,
+                                                        VertexValueInputIterator,
+                                                        IntersectionOp,
+                                                        decltype(chunk_vertex_pair_index_first),
+                                                        VertexPairIterator,
+                                                        VertexPairValueOutputIterator>{
+                           edge_partition,
+                           thrust::optional<raft::device_span<vertex_t const>>{thrust::nullopt},
+                           vertex_value_input_first,
+                           intersection_op,
+                           intersection_offsets.data(),
+                           intersection_indices.data(),
+                           chunk_vertex_pair_index_first,
+                           vertex_pair_first,
+                           vertex_pair_value_output_first});
+      }
 
       chunk_vertex_pair_index_first += this_chunk_size;
     }
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index c246025e88b..ce64c796c8c 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -428,155 +428,161 @@ if(BUILD_CUGRAPH_MG_TESTS)
       set(GPU_COUNT "1")
     endif()
 
-    ###########################################################################################
-    # - MG SYMMETRIZE tests -------------------------------------------------------------------
+    ###############################################################################################
+    # - MG SYMMETRIZE tests -----------------------------------------------------------------------
     ConfigureTestMG(MG_SYMMETRIZE_TEST structure/mg_symmetrize_test.cpp)
 
-    ###########################################################################################
-    # - MG Transpose tests --------------------------------------------------------------------
+    ###############################################################################################
+    # - MG Transpose tests ------------------------------------------------------------------------
     ConfigureTestMG(MG_TRANSPOSE_TEST structure/mg_transpose_test.cpp)
 
-    ###########################################################################################
-    # - MG Transpose Storage tests ------------------------------------------------------------
+    ###############################################################################################
+    # - MG Transpose Storage tests ----------------------------------------------------------------
     ConfigureTestMG(MG_TRANSPOSE_STORAGE_TEST structure/mg_transpose_storage_test.cpp)
 
-    ###########################################################################################
-    # - MG Count self-loops and multi-edges tests ---------------------------------------------
+    ###############################################################################################
+    # - MG Count self-loops and multi-edges tests -------------------------------------------------
     ConfigureTestMG(MG_COUNT_SELF_LOOPS_AND_MULTI_EDGES_TEST
           "structure/mg_count_self_loops_and_multi_edges_test.cpp")
 
-    ###########################################################################################
-    # - MG PAGERANK tests ---------------------------------------------------------------------
+    ###############################################################################################
+    # - MG PAGERANK tests -------------------------------------------------------------------------
     ConfigureTestMG(MG_PAGERANK_TEST link_analysis/mg_pagerank_test.cpp)
 
-    ###########################################################################################
-    # - MG HITS tests -------------------------------------------------------------------------
+    ###############################################################################################
+    # - MG HITS tests -----------------------------------------------------------------------------
     ConfigureTestMG(MG_HITS_TEST link_analysis/mg_hits_test.cpp)
 
-    ###########################################################################################
-    # - MG KATZ CENTRALITY tests --------------------------------------------------------------
+    ###############################################################################################
+    # - MG KATZ CENTRALITY tests ------------------------------------------------------------------
     ConfigureTestMG(MG_KATZ_CENTRALITY_TEST centrality/mg_katz_centrality_test.cpp)
 
-    ###########################################################################################
-    # - MG EIGENVECTOR CENTRALITY tests --------------------------------------------------------------
+    ###############################################################################################
+    # - MG EIGENVECTOR CENTRALITY tests -----------------------------------------------------------
     ConfigureTestMG(MG_EIGENVECTOR_CENTRALITY_TEST centrality/mg_eigenvector_centrality_test.cpp)
 
-    ###########################################################################################
-    # - MG BFS tests --------------------------------------------------------------------------
+    ###############################################################################################
+    # - MG BFS tests ------------------------------------------------------------------------------
     ConfigureTestMG(MG_BFS_TEST traversal/mg_bfs_test.cpp)
 
-    ###########################################################################################
-    # - Extract BFS Paths tests ---------------------------------------------------------------
+    ###############################################################################################
+    # - Extract BFS Paths tests -------------------------------------------------------------------
     ConfigureTestMG(MG_EXTRACT_BFS_PATHS_TEST
                     traversal/mg_extract_bfs_paths_test.cu)
 
-    ###########################################################################################
-    # - MG SSSP tests -------------------------------------------------------------------------
+    ###############################################################################################
+    # - MG SSSP tests -----------------------------------------------------------------------------
     ConfigureTestMG(MG_SSSP_TEST traversal/mg_sssp_test.cpp)
 
-    ###########################################################################################
-    # - MG LOUVAIN tests ----------------------------------------------------------------------
+    ###############################################################################################
+    # - MG LOUVAIN tests --------------------------------------------------------------------------
     ConfigureTestMG(MG_LOUVAIN_TEST
         community/mg_louvain_helper.cu
         community/mg_louvain_test.cpp)
 
-    ###########################################################################################
-    # - MG WEAKLY CONNECTED COMPONENTS tests --------------------------------------------------
+    ###############################################################################################
+    # - MG WEAKLY CONNECTED COMPONENTS tests ------------------------------------------------------
     ConfigureTestMG(MG_WEAKLY_CONNECTED_COMPONENTS_TEST
                     components/mg_weakly_connected_components_test.cpp)
 
-    ###########################################################################################
-    # - MG GRAPH BROADCAST tests --------------------------------------------------------------
+    ###############################################################################################
+    # - MG GRAPH BROADCAST tests ------------------------------------------------------------------
     ConfigureTestMG(MG_GRAPH_BROADCAST_TEST bcast/mg_graph_bcast.cpp)
 
-    ###########################################################################################
-    # - MG Core Number tests ------------------------------------------------------------------
+    ###############################################################################################
+    # - MG Core Number tests ----------------------------------------------------------------------
     ConfigureTestMG(MG_CORE_NUMBER_TEST cores/mg_core_number_test.cpp)
 
-    ###########################################################################################
-    # - MG TRIANGLE COUNT tests ---------------------------------------------------------------
+    ###############################################################################################
+    # - MG TRIANGLE COUNT tests -------------------------------------------------------------------
     ConfigureTestMG(MG_TRIANGLE_COUNT_TEST community/mg_triangle_count_test.cpp)
 
-    ###########################################################################################
-    # - MG PRIMS COUNT_IF_V tests -------------------------------------------------------------
+    ###############################################################################################
+    # - MG PRIMS COUNT_IF_V tests -----------------------------------------------------------------
     ConfigureTestMG(MG_COUNT_IF_V_TEST prims/mg_count_if_v.cu)
     target_link_libraries(MG_COUNT_IF_V_TEST PRIVATE cuco::cuco)
 
-    ###########################################################################################
-    # - MG PRIMS TRANSFORM_REDUCE_V_FRONTIER_OUTGOING_E_BY_DST tests --------------------------
+    ###############################################################################################
+    # - MG PRIMS TRANSFORM_REDUCE_V_FRONTIER_OUTGOING_E_BY_DST tests ------------------------------
     ConfigureTestMG(MG_TRANSFORM_REDUCE_V_FRONTIER_OUTGOING_E_BY_DST_TEST
                     prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu)
     target_link_libraries(MG_TRANSFORM_REDUCE_V_FRONTIER_OUTGOING_E_BY_DST_TEST PRIVATE cuco::cuco)
 
-    ###########################################################################################
-    # - MG PRIMS REDUCE_V tests ---------------------------------------------------------------
+    ###############################################################################################
+    # - MG PRIMS REDUCE_V tests -------------------------------------------------------------------
     ConfigureTestMG(MG_REDUCE_V_TEST prims/mg_reduce_v.cu)
     target_link_libraries(MG_REDUCE_V_TEST PRIVATE cuco::cuco)
 
-    ###########################################################################################
-    # - MG PRIMS TRANSFORM_REDUCE_V tests -----------------------------------------------------
+    ###############################################################################################
+    # - MG PRIMS TRANSFORM_REDUCE_V tests ---------------------------------------------------------
     ConfigureTestMG(MG_TRANSFORM_REDUCE_V_TEST prims/mg_transform_reduce_v.cu)
     target_link_libraries(MG_TRANSFORM_REDUCE_V_TEST PRIVATE cuco::cuco)
 
-    ###########################################################################################
-    # - MG PRIMS TRANSFORM_REDUCE_E tests -----------------------------------------------------
+    ###############################################################################################
+    # - MG PRIMS TRANSFORM_REDUCE_E tests ---------------------------------------------------------
     ConfigureTestMG(MG_TRANSFORM_REDUCE_E_TEST prims/mg_transform_reduce_e.cu)
     target_link_libraries(MG_TRANSFORM_REDUCE_E_TEST PRIVATE cuco::cuco)
 
-    ###########################################################################################
-    # - MG PRIMS COUNT_IF_E tests -------------------------------------------------------------
+    ###############################################################################################
+    # - MG PRIMS COUNT_IF_E tests -----------------------------------------------------------------
     ConfigureTestMG(MG_COUNT_IF_E_TEST prims/mg_count_if_e.cu)
     target_link_libraries(MG_COUNT_IF_E_TEST PRIVATE cuco::cuco)
 
-    ###########################################################################################
-    # - MG PRIMS PER_V_TRANSFORM_REDUCE_INCOMING_OUTGOING_E tests -----------------------------
+    ###############################################################################################
+    # - MG PRIMS PER_V_TRANSFORM_REDUCE_INCOMING_OUTGOING_E tests ---------------------------------
     ConfigureTestMG(MG_PER_V_TRANSFORM_REDUCE_INCOMING_OUTGOING_E_TEST
       prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu)
     target_link_libraries(MG_PER_V_TRANSFORM_REDUCE_INCOMING_OUTGOING_E_TEST PRIVATE cuco::cuco)
 
-    ###########################################################################################
-    # - MG PRIMS EXTRACT_IF_E tests -----------------------------------------------------------
+    ###############################################################################################
+    # - MG PRIMS EXTRACT_IF_E tests ---------------------------------------------------------------
     ConfigureTestMG(MG_EXTRACT_IF_E_TEST prims/mg_extract_if_e.cu)
     target_link_libraries(MG_EXTRACT_IF_E_TEST PRIVATE cuco::cuco)
 
-    ###########################################################################################
-    # - MG PRIMS EXTRACT_TRANSFORM_V_FRONTIER_OUTGOING_E tests --------------------------------
+    ###############################################################################################
+    # - MG PRIMS EXTRACT_TRANSFORM_V_FRONTIER_OUTGOING_E tests ------------------------------------
     ConfigureTestMG(MG_EXTRACT_TRANSFORM_V_FRONTIER_OUTGOING_E_TEST
                     prims/mg_extract_transform_v_frontier_outgoing_e.cu)
     target_link_libraries(MG_EXTRACT_TRANSFORM_V_FRONTIER_OUTGOING_E_TEST PRIVATE cuco::cuco)
 
-    ###########################################################################################
-    # - MG PRIMS PER_V_RANDOM_SELECT_TRANSFORM_OUTGOING_E tests -------------------------------
+    ###############################################################################################
+    # - MG PRIMS PER_V_RANDOM_SELECT_TRANSFORM_OUTGOING_E tests -----------------------------------
     ConfigureTestMG(MG_PER_V_RANDOM_SELECT_TRANSFORM_OUTGOING_E_TEST
                     prims/mg_per_v_random_select_transform_outgoing_e.cu)
     target_link_libraries(MG_PER_V_RANDOM_SELECT_TRANSFORM_OUTGOING_E_TEST PRIVATE cuco::cuco)
 
-    ###########################################################################################
-    # - MG GATHER_UTILS tests -----------------------------------------------------------------
+    ###############################################################################################
+    # - MG PRIMS PER_V_PAIR_TRANSFORM_DST_NBR_INTERSECTION tests ----------------------------------
+    ConfigureTestMG(MG_PER_V_PAIR_TRANSFORM_DST_NBR_INTERSECTION_TEST
+                    prims/mg_per_v_pair_transform_dst_nbr_intersection.cu)
+    target_link_libraries(MG_PER_V_PAIR_TRANSFORM_DST_NBR_INTERSECTION_TEST PRIVATE cuco::cuco)
+
+    ###############################################################################################
+    # - MG GATHER_UTILS tests ---------------------------------------------------------------------
     ConfigureTestMG(MG_GATHER_UTILS_TEST sampling/detail/mg_gather_utils.cu)
     target_link_libraries(MG_GATHER_UTILS_TEST PRIVATE cuco::cuco)
 
-    ###########################################################################################
-    # - MG GATHER_ONE_HOP tests ---------------------------------------------------------------
+    ###############################################################################################
+    # - MG GATHER_ONE_HOP tests -------------------------------------------------------------------
     ConfigureTestMG(MG_GATHER_ONE_HOP_TEST sampling/detail/mg_gather_one_hop.cu)
     target_link_libraries(MG_GATHER_ONE_HOP_TEST PRIVATE cuco::cuco)
 
-    ###########################################################################################
-    # - MG NBR SAMPLING tests -----------------------------------------------------------------
+    ###############################################################################################
+    # - MG NBR SAMPLING tests ---------------------------------------------------------------------
     ConfigureTestMG(MG_UNIFORM_NEIGHBOR_SAMPLING_TEST sampling/mg_uniform_neighbor_sampling.cu)
     target_link_libraries(MG_UNIFORM_NEIGHBOR_SAMPLING_TEST PRIVATE cuco::cuco)
 
-    ###########################################################################################
-    # - RANDOM_WALKS tests --------------------------------------------------------------------
-#    ConfigureTestMG(MG_RANDOM_WALKS_TEST sampling/mg_random_walks_test.cpp)
+    ###############################################################################################
+    # - RANDOM_WALKS tests ------------------------------------------------------------------------
+    # ConfigureTestMG(MG_RANDOM_WALKS_TEST sampling/mg_random_walks_test.cpp)
 
-    ###################################################################################################
-    # - SIMILARITY tests ------------------------------------------------------------------------------
+    ###############################################################################################
+    # - SIMILARITY tests --------------------------------------------------------------------------
     ConfigureTestMG(MG_SIMILARITY_TEST link_prediction/mg_similarity_test.cpp)
 
 
-    ###########################################################################################
-    # - MG C API tests ------------------------------------------------------------------------
+    ###############################################################################################
+    # - MG C API tests ----------------------------------------------------------------------------
     ConfigureCTestMG(MG_CAPI_CREATE_GRAPH_TEST c_api/mg_create_graph_test.c c_api/mg_test_utils.cpp)
     ConfigureCTestMG(MG_CAPI_PAGERANK_TEST c_api/mg_pagerank_test.c c_api/mg_test_utils.cpp)
     ConfigureCTestMG(MG_CAPI_BFS_TEST c_api/mg_bfs_test.c c_api/mg_test_utils.cpp)
@@ -598,7 +604,7 @@ endif()
 # - C API tests -----------------------------------------------------------------------------------
 
 ###################################################################################################
-# - common C API test utils -----------------------------------------------------------------------------
+# - common C API test utils -----------------------------------------------------------------------
 
 add_library(cugraph_c_testutil STATIC
             c_api/test_utils.cpp)

From 789f15c81ce3ef14d82416a25c9ef0e027ae2fc7 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 27 Sep 2022 14:43:25 -0700
Subject: [PATCH 17/18] test MG prim test

---
 ...r_v_pair_transform_dst_nbr_intersection.cu | 273 ++++++++++++++++++
 1 file changed, 273 insertions(+)
 create mode 100644 cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu

diff --git a/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu b/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu
new file mode 100644
index 00000000000..73a1c9c0ae5
--- /dev/null
+++ b/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/device_comm_wrapper.hpp>
+#include <utilities/high_res_clock.h>
+#include <utilities/mg_utilities.hpp>
+#include <utilities/test_graphs.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <prims/per_v_pair_transform_dst_nbr_intersection.cuh>
+
+#include <cugraph/edge_src_dst_property.hpp>
+#include <cugraph/utilities/dataframe_buffer.hpp>
+#include <cugraph/utilities/host_scalar_comm.hpp>
+#include <cugraph/utilities/thrust_tuple_utils.hpp>
+
+#include <cugraph/graph_view.hpp>
+
+#include <raft/core/device_span.hpp>
+#include <raft/comms/comms.hpp>
+#include <raft/comms/mpi_comms.hpp>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/tuple.h>
+
+#include <gtest/gtest.h>
+
+#include <random>
+
+template <typename vertex_t, typename edge_t>
+struct intersection_op_t {
+  __device__ thrust::tuple<edge_t, edge_t> operator()(
+    vertex_t,
+    vertex_t,
+    edge_t v0_prop /* out degree */,
+    edge_t v1_prop /* out degree */,
+    raft::device_span<vertex_t const> intersection) const
+  {
+    return thrust::make_tuple(v0_prop + v1_prop, static_cast<edge_t>(intersection.size()));
+  }
+};
+
+struct Prims_Usecase {
+  size_t num_vertex_pairs{0};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MGPerVPairTransformDstNbrIntersection
+  : public ::testing::TestWithParam<std::tuple<Prims_Usecase, input_usecase_t>> {
+ public:
+  Tests_MGPerVPairTransformDstNbrIntersection() {}
+
+  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
+
+  static void TearDownTestCase() { handle_.reset(); }
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  // Verify the results of per_v_pair_transform_dst_nbr_intersection primitive
+  template <typename vertex_t, typename edge_t, typename weight_t, typename property_t>
+  void run_current_test(Prims_Usecase const& prims_usecase, input_usecase_t const& input_usecase)
+  {
+    HighResClock hr_clock{};
+
+    auto const comm_rank = handle_->get_comms().get_rank();
+    auto const comm_size = handle_->get_comms().get_size();
+    auto const row_comm_size = handle_->get_subcomm(cugraph::partition_2d::key_naming_t().row_name()).get_size();
+    auto const col_comm_size = handle_->get_subcomm(cugraph::partition_2d::key_naming_t().col_name()).get_size();
+
+    // 1. create MG graph
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_clock.start();
+    }
+
+    auto [mg_graph, d_mg_renumber_map_labels] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, true>(
+        *handle_, input_usecase, false, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      double elapsed_time{0.0};
+      hr_clock.stop(&elapsed_time);
+      std::cout << "MG construct_graph took " << elapsed_time * 1e-6 << " s.\n";
+    }
+
+    auto mg_graph_view = mg_graph.view();
+
+    // 2. run MG per_v_pair_transform_dst_nbr_intersection primitive
+
+    ASSERT_TRUE(
+      mg_graph_view.number_of_vertices() >
+      vertex_t{0});  // the code below to generate vertex pairs is invalid for an empty graph.
+
+    auto mg_vertex_pair_buffer =
+      cugraph::allocate_dataframe_buffer<thrust::tuple<vertex_t, vertex_t>>(
+        prims_usecase.num_vertex_pairs / comm_size +
+          (static_cast<size_t>(comm_rank) < prims_usecase.num_vertex_pairs % comm_size ? 1 : 0),
+        handle_->get_stream());
+    thrust::tabulate(
+      handle_->get_thrust_policy(),
+      cugraph::get_dataframe_buffer_begin(mg_vertex_pair_buffer),
+      cugraph::get_dataframe_buffer_end(mg_vertex_pair_buffer),
+      [comm_rank, num_vertices = mg_graph_view.number_of_vertices()] __device__(size_t i) {
+        cuco::detail::MurmurHash3_32<size_t>
+          hash_func{};  // use hash_func to generate arbitrary vertex pairs
+        auto v0 = static_cast<vertex_t>(hash_func(i + comm_rank) % num_vertices);
+        auto v1 = static_cast<vertex_t>(hash_func(i + num_vertices + comm_rank) % num_vertices);
+        return thrust::make_tuple(v0, v1);
+      });
+
+    auto h_vertex_partition_range_lasts = mg_graph_view.vertex_partition_range_lasts();
+    rmm::device_uvector<vertex_t> d_vertex_partition_range_lasts(h_vertex_partition_range_lasts.size(), handle_->get_stream());
+    raft::update_device(d_vertex_partition_range_lasts.data(), h_vertex_partition_range_lasts.data(), h_vertex_partition_range_lasts.size(), handle_->get_stream());
+    std::tie(mg_vertex_pair_buffer, std::ignore) = cugraph::groupby_gpu_id_and_shuffle_values(
+      handle_->get_comms(),
+      cugraph::get_dataframe_buffer_begin(mg_vertex_pair_buffer),
+      cugraph::get_dataframe_buffer_begin(mg_vertex_pair_buffer),
+      cugraph::detail::compute_gpu_id_from_int_edge_endpoints_t<vertex_t>{raft::device_span<vertex_t const>(d_vertex_partition_range_lasts.data(), d_vertex_partition_range_lasts.size()), comm_size, row_comm_size, col_comm_size},
+      handle_->get_stream());
+
+    auto mg_result_buffer = cugraph::allocate_dataframe_buffer<thrust::tuple<edge_t, edge_t>>(
+      cugraph::size_dataframe_buffer(mg_vertex_pair_buffer), handle_->get_stream());
+    auto mg_out_degrees = mg_graph_view.compute_out_degrees(*handle_);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_clock.start();
+    }
+
+    cugraph::per_v_pair_transform_dst_nbr_intersection(
+      *handle_,
+      mg_graph_view,
+      cugraph::get_dataframe_buffer_begin(mg_vertex_pair_buffer),
+      cugraph::get_dataframe_buffer_end(mg_vertex_pair_buffer),
+      mg_out_degrees.begin(),
+      intersection_op_t<vertex_t, edge_t>{},
+      cugraph::get_dataframe_buffer_begin(mg_result_buffer));
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      double elapsed_time{0.0};
+      hr_clock.stop(&elapsed_time);
+      std::cout << "MG per_v_pair_transform_dst_nbr_intersection took " << elapsed_time * 1e-6
+                << " s.\n";
+    }
+
+    // 3. validate MG results
+
+    if (prims_usecase.check_correctness) {
+      cugraph::unrenumber_int_vertices<vertex_t, true>(*handle_, std::get<0>(mg_vertex_pair_buffer).data(), cugraph::size_dataframe_buffer(mg_vertex_pair_buffer), (*d_mg_renumber_map_labels).data(), h_vertex_partition_range_lasts);
+      cugraph::unrenumber_int_vertices<vertex_t, true>(*handle_, std::get<1>(mg_vertex_pair_buffer).data(), cugraph::size_dataframe_buffer(mg_vertex_pair_buffer), (*d_mg_renumber_map_labels).data(), h_vertex_partition_range_lasts);
+
+      cugraph::graph_t<vertex_t, edge_t, weight_t, false, false> unrenumbered_graph(*handle_);
+      std::tie(unrenumbered_graph, std::ignore) =
+        cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, false>(
+          *handle_, input_usecase, false, false);
+
+      auto unrenumbered_graph_view = unrenumbered_graph.view();
+
+      auto sg_result_buffer = cugraph::allocate_dataframe_buffer<thrust::tuple<edge_t, edge_t>>(cugraph::size_dataframe_buffer(mg_vertex_pair_buffer), handle_->get_stream());
+      auto sg_out_degrees = unrenumbered_graph_view.compute_out_degrees(*handle_);
+
+      cugraph::per_v_pair_transform_dst_nbr_intersection(
+        *handle_,
+        unrenumbered_graph_view,
+        cugraph::get_dataframe_buffer_begin(mg_vertex_pair_buffer /* now unrenumbered */),
+        cugraph::get_dataframe_buffer_end(mg_vertex_pair_buffer /* now unrenumbered */),
+        sg_out_degrees.begin(),
+        intersection_op_t<vertex_t, edge_t>{},
+        cugraph::get_dataframe_buffer_begin(sg_result_buffer));
+
+      bool valid = thrust::equal(handle_->get_thrust_policy(),
+                                 cugraph::get_dataframe_buffer_begin(mg_result_buffer),
+                                 cugraph::get_dataframe_buffer_end(mg_result_buffer),
+                                 cugraph::get_dataframe_buffer_begin(sg_result_buffer));
+
+      valid = static_cast<bool>(cugraph::host_scalar_allreduce(
+        handle_->get_comms(), static_cast<int>(valid), raft::comms::op_t::MIN, handle_->get_stream()));
+      ASSERT_TRUE(valid);
+    }
+  }
+
+ private:
+  static std::unique_ptr<raft::handle_t> handle_;
+};
+
+template <typename input_usecase_t>
+std::unique_ptr<raft::handle_t>
+  Tests_MGPerVPairTransformDstNbrIntersection<input_usecase_t>::handle_ = nullptr;
+
+using Tests_MGPerVPairTransformDstNbrIntersection_File =
+  Tests_MGPerVPairTransformDstNbrIntersection<cugraph::test::File_Usecase>;
+using Tests_MGPerVPairTransformDstNbrIntersection_Rmat =
+  Tests_MGPerVPairTransformDstNbrIntersection<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MGPerVPairTransformDstNbrIntersection_File, CheckInt32Int32FloatTupleIntFloat)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>>(std::get<0>(param),
+                                                                       std::get<1>(param));
+}
+
+TEST_P(Tests_MGPerVPairTransformDstNbrIntersection_Rmat, CheckInt32Int32FloatTupleIntFloat)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGPerVPairTransformDstNbrIntersection_File, CheckInt32Int32Float)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, int>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_MGPerVPairTransformDstNbrIntersection_Rmat, CheckInt32Int32Float)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, int>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_MGPerVPairTransformDstNbrIntersection_File,
+  ::testing::Combine(
+    ::testing::Values(Prims_Usecase{size_t{1024}, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_MGPerVPairTransformDstNbrIntersection_Rmat,
+  ::testing::Combine(::testing::Values(Prims_Usecase{size_t{1024}, true}),
+                     ::testing::Values(cugraph::test::Rmat_Usecase(
+                       10, 16, 0.57, 0.19, 0.19, 0, false, false, 0, true))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_large_test,
+  Tests_MGPerVPairTransformDstNbrIntersection_Rmat,
+  ::testing::Combine(::testing::Values(Prims_Usecase{size_t{1024 * 1024}, false}),
+                     ::testing::Values(cugraph::test::Rmat_Usecase(
+                       20, 32, 0.57, 0.19, 0.19, 0, false, false, 0, true))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()

From 69f307ce80b9e0011563ff724a138e9aadb3990d Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 27 Sep 2022 15:17:18 -0700
Subject: [PATCH 18/18] clang-format

---
 ...r_v_pair_transform_dst_nbr_intersection.cu | 46 ++++++++++++++-----
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu b/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu
index 73a1c9c0ae5..457e309a65b 100644
--- a/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu
+++ b/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu
@@ -30,9 +30,9 @@
 
 #include <cugraph/graph_view.hpp>
 
-#include <raft/core/device_span.hpp>
 #include <raft/comms/comms.hpp>
 #include <raft/comms/mpi_comms.hpp>
+#include <raft/core/device_span.hpp>
 #include <raft/handle.hpp>
 #include <rmm/device_uvector.hpp>
 
@@ -82,8 +82,10 @@ class Tests_MGPerVPairTransformDstNbrIntersection
 
     auto const comm_rank = handle_->get_comms().get_rank();
     auto const comm_size = handle_->get_comms().get_size();
-    auto const row_comm_size = handle_->get_subcomm(cugraph::partition_2d::key_naming_t().row_name()).get_size();
-    auto const col_comm_size = handle_->get_subcomm(cugraph::partition_2d::key_naming_t().col_name()).get_size();
+    auto const row_comm_size =
+      handle_->get_subcomm(cugraph::partition_2d::key_naming_t().row_name()).get_size();
+    auto const col_comm_size =
+      handle_->get_subcomm(cugraph::partition_2d::key_naming_t().col_name()).get_size();
 
     // 1. create MG graph
 
@@ -131,13 +133,22 @@ class Tests_MGPerVPairTransformDstNbrIntersection
       });
 
     auto h_vertex_partition_range_lasts = mg_graph_view.vertex_partition_range_lasts();
-    rmm::device_uvector<vertex_t> d_vertex_partition_range_lasts(h_vertex_partition_range_lasts.size(), handle_->get_stream());
-    raft::update_device(d_vertex_partition_range_lasts.data(), h_vertex_partition_range_lasts.data(), h_vertex_partition_range_lasts.size(), handle_->get_stream());
+    rmm::device_uvector<vertex_t> d_vertex_partition_range_lasts(
+      h_vertex_partition_range_lasts.size(), handle_->get_stream());
+    raft::update_device(d_vertex_partition_range_lasts.data(),
+                        h_vertex_partition_range_lasts.data(),
+                        h_vertex_partition_range_lasts.size(),
+                        handle_->get_stream());
     std::tie(mg_vertex_pair_buffer, std::ignore) = cugraph::groupby_gpu_id_and_shuffle_values(
       handle_->get_comms(),
       cugraph::get_dataframe_buffer_begin(mg_vertex_pair_buffer),
       cugraph::get_dataframe_buffer_begin(mg_vertex_pair_buffer),
-      cugraph::detail::compute_gpu_id_from_int_edge_endpoints_t<vertex_t>{raft::device_span<vertex_t const>(d_vertex_partition_range_lasts.data(), d_vertex_partition_range_lasts.size()), comm_size, row_comm_size, col_comm_size},
+      cugraph::detail::compute_gpu_id_from_int_edge_endpoints_t<vertex_t>{
+        raft::device_span<vertex_t const>(d_vertex_partition_range_lasts.data(),
+                                          d_vertex_partition_range_lasts.size()),
+        comm_size,
+        row_comm_size,
+        col_comm_size},
       handle_->get_stream());
 
     auto mg_result_buffer = cugraph::allocate_dataframe_buffer<thrust::tuple<edge_t, edge_t>>(
@@ -171,8 +182,18 @@ class Tests_MGPerVPairTransformDstNbrIntersection
     // 3. validate MG results
 
     if (prims_usecase.check_correctness) {
-      cugraph::unrenumber_int_vertices<vertex_t, true>(*handle_, std::get<0>(mg_vertex_pair_buffer).data(), cugraph::size_dataframe_buffer(mg_vertex_pair_buffer), (*d_mg_renumber_map_labels).data(), h_vertex_partition_range_lasts);
-      cugraph::unrenumber_int_vertices<vertex_t, true>(*handle_, std::get<1>(mg_vertex_pair_buffer).data(), cugraph::size_dataframe_buffer(mg_vertex_pair_buffer), (*d_mg_renumber_map_labels).data(), h_vertex_partition_range_lasts);
+      cugraph::unrenumber_int_vertices<vertex_t, true>(
+        *handle_,
+        std::get<0>(mg_vertex_pair_buffer).data(),
+        cugraph::size_dataframe_buffer(mg_vertex_pair_buffer),
+        (*d_mg_renumber_map_labels).data(),
+        h_vertex_partition_range_lasts);
+      cugraph::unrenumber_int_vertices<vertex_t, true>(
+        *handle_,
+        std::get<1>(mg_vertex_pair_buffer).data(),
+        cugraph::size_dataframe_buffer(mg_vertex_pair_buffer),
+        (*d_mg_renumber_map_labels).data(),
+        h_vertex_partition_range_lasts);
 
       cugraph::graph_t<vertex_t, edge_t, weight_t, false, false> unrenumbered_graph(*handle_);
       std::tie(unrenumbered_graph, std::ignore) =
@@ -181,7 +202,8 @@ class Tests_MGPerVPairTransformDstNbrIntersection
 
       auto unrenumbered_graph_view = unrenumbered_graph.view();
 
-      auto sg_result_buffer = cugraph::allocate_dataframe_buffer<thrust::tuple<edge_t, edge_t>>(cugraph::size_dataframe_buffer(mg_vertex_pair_buffer), handle_->get_stream());
+      auto sg_result_buffer = cugraph::allocate_dataframe_buffer<thrust::tuple<edge_t, edge_t>>(
+        cugraph::size_dataframe_buffer(mg_vertex_pair_buffer), handle_->get_stream());
       auto sg_out_degrees = unrenumbered_graph_view.compute_out_degrees(*handle_);
 
       cugraph::per_v_pair_transform_dst_nbr_intersection(
@@ -198,8 +220,10 @@ class Tests_MGPerVPairTransformDstNbrIntersection
                                  cugraph::get_dataframe_buffer_end(mg_result_buffer),
                                  cugraph::get_dataframe_buffer_begin(sg_result_buffer));
 
-      valid = static_cast<bool>(cugraph::host_scalar_allreduce(
-        handle_->get_comms(), static_cast<int>(valid), raft::comms::op_t::MIN, handle_->get_stream()));
+      valid = static_cast<bool>(cugraph::host_scalar_allreduce(handle_->get_comms(),
+                                                               static_cast<int>(valid),
+                                                               raft::comms::op_t::MIN,
+                                                               handle_->get_stream()));
       ASSERT_TRUE(valid);
     }
   }