From a1f8a653b6316bd43141a6df8ec7e0ac7f7b4a8a Mon Sep 17 00:00:00 2001
From: Joseph Nke <76006812+jnke2016@users.noreply.github.com>
Date: Tue, 30 Jul 2024 07:01:44 +0100
Subject: [PATCH] Optimize K-Truss (#4375)

This PR leverages our existing edge triangle count to implement both SG and MG K-Truss as our initial version.
This PR also:

- Exposes the `rx count` in several as our shuffling functions
- Add C and C++ tests for MG K-Truss

Closes #4500

Authors:
  - Joseph Nke (https://github.com/jnke2016)
  - Brad Rees (https://github.com/BradReesWork)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Seunghwa Kang (https://github.com/seunghwak)
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: https://github.com/rapidsai/cugraph/pull/4375
---
 cpp/CMakeLists.txt                            |   3 +
 .../cugraph/detail/shuffle_wrappers.hpp       |   8 +-
 .../mtmg/detail/per_device_edgelist.hpp       |   3 +-
 cpp/src/c_api/graph_functions.cpp             |   2 +-
 cpp/src/c_api/graph_mg.cpp                    |   3 +-
 cpp/src/c_api/k_truss.cpp                     |   3 -
 cpp/src/community/detail/refine_impl.cuh      |   1 +
 .../community/edge_triangle_count_impl.cuh    |   2 +-
 cpp/src/community/k_truss_impl.cuh            | 674 ++----------------
 cpp/src/community/k_truss_mg_v32_e32.cu       |  41 ++
 cpp/src/community/k_truss_mg_v32_e64.cu       |  41 ++
 cpp/src/community/k_truss_mg_v64_e64.cu       |  41 ++
 cpp/src/community/triangle_count_impl.cuh     |   2 +-
 .../weakly_connected_components_impl.cuh      |   1 +
 cpp/src/cores/k_core_impl.cuh                 |   4 +-
 cpp/src/link_prediction/similarity_impl.cuh   |   4 +-
 cpp/src/structure/coarsen_graph_impl.cuh      |   4 +-
 .../structure/symmetrize_edgelist_impl.cuh    |   2 +
 cpp/src/structure/transpose_graph_impl.cuh    |   1 +
 .../transpose_graph_storage_impl.cuh          |   1 +
 cpp/src/utilities/shuffle_vertex_pairs.cuh    |  43 +-
 .../shuffle_vertex_pairs_mg_v32_e32.cu        |  18 +-
 .../shuffle_vertex_pairs_mg_v32_e64.cu        |  18 +-
 .../shuffle_vertex_pairs_mg_v64_e64.cu        |  18 +-
 .../shuffle_vertices_mg_v32_integral.cu       |   7 +
 .../shuffle_vertices_mg_v64_integral.cu       |   7 +
 cpp/tests/CMakeLists.txt                      |   7 +-
 cpp/tests/c_api/mg_k_truss_test.c             | 150 ++++
 cpp/tests/community/mg_k_truss_test.cpp       | 286 ++++++++
 .../link_prediction/mg_similarity_test.cpp    |   2 +-
 .../mg_weighted_similarity_test.cpp           |   2 +-
 ...r_v_pair_transform_dst_nbr_intersection.cu |   1 +
 ...transform_dst_nbr_weighted_intersection.cu |   1 +
 ...has_edge_and_compute_multiplicity_test.cpp |   1 +
 cpp/tests/utilities/test_graphs.hpp           |   1 +
 35 files changed, 719 insertions(+), 684 deletions(-)
 create mode 100644 cpp/src/community/k_truss_mg_v32_e32.cu
 create mode 100644 cpp/src/community/k_truss_mg_v32_e64.cu
 create mode 100644 cpp/src/community/k_truss_mg_v64_e64.cu
 create mode 100644 cpp/tests/c_api/mg_k_truss_test.c
 create mode 100644 cpp/tests/community/mg_k_truss_test.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 61ec34c3319..441627cabce 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -284,6 +284,9 @@ set(CUGRAPH_SOURCES
     src/community/k_truss_sg_v64_e64.cu
     src/community/k_truss_sg_v32_e32.cu
     src/community/k_truss_sg_v32_e64.cu
+    src/community/k_truss_mg_v64_e64.cu
+    src/community/k_truss_mg_v32_e32.cu
+    src/community/k_truss_mg_v32_e64.cu
     src/lookup/lookup_src_dst_mg_v32_e32.cu
     src/lookup/lookup_src_dst_mg_v32_e64.cu
     src/lookup/lookup_src_dst_mg_v64_e64.cu
diff --git a/cpp/include/cugraph/detail/shuffle_wrappers.hpp b/cpp/include/cugraph/detail/shuffle_wrappers.hpp
index 69d48098a5d..7dffcce298a 100644
--- a/cpp/include/cugraph/detail/shuffle_wrappers.hpp
+++ b/cpp/include/cugraph/detail/shuffle_wrappers.hpp
@@ -53,7 +53,8 @@ std::tuple<rmm::device_uvector<vertex_t>,
            rmm::device_uvector<vertex_t>,
            std::optional<rmm::device_uvector<weight_t>>,
            std::optional<rmm::device_uvector<edge_t>>,
-           std::optional<rmm::device_uvector<edge_type_id_t>>>
+           std::optional<rmm::device_uvector<edge_type_id_t>>,
+           std::vector<size_t>>
 shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<vertex_t>&& majors,
@@ -86,14 +87,15 @@ shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
  * (exclusive) vertex ID.
  *
  * @return Tuple of vectors storing shuffled major vertices, minor vertices and optional weights,
- * edge ids and edge types
+ * edge ids and edge types and rx counts
  */
 template <typename vertex_t, typename edge_t, typename weight_t, typename edge_type_id_t>
 std::tuple<rmm::device_uvector<vertex_t>,
            rmm::device_uvector<vertex_t>,
            std::optional<rmm::device_uvector<weight_t>>,
            std::optional<rmm::device_uvector<edge_t>>,
-           std::optional<rmm::device_uvector<edge_type_id_t>>>
+           std::optional<rmm::device_uvector<edge_type_id_t>>,
+           std::vector<size_t>>
 shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<vertex_t>&& majors,
diff --git a/cpp/include/cugraph/mtmg/detail/per_device_edgelist.hpp b/cpp/include/cugraph/mtmg/detail/per_device_edgelist.hpp
index 63d7fd9685e..61ad833a529 100644
--- a/cpp/include/cugraph/mtmg/detail/per_device_edgelist.hpp
+++ b/cpp/include/cugraph/mtmg/detail/per_device_edgelist.hpp
@@ -251,7 +251,8 @@ class per_device_edgelist_t {
              store_transposed ? src_[0] : dst_[0],
              tmp_wgt,
              tmp_edge_id,
-             tmp_edge_type) =
+             tmp_edge_type,
+             std::ignore) =
       cugraph::detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
         handle.raft_handle(),
         store_transposed ? std::move(dst_[0]) : std::move(src_[0]),
diff --git a/cpp/src/c_api/graph_functions.cpp b/cpp/src/c_api/graph_functions.cpp
index 91371b988b3..df741a349d2 100644
--- a/cpp/src/c_api/graph_functions.cpp
+++ b/cpp/src/c_api/graph_functions.cpp
@@ -72,7 +72,7 @@ struct create_vertex_pairs_functor : public cugraph::c_api::abstract_functor {
         second_copy.data(), second_->as_type<vertex_t>(), second_->size_, handle_.get_stream());
 
       if constexpr (multi_gpu) {
-        std::tie(first_copy, second_copy, std::ignore, std::ignore, std::ignore) =
+        std::tie(first_copy, second_copy, std::ignore, std::ignore, std::ignore, std::ignore) =
           cugraph::detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
             vertex_t,
             edge_t,
diff --git a/cpp/src/c_api/graph_mg.cpp b/cpp/src/c_api/graph_mg.cpp
index 22ceea3f629..cc4acd31743 100644
--- a/cpp/src/c_api/graph_mg.cpp
+++ b/cpp/src/c_api/graph_mg.cpp
@@ -167,7 +167,8 @@ struct create_graph_functor : public cugraph::c_api::abstract_functor {
                store_transposed ? edgelist_srcs : edgelist_dsts,
                edgelist_weights,
                edgelist_edge_ids,
-               edgelist_edge_types) =
+               edgelist_edge_types,
+               std::ignore) =
         cugraph::detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
           handle_,
           std::move(store_transposed ? edgelist_dsts : edgelist_srcs),
diff --git a/cpp/src/c_api/k_truss.cpp b/cpp/src/c_api/k_truss.cpp
index 18e256b022a..37a0672676e 100644
--- a/cpp/src/c_api/k_truss.cpp
+++ b/cpp/src/c_api/k_truss.cpp
@@ -60,10 +60,7 @@ struct k_truss_functor : public cugraph::c_api::abstract_functor {
   {
     if constexpr (!cugraph::is_candidate<vertex_t, edge_t, weight_t>::value) {
       unsupported();
-    } else if constexpr (multi_gpu) {
-      unsupported();
     } else {
-      // k_truss expects store_transposed == false
       if constexpr (store_transposed) {
         error_code_ = cugraph::c_api::
           transpose_storage<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
diff --git a/cpp/src/community/detail/refine_impl.cuh b/cpp/src/community/detail/refine_impl.cuh
index 99fc1cd6fae..272e3d71f83 100644
--- a/cpp/src/community/detail/refine_impl.cuh
+++ b/cpp/src/community/detail/refine_impl.cuh
@@ -627,6 +627,7 @@ refine_clustering(
                store_transposed ? d_srcs : d_dsts,
                d_weights,
                std::ignore,
+               std::ignore,
                std::ignore) =
         cugraph::detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
           vertex_t,
diff --git a/cpp/src/community/edge_triangle_count_impl.cuh b/cpp/src/community/edge_triangle_count_impl.cuh
index c4277e240be..225687c4cf0 100644
--- a/cpp/src/community/edge_triangle_count_impl.cuh
+++ b/cpp/src/community/edge_triangle_count_impl.cuh
@@ -250,7 +250,7 @@ edge_property_t<graph_view_t<vertex_t, edge_t, false, multi_gpu>, edge_t> edge_t
                          handle.get_stream());
 
       // There are still multiple copies here but is it worth sorting and reducing again?
-      std::tie(pair_srcs, pair_dsts, std::ignore, pair_count, std::ignore) =
+      std::tie(pair_srcs, pair_dsts, std::ignore, pair_count, std::ignore, std::ignore) =
         shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
                                                                                edge_t,
                                                                                weight_t,
diff --git a/cpp/src/community/k_truss_impl.cuh b/cpp/src/community/k_truss_impl.cuh
index f2d4c04dffd..e052a892917 100644
--- a/cpp/src/community/k_truss_impl.cuh
+++ b/cpp/src/community/k_truss_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,12 +17,14 @@
 
 #include "prims/edge_bucket.cuh"
 #include "prims/extract_transform_e.cuh"
+#include "prims/extract_transform_v_frontier_outgoing_e.cuh"
 #include "prims/fill_edge_property.cuh"
 #include "prims/transform_e.cuh"
 #include "prims/transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cuh"
 #include "prims/update_edge_src_dst_property.cuh"
 
 #include <cugraph/algorithms.hpp>
+#include <cugraph/detail/collect_comm_wrapper.hpp>
 #include <cugraph/detail/shuffle_wrappers.hpp>
 #include <cugraph/graph_functions.hpp>
 #include <cugraph/utilities/error.hpp>
@@ -41,351 +43,6 @@
 
 namespace cugraph {
 
-template <typename vertex_t, typename edge_t, typename EdgeIterator>
-struct unroll_edge {
-  size_t num_valid_edges{};
-  raft::device_span<edge_t> num_triangles{};
-  EdgeIterator edge_to_unroll_first{};
-  EdgeIterator transposed_valid_edge_first{};
-  EdgeIterator transposed_valid_edge_last{};
-  EdgeIterator transposed_invalid_edge_last{};
-
-  __device__ thrust::tuple<vertex_t, vertex_t> operator()(edge_t i) const
-  {
-    // edges are sorted with destination as key so reverse the edge when looking it
-    auto pair = thrust::make_tuple(thrust::get<1>(*(edge_to_unroll_first + i)),
-                                   thrust::get<0>(*(edge_to_unroll_first + i)));
-    // Find its position in either partition of the transposed edgelist
-    // An edge can be in found in either of the two partitions (valid or invalid)
-    auto itr = thrust::lower_bound(
-      thrust::seq, transposed_valid_edge_last, transposed_invalid_edge_last, pair);
-    size_t idx{};
-    if (itr != transposed_invalid_edge_last && *itr == pair) {
-      idx =
-        static_cast<size_t>(thrust::distance(transposed_valid_edge_last, itr) + num_valid_edges);
-    } else {
-      // The edge must be in the first boundary
-      itr = thrust::lower_bound(
-        thrust::seq, transposed_valid_edge_first, transposed_valid_edge_last, pair);
-      assert(*itr == pair);
-      idx = thrust::distance(transposed_valid_edge_first, itr);
-    }
-    cuda::atomic_ref<edge_t, cuda::thread_scope_device> atomic_counter(num_triangles[idx]);
-    auto r = atomic_counter.fetch_sub(edge_t{1}, cuda::std::memory_order_relaxed);
-  }
-};
-
-// FIXME: May re-locate this function as a general utility function for graph algorithm
-// implementations.
-template <typename vertex_t>
-rmm::device_uvector<vertex_t> compute_prefix_sum(raft::handle_t const& handle,
-                                                 raft::device_span<vertex_t const> sorted_vertices,
-                                                 raft::device_span<vertex_t const> query_vertices)
-{
-  rmm::device_uvector<vertex_t> prefix_sum(query_vertices.size() + 1, handle.get_stream());
-
-  auto count_first = thrust::make_transform_iterator(
-    thrust::make_counting_iterator(size_t{0}),
-    cuda::proclaim_return_type<vertex_t>(
-      [query_vertices,
-       num_edges       = sorted_vertices.size(),
-       sorted_vertices = sorted_vertices.begin()] __device__(size_t idx) {
-        auto itr_lower = thrust::lower_bound(
-          thrust::seq, sorted_vertices, sorted_vertices + num_edges, query_vertices[idx]);
-
-        auto itr_upper = thrust::upper_bound(
-          thrust::seq, itr_lower, sorted_vertices + num_edges, query_vertices[idx]);
-        vertex_t dist = thrust::distance(itr_lower, itr_upper);
-
-        return dist;
-      }));
-
-  thrust::exclusive_scan(handle.get_thrust_policy(),
-                         count_first,
-                         count_first + query_vertices.size() + 1,
-                         prefix_sum.begin());
-
-  return prefix_sum;
-}
-
-template <typename vertex_t, typename edge_t, typename EdgeIterator>
-edge_t remove_overcompensating_edges(raft::handle_t const& handle,
-                                     size_t buffer_size,
-                                     EdgeIterator potential_closing_or_incoming_edges,
-                                     EdgeIterator incoming_or_potential_closing_edges,
-                                     raft::device_span<vertex_t const> invalid_edgelist_srcs,
-                                     raft::device_span<vertex_t const> invalid_edgelist_dsts)
-{
-  // To avoid over-compensating, check whether the 'potential_closing_edges'
-  // are within the invalid edges. If yes, the was already unrolled
-  auto edges_not_overcomp = thrust::remove_if(
-    handle.get_thrust_policy(),
-    thrust::make_zip_iterator(potential_closing_or_incoming_edges,
-                              incoming_or_potential_closing_edges),
-    thrust::make_zip_iterator(potential_closing_or_incoming_edges + buffer_size,
-                              incoming_or_potential_closing_edges + buffer_size),
-    [num_invalid_edges = invalid_edgelist_dsts.size(),
-     invalid_first =
-       thrust::make_zip_iterator(invalid_edgelist_dsts.begin(), invalid_edgelist_srcs.begin()),
-     invalid_last = thrust::make_zip_iterator(invalid_edgelist_dsts.end(),
-                                              invalid_edgelist_srcs.end())] __device__(auto e) {
-      auto potential_edge = thrust::get<0>(e);
-      auto transposed_potential_or_incoming_edge =
-        thrust::make_tuple(thrust::get<1>(potential_edge), thrust::get<0>(potential_edge));
-      auto itr = thrust::lower_bound(
-        thrust::seq, invalid_first, invalid_last, transposed_potential_or_incoming_edge);
-      return (itr != invalid_last && *itr == transposed_potential_or_incoming_edge);
-    });
-
-  auto dist = thrust::distance(thrust::make_zip_iterator(potential_closing_or_incoming_edges,
-                                                         incoming_or_potential_closing_edges),
-                               edges_not_overcomp);
-
-  return dist;
-}
-
-template <typename vertex_t, typename edge_t, bool multi_gpu, bool is_q_r_edge>
-void unroll_p_r_or_q_r_edges(raft::handle_t const& handle,
-                             graph_view_t<vertex_t, edge_t, false, false>& graph_view,
-                             size_t num_invalid_edges,
-                             size_t num_valid_edges,
-                             raft::device_span<vertex_t const> edgelist_srcs,
-                             raft::device_span<vertex_t const> edgelist_dsts,
-                             raft::device_span<edge_t> num_triangles)
-{
-  auto prefix_sum_valid = compute_prefix_sum(
-    handle,
-    raft::device_span<vertex_t const>(edgelist_dsts.data(), num_valid_edges),
-    raft::device_span<vertex_t const>(edgelist_dsts.data() + num_valid_edges, num_invalid_edges));
-
-  auto prefix_sum_invalid = compute_prefix_sum(
-    handle,
-    raft::device_span<vertex_t const>(edgelist_dsts.data() + num_valid_edges, num_invalid_edges),
-    raft::device_span<vertex_t const>(edgelist_dsts.data() + num_valid_edges, num_invalid_edges));
-
-  auto potential_closing_edges = allocate_dataframe_buffer<thrust::tuple<vertex_t, vertex_t>>(
-    prefix_sum_valid.back_element(handle.get_stream()) +
-      prefix_sum_invalid.back_element(handle.get_stream()),
-    handle.get_stream());
-
-  auto incoming_edges_to_r = allocate_dataframe_buffer<thrust::tuple<vertex_t, vertex_t>>(
-    prefix_sum_valid.back_element(handle.get_stream()) +
-      prefix_sum_invalid.back_element(handle.get_stream()),
-    handle.get_stream());
-
-  thrust::for_each(
-    handle.get_thrust_policy(),
-    thrust::make_counting_iterator<edge_t>(0),
-    thrust::make_counting_iterator<edge_t>(num_invalid_edges),
-    [num_valid_edges,
-     num_invalid_edges,
-     invalid_dst_first       = edgelist_dsts.begin() + num_valid_edges,
-     invalid_src_first       = edgelist_srcs.begin() + num_valid_edges,
-     valid_src_first         = edgelist_srcs.begin(),
-     valid_dst_first         = edgelist_dsts.begin(),
-     prefix_sum_valid        = prefix_sum_valid.data(),
-     prefix_sum_invalid      = prefix_sum_invalid.data(),
-     potential_closing_edges = get_dataframe_buffer_begin(potential_closing_edges),
-     incoming_edges_to_r = get_dataframe_buffer_begin(incoming_edges_to_r)] __device__(auto idx) {
-      auto src                 = invalid_src_first[idx];
-      auto dst                 = invalid_dst_first[idx];
-      auto dst_array_end_valid = valid_dst_first + num_valid_edges;
-
-      auto itr_lower_valid =
-        thrust::lower_bound(thrust::seq, valid_dst_first, dst_array_end_valid, dst);
-      auto idx_lower_valid = thrust::distance(
-        valid_dst_first,
-        itr_lower_valid);  // Need a binary search to find the begining of the range
-
-      auto invalid_end_dst = invalid_dst_first + num_invalid_edges;
-
-      auto itr_lower_invalid =
-        thrust::lower_bound(thrust::seq, invalid_dst_first, invalid_end_dst, dst);
-      auto idx_lower_invalid = thrust::distance(
-        invalid_dst_first,
-        itr_lower_invalid);  // Need a binary search to find the begining of the range
-
-      auto incoming_edges_to_r_first_valid = thrust::make_zip_iterator(
-        valid_src_first + idx_lower_valid, thrust::make_constant_iterator(dst));
-      thrust::copy(
-        thrust::seq,
-        incoming_edges_to_r_first_valid,
-        incoming_edges_to_r_first_valid + (prefix_sum_valid[idx + 1] - prefix_sum_valid[idx]),
-        incoming_edges_to_r + prefix_sum_valid[idx] + prefix_sum_invalid[idx]);
-
-      auto incoming_edges_to_r_first_invalid = thrust::make_zip_iterator(
-        invalid_src_first + idx_lower_invalid, thrust::make_constant_iterator(dst));
-      thrust::copy(
-        thrust::seq,
-        incoming_edges_to_r_first_invalid,
-        incoming_edges_to_r_first_invalid + (prefix_sum_invalid[idx + 1] - prefix_sum_invalid[idx]),
-
-        incoming_edges_to_r + prefix_sum_invalid[idx] + prefix_sum_valid[idx + 1]);
-
-      if constexpr (is_q_r_edge) {
-        auto potential_closing_edges_first_valid = thrust::make_zip_iterator(
-          valid_src_first + idx_lower_valid, thrust::make_constant_iterator(src));
-        thrust::copy(
-          thrust::seq,
-          potential_closing_edges_first_valid,
-          potential_closing_edges_first_valid + (prefix_sum_valid[idx + 1] - prefix_sum_valid[idx]),
-          potential_closing_edges + prefix_sum_valid[idx] + prefix_sum_invalid[idx]);
-
-        auto potential_closing_edges_first_invalid = thrust::make_zip_iterator(
-          invalid_src_first + idx_lower_invalid, thrust::make_constant_iterator(src));
-        thrust::copy(thrust::seq,
-                     potential_closing_edges_first_invalid,
-                     potential_closing_edges_first_invalid +
-                       (prefix_sum_invalid[idx + 1] - prefix_sum_invalid[idx]),
-                     potential_closing_edges + prefix_sum_invalid[idx] + prefix_sum_valid[idx + 1]);
-
-      } else {
-        auto potential_closing_edges_first_valid = thrust::make_zip_iterator(
-          thrust::make_constant_iterator(src), valid_src_first + idx_lower_valid);
-        thrust::copy(
-          thrust::seq,
-          potential_closing_edges_first_valid,
-          potential_closing_edges_first_valid + (prefix_sum_valid[idx + 1] - prefix_sum_valid[idx]),
-          potential_closing_edges + prefix_sum_valid[idx] + prefix_sum_invalid[idx]);
-
-        auto potential_closing_edges_first_invalid = thrust::make_zip_iterator(
-          thrust::make_constant_iterator(src), invalid_src_first + idx_lower_invalid);
-        thrust::copy(
-          thrust::seq,
-          potential_closing_edges_first_invalid,
-          potential_closing_edges_first_invalid +
-            (prefix_sum_invalid[idx + 1] - prefix_sum_invalid[idx]),
-          potential_closing_edges + prefix_sum_invalid[idx] + (prefix_sum_valid[idx + 1]));
-      }
-    });
-
-  auto edges_exist = graph_view.has_edge(
-    handle,
-    raft::device_span<vertex_t const>(std::get<0>(potential_closing_edges).data(),
-                                      std::get<0>(potential_closing_edges).size()),
-    raft::device_span<vertex_t const>(std::get<1>(potential_closing_edges).data(),
-                                      std::get<1>(potential_closing_edges).size()));
-
-  auto edge_to_existance = thrust::make_zip_iterator(
-    thrust::make_zip_iterator(get_dataframe_buffer_begin(potential_closing_edges),
-                              get_dataframe_buffer_begin(incoming_edges_to_r)),
-    edges_exist.begin());
-
-  auto has_edge_last = thrust::remove_if(handle.get_thrust_policy(),
-                                         edge_to_existance,
-                                         edge_to_existance + edges_exist.size(),
-                                         [] __device__(auto e) {
-                                           auto edge_exists = thrust::get<1>(e);
-                                           return edge_exists == 0;
-                                         });
-
-  auto num_edge_exists = thrust::distance(edge_to_existance, has_edge_last);
-
-  // After pushing the non-existant edges to the second partition,
-  // remove them by resizing  both vertex pair buffer
-  resize_dataframe_buffer(potential_closing_edges, num_edge_exists, handle.get_stream());
-  resize_dataframe_buffer(incoming_edges_to_r, num_edge_exists, handle.get_stream());
-
-  auto num_edges_not_overcomp =
-    remove_overcompensating_edges<vertex_t,
-                                  edge_t,
-                                  decltype(get_dataframe_buffer_begin(potential_closing_edges))>(
-      handle,
-      num_edge_exists,
-      get_dataframe_buffer_begin(potential_closing_edges),
-      get_dataframe_buffer_begin(incoming_edges_to_r),
-      raft::device_span<vertex_t const>(edgelist_srcs.data() + num_valid_edges, num_invalid_edges),
-      raft::device_span<vertex_t const>(edgelist_dsts.data() + num_valid_edges, num_invalid_edges));
-
-  // After pushing the non-existant edges to the second partition,
-  // remove them by resizing  both vertex pair buffer
-  resize_dataframe_buffer(potential_closing_edges, num_edges_not_overcomp, handle.get_stream());
-  resize_dataframe_buffer(incoming_edges_to_r, num_edges_not_overcomp, handle.get_stream());
-
-  // Extra check for 'incoming_edges_to_r'
-  if constexpr (!is_q_r_edge) {
-    // Exchange the arguments (incoming_edges_to_r, num_edges_not_overcomp) order
-    // To also check if the 'incoming_edges_to_r' belong the the invalid_edgelist
-    num_edges_not_overcomp =
-      remove_overcompensating_edges<vertex_t,
-                                    edge_t,
-                                    decltype(get_dataframe_buffer_begin(potential_closing_edges))>(
-        handle,
-        num_edges_not_overcomp,
-        get_dataframe_buffer_begin(incoming_edges_to_r),
-        get_dataframe_buffer_begin(potential_closing_edges),
-        raft::device_span<vertex_t const>(edgelist_srcs.data() + num_valid_edges,
-                                          num_invalid_edges),
-        raft::device_span<vertex_t const>(edgelist_dsts.data() + num_valid_edges,
-                                          num_invalid_edges));
-
-    resize_dataframe_buffer(potential_closing_edges, num_edges_not_overcomp, handle.get_stream());
-    resize_dataframe_buffer(incoming_edges_to_r, num_edges_not_overcomp, handle.get_stream());
-  }
-
-  thrust::for_each(
-    handle.get_thrust_policy(),
-    thrust::make_zip_iterator(get_dataframe_buffer_begin(potential_closing_edges),
-                              get_dataframe_buffer_begin(incoming_edges_to_r)),
-    thrust::make_zip_iterator(
-      get_dataframe_buffer_begin(potential_closing_edges) + num_edges_not_overcomp,
-      get_dataframe_buffer_begin(incoming_edges_to_r) + num_edges_not_overcomp),
-    [num_triangles = num_triangles.begin(),
-     num_valid_edges,
-     invalid_first = thrust::make_zip_iterator(edgelist_dsts.begin() + num_valid_edges,
-                                               edgelist_srcs.begin() + num_valid_edges),
-     invalid_last  = thrust::make_zip_iterator(
-       edgelist_dsts.end(), edgelist_srcs.end())] __device__(auto potential_or_incoming_e) {
-      auto potential_e     = thrust::get<0>(potential_or_incoming_e);
-      auto incoming_e_to_r = thrust::get<1>(potential_or_incoming_e);
-      // thrust::tuple<vertex_t, vertex_t>> transposed_invalid_edge_;
-      auto transposed_invalid_edge =
-        thrust::make_tuple(thrust::get<1>(incoming_e_to_r), thrust::get<1>(potential_e));
-
-      if constexpr (!is_q_r_edge) {
-        transposed_invalid_edge =
-          thrust::make_tuple(thrust::get<1>(incoming_e_to_r), thrust::get<0>(potential_e));
-      }
-      auto itr =
-        thrust::lower_bound(thrust::seq, invalid_first, invalid_last, transposed_invalid_edge);
-      if (itr != invalid_last) { assert(*itr == transposed_invalid_edge); }
-      auto dist = thrust::distance(invalid_first, itr) + num_valid_edges;
-
-      cuda::atomic_ref<edge_t, cuda::thread_scope_device> atomic_counter(num_triangles[dist]);
-      auto r = atomic_counter.fetch_sub(edge_t{1}, cuda::std::memory_order_relaxed);
-    });
-
-  thrust::for_each(
-    handle.get_thrust_policy(),
-    thrust::make_counting_iterator<edge_t>(0),
-    thrust::make_counting_iterator<edge_t>(num_edges_not_overcomp),
-    unroll_edge<vertex_t,
-                edge_t,
-                decltype(thrust::make_zip_iterator(edgelist_srcs.begin(), edgelist_dsts.begin()))>{
-      num_valid_edges,
-      raft::device_span<edge_t>(num_triangles.data(), num_triangles.size()),
-      get_dataframe_buffer_begin(potential_closing_edges),
-      thrust::make_zip_iterator(edgelist_dsts.begin(), edgelist_srcs.begin()),
-      thrust::make_zip_iterator(edgelist_dsts.begin() + num_valid_edges,
-                                edgelist_srcs.begin() + num_valid_edges),
-      thrust::make_zip_iterator(edgelist_dsts.end(), edgelist_srcs.end())});
-
-  thrust::for_each(
-    handle.get_thrust_policy(),
-    thrust::make_counting_iterator<edge_t>(0),
-    thrust::make_counting_iterator<edge_t>(num_edges_not_overcomp),
-    unroll_edge<vertex_t,
-                edge_t,
-                decltype(thrust::make_zip_iterator(edgelist_srcs.begin(), edgelist_dsts.begin()))>{
-      num_valid_edges,
-      raft::device_span<edge_t>(num_triangles.data(), num_triangles.size()),
-      get_dataframe_buffer_begin(incoming_edges_to_r),
-      thrust::make_zip_iterator(edgelist_dsts.begin(), edgelist_srcs.begin()),
-      thrust::make_zip_iterator(edgelist_dsts.begin() + num_valid_edges,
-                                edgelist_srcs.begin() + num_valid_edges),
-      thrust::make_zip_iterator(edgelist_dsts.end(), edgelist_srcs.end())});
-}
-
 namespace {
 
 template <typename vertex_t>
@@ -434,28 +91,6 @@ struct extract_low_to_high_degree_edges_t {
   }
 };
 
-template <typename vertex_t, typename edge_t, bool generate_p_r>
-struct generate_p_r_or_q_r_from_p_q {
-  size_t chunk_start{};
-  raft::device_span<size_t const> intersection_offsets{};
-  raft::device_span<vertex_t const> intersection_indices{};
-  raft::device_span<vertex_t const> invalid_srcs{};
-  raft::device_span<vertex_t const> invalid_dsts{};
-
-  __device__ thrust::tuple<vertex_t, vertex_t> operator()(edge_t i) const
-  {
-    auto itr = thrust::upper_bound(
-      thrust::seq, intersection_offsets.begin() + 1, intersection_offsets.end(), i);
-    auto idx = thrust::distance(intersection_offsets.begin() + 1, itr);
-
-    if constexpr (generate_p_r) {
-      return thrust::make_tuple(invalid_srcs[chunk_start + idx], intersection_indices[i]);
-
-    } else {
-      return thrust::make_tuple(invalid_dsts[chunk_start + idx], intersection_indices[i]);
-    }
-  }
-};
 }  // namespace
 
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
@@ -470,8 +105,6 @@ k_truss(raft::handle_t const& handle,
 {
   // 1. Check input arguments.
 
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   CUGRAPH_EXPECTS(graph_view.is_symmetric(),
                   "Invalid input arguments: K-truss currently supports undirected graphs only.");
   CUGRAPH_EXPECTS(!graph_view.is_multigraph(),
@@ -497,7 +130,7 @@ k_truss(raft::handle_t const& handle,
                                             exclude_self_loop_t<vertex_t>{});
 
     if constexpr (multi_gpu) {
-      std::tie(srcs, dsts, std::ignore, std::ignore, std::ignore) =
+      std::tie(srcs, dsts, std::ignore, std::ignore, std::ignore, std::ignore) =
         detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
                                                                                        edge_t,
                                                                                        weight_t,
@@ -520,7 +153,8 @@ k_truss(raft::handle_t const& handle,
     modified_graph_view = (*modified_graph).view();
   }
 
-  // 3. Find (k-1)-core and exclude edges that do not belong to (k-1)-core
+  // 2. Find (k-1)-core and exclude edges that do not belong to (k-1)-core
+
   {
     auto cur_graph_view = modified_graph_view ? *modified_graph_view : graph_view;
 
@@ -536,26 +170,23 @@ k_truss(raft::handle_t const& handle,
 
     raft::device_span<edge_t const> core_number_span{core_numbers.data(), core_numbers.size()};
 
-    rmm::device_uvector<vertex_t> srcs{0, handle.get_stream()};
-    rmm::device_uvector<vertex_t> dsts{0, handle.get_stream()};
-    std::tie(srcs, dsts, wgts) = k_core(handle,
-                                        cur_graph_view,
-                                        edge_weight_view,
-                                        k - 1,
-                                        std::make_optional(k_core_degree_type_t::OUT),
-                                        std::make_optional(core_number_span));
+    auto [srcs, dsts, wgts] = k_core(handle,
+                                     cur_graph_view,
+                                     edge_weight_view,
+                                     k - 1,
+                                     std::make_optional(k_core_degree_type_t::OUT),
+                                     std::make_optional(core_number_span));
 
     if constexpr (multi_gpu) {
-      std::tie(srcs, dsts, std::ignore, std::ignore, std::ignore) =
+      std::tie(srcs, dsts, wgts, std::ignore, std::ignore, std::ignore) =
         detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
                                                                                        edge_t,
                                                                                        weight_t,
                                                                                        int32_t>(
-          handle, std::move(srcs), std::move(dsts), std::nullopt, std::nullopt, std::nullopt);
+          handle, std::move(srcs), std::move(dsts), std::move(wgts), std::nullopt, std::nullopt);
     }
 
     std::optional<rmm::device_uvector<vertex_t>> tmp_renumber_map{std::nullopt};
-
     std::tie(*modified_graph, edge_weight, std::ignore, std::ignore, tmp_renumber_map) =
       create_graph_from_edgelist<vertex_t, edge_t, weight_t, edge_t, int32_t, false, multi_gpu>(
         handle,
@@ -566,7 +197,7 @@ k_truss(raft::handle_t const& handle,
         std::nullopt,
         std::nullopt,
         cugraph::graph_properties_t{true, graph_view.is_multigraph()},
-        false);
+        true);
 
     modified_graph_view = (*modified_graph).view();
 
@@ -577,10 +208,11 @@ k_truss(raft::handle_t const& handle,
                                                    (*renumber_map).data(),
                                                    *vertex_partition_range_lasts);
     }
+
     renumber_map = std::move(tmp_renumber_map);
   }
 
-  // 4. Keep only the edges from a low-degree vertex to a high-degree vertex.
+  // 3. Keep only the edges from a low-degree vertex to a high-degree vertex.
 
   {
     auto cur_graph_view = modified_graph_view ? *modified_graph_view : graph_view;
@@ -625,7 +257,7 @@ k_truss(raft::handle_t const& handle,
     }
 
     if constexpr (multi_gpu) {
-      std::tie(srcs, dsts, wgts, std::ignore, std::ignore) =
+      std::tie(srcs, dsts, wgts, std::ignore, std::ignore, std::ignore) =
         detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
                                                                                        edge_t,
                                                                                        weight_t,
@@ -645,7 +277,7 @@ k_truss(raft::handle_t const& handle,
         std::nullopt,
         std::nullopt,
         cugraph::graph_properties_t{false /* now asymmetric */, cur_graph_view.is_multigraph()},
-        false);
+        true);
 
     modified_graph_view = (*modified_graph).view();
     if (renumber_map) {  // collapse renumber_map
@@ -658,264 +290,49 @@ k_truss(raft::handle_t const& handle,
     renumber_map = std::move(tmp_renumber_map);
   }
 
-  // 5. Decompress the resulting graph to an edges list and ind intersection of edges endpoints
-  // for each partition using detail::nbr_intersection
+  // 4. Compute triangle count using nbr_intersection and unroll weak edges
 
   {
     auto cur_graph_view = modified_graph_view ? *modified_graph_view : graph_view;
-    rmm::device_uvector<vertex_t> edgelist_srcs(0, handle.get_stream());
-    rmm::device_uvector<vertex_t> edgelist_dsts(0, handle.get_stream());
-    std::optional<rmm::device_uvector<edge_t>> num_triangles{std::nullopt};
-    std::optional<rmm::device_uvector<weight_t>> edgelist_wgts{std::nullopt};
 
     edge_weight_view =
       edge_weight ? std::make_optional((*edge_weight).view())
                   : std::optional<edge_property_view_t<edge_t, weight_t const*>>{std::nullopt};
 
-    auto prop_num_triangles = edge_triangle_count<vertex_t, edge_t, false>(handle, cur_graph_view);
-
-    std::tie(edgelist_srcs, edgelist_dsts, edgelist_wgts, num_triangles, std::ignore) =
-      decompress_to_edgelist(
-        handle,
-        cur_graph_view,
-        edge_weight_view,
-        // FIXME: Update 'decompress_edgelist' to support int32_t and int64_t values
-        std::make_optional(prop_num_triangles.view()),
-        std::optional<cugraph::edge_property_view_t<edge_t, int32_t const*>>{std::nullopt},
-        std::optional<raft::device_span<vertex_t const>>(std::nullopt));
-    auto transposed_edge_first =
-      thrust::make_zip_iterator(edgelist_dsts.begin(), edgelist_srcs.begin());
+    cugraph::edge_property_t<decltype(cur_graph_view), bool> edge_mask(handle, cur_graph_view);
+    cugraph::fill_edge_property(handle, cur_graph_view, edge_mask.mutable_view(), bool{true});
 
-    auto edge_first = thrust::make_zip_iterator(edgelist_srcs.begin(), edgelist_dsts.begin());
+    while (true) {
+      // FIXME: This approach is very expensive when invalidating only few edges per iteration
+      // and should be address.
+      auto edge_triangle_counts =
+        edge_triangle_count<vertex_t, edge_t, multi_gpu>(handle, cur_graph_view);
 
-    auto transposed_edge_triangle_count_pair_first =
-      thrust::make_zip_iterator(transposed_edge_first, (*num_triangles).begin());
+      // Mask all the edges that have k - 2 count
 
-    thrust::sort_by_key(handle.get_thrust_policy(),
-                        transposed_edge_first,
-                        transposed_edge_first + edgelist_srcs.size(),
-                        (*num_triangles).begin());
+      auto prev_number_of_edges = cur_graph_view.compute_number_of_edges(handle);
 
-    cugraph::edge_property_t<decltype(cur_graph_view), bool> edge_mask(handle, cur_graph_view);
-    cugraph::fill_edge_property(handle, cur_graph_view, edge_mask.mutable_view(), true);
-    cur_graph_view.attach_edge_mask(edge_mask.view());
-
-    while (true) {
-      // 'invalid_transposed_edge_triangle_count_first' marks the beginning of the edges to be
-      // removed 'invalid_transposed_edge_triangle_count_first' + edgelist_srcs.size() marks the end
-      // of the edges to be removed 'edge_triangle_count_pair_first' marks the begining of the valid
-      // edges.
-      auto invalid_transposed_edge_triangle_count_first =
-        thrust::stable_partition(handle.get_thrust_policy(),
-                                 transposed_edge_triangle_count_pair_first,
-                                 transposed_edge_triangle_count_pair_first + edgelist_srcs.size(),
-                                 [k] __device__(auto e) {
-                                   auto num_triangles = thrust::get<1>(e);
-                                   return num_triangles >= k - 2;
-                                 });
-      auto num_invalid_edges = static_cast<size_t>(
-        thrust::distance(invalid_transposed_edge_triangle_count_first,
-                         transposed_edge_triangle_count_pair_first + edgelist_srcs.size()));
-
-      if (num_invalid_edges == 0) { break; }
-
-      auto num_valid_edges = edgelist_srcs.size() - num_invalid_edges;
-
-      // case 1. For the (p, q), find intersection 'r'.
-
-      // nbr_intersection requires the edges to be sort by 'src'
-      // sort the invalid edges by src for nbr intersection
-      size_t edges_to_intersect_per_iteration =
-        static_cast<size_t>(handle.get_device_properties().multiProcessorCount) * (1 << 17);
-
-      size_t prev_chunk_size         = 0;
-      size_t chunk_num_invalid_edges = num_invalid_edges;
-
-      auto num_chunks =
-        raft::div_rounding_up_safe(edgelist_srcs.size(), edges_to_intersect_per_iteration);
-
-      for (size_t i = 0; i < num_chunks; ++i) {
-        auto chunk_size = std::min(edges_to_intersect_per_iteration, chunk_num_invalid_edges);
-        thrust::sort_by_key(handle.get_thrust_policy(),
-                            edge_first + num_valid_edges,
-                            edge_first + edgelist_srcs.size(),
-                            (*num_triangles).begin() + num_valid_edges);
-
-        auto [intersection_offsets, intersection_indices] =
-          detail::nbr_intersection(handle,
-                                   cur_graph_view,
-                                   cugraph::edge_dummy_property_t{}.view(),
-                                   edge_first + num_valid_edges + prev_chunk_size,
-                                   edge_first + num_valid_edges + prev_chunk_size + chunk_size,
-                                   std::array<bool, 2>{true, true},
-                                   do_expensive_check);
-
-        // Update the number of triangles of each (p, q) edges by looking at their intersection
-        // size.
-        thrust::for_each(
-          handle.get_thrust_policy(),
-          thrust::make_counting_iterator<edge_t>(0),
-          thrust::make_counting_iterator<edge_t>(chunk_size),
-          [chunk_start   = prev_chunk_size,
-           num_triangles = raft::device_span<edge_t>((*num_triangles).data() + num_valid_edges,
-                                                     num_invalid_edges),
-           intersection_offsets = raft::device_span<size_t const>(
-             intersection_offsets.data(), intersection_offsets.size())] __device__(auto i) {
-            num_triangles[chunk_start + i] -=
-              (intersection_offsets[i + 1] - intersection_offsets[i]);
-          });
-
-        // FIXME: Find a way to not have to maintain a dataframe_buffer
-        auto vertex_pair_buffer_p_r_edge_p_q =
-          allocate_dataframe_buffer<thrust::tuple<vertex_t, vertex_t>>(intersection_indices.size(),
-                                                                       handle.get_stream());
-        thrust::tabulate(
-          handle.get_thrust_policy(),
-          get_dataframe_buffer_begin(vertex_pair_buffer_p_r_edge_p_q),
-          get_dataframe_buffer_end(vertex_pair_buffer_p_r_edge_p_q),
-          generate_p_r_or_q_r_from_p_q<vertex_t, edge_t, true>{
-            prev_chunk_size,
-            raft::device_span<size_t const>(intersection_offsets.data(),
-                                            intersection_offsets.size()),
-            raft::device_span<vertex_t const>(intersection_indices.data(),
-                                              intersection_indices.size()),
-            raft::device_span<vertex_t>(edgelist_srcs.data() + num_valid_edges, num_invalid_edges),
-            raft::device_span<vertex_t>(edgelist_dsts.data() + num_valid_edges,
-                                        num_invalid_edges)});
-
-        auto vertex_pair_buffer_q_r_edge_p_q =
-          allocate_dataframe_buffer<thrust::tuple<vertex_t, vertex_t>>(intersection_indices.size(),
-                                                                       handle.get_stream());
-        thrust::tabulate(
-          handle.get_thrust_policy(),
-          get_dataframe_buffer_begin(vertex_pair_buffer_q_r_edge_p_q),
-          get_dataframe_buffer_end(vertex_pair_buffer_q_r_edge_p_q),
-          generate_p_r_or_q_r_from_p_q<vertex_t, edge_t, false>{
-            prev_chunk_size,
-            raft::device_span<size_t const>(intersection_offsets.data(),
-                                            intersection_offsets.size()),
-            raft::device_span<vertex_t const>(intersection_indices.data(),
-                                              intersection_indices.size()),
-            raft::device_span<vertex_t>(edgelist_srcs.data() + num_valid_edges, num_invalid_edges),
-            raft::device_span<vertex_t>(edgelist_dsts.data() + num_valid_edges,
-                                        num_invalid_edges)});
-
-        // Unrolling the edges require the edges to be sorted by destination
-        // re-sort the invalid edges by 'dst'
-        thrust::sort_by_key(handle.get_thrust_policy(),
-                            transposed_edge_first + num_valid_edges,
-                            transposed_edge_first + edgelist_srcs.size(),
-                            (*num_triangles).begin() + num_valid_edges);
-
-        thrust::for_each(
-          handle.get_thrust_policy(),
-          thrust::make_counting_iterator<edge_t>(0),
-          thrust::make_counting_iterator<edge_t>(intersection_indices.size()),
-          unroll_edge<vertex_t, edge_t, decltype(transposed_edge_first)>{
-            num_valid_edges,
-            raft::device_span<edge_t>((*num_triangles).data(), (*num_triangles).size()),
-            get_dataframe_buffer_begin(vertex_pair_buffer_p_r_edge_p_q),
-            transposed_edge_first,
-            transposed_edge_first + num_valid_edges,
-            transposed_edge_first + edgelist_srcs.size()});
-
-        thrust::for_each(
-          handle.get_thrust_policy(),
-          thrust::make_counting_iterator<edge_t>(0),
-          thrust::make_counting_iterator<edge_t>(intersection_indices.size()),
-          unroll_edge<vertex_t, edge_t, decltype(transposed_edge_first)>{
-            num_valid_edges,
-            raft::device_span<edge_t>((*num_triangles).data(), (*num_triangles).size()),
-            get_dataframe_buffer_begin(vertex_pair_buffer_q_r_edge_p_q),
-            transposed_edge_first,
-            transposed_edge_first + num_valid_edges,
-            transposed_edge_first + edgelist_srcs.size()});
-
-        prev_chunk_size += chunk_size;
-        chunk_num_invalid_edges -= chunk_size;
-      }
-      // case 2: unroll (q, r)
-      // For each (q, r) edges to unroll, find the incoming edges to 'r' let's say from 'p' and
-      // create the pair (p, q)
-      cugraph::unroll_p_r_or_q_r_edges<vertex_t, edge_t, false, true>(
-        handle,
-        cur_graph_view,
-        num_invalid_edges,
-        num_valid_edges,
-        raft::device_span<vertex_t const>(edgelist_srcs.data(), edgelist_srcs.size()),
-        raft::device_span<vertex_t const>(edgelist_dsts.data(), edgelist_dsts.size()),
-        raft::device_span<edge_t>((*num_triangles).data(), (*num_triangles).size()));
-
-      // case 3: unroll (p, r)
-      cugraph::unroll_p_r_or_q_r_edges<vertex_t, edge_t, false, false>(
+      cugraph::transform_e(
         handle,
         cur_graph_view,
-        num_invalid_edges,
-        num_valid_edges,
-        raft::device_span<vertex_t const>(edgelist_srcs.data(), edgelist_srcs.size()),
-        raft::device_span<vertex_t const>(edgelist_dsts.data(), edgelist_dsts.size()),
-        raft::device_span<edge_t>((*num_triangles).data(), (*num_triangles).size()));
-
-      // Remove edges that have a triangle count of zero. Those should not be accounted
-      // for during the unroling phase.
-      auto edges_with_triangle_last = thrust::stable_partition(
-        handle.get_thrust_policy(),
-        transposed_edge_triangle_count_pair_first,
-        transposed_edge_triangle_count_pair_first + (*num_triangles).size(),
-        [] __device__(auto e) {
-          auto num_triangles = thrust::get<1>(e);
-          return num_triangles > 0;
-        });
-
-      auto num_edges_with_triangles = static_cast<size_t>(
-        thrust::distance(transposed_edge_triangle_count_pair_first, edges_with_triangle_last));
-
-      thrust::sort(handle.get_thrust_policy(),
-                   thrust::make_zip_iterator(edgelist_srcs.begin() + num_edges_with_triangles,
-                                             edgelist_dsts.begin() + num_edges_with_triangles),
-                   thrust::make_zip_iterator(edgelist_srcs.end(), edgelist_dsts.end()));
-
-      cugraph::edge_bucket_t<vertex_t, void, true, multi_gpu, true> edges_with_no_triangle(handle);
-      edges_with_no_triangle.insert(edgelist_srcs.begin() + num_edges_with_triangles,
-                                    edgelist_srcs.end(),
-                                    edgelist_dsts.begin() + num_edges_with_triangles);
-
-      cur_graph_view.clear_edge_mask();
-      if (edge_weight_view) {
-        cugraph::transform_e(
-          handle,
-          cur_graph_view,
-          edges_with_no_triangle,
-          cugraph::edge_src_dummy_property_t{}.view(),
-          cugraph::edge_dst_dummy_property_t{}.view(),
-          *edge_weight_view,
-          [] __device__(auto src, auto dst, thrust::nullopt_t, thrust::nullopt_t, auto wgt) {
-            return false;
-          },
-          edge_mask.mutable_view(),
-          false);
-      } else {
-        cugraph::transform_e(
-          handle,
-          cur_graph_view,
-          edges_with_no_triangle,
-          cugraph::edge_src_dummy_property_t{}.view(),
-          cugraph::edge_dst_dummy_property_t{}.view(),
-          cugraph::edge_dummy_property_t{}.view(),
-          [] __device__(
-            auto src, auto dst, thrust::nullopt_t, thrust::nullopt_t, thrust::nullopt_t) {
-            return false;
-          },
-          edge_mask.mutable_view(),
-          false);
-      }
+        cugraph::edge_src_dummy_property_t{}.view(),
+        cugraph::edge_dst_dummy_property_t{}.view(),
+        edge_triangle_counts.view(),
+        [k] __device__(auto src, auto dst, thrust::nullopt_t, thrust::nullopt_t, auto count) {
+          return count >= k - 2;
+        },
+        edge_mask.mutable_view(),
+        false);
+
       cur_graph_view.attach_edge_mask(edge_mask.view());
 
-      edgelist_srcs.resize(num_edges_with_triangles, handle.get_stream());
-      edgelist_dsts.resize(num_edges_with_triangles, handle.get_stream());
-      (*num_triangles).resize(num_edges_with_triangles, handle.get_stream());
+      if (prev_number_of_edges == cur_graph_view.compute_number_of_edges(handle)) { break; }
     }
 
+    rmm::device_uvector<vertex_t> edgelist_srcs(0, handle.get_stream());
+    rmm::device_uvector<vertex_t> edgelist_dsts(0, handle.get_stream());
+    std::optional<rmm::device_uvector<weight_t>> edgelist_wgts{std::nullopt};
+
     std::tie(edgelist_srcs, edgelist_dsts, edgelist_wgts, std::ignore, std::ignore) =
       decompress_to_edgelist(
         handle,
@@ -923,7 +340,8 @@ k_truss(raft::handle_t const& handle,
         edge_weight_view ? std::make_optional(*edge_weight_view) : std::nullopt,
         std::optional<edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
         std::optional<cugraph::edge_property_view_t<edge_t, int32_t const*>>{std::nullopt},
-        std::optional<raft::device_span<vertex_t const>>(std::nullopt));
+        std::make_optional(
+          raft::device_span<vertex_t const>((*renumber_map).data(), (*renumber_map).size())));
 
     std::tie(edgelist_srcs, edgelist_dsts, edgelist_wgts) =
       symmetrize_edgelist<vertex_t, weight_t, false, multi_gpu>(handle,
diff --git a/cpp/src/community/k_truss_mg_v32_e32.cu b/cpp/src/community/k_truss_mg_v32_e32.cu
new file mode 100644
index 00000000000..4feb69f6098
--- /dev/null
+++ b/cpp/src/community/k_truss_mg_v32_e32.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "community/k_truss_impl.cuh"
+
+namespace cugraph {
+
+// MG instantiation
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>>
+k_truss(raft::handle_t const& handle,
+        graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+        std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+        int32_t k,
+        bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>>
+k_truss(raft::handle_t const& handle,
+        graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+        std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+        int32_t k,
+        bool do_expensive_check);
+
+}  // namespace cugraph
diff --git a/cpp/src/community/k_truss_mg_v32_e64.cu b/cpp/src/community/k_truss_mg_v32_e64.cu
new file mode 100644
index 00000000000..b07f9382612
--- /dev/null
+++ b/cpp/src/community/k_truss_mg_v32_e64.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "community/k_truss_impl.cuh"
+
+namespace cugraph {
+
+// MG instantiation
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>>
+k_truss(raft::handle_t const& handle,
+        graph_view_t<int32_t, int64_t, false, true> const& graph_view,
+        std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+        int64_t k,
+        bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>>
+k_truss(raft::handle_t const& handle,
+        graph_view_t<int32_t, int64_t, false, true> const& graph_view,
+        std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+        int64_t k,
+        bool do_expensive_check);
+
+}  // namespace cugraph
diff --git a/cpp/src/community/k_truss_mg_v64_e64.cu b/cpp/src/community/k_truss_mg_v64_e64.cu
new file mode 100644
index 00000000000..1c730fe272d
--- /dev/null
+++ b/cpp/src/community/k_truss_mg_v64_e64.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "community/k_truss_impl.cuh"
+
+namespace cugraph {
+
+// MG instantiation
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>>
+k_truss(raft::handle_t const& handle,
+        graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+        std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+        int64_t k,
+        bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>>
+k_truss(raft::handle_t const& handle,
+        graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+        std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+        int64_t k,
+        bool do_expensive_check);
+
+}  // namespace cugraph
diff --git a/cpp/src/community/triangle_count_impl.cuh b/cpp/src/community/triangle_count_impl.cuh
index 0b453cfe262..e902901cd36 100644
--- a/cpp/src/community/triangle_count_impl.cuh
+++ b/cpp/src/community/triangle_count_impl.cuh
@@ -439,7 +439,7 @@ void triangle_count(raft::handle_t const& handle,
                                             extract_low_to_high_degree_edges_t<vertex_t, edge_t>{});
 
     if constexpr (multi_gpu) {
-      std::tie(srcs, dsts, std::ignore, std::ignore, std::ignore) =
+      std::tie(srcs, dsts, std::ignore, std::ignore, std::ignore, std::ignore) =
         detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
                                                                                        edge_t,
                                                                                        weight_t,
diff --git a/cpp/src/components/weakly_connected_components_impl.cuh b/cpp/src/components/weakly_connected_components_impl.cuh
index 681523dad90..468f4f7280f 100644
--- a/cpp/src/components/weakly_connected_components_impl.cuh
+++ b/cpp/src/components/weakly_connected_components_impl.cuh
@@ -682,6 +682,7 @@ void weakly_connected_components_impl(raft::handle_t const& handle,
                  std::get<1>(edge_buffer),
                  std::ignore,
                  std::ignore,
+                 std::ignore,
                  std::ignore) =
           detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
             vertex_t,
diff --git a/cpp/src/cores/k_core_impl.cuh b/cpp/src/cores/k_core_impl.cuh
index 06402cc3382..2c5bf987a47 100644
--- a/cpp/src/cores/k_core_impl.cuh
+++ b/cpp/src/cores/k_core_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,8 +37,6 @@ k_core(raft::handle_t const& handle,
        std::optional<raft::device_span<edge_t const>> core_numbers,
        bool do_expensive_check)
 {
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   rmm::device_uvector<edge_t> computed_core_numbers(0, handle.get_stream());
 
   if (!core_numbers) {
diff --git a/cpp/src/link_prediction/similarity_impl.cuh b/cpp/src/link_prediction/similarity_impl.cuh
index 487f31e5e03..b39895129dc 100644
--- a/cpp/src/link_prediction/similarity_impl.cuh
+++ b/cpp/src/link_prediction/similarity_impl.cuh
@@ -408,7 +408,7 @@ all_pairs_similarity(raft::handle_t const& handle,
           // shuffle vertex pairs
           auto vertex_partition_range_lasts = graph_view.vertex_partition_range_lasts();
 
-          std::tie(v1, v2, std::ignore, std::ignore, std::ignore) =
+          std::tie(v1, v2, std::ignore, std::ignore, std::ignore, std::ignore) =
             detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
                                                                                            edge_t,
                                                                                            weight_t,
@@ -599,7 +599,7 @@ all_pairs_similarity(raft::handle_t const& handle,
       // shuffle vertex pairs
       auto vertex_partition_range_lasts = graph_view.vertex_partition_range_lasts();
 
-      std::tie(v1, v2, std::ignore, std::ignore, std::ignore) =
+      std::tie(v1, v2, std::ignore, std::ignore, std::ignore, std::ignore) =
         detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
                                                                                        edge_t,
                                                                                        weight_t,
diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh
index fb1dee1a92f..ed0a70e570f 100644
--- a/cpp/src/structure/coarsen_graph_impl.cuh
+++ b/cpp/src/structure/coarsen_graph_impl.cuh
@@ -349,7 +349,8 @@ coarsen_graph(raft::handle_t const& handle,
 
     // 1-2. globally shuffle
 
-    std::tie(edgelist_majors, edgelist_minors, edgelist_weights, std::ignore, std::ignore) =
+    std::tie(
+      edgelist_majors, edgelist_minors, edgelist_weights, std::ignore, std::ignore, std::ignore) =
       cugraph::detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
         vertex_t,
         edge_t,
@@ -475,6 +476,7 @@ coarsen_graph(raft::handle_t const& handle,
              reversed_edgelist_minors,
              reversed_edgelist_weights,
              std::ignore,
+             std::ignore,
              std::ignore) =
       cugraph::detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
         vertex_t,
diff --git a/cpp/src/structure/symmetrize_edgelist_impl.cuh b/cpp/src/structure/symmetrize_edgelist_impl.cuh
index a6a4c0947c7..1fd566938eb 100644
--- a/cpp/src/structure/symmetrize_edgelist_impl.cuh
+++ b/cpp/src/structure/symmetrize_edgelist_impl.cuh
@@ -295,6 +295,7 @@ symmetrize_edgelist(raft::handle_t const& handle,
              upper_triangular_majors,
              upper_triangular_weights,
              std::ignore,
+             std::ignore,
              std::ignore) =
       detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
                                                                                      vertex_t,
@@ -481,6 +482,7 @@ symmetrize_edgelist(raft::handle_t const& handle,
              upper_triangular_minors,
              upper_triangular_weights,
              std::ignore,
+             std::ignore,
              std::ignore) =
       detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
                                                                                      vertex_t,
diff --git a/cpp/src/structure/transpose_graph_impl.cuh b/cpp/src/structure/transpose_graph_impl.cuh
index a1eef536c43..6786b52faed 100644
--- a/cpp/src/structure/transpose_graph_impl.cuh
+++ b/cpp/src/structure/transpose_graph_impl.cuh
@@ -96,6 +96,7 @@ transpose_graph_impl(
            store_transposed ? edgelist_dsts : edgelist_srcs,
            edgelist_weights,
            std::ignore,
+           std::ignore,
            std::ignore) =
     detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
                                                                                    edge_t,
diff --git a/cpp/src/structure/transpose_graph_storage_impl.cuh b/cpp/src/structure/transpose_graph_storage_impl.cuh
index f8e479f28ff..374bfd05c40 100644
--- a/cpp/src/structure/transpose_graph_storage_impl.cuh
+++ b/cpp/src/structure/transpose_graph_storage_impl.cuh
@@ -96,6 +96,7 @@ transpose_graph_storage_impl(
            !store_transposed ? edgelist_srcs : edgelist_dsts,
            edgelist_weights,
            std::ignore,
+           std::ignore,
            std::ignore) =
     detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
                                                                                    edge_t,
diff --git a/cpp/src/utilities/shuffle_vertex_pairs.cuh b/cpp/src/utilities/shuffle_vertex_pairs.cuh
index 8cfefa1a060..70327db5ffb 100644
--- a/cpp/src/utilities/shuffle_vertex_pairs.cuh
+++ b/cpp/src/utilities/shuffle_vertex_pairs.cuh
@@ -42,7 +42,8 @@ std::tuple<rmm::device_uvector<vertex_t>,
            rmm::device_uvector<vertex_t>,
            std::optional<rmm::device_uvector<weight_t>>,
            std::optional<rmm::device_uvector<edge_t>>,
-           std::optional<rmm::device_uvector<edge_type_t>>>
+           std::optional<rmm::device_uvector<edge_type_t>>,
+           std::vector<size_t>>
 shuffle_vertex_pairs_with_values_by_gpu_id_impl(
   raft::handle_t const& handle,
   rmm::device_uvector<vertex_t>&& majors,
@@ -178,25 +179,27 @@ shuffle_vertex_pairs_with_values_by_gpu_id_impl(
                     handle.get_stream());
   handle.sync_stream();
 
+  std::vector<size_t> rx_counts{};
+
   if (mem_frugal_flag) {  // trade-off potential parallelism to lower peak memory
-    std::tie(majors, std::ignore) =
+    std::tie(majors, rx_counts) =
       shuffle_values(comm, majors.begin(), h_tx_value_counts, handle.get_stream());
 
-    std::tie(minors, std::ignore) =
+    std::tie(minors, rx_counts) =
       shuffle_values(comm, minors.begin(), h_tx_value_counts, handle.get_stream());
 
     if (weights) {
-      std::tie(weights, std::ignore) =
+      std::tie(weights, rx_counts) =
         shuffle_values(comm, (*weights).begin(), h_tx_value_counts, handle.get_stream());
     }
 
     if (edge_ids) {
-      std::tie(edge_ids, std::ignore) =
+      std::tie(edge_ids, rx_counts) =
         shuffle_values(comm, (*edge_ids).begin(), h_tx_value_counts, handle.get_stream());
     }
 
     if (edge_types) {
-      std::tie(edge_types, std::ignore) =
+      std::tie(edge_types, rx_counts) =
         shuffle_values(comm, (*edge_types).begin(), h_tx_value_counts, handle.get_stream());
     }
   } else {
@@ -204,7 +207,7 @@ shuffle_vertex_pairs_with_values_by_gpu_id_impl(
       if (edge_ids) {
         if (edge_types) {
           std::forward_as_tuple(std::tie(majors, minors, weights, edge_ids, edge_types),
-                                std::ignore) =
+                                rx_counts) =
             shuffle_values(comm,
                            thrust::make_zip_iterator(majors.begin(),
                                                      minors.begin(),
@@ -214,7 +217,7 @@ shuffle_vertex_pairs_with_values_by_gpu_id_impl(
                            h_tx_value_counts,
                            handle.get_stream());
         } else {
-          std::forward_as_tuple(std::tie(majors, minors, weights, edge_ids), std::ignore) =
+          std::forward_as_tuple(std::tie(majors, minors, weights, edge_ids), rx_counts) =
             shuffle_values(comm,
                            thrust::make_zip_iterator(
                              majors.begin(), minors.begin(), weights->begin(), edge_ids->begin()),
@@ -223,14 +226,14 @@ shuffle_vertex_pairs_with_values_by_gpu_id_impl(
         }
       } else {
         if (edge_types) {
-          std::forward_as_tuple(std::tie(majors, minors, weights, edge_types), std::ignore) =
+          std::forward_as_tuple(std::tie(majors, minors, weights, edge_types), rx_counts) =
             shuffle_values(comm,
                            thrust::make_zip_iterator(
                              majors.begin(), minors.begin(), weights->begin(), edge_types->begin()),
                            h_tx_value_counts,
                            handle.get_stream());
         } else {
-          std::forward_as_tuple(std::tie(majors, minors, weights), std::ignore) = shuffle_values(
+          std::forward_as_tuple(std::tie(majors, minors, weights), rx_counts) = shuffle_values(
             comm,
             thrust::make_zip_iterator(majors.begin(), minors.begin(), weights->begin()),
             h_tx_value_counts,
@@ -240,7 +243,7 @@ shuffle_vertex_pairs_with_values_by_gpu_id_impl(
     } else {
       if (edge_ids) {
         if (edge_types) {
-          std::forward_as_tuple(std::tie(majors, minors, edge_ids, edge_types), std::ignore) =
+          std::forward_as_tuple(std::tie(majors, minors, edge_ids, edge_types), rx_counts) =
             shuffle_values(
               comm,
               thrust::make_zip_iterator(
@@ -248,7 +251,7 @@ shuffle_vertex_pairs_with_values_by_gpu_id_impl(
               h_tx_value_counts,
               handle.get_stream());
         } else {
-          std::forward_as_tuple(std::tie(majors, minors, edge_ids), std::ignore) = shuffle_values(
+          std::forward_as_tuple(std::tie(majors, minors, edge_ids), rx_counts) = shuffle_values(
             comm,
             thrust::make_zip_iterator(majors.begin(), minors.begin(), edge_ids->begin()),
             h_tx_value_counts,
@@ -256,13 +259,13 @@ shuffle_vertex_pairs_with_values_by_gpu_id_impl(
         }
       } else {
         if (edge_types) {
-          std::forward_as_tuple(std::tie(majors, minors, edge_types), std::ignore) = shuffle_values(
+          std::forward_as_tuple(std::tie(majors, minors, edge_types), rx_counts) = shuffle_values(
             comm,
             thrust::make_zip_iterator(majors.begin(), minors.begin(), edge_types->begin()),
             h_tx_value_counts,
             handle.get_stream());
         } else {
-          std::forward_as_tuple(std::tie(majors, minors), std::ignore) =
+          std::forward_as_tuple(std::tie(majors, minors), rx_counts) =
             shuffle_values(comm,
                            thrust::make_zip_iterator(majors.begin(), minors.begin()),
                            h_tx_value_counts,
@@ -276,7 +279,8 @@ shuffle_vertex_pairs_with_values_by_gpu_id_impl(
                          std::move(minors),
                          std::move(weights),
                          std::move(edge_ids),
-                         std::move(edge_types));
+                         std::move(edge_types),
+                         std::move(rx_counts));
 }
 
 }  // namespace
@@ -288,7 +292,8 @@ std::tuple<rmm::device_uvector<vertex_t>,
            rmm::device_uvector<vertex_t>,
            std::optional<rmm::device_uvector<weight_t>>,
            std::optional<rmm::device_uvector<edge_t>>,
-           std::optional<rmm::device_uvector<edge_type_t>>>
+           std::optional<rmm::device_uvector<edge_type_t>>,
+           std::vector<size_t>>
 shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<vertex_t>&& majors,
@@ -320,7 +325,8 @@ std::tuple<rmm::device_uvector<vertex_t>,
            rmm::device_uvector<vertex_t>,
            std::optional<rmm::device_uvector<weight_t>>,
            std::optional<rmm::device_uvector<edge_t>>,
-           std::optional<rmm::device_uvector<edge_type_t>>>
+           std::optional<rmm::device_uvector<edge_type_t>>,
+           std::vector<size_t>>
 shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<vertex_t>&& majors,
@@ -366,7 +372,8 @@ std::tuple<rmm::device_uvector<vertex_t>,
            rmm::device_uvector<vertex_t>,
            std::optional<rmm::device_uvector<weight_t>>,
            std::optional<rmm::device_uvector<edge_t>>,
-           std::optional<rmm::device_uvector<edge_type_t>>>
+           std::optional<rmm::device_uvector<edge_type_t>>,
+           std::vector<size_t>>
 shuffle_external_edges(raft::handle_t const& handle,
                        rmm::device_uvector<vertex_t>&& edge_srcs,
                        rmm::device_uvector<vertex_t>&& edge_dsts,
diff --git a/cpp/src/utilities/shuffle_vertex_pairs_mg_v32_e32.cu b/cpp/src/utilities/shuffle_vertex_pairs_mg_v32_e32.cu
index 7943ebcd5f4..db5a7c0e9bd 100644
--- a/cpp/src/utilities/shuffle_vertex_pairs_mg_v32_e32.cu
+++ b/cpp/src/utilities/shuffle_vertex_pairs_mg_v32_e32.cu
@@ -36,7 +36,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<float>>,
                     std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int32_t>&& majors,
@@ -49,7 +50,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<double>>,
                     std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int32_t>&& majors,
@@ -62,7 +64,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<float>>,
                     std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int32_t>&& majors,
@@ -76,7 +79,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<double>>,
                     std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int32_t>&& majors,
@@ -92,7 +96,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<float>>,
                     std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_external_edges(raft::handle_t const& handle,
                        rmm::device_uvector<int32_t>&& majors,
                        rmm::device_uvector<int32_t>&& minors,
@@ -104,7 +109,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<double>>,
                     std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_external_edges(raft::handle_t const& handle,
                        rmm::device_uvector<int32_t>&& majors,
                        rmm::device_uvector<int32_t>&& minors,
diff --git a/cpp/src/utilities/shuffle_vertex_pairs_mg_v32_e64.cu b/cpp/src/utilities/shuffle_vertex_pairs_mg_v32_e64.cu
index 230fce435f9..d79b2379224 100644
--- a/cpp/src/utilities/shuffle_vertex_pairs_mg_v32_e64.cu
+++ b/cpp/src/utilities/shuffle_vertex_pairs_mg_v32_e64.cu
@@ -35,7 +35,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<float>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int32_t>&& majors,
@@ -48,7 +49,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<double>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int32_t>&& majors,
@@ -61,7 +63,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<float>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int32_t>&& majors,
@@ -75,7 +78,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<double>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int32_t>&& majors,
@@ -91,7 +95,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<float>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_external_edges(raft::handle_t const& handle,
                        rmm::device_uvector<int32_t>&& majors,
                        rmm::device_uvector<int32_t>&& minors,
@@ -103,7 +108,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<double>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_external_edges(raft::handle_t const& handle,
                        rmm::device_uvector<int32_t>&& majors,
                        rmm::device_uvector<int32_t>&& minors,
diff --git a/cpp/src/utilities/shuffle_vertex_pairs_mg_v64_e64.cu b/cpp/src/utilities/shuffle_vertex_pairs_mg_v64_e64.cu
index 8134c41696b..605976f7076 100644
--- a/cpp/src/utilities/shuffle_vertex_pairs_mg_v64_e64.cu
+++ b/cpp/src/utilities/shuffle_vertex_pairs_mg_v64_e64.cu
@@ -36,7 +36,8 @@ template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
                     std::optional<rmm::device_uvector<float>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int64_t>&& majors,
@@ -49,7 +50,8 @@ template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
                     std::optional<rmm::device_uvector<double>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int64_t>&& majors,
@@ -62,7 +64,8 @@ template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
                     std::optional<rmm::device_uvector<float>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int64_t>&& majors,
@@ -76,7 +79,8 @@ template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
                     std::optional<rmm::device_uvector<double>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int64_t>&& majors,
@@ -92,7 +96,8 @@ template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
                     std::optional<rmm::device_uvector<float>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_external_edges(raft::handle_t const& handle,
                        rmm::device_uvector<int64_t>&& majors,
                        rmm::device_uvector<int64_t>&& minors,
@@ -104,7 +109,8 @@ template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
                     std::optional<rmm::device_uvector<double>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_external_edges(raft::handle_t const& handle,
                        rmm::device_uvector<int64_t>&& majors,
                        rmm::device_uvector<int64_t>&& minors,
diff --git a/cpp/src/utilities/shuffle_vertices_mg_v32_integral.cu b/cpp/src/utilities/shuffle_vertices_mg_v32_integral.cu
index 0c91eb546d6..db7be5a3031 100644
--- a/cpp/src/utilities/shuffle_vertices_mg_v32_integral.cu
+++ b/cpp/src/utilities/shuffle_vertices_mg_v32_integral.cu
@@ -40,6 +40,13 @@ shuffle_int_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
   rmm::device_uvector<int32_t>&& d_values,
   std::vector<int32_t> const& vertex_partition_range_lasts);
 
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int64_t>>
+shuffle_int_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& d_vertices,
+  rmm::device_uvector<int64_t>&& d_values,
+  std::vector<int32_t> const& vertex_partition_range_lasts);
+
 template rmm::device_uvector<int32_t> shuffle_ext_vertices_to_local_gpu_by_vertex_partitioning(
   raft::handle_t const& handle, rmm::device_uvector<int32_t>&& d_vertices);
 
diff --git a/cpp/src/utilities/shuffle_vertices_mg_v64_integral.cu b/cpp/src/utilities/shuffle_vertices_mg_v64_integral.cu
index 5abce7c0783..7d968006bc7 100644
--- a/cpp/src/utilities/shuffle_vertices_mg_v64_integral.cu
+++ b/cpp/src/utilities/shuffle_vertices_mg_v64_integral.cu
@@ -35,6 +35,13 @@ shuffle_int_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
   rmm::device_uvector<int32_t>&& d_values,
   std::vector<int64_t> const& vertex_partition_range_lasts);
 
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>
+shuffle_int_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int64_t>&& d_vertices,
+  rmm::device_uvector<int64_t>&& d_values,
+  std::vector<int64_t> const& vertex_partition_range_lasts);
+
 template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int32_t>>
 shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
   raft::handle_t const& handle,
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 892ba91af86..52d257b9bea 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -500,7 +500,7 @@ ConfigureTest(CORE_NUMBER_TEST cores/core_number_test.cpp)
 ConfigureTest(K_CORE_TEST cores/k_core_test.cpp)
 
 ###################################################################################################
-# - K-truss tests ---------------------------------------------------------------------------------
+# - K-truss tests --------------------------------------------------------------------------
 ConfigureTest(K_TRUSS_TEST community/k_truss_test.cpp)
 
 ###################################################################################################
@@ -619,6 +619,10 @@ if(BUILD_CUGRAPH_MG_TESTS)
     # - MG EDGE TRIANGLE COUNT tests --------------------------------------------------------------
     ConfigureTestMG(MG_EDGE_TRIANGLE_COUNT_TEST community/mg_edge_triangle_count_test.cpp)
 
+    ###############################################################################################
+    # - MG K-TRUSS tests --------------------------------------------------------------------------
+    ConfigureTestMG(MG_K_TRUSS_TEST community/mg_k_truss_test.cpp)
+
     ###############################################################################################
     # - MG WEAKLY CONNECTED COMPONENTS tests ------------------------------------------------------
     ConfigureTestMG(MG_WEAKLY_CONNECTED_COMPONENTS_TEST
@@ -783,6 +787,7 @@ if(BUILD_CUGRAPH_MG_TESTS)
     ConfigureCTestMG(MG_CAPI_COUNT_MULTI_EDGES c_api/mg_count_multi_edges_test.c)
     ConfigureCTestMG(MG_CAPI_EGONET_TEST c_api/mg_egonet_test.c)
     ConfigureCTestMG(MG_CAPI_TWO_HOP_NEIGHBORS_TEST c_api/mg_two_hop_neighbors_test.c)
+    ConfigureCTestMG(MG_CAPI_K_TRUSS c_api/mg_k_truss_test.c)
 
     rapids_test_install_relocatable(INSTALL_COMPONENT_SET testing_mg DESTINATION bin/gtests/libcugraph_mg)
 
diff --git a/cpp/tests/c_api/mg_k_truss_test.c b/cpp/tests/c_api/mg_k_truss_test.c
new file mode 100644
index 00000000000..e406eb330a7
--- /dev/null
+++ b/cpp/tests/c_api/mg_k_truss_test.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mg_test_utils.h" /* RUN_TEST */
+
+#include <cugraph_c/graph.h>
+#include <cugraph_c/graph_functions.h>
+
+#include <stdio.h>
+
+typedef int32_t vertex_t;
+typedef int32_t edge_t;
+typedef float weight_t;
+
+/*
+ * Simple check of creating a graph from a COO on device memory.
+ */
+int generic_k_truss_test(const cugraph_resource_handle_t* handle,
+                         vertex_t* h_src,
+                         vertex_t* h_dst,
+                         weight_t* h_wgt,
+                         size_t num_edges,
+                         size_t num_results,
+                         size_t k,
+                         bool_t store_transposed,
+                         vertex_t* h_result_src,
+                         vertex_t* h_result_dst,
+                         weight_t* h_result_wgt)
+{
+  int test_ret_value = 0;
+
+  cugraph_error_code_t ret_code = CUGRAPH_SUCCESS;
+  cugraph_error_t* ret_error;
+
+  cugraph_graph_t* graph = NULL;
+
+  cugraph_induced_subgraph_result_t* result = NULL;
+
+  data_type_id_t vertex_tid = INT32;
+  data_type_id_t size_t_tid = SIZE_T;
+
+  ret_code = create_mg_test_graph(
+    handle, h_src, h_dst, h_wgt, num_edges, store_transposed, TRUE, &graph, &ret_error);
+
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create_test_graph failed.");
+  TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
+
+  ret_code = cugraph_k_truss_subgraph(handle, graph, k, FALSE, &result, &ret_error);
+  TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "cugraph_k_truss failed.");
+
+  cugraph_type_erased_device_array_view_t* k_truss_src;
+  cugraph_type_erased_device_array_view_t* k_truss_dst;
+  cugraph_type_erased_device_array_view_t* k_truss_wgt;
+
+  k_truss_src = cugraph_induced_subgraph_get_sources(result);
+  k_truss_dst = cugraph_induced_subgraph_get_destinations(result);
+  k_truss_wgt = cugraph_induced_subgraph_get_edge_weights(result);
+
+  size_t k_truss_size = cugraph_type_erased_device_array_view_size(k_truss_src);
+
+  vertex_t h_k_truss_src[k_truss_size];
+  vertex_t h_k_truss_dst[k_truss_size];
+  weight_t h_k_truss_wgt[k_truss_size];
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    handle, (byte_t*)h_k_truss_src, k_truss_src, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    handle, (byte_t*)h_k_truss_dst, k_truss_dst, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    handle, (byte_t*)h_k_truss_wgt, k_truss_wgt, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  for (size_t i = 0; (i < k_truss_size) && (test_ret_value == 0); ++i) {
+    bool_t found = FALSE;
+    for (size_t j = 0; (j < num_results) && !found; ++j) {
+      if ((h_k_truss_src[i] == h_result_src[j]) && (h_k_truss_dst[i] == h_result_dst[j]) &&
+          (h_k_truss_wgt[i] == h_result_wgt[j]))
+        found = TRUE;
+    }
+    TEST_ASSERT(test_ret_value, found, "k_truss subgraph has an edge that doesn't match");
+  }
+
+  cugraph_induced_subgraph_result_free(result);
+  cugraph_mg_graph_free(graph);
+  cugraph_error_free(ret_error);
+  return test_ret_value;
+}
+
+int test_k_truss_subgraph(const cugraph_resource_handle_t* handle)
+{
+  size_t num_edges    = 14;
+  size_t num_vertices = 7;
+  size_t num_results  = 6;
+  size_t k            = 3;
+
+  vertex_t h_src[] = {0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 3, 4, 5, 6};
+  vertex_t h_dst[] = {1, 2, 5, 0, 2, 3, 4, 6, 0, 1, 1, 1, 0, 1};
+  weight_t h_wgt[] = {
+    1.2f, 1.3f, 1.6f, 1.2f, 2.3f, 2.4f, 2.5f, 2.7f, 1.3f, 2.3f, 2.4f, 2.5f, 1.6f, 2.7f};
+
+  vertex_t h_result_src[] = {0, 2, 2, 1, 1, 0};
+  vertex_t h_result_dst[] = {1, 1, 0, 0, 2, 2};
+  weight_t h_result_wgt[] = {1.2f, 2.3f, 1.3f, 1.2f, 2.3f, 1.3f};
+
+  return generic_k_truss_test(handle,
+                              h_src,
+                              h_dst,
+                              h_wgt,
+                              num_edges,
+                              num_results,
+                              k,
+                              FALSE,
+                              h_result_src,
+                              h_result_dst,
+                              h_result_wgt);
+}
+
+/******************************************************************************/
+
+int main(int argc, char** argv)
+{
+  void* raft_handle                 = create_mg_raft_handle(argc, argv);
+  cugraph_resource_handle_t* handle = cugraph_create_resource_handle(raft_handle);
+
+  int result = 0;
+  result |= RUN_MG_TEST(test_k_truss_subgraph, handle);
+
+  cugraph_free_resource_handle(handle);
+  free_mg_raft_handle(raft_handle);
+
+  return result;
+}
diff --git a/cpp/tests/community/mg_k_truss_test.cpp b/cpp/tests/community/mg_k_truss_test.cpp
new file mode 100644
index 00000000000..a1624949007
--- /dev/null
+++ b/cpp/tests/community/mg_k_truss_test.cpp
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utilities/base_fixture.hpp"
+#include "utilities/conversion_utilities.hpp"
+#include "utilities/device_comm_wrapper.hpp"
+#include "utilities/mg_utilities.hpp"
+#include "utilities/property_generator_utilities.hpp"
+#include "utilities/test_graphs.hpp"
+#include "utilities/thrust_wrapper.hpp"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/graph.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <raft/comms/mpi_comms.hpp>
+#include <raft/core/comms.hpp>
+#include <raft/core/handle.hpp>
+
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <random>
+
+struct KTruss_Usecase {
+  int32_t k_{3};
+  bool test_weighted_{false};
+  bool edge_masking_{false};
+  bool check_correctness_{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MGKTruss
+  : public ::testing::TestWithParam<std::tuple<KTruss_Usecase, input_usecase_t>> {
+ public:
+  Tests_MGKTruss() {}
+
+  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
+
+  static void TearDownTestCase() { handle_.reset(); }
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  // Compare the results of running KTruss on multiple GPUs to that of a single-GPU run
+  template <typename vertex_t, typename edge_t>
+  void run_current_test(KTruss_Usecase const& k_truss_usecase, input_usecase_t const& input_usecase)
+  {
+    using weight_t = float;
+
+    HighResTimer hr_timer{};
+
+    // 1. create MG graph
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG Construct graph");
+    }
+
+    auto [mg_graph, edge_weight, mg_renumber_map] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, true>(
+        *handle_, input_usecase, k_truss_usecase.test_weighted_, true, false, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto mg_graph_view = mg_graph.view();
+
+    std::optional<cugraph::edge_property_t<decltype(mg_graph_view), bool>> edge_mask{std::nullopt};
+    if (k_truss_usecase.edge_masking_) {
+      edge_mask = cugraph::test::generate<decltype(mg_graph_view), bool>::edge_property(
+        *handle_, mg_graph_view, 2);
+      mg_graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
+    // 2. run MG KTruss
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG KTruss");
+    }
+
+    auto mg_edge_weight_view =
+      edge_weight ? std::make_optional((*edge_weight).view()) : std::nullopt;
+    auto [d_cugraph_srcs, d_cugraph_dsts, d_cugraph_wgts] =
+      cugraph::k_truss<vertex_t, edge_t, weight_t, true>(
+        *handle_, mg_graph_view, mg_edge_weight_view, k_truss_usecase.k_, false);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    // 3. Compare SG & MG results
+
+    if (k_truss_usecase.check_correctness_) {
+      cugraph::unrenumber_int_vertices<vertex_t, true>(
+        *handle_,
+        d_cugraph_srcs.data(),
+        d_cugraph_srcs.size(),
+        (*mg_renumber_map).data(),
+        mg_graph_view.vertex_partition_range_lasts());
+
+      cugraph::unrenumber_int_vertices<vertex_t, true>(
+        *handle_,
+        d_cugraph_dsts.data(),
+        d_cugraph_dsts.size(),
+        (*mg_renumber_map).data(),
+        mg_graph_view.vertex_partition_range_lasts());
+
+      auto global_d_cugraph_srcs = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>(d_cugraph_srcs.data(), d_cugraph_srcs.size()));
+
+      auto global_d_cugraph_dsts = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>(d_cugraph_dsts.data(), d_cugraph_srcs.size()));
+
+      rmm::device_uvector<vertex_t> d_sorted_cugraph_srcs{0, handle_->get_stream()};
+      rmm::device_uvector<vertex_t> d_sorted_cugraph_dsts{0, handle_->get_stream()};
+      rmm::device_uvector<weight_t> d_sorted_cugraph_wgts{0, handle_->get_stream()};
+
+      if (edge_weight) {
+        auto global_d_cugraph_wgts = cugraph::test::device_gatherv(
+          *handle_,
+          raft::device_span<weight_t const>((*d_cugraph_wgts).data(), (*d_cugraph_wgts).size()));
+
+        std::tie(d_sorted_cugraph_srcs, d_sorted_cugraph_dsts, d_sorted_cugraph_wgts) =
+          cugraph::test::sort_by_key<vertex_t, weight_t>(
+            *handle_, global_d_cugraph_srcs, global_d_cugraph_dsts, global_d_cugraph_wgts);
+
+      } else {
+        std::tie(d_sorted_cugraph_srcs, d_sorted_cugraph_dsts) =
+          cugraph::test::sort<vertex_t>(*handle_, global_d_cugraph_srcs, global_d_cugraph_dsts);
+      }
+
+      // 3-1. Convert to SG graph
+      auto [sg_graph, sg_edge_weights, sg_edge_ids, sg_number_map] =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          mg_edge_weight_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          false);
+
+      auto sg_edge_weight_view =
+        sg_edge_weights ? std::make_optional((*sg_edge_weights).view()) : std::nullopt;
+
+      if (handle_->get_comms().get_rank() == int{0}) {
+        auto sg_graph_view = sg_graph.view();
+
+        // 3-2. Run SG KTruss
+        auto [ref_d_cugraph_srcs, ref_d_cugraph_dsts, ref_d_cugraph_wgts] =
+          cugraph::k_truss<vertex_t, edge_t, weight_t, false>(
+            *handle_, sg_graph_view, sg_edge_weight_view, k_truss_usecase.k_, false);
+
+        rmm::device_uvector<vertex_t> d_sorted_ref_cugraph_srcs{0, handle_->get_stream()};
+        rmm::device_uvector<vertex_t> d_sorted_ref_cugraph_dsts{0, handle_->get_stream()};
+        rmm::device_uvector<weight_t> d_sorted_ref_cugraph_wgts{0, handle_->get_stream()};
+
+        if (edge_weight) {
+          std::tie(
+            d_sorted_ref_cugraph_srcs, d_sorted_ref_cugraph_dsts, d_sorted_ref_cugraph_wgts) =
+            cugraph::test::sort_by_key<vertex_t, weight_t>(
+              *handle_, ref_d_cugraph_srcs, ref_d_cugraph_dsts, *ref_d_cugraph_wgts);
+
+        } else {
+          std::tie(d_sorted_ref_cugraph_srcs, d_sorted_ref_cugraph_dsts) =
+            cugraph::test::sort<vertex_t>(*handle_, ref_d_cugraph_srcs, ref_d_cugraph_dsts);
+        }
+
+        // 3-3. Compare
+        auto h_cugraph_srcs     = cugraph::test::to_host(*handle_, d_sorted_cugraph_srcs);
+        auto h_cugraph_dsts     = cugraph::test::to_host(*handle_, d_sorted_cugraph_dsts);
+        auto ref_h_cugraph_srcs = cugraph::test::to_host(*handle_, d_sorted_ref_cugraph_srcs);
+        auto ref_h_cugraph_dsts = cugraph::test::to_host(*handle_, d_sorted_ref_cugraph_dsts);
+
+        ASSERT_TRUE(
+          std::equal(h_cugraph_srcs.begin(), h_cugraph_srcs.end(), ref_h_cugraph_srcs.begin()));
+
+        ASSERT_TRUE(
+          std::equal(h_cugraph_dsts.begin(), h_cugraph_dsts.end(), ref_h_cugraph_dsts.begin()));
+
+        if (edge_weight) {
+          auto ref_h_cugraph_wgts = cugraph::test::to_host(*handle_, d_sorted_ref_cugraph_wgts);
+
+          auto h_cugraph_wgts = cugraph::test::to_host(*handle_, d_sorted_cugraph_wgts);
+
+          ASSERT_TRUE(
+            std::equal(h_cugraph_wgts.begin(), h_cugraph_wgts.end(), ref_h_cugraph_wgts.begin()));
+        }
+      }
+    }
+  }
+
+ private:
+  static std::unique_ptr<raft::handle_t> handle_;
+};
+
+template <typename input_usecase_t>
+std::unique_ptr<raft::handle_t> Tests_MGKTruss<input_usecase_t>::handle_ = nullptr;
+
+using Tests_MGKTruss_File = Tests_MGKTruss<cugraph::test::File_Usecase>;
+using Tests_MGKTruss_Rmat = Tests_MGKTruss<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MGKTruss_File, CheckInt32Int32)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_MGKTruss_Rmat, CheckInt32Int32)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGKTruss_Rmat, CheckInt32Int64)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int64_t>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGKTruss_Rmat, CheckInt64Int64)
+{
+  auto param = GetParam();
+  run_current_test<int64_t, int64_t>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_tests,
+  Tests_MGKTruss_File,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(KTruss_Usecase{4, false, true, true}, KTruss_Usecase{5, true, true, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/dolphins.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_tests,
+  Tests_MGKTruss_Rmat,
+  ::testing::Combine(
+    ::testing::Values(KTruss_Usecase{4, false, false, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 16, 0.57, 0.19, 0.19, 0, true, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_MGKTruss_Rmat,
+  ::testing::Combine(
+    ::testing::Values(KTruss_Usecase{4, false, false, false},
+                      KTruss_Usecase{5, false, false, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, true, false))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/link_prediction/mg_similarity_test.cpp b/cpp/tests/link_prediction/mg_similarity_test.cpp
index 8f674e6a6de..3bcabb6b6df 100644
--- a/cpp/tests/link_prediction/mg_similarity_test.cpp
+++ b/cpp/tests/link_prediction/mg_similarity_test.cpp
@@ -106,7 +106,7 @@ class Tests_MGSimilarity
     auto d_v1 = cugraph::test::to_device(*handle_, h_v1);
     auto d_v2 = std::move(two_hop_nbrs);
 
-    std::tie(d_v1, d_v2, std::ignore, std::ignore, std::ignore) =
+    std::tie(d_v1, d_v2, std::ignore, std::ignore, std::ignore, std::ignore) =
       cugraph::detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
         vertex_t,
         edge_t,
diff --git a/cpp/tests/link_prediction/mg_weighted_similarity_test.cpp b/cpp/tests/link_prediction/mg_weighted_similarity_test.cpp
index 192caa5227e..730a3ac8f08 100644
--- a/cpp/tests/link_prediction/mg_weighted_similarity_test.cpp
+++ b/cpp/tests/link_prediction/mg_weighted_similarity_test.cpp
@@ -108,7 +108,7 @@ class Tests_MGSimilarity
     auto d_v1 = cugraph::test::to_device(*handle_, h_v1);
     auto d_v2 = std::move(two_hop_nbrs);
 
-    std::tie(d_v1, d_v2, std::ignore, std::ignore, std::ignore) =
+    std::tie(d_v1, d_v2, std::ignore, std::ignore, std::ignore, std::ignore) =
       cugraph::detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
         vertex_t,
         edge_t,
diff --git a/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu b/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu
index 681a7d8e6ff..fc6369ec721 100644
--- a/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu
+++ b/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu
@@ -149,6 +149,7 @@ class Tests_MGPerVPairTransformDstNbrIntersection
              std::get<1>(mg_vertex_pair_buffer),
              std::ignore,
              std::ignore,
+             std::ignore,
              std::ignore) =
       cugraph::detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
         vertex_t,
diff --git a/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_weighted_intersection.cu b/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_weighted_intersection.cu
index 48bbc6176d8..06a23880d81 100644
--- a/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_weighted_intersection.cu
+++ b/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_weighted_intersection.cu
@@ -175,6 +175,7 @@ class Tests_MGPerVPairTransformDstNbrIntersection
              std::get<1>(mg_vertex_pair_buffer),
              std::ignore,
              std::ignore,
+             std::ignore,
              std::ignore) =
       cugraph::detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
         vertex_t,
diff --git a/cpp/tests/structure/mg_has_edge_and_compute_multiplicity_test.cpp b/cpp/tests/structure/mg_has_edge_and_compute_multiplicity_test.cpp
index 3d3d881fb23..b8ad06dd18b 100644
--- a/cpp/tests/structure/mg_has_edge_and_compute_multiplicity_test.cpp
+++ b/cpp/tests/structure/mg_has_edge_and_compute_multiplicity_test.cpp
@@ -123,6 +123,7 @@ class Tests_MGHasEdgeAndComputeMultiplicity
              store_transposed ? d_mg_edge_srcs : d_mg_edge_dsts,
              std::ignore,
              std::ignore,
+             std::ignore,
              std::ignore) =
       cugraph::detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
         vertex_t,
diff --git a/cpp/tests/utilities/test_graphs.hpp b/cpp/tests/utilities/test_graphs.hpp
index b6898fbaf78..0a706d1cf80 100644
--- a/cpp/tests/utilities/test_graphs.hpp
+++ b/cpp/tests/utilities/test_graphs.hpp
@@ -335,6 +335,7 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase {
                  store_transposed ? tmp_src_v : tmp_dst_v,
                  tmp_weights_v,
                  std::ignore,
+                 std::ignore,
                  std::ignore) =
           cugraph::detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
             vertex_t,