From f753e5146bac7243d962736a9377a843ca81fd6f Mon Sep 17 00:00:00 2001
From: Joseph Nke <76006812+jnke2016@users.noreply.github.com>
Date: Tue, 26 Mar 2024 23:28:45 +0100
Subject: [PATCH] Ktruss implementation (#4059)

Implements SG and MG ktruss using graph primitives and drop `cuHornet`.

Closes #3447
Closes #3448
Closes #3449
Closes #3450
Closes #3451
Closes #3452
Closes #3453

Authors:
  - Joseph Nke (https://github.com/jnke2016)
  - Seunghwa Kang (https://github.com/seunghwak)

Approvers:
  - Brad Rees (https://github.com/BradReesWork)
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Seunghwa Kang (https://github.com/seunghwak)
  - Rick Ratzel (https://github.com/rlratzel)

URL: https://github.com/rapidsai/cugraph/pull/4059
---
 cpp/CMakeLists.txt                            |  32 +-
 cpp/cmake/thirdparty/get_cuhornet.cmake       |  45 -
 cpp/include/cugraph/algorithms.hpp            |  73 +-
 .../cugraph/utilities/graph_traits.hpp        |   6 +-
 .../c_api/{legacy_k_truss.cpp => k_truss.cpp} |  34 +-
 .../community/edge_triangle_count_impl.cuh    | 154 +++
 cpp/src/community/edge_triangle_count_sg.cu   |  39 +
 cpp/src/community/k_truss_impl.cuh            | 915 ++++++++++++++++++
 cpp/src/community/k_truss_sg.cu               |  77 ++
 cpp/src/community/legacy/ktruss.cu            | 185 ----
 cpp/tests/CMakeLists.txt                      |   6 +-
 .../{legacy_k_truss_test.c => k_truss_test.c} |  13 +-
 cpp/tests/community/k_truss_test.cpp          | 325 +++++++
 cpp/tests/utilities/thrust_wrapper.cu         | 111 +++
 cpp/tests/utilities/thrust_wrapper.hpp        |  12 +
 15 files changed, 1693 insertions(+), 334 deletions(-)
 delete mode 100644 cpp/cmake/thirdparty/get_cuhornet.cmake
 rename cpp/src/c_api/{legacy_k_truss.cpp => k_truss.cpp} (80%)
 create mode 100644 cpp/src/community/edge_triangle_count_impl.cuh
 create mode 100644 cpp/src/community/edge_triangle_count_sg.cu
 create mode 100644 cpp/src/community/k_truss_impl.cuh
 create mode 100644 cpp/src/community/k_truss_sg.cu
 delete mode 100644 cpp/src/community/legacy/ktruss.cu
 rename cpp/tests/c_api/{legacy_k_truss_test.c => k_truss_test.c} (95%)
 create mode 100644 cpp/tests/community/k_truss_test.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 6070621134d..0240e2b892e 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -37,30 +37,6 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
     message(FATAL_ERROR "GCC compiler must be at least 9.3")
 endif()
 
-# Remove the following archs from CMAKE_CUDA_ARCHITECTURES that
-# cuhornet currently doesn't support
-#
-# >= 90
-set(supported_archs "70" "72" "75" "80" "86" "89" "90")
-foreach( arch IN LISTS CMAKE_CUDA_ARCHITECTURES)
-    string(REPLACE "-real" "" arch ${arch})
-    if( arch IN_LIST supported_archs )
-        list(APPEND usable_arch_values ${arch})
-    endif()
-endforeach()
-# Make sure everything but the 'newest' arch
-# is marked as `-real` so we only generate PTX for
-# arch > 86
-list(POP_BACK usable_arch_values latest_arch)
-list(TRANSFORM usable_arch_values APPEND "-real")
-if (usable_arch_values)
-    list(APPEND usable_arch_values ${latest_arch})
-else()
-    list(APPEND usable_arch_values ${latest_arch}-real)
-endif()
-
-set(CMAKE_CUDA_ARCHITECTURES ${usable_arch_values})
-
 # Write the version header
 rapids_cmake_write_version_file(include/cugraph/version_config.hpp)
 rapids_cmake_write_version_file(include/cugraph_c/version_config.hpp)
@@ -168,7 +144,6 @@ if(USE_CUGRAPH_OPS)
 endif()
 
 include(cmake/thirdparty/get_nccl.cmake)
-include(cmake/thirdparty/get_cuhornet.cmake)
 
 if (BUILD_CUGRAPH_MTMG_TESTS)
   include(cmake/thirdparty/get_ucp.cmake)
@@ -197,6 +172,7 @@ set(CUGRAPH_SOURCES
     src/community/detail/common_methods_sg.cu
     src/community/detail/refine_sg.cu
     src/community/detail/refine_mg.cu
+    src/community/edge_triangle_count_sg.cu
     src/community/detail/maximal_independent_moves_sg.cu
     src/community/detail/maximal_independent_moves_mg.cu
     src/detail/utility_wrappers.cu
@@ -222,10 +198,10 @@ set(CUGRAPH_SOURCES
     src/community/ecg_sg.cu
     src/community/ecg_mg.cu
     src/community/legacy/louvain.cu
-    src/community/legacy/ktruss.cu
     src/community/legacy/ecg.cu
     src/community/egonet_sg.cu
     src/community/egonet_mg.cu
+    src/community/k_truss_sg.cu
     src/sampling/random_walks.cu
     src/sampling/random_walks_sg.cu
     src/sampling/detail/prepare_next_frontier_sg.cu
@@ -391,7 +367,6 @@ if (USE_CUGRAPH_OPS)
             $<$<BOOL:${CUDA_STATIC_RUNTIME}>:raft::raft>
             $<$<BOOL:${CUDA_STATIC_RUNTIME}>:${COMPILED_RAFT_LIB}>
             cuco::cuco
-            cugraph::cuHornet
             NCCL::NCCL
     )
 else()
@@ -404,7 +379,6 @@ else()
             $<$<BOOL:${CUDA_STATIC_RUNTIME}>:raft::raft>
             $<$<BOOL:${CUDA_STATIC_RUNTIME}>:${COMPILED_RAFT_LIB}>
             cuco::cuco
-            cugraph::cuHornet
             NCCL::NCCL
     )
 endif()
@@ -427,6 +401,7 @@ add_library(cugraph_c
         src/c_api/eigenvector_centrality.cpp
         src/c_api/betweenness_centrality.cpp
         src/c_api/core_number.cpp
+        src/c_api/k_truss.cpp
         src/c_api/core_result.cpp
         src/c_api/extract_ego.cpp
         src/c_api/ecg.cpp
@@ -455,7 +430,6 @@ add_library(cugraph_c
         src/c_api/weakly_connected_components.cpp
         src/c_api/strongly_connected_components.cpp
         src/c_api/allgather.cpp
-        src/c_api/legacy_k_truss.cpp
         )
 add_library(cugraph::cugraph_c ALIAS cugraph_c)
 
diff --git a/cpp/cmake/thirdparty/get_cuhornet.cmake b/cpp/cmake/thirdparty/get_cuhornet.cmake
deleted file mode 100644
index d6dc817d78f..00000000000
--- a/cpp/cmake/thirdparty/get_cuhornet.cmake
+++ /dev/null
@@ -1,45 +0,0 @@
-#=============================================================================
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
-
-function(find_and_configure_cuhornet)
-
-    # We are not using the cuhornet CMake targets, so no need to call `add_subdirectory()`,
-    # or to use CPM
-    FetchContent_Declare(
-        cuhornet
-        GIT_REPOSITORY    https://github.com/rapidsai/cuhornet.git
-        GIT_TAG           17467c88abe2b76df456614575c02f7e9cbfd02d
-        SOURCE_SUBDIR     hornet
-    )
-    FetchContent_GetProperties(cuhornet)
-
-    if(NOT cuhornet_POPULATED)
-        FetchContent_Populate(cuhornet)
-    endif()
-
-    if(NOT TARGET cugraph::cuHornet)
-        add_library(cugraph::cuHornet IMPORTED INTERFACE GLOBAL)
-        target_include_directories(cugraph::cuHornet INTERFACE
-            "${cuhornet_SOURCE_DIR}/hornet/include"
-            "${cuhornet_SOURCE_DIR}/hornetsnest/include"
-            "${cuhornet_SOURCE_DIR}/xlib/include"
-            "${cuhornet_SOURCE_DIR}/primitives"
-            )
-    endif()
-endfunction()
-
-
-find_and_configure_cuhornet()
diff --git a/cpp/include/cugraph/algorithms.hpp b/cpp/include/cugraph/algorithms.hpp
index 1471d340cec..c817665b1cb 100644
--- a/cpp/include/cugraph/algorithms.hpp
+++ b/cpp/include/cugraph/algorithms.hpp
@@ -427,41 +427,6 @@ void connected_components(legacy::GraphCSRView<VT, ET, WT> const& graph,
                           cugraph_cc_t connectivity_type,
                           VT* labels);
 
-/**
- * @brief     Compute k truss for a graph  ** temporary
- *
- * K Truss is the maximal subgraph of a graph which contains at least three
- * vertices where every edge is incident to at least k-2 triangles.
- *
- * This version is a temporary solution to clean up python integration through the C API.
- *
- * This version is only supported SG.
- *
- * @throws                  cugraph::logic_error with a custom message when an error
- * occurs.
- *
- * @tparam vertex_t         Type of vertex identifiers. Supported value : int (signed, 32-bit)
- * @tparam weight_t         Type of edge weights. Supported values : float or double.
- *
- * @param[in] handle        Library handle (RAFT).
- * @param[in] src           Source vertices from COO
- * @param[in] dst           Destination vertices from COO
- * @param[in] wgt           Optional edge weights from COO
- * @param[in] k             The order of the truss
- * @return                  Tuple containing extracted src, dst and optional weights for the
- * subgraph
- */
-template <typename vertex_t, typename weight_t>
-std::tuple<rmm::device_uvector<vertex_t>,
-           rmm::device_uvector<vertex_t>,
-           std::optional<rmm::device_uvector<weight_t>>>
-k_truss_subgraph(raft::handle_t const& handle,
-                 raft::device_span<vertex_t> src,
-                 raft::device_span<vertex_t> dst,
-                 std::optional<raft::device_span<weight_t>> wgt,
-                 size_t number_of_vertices,
-                 int k);
-
 /**
  * @brief      Compute Hungarian algorithm on a weighted bipartite graph
  *
@@ -1842,7 +1807,7 @@ void weakly_connected_components(raft::handle_t const& handle,
 enum class k_core_degree_type_t { IN = 0, OUT = 1, INOUT = 2 };
 
 /**
- * @brief   Compute core numbers of individual vertices from K-core decomposition.
+ * @brief   Compute core numbers of individual vertices from K-Core decomposition.
  *
  * The input graph should not have self-loops nor multi-edges. Currently, only undirected graphs are
  * supported.
@@ -1855,11 +1820,11 @@ enum class k_core_degree_type_t { IN = 0, OUT = 1, INOUT = 2 };
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Graph view object.
  * @param core_numbers Pointer to the output core number array.
- * @param degree_type Dictate whether to compute the K-core decomposition based on in-degrees,
+ * @param degree_type Dictate whether to compute the K-Core decomposition based on in-degrees,
  * out-degrees, or in-degrees + out_degrees.
- * @param k_first Find K-cores from K = k_first. Any vertices that do not belong to k_first-core
+ * @param k_first Find K-Cores from K = k_first. Any vertices that do not belong to k_first-core
  * will have core numbers of 0.
- * @param k_last Find K-cores to K = k_last. Any vertices that belong to (k_last)-core will have
+ * @param k_last Find K-Cores to K = k_last. Any vertices that belong to (k_last)-core will have
  * their core numbers set to their degrees on k_last-core.
  * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
  */
@@ -1873,7 +1838,7 @@ void core_number(raft::handle_t const& handle,
                  bool do_expensive_check = false);
 
 /**
- * @brief   Extract K Core of a graph
+ * @brief   Extract K-Core of a graph
  *
  * @throws     cugraph::logic_error when an error occurs.
  *
@@ -1884,7 +1849,7 @@ void core_number(raft::handle_t const& handle,
  * @param  graph_view      Graph view object.
  * @param edge_weight_view Optional view object holding edge weights for @p graph_view.
  * @param  k               Order of the core. This value must not be negative.
- * @param degree_type Optional parameter to dictate whether to compute the K-core decomposition
+ * @param degree_type Optional parameter to dictate whether to compute the K-Core decomposition
  *                    based on in-degrees, out-degrees, or in-degrees + out_degrees.  One of @p
  *                    degree_type and @p core_numbers must be specified.
  * @param  core_numbers    Optional output from core_number algorithm.  If not specified then
@@ -2040,6 +2005,32 @@ void triangle_count(raft::handle_t const& handle,
                     raft::device_span<edge_t> counts,
                     bool do_expensive_check = false);
 
+/*
+ * @brief Compute K-Truss.
+ *
+ * Extract the K-Truss subgraph of a graph
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Graph view object.
+ * @param edge_weight_view Optional view object holding edge weights for @p graph_view.
+ * @param k The desired k to be used for extracting the K-Truss subgraph
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ * @return edge list of the K-Truss subgraph
+ */
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>>
+k_truss(raft::handle_t const& handle,
+        graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+        std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+        edge_t k,
+        bool do_expensive_check = false);
+
 /**
  * @brief     Compute Jaccard similarity coefficient
  *
diff --git a/cpp/include/cugraph/utilities/graph_traits.hpp b/cpp/include/cugraph/utilities/graph_traits.hpp
index e2737305aed..bd46c9d4fc1 100644
--- a/cpp/include/cugraph/utilities/graph_traits.hpp
+++ b/cpp/include/cugraph/utilities/graph_traits.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,8 +67,8 @@ struct is_candidate {
 };
 
 // meta-function that constrains
-// all 3 template param candidates where vertex_t and edge_t
-// are restricted to int32_t:
+// vertex_t and edge_t are restricted to int32_t:
+// FIXME: Drop this functor as it was only used by legacy K-Truss
 //
 template <typename vertex_t, typename edge_t, typename weight_t>
 struct is_candidate_legacy {
diff --git a/cpp/src/c_api/legacy_k_truss.cpp b/cpp/src/c_api/k_truss.cpp
similarity index 80%
rename from cpp/src/c_api/legacy_k_truss.cpp
rename to cpp/src/c_api/k_truss.cpp
index a6a1f17f482..18e256b022a 100644
--- a/cpp/src/c_api/legacy_k_truss.cpp
+++ b/cpp/src/c_api/k_truss.cpp
@@ -58,7 +58,7 @@ struct k_truss_functor : public cugraph::c_api::abstract_functor {
             bool multi_gpu>
   void operator()()
   {
-    if constexpr (!cugraph::is_candidate_legacy<vertex_t, edge_t, weight_t>::value) {
+    if constexpr (!cugraph::is_candidate<vertex_t, edge_t, weight_t>::value) {
       unsupported();
     } else if constexpr (multi_gpu) {
       unsupported();
@@ -81,26 +81,14 @@ struct k_truss_functor : public cugraph::c_api::abstract_functor {
       auto number_map = reinterpret_cast<rmm::device_uvector<vertex_t>*>(graph_->number_map_);
 
       auto graph_view = graph->view();
-      rmm::device_uvector<vertex_t> src(0, handle_.get_stream());
-      rmm::device_uvector<vertex_t> dst(0, handle_.get_stream());
-      std::optional<rmm::device_uvector<weight_t>> wgt{std::nullopt};
 
-      std::tie(src, dst, wgt, std::ignore) = cugraph::decompress_to_edgelist(
-        handle_,
-        graph_view,
-        edge_weights ? std::make_optional(edge_weights->view()) : std::nullopt,
-        std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
-        std::optional<raft::device_span<vertex_t const>>(std::nullopt),
-        do_expensive_check_);
-
-      auto [result_src, result_dst, result_wgt] = cugraph::k_truss_subgraph(
-        handle_,
-        raft::device_span<vertex_t>(src.data(), src.size()),
-        raft::device_span<vertex_t>(dst.data(), dst.size()),
-        wgt ? std::make_optional(raft::device_span<weight_t>(wgt->data(), wgt->size()))
-            : std::nullopt,
-        graph_view.number_of_vertices(),
-        k_);
+      auto [result_src, result_dst, result_wgt] =
+        cugraph::k_truss<vertex_t, edge_t, weight_t, multi_gpu>(
+          handle_,
+          graph_view,
+          edge_weights ? std::make_optional(edge_weights->view()) : std::nullopt,
+          k_,
+          do_expensive_check_);
 
       cugraph::unrenumber_int_vertices<vertex_t, multi_gpu>(
         handle_,
@@ -127,9 +115,9 @@ struct k_truss_functor : public cugraph::c_api::abstract_functor {
       result_ = new cugraph::c_api::cugraph_induced_subgraph_result_t{
         new cugraph::c_api::cugraph_type_erased_device_array_t(result_src, graph_->vertex_type_),
         new cugraph::c_api::cugraph_type_erased_device_array_t(result_dst, graph_->vertex_type_),
-        wgt ? new cugraph::c_api::cugraph_type_erased_device_array_t(*result_wgt,
-                                                                     graph_->weight_type_)
-            : NULL,
+        result_wgt ? new cugraph::c_api::cugraph_type_erased_device_array_t(*result_wgt,
+                                                                            graph_->weight_type_)
+                   : NULL,
         NULL,
         NULL,
         new cugraph::c_api::cugraph_type_erased_device_array_t(edge_offsets,
diff --git a/cpp/src/community/edge_triangle_count_impl.cuh b/cpp/src/community/edge_triangle_count_impl.cuh
new file mode 100644
index 00000000000..1370c1a17f2
--- /dev/null
+++ b/cpp/src/community/edge_triangle_count_impl.cuh
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/graph_partition_utils.cuh"
+#include "prims/transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cuh"
+
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/error.hpp>
+
+#include <thrust/adjacent_difference.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sort.h>
+#include <thrust/tuple.h>
+
+namespace cugraph {
+
+namespace detail {
+
+template <typename vertex_t, typename edge_t, typename EdgeIterator>
+struct update_edges_p_r_q_r_num_triangles {
+  size_t num_edges{};  // rename to num_edges
+  const edge_t edge_first_or_second{};
+  raft::device_span<size_t const> intersection_offsets{};
+  raft::device_span<vertex_t const> intersection_indices{};
+  raft::device_span<edge_t> num_triangles{};
+
+  EdgeIterator edge_first{};
+
+  __device__ void operator()(size_t i) const
+  {
+    auto itr = thrust::upper_bound(
+      thrust::seq, intersection_offsets.begin() + 1, intersection_offsets.end(), i);
+    auto idx = thrust::distance(intersection_offsets.begin() + 1, itr);
+    if (edge_first_or_second == 0) {
+      auto p_r_pair =
+        thrust::make_tuple(thrust::get<0>(*(edge_first + idx)), intersection_indices[i]);
+
+      // Find its position in 'edges'
+      auto itr_p_r_p_q =
+        thrust::lower_bound(thrust::seq,
+                            edge_first,
+                            edge_first + num_edges,  // pass the number of vertex pairs
+                            p_r_pair);
+
+      assert(*itr_p_r_p_q == p_r_pair);
+      idx = thrust::distance(edge_first, itr_p_r_p_q);
+    } else {
+      auto p_r_pair =
+        thrust::make_tuple(thrust::get<1>(*(edge_first + idx)), intersection_indices[i]);
+
+      // Find its position in 'edges'
+      auto itr_p_r_p_q =
+        thrust::lower_bound(thrust::seq,
+                            edge_first,
+                            edge_first + num_edges,  // pass the number of vertex pairs
+                            p_r_pair);
+      assert(*itr_p_r_p_q == p_r_pair);
+      idx = thrust::distance(edge_first, itr_p_r_p_q);
+    }
+    cuda::atomic_ref<edge_t, cuda::thread_scope_device> atomic_counter(num_triangles[idx]);
+    auto r = atomic_counter.fetch_add(edge_t{1}, cuda::std::memory_order_relaxed);
+  }
+};
+
+template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+std::enable_if_t<!multi_gpu, rmm::device_uvector<edge_t>> edge_triangle_count_impl(
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  raft::device_span<vertex_t> edgelist_srcs,
+  raft::device_span<vertex_t> edgelist_dsts)
+{
+  auto edge_first = thrust::make_zip_iterator(edgelist_srcs.begin(), edgelist_dsts.begin());
+
+  thrust::sort(handle.get_thrust_policy(), edge_first, edge_first + edgelist_srcs.size());
+
+  // FIXME: Perform 'nbr_intersection' in chunks to reduce peak memory.
+  auto [intersection_offsets, intersection_indices] =
+    detail::nbr_intersection(handle,
+                             graph_view,
+                             cugraph::edge_dummy_property_t{}.view(),
+                             edge_first,
+                             edge_first + edgelist_srcs.size(),
+                             std::array<bool, 2>{true, true},
+                             false /*FIXME: pass 'do_expensive_check' as argument*/);
+
+  rmm::device_uvector<edge_t> num_triangles(edgelist_srcs.size(), handle.get_stream());
+
+  // Update the number of triangles of each (p, q) edges by looking at their intersection
+  // size
+  thrust::adjacent_difference(handle.get_thrust_policy(),
+                              intersection_offsets.begin() + 1,
+                              intersection_offsets.end(),
+                              num_triangles.begin());
+
+  // Given intersection offsets and indices that are used to update the number of
+  // triangles of (p, q) edges where `r`s are the intersection indices, update
+  // the number of triangles of the pairs (p, r) and (q, r).
+
+  thrust::for_each(
+    handle.get_thrust_policy(),
+    thrust::make_counting_iterator<edge_t>(0),
+    thrust::make_counting_iterator<edge_t>(intersection_indices.size()),
+    update_edges_p_r_q_r_num_triangles<vertex_t, edge_t, decltype(edge_first)>{
+      edgelist_srcs.size(),
+      0,
+      raft::device_span<size_t const>(intersection_offsets.data(), intersection_offsets.size()),
+      raft::device_span<vertex_t const>(intersection_indices.data(), intersection_indices.size()),
+      raft::device_span<edge_t>(num_triangles.data(), num_triangles.size()),
+      edge_first});
+
+  thrust::for_each(
+    handle.get_thrust_policy(),
+    thrust::make_counting_iterator<edge_t>(0),
+    thrust::make_counting_iterator<edge_t>(intersection_indices.size()),
+    update_edges_p_r_q_r_num_triangles<vertex_t, edge_t, decltype(edge_first)>{
+      edgelist_srcs.size(),
+      1,
+      raft::device_span<size_t const>(intersection_offsets.data(), intersection_offsets.size()),
+      raft::device_span<vertex_t const>(intersection_indices.data(), intersection_indices.size()),
+      raft::device_span<edge_t>(num_triangles.data(), num_triangles.size()),
+      edge_first});
+
+  return num_triangles;
+}
+
+}  // namespace detail
+
+template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+rmm::device_uvector<edge_t> edge_triangle_count(
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  raft::device_span<vertex_t> edgelist_srcs,
+  raft::device_span<vertex_t> edgelist_dsts)
+{
+  return detail::edge_triangle_count_impl(handle, graph_view, edgelist_srcs, edgelist_dsts);
+}
+
+}  // namespace cugraph
diff --git a/cpp/src/community/edge_triangle_count_sg.cu b/cpp/src/community/edge_triangle_count_sg.cu
new file mode 100644
index 00000000000..c4b7e71b967
--- /dev/null
+++ b/cpp/src/community/edge_triangle_count_sg.cu
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "community/edge_triangle_count_impl.cuh"
+
+namespace cugraph {
+
+// SG instantiation
+template rmm::device_uvector<int32_t> edge_triangle_count(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  raft::device_span<int32_t> edgelist_srcs,
+  raft::device_span<int32_t> edgelist_dsts);
+
+template rmm::device_uvector<int64_t> edge_triangle_count(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int32_t, int64_t, false, false> const& graph_view,
+  raft::device_span<int32_t> edgelist_srcs,
+  raft::device_span<int32_t> edgelist_dsts);
+
+template rmm::device_uvector<int64_t> edge_triangle_count(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  raft::device_span<int64_t> edgelist_srcs,
+  raft::device_span<int64_t> edgelist_dsts);
+
+}  // namespace cugraph
diff --git a/cpp/src/community/k_truss_impl.cuh b/cpp/src/community/k_truss_impl.cuh
new file mode 100644
index 00000000000..3db9fd70de2
--- /dev/null
+++ b/cpp/src/community/k_truss_impl.cuh
@@ -0,0 +1,915 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "prims/edge_bucket.cuh"
+#include "prims/extract_transform_e.cuh"
+#include "prims/fill_edge_property.cuh"
+#include "prims/transform_e.cuh"
+#include "prims/transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cuh"
+#include "prims/update_edge_src_dst_property.cuh"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/detail/shuffle_wrappers.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/utilities/error.hpp>
+
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/distance.h>
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/optional.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+#include <thrust/tuple.h>
+
+namespace cugraph {
+
+// FIXME : This will be deleted once edge_triangle_count becomes public
+template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+rmm::device_uvector<edge_t> edge_triangle_count(
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  raft::device_span<vertex_t> edgelist_srcs,
+  raft::device_span<vertex_t> edgelist_dsts);
+
+template <typename vertex_t, typename edge_t, typename EdgeIterator>
+struct unroll_edge {
+  size_t num_valid_edges{};
+  raft::device_span<edge_t> num_triangles{};
+  EdgeIterator edge_to_unroll_first{};
+  EdgeIterator transposed_valid_edge_first{};
+  EdgeIterator transposed_valid_edge_last{};
+  EdgeIterator transposed_invalid_edge_last{};
+
+  __device__ thrust::tuple<vertex_t, vertex_t> operator()(edge_t i) const
+  {
+    // edges are sorted with destination as key so reverse the edge when looking it
+    auto pair = thrust::make_tuple(thrust::get<1>(*(edge_to_unroll_first + i)),
+                                   thrust::get<0>(*(edge_to_unroll_first + i)));
+    // Find its position in either partition of the transposed edgelist
+    // An edge can be in found in either of the two partitions (valid or invalid)
+    auto itr = thrust::lower_bound(
+      thrust::seq, transposed_valid_edge_last, transposed_invalid_edge_last, pair);
+    size_t idx{};
+    if (itr != transposed_invalid_edge_last && *itr == pair) {
+      idx =
+        static_cast<size_t>(thrust::distance(transposed_valid_edge_last, itr) + num_valid_edges);
+    } else {
+      // The edge must be in the first boundary
+      itr = thrust::lower_bound(
+        thrust::seq, transposed_valid_edge_first, transposed_valid_edge_last, pair);
+      assert(*itr == pair);
+      idx = thrust::distance(transposed_valid_edge_first, itr);
+    }
+    cuda::atomic_ref<edge_t, cuda::thread_scope_device> atomic_counter(num_triangles[idx]);
+    auto r = atomic_counter.fetch_sub(edge_t{1}, cuda::std::memory_order_relaxed);
+  }
+};
+
+// FIXME: May re-locate this function as a general utility function for graph algorithm
+// implementations.
+template <typename vertex_t>
+rmm::device_uvector<vertex_t> compute_prefix_sum(raft::handle_t const& handle,
+                                                 raft::device_span<vertex_t const> sorted_vertices,
+                                                 raft::device_span<vertex_t const> query_vertices)
+{
+  rmm::device_uvector<vertex_t> prefix_sum(query_vertices.size() + 1, handle.get_stream());
+
+  auto count_first = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(size_t{0}),
+    cuda::proclaim_return_type<vertex_t>(
+      [query_vertices,
+       num_edges       = sorted_vertices.size(),
+       sorted_vertices = sorted_vertices.begin()] __device__(size_t idx) {
+        auto itr_lower = thrust::lower_bound(
+          thrust::seq, sorted_vertices, sorted_vertices + num_edges, query_vertices[idx]);
+
+        auto itr_upper = thrust::upper_bound(
+          thrust::seq, itr_lower, sorted_vertices + num_edges, query_vertices[idx]);
+        vertex_t dist = thrust::distance(itr_lower, itr_upper);
+
+        return dist;
+      }));
+
+  thrust::exclusive_scan(handle.get_thrust_policy(),
+                         count_first,
+                         count_first + query_vertices.size() + 1,
+                         prefix_sum.begin());
+
+  return prefix_sum;
+}
+
+template <typename vertex_t, typename edge_t, typename EdgeIterator>
+edge_t remove_overcompensating_edges(raft::handle_t const& handle,
+                                     size_t buffer_size,
+                                     EdgeIterator potential_closing_or_incoming_edges,
+                                     EdgeIterator incoming_or_potential_closing_edges,
+                                     raft::device_span<vertex_t const> invalid_edgelist_srcs,
+                                     raft::device_span<vertex_t const> invalid_edgelist_dsts)
+{
+  // To avoid over-compensating, check whether the 'potential_closing_edges'
+  // are within the invalid edges. If yes, the was already unrolled
+  auto edges_not_overcomp = thrust::remove_if(
+    handle.get_thrust_policy(),
+    thrust::make_zip_iterator(potential_closing_or_incoming_edges,
+                              incoming_or_potential_closing_edges),
+    thrust::make_zip_iterator(potential_closing_or_incoming_edges + buffer_size,
+                              incoming_or_potential_closing_edges + buffer_size),
+    [num_invalid_edges = invalid_edgelist_dsts.size(),
+     invalid_first =
+       thrust::make_zip_iterator(invalid_edgelist_dsts.begin(), invalid_edgelist_srcs.begin()),
+     invalid_last = thrust::make_zip_iterator(invalid_edgelist_dsts.end(),
+                                              invalid_edgelist_srcs.end())] __device__(auto e) {
+      auto potential_edge = thrust::get<0>(e);
+      auto transposed_potential_or_incoming_edge =
+        thrust::make_tuple(thrust::get<1>(potential_edge), thrust::get<0>(potential_edge));
+      auto itr = thrust::lower_bound(
+        thrust::seq, invalid_first, invalid_last, transposed_potential_or_incoming_edge);
+      return (itr != invalid_last && *itr == transposed_potential_or_incoming_edge);
+    });
+
+  auto dist = thrust::distance(thrust::make_zip_iterator(potential_closing_or_incoming_edges,
+                                                         incoming_or_potential_closing_edges),
+                               edges_not_overcomp);
+
+  return dist;
+}
+
+template <typename vertex_t, typename edge_t, bool multi_gpu, bool is_q_r_edge>
+void unroll_p_r_or_q_r_edges(raft::handle_t const& handle,
+                             graph_view_t<vertex_t, edge_t, false, false>& graph_view,
+                             size_t num_invalid_edges,
+                             size_t num_valid_edges,
+                             raft::device_span<vertex_t const> edgelist_srcs,
+                             raft::device_span<vertex_t const> edgelist_dsts,
+                             raft::device_span<edge_t> num_triangles)
+{
+  auto prefix_sum_valid = compute_prefix_sum(
+    handle,
+    raft::device_span<vertex_t const>(edgelist_dsts.data(), num_valid_edges),
+    raft::device_span<vertex_t const>(edgelist_dsts.data() + num_valid_edges, num_invalid_edges));
+
+  auto prefix_sum_invalid = compute_prefix_sum(
+    handle,
+    raft::device_span<vertex_t const>(edgelist_dsts.data() + num_valid_edges, num_invalid_edges),
+    raft::device_span<vertex_t const>(edgelist_dsts.data() + num_valid_edges, num_invalid_edges));
+
+  auto potential_closing_edges = allocate_dataframe_buffer<thrust::tuple<vertex_t, vertex_t>>(
+    prefix_sum_valid.back_element(handle.get_stream()) +
+      prefix_sum_invalid.back_element(handle.get_stream()),
+    handle.get_stream());
+
+  auto incoming_edges_to_r = allocate_dataframe_buffer<thrust::tuple<vertex_t, vertex_t>>(
+    prefix_sum_valid.back_element(handle.get_stream()) +
+      prefix_sum_invalid.back_element(handle.get_stream()),
+    handle.get_stream());
+
+  thrust::for_each(
+    handle.get_thrust_policy(),
+    thrust::make_counting_iterator<edge_t>(0),
+    thrust::make_counting_iterator<edge_t>(num_invalid_edges),
+    [num_valid_edges,
+     num_invalid_edges,
+     invalid_dst_first       = edgelist_dsts.begin() + num_valid_edges,
+     invalid_src_first       = edgelist_srcs.begin() + num_valid_edges,
+     valid_src_first         = edgelist_srcs.begin(),
+     valid_dst_first         = edgelist_dsts.begin(),
+     prefix_sum_valid        = prefix_sum_valid.data(),
+     prefix_sum_invalid      = prefix_sum_invalid.data(),
+     potential_closing_edges = get_dataframe_buffer_begin(potential_closing_edges),
+     incoming_edges_to_r = get_dataframe_buffer_begin(incoming_edges_to_r)] __device__(auto idx) {
+      auto src                 = invalid_src_first[idx];
+      auto dst                 = invalid_dst_first[idx];
+      auto dst_array_end_valid = valid_dst_first + num_valid_edges;
+
+      auto itr_lower_valid =
+        thrust::lower_bound(thrust::seq, valid_dst_first, dst_array_end_valid, dst);
+      auto idx_lower_valid = thrust::distance(
+        valid_dst_first,
+        itr_lower_valid);  // Need a binary search to find the begining of the range
+
+      auto invalid_end_dst = invalid_dst_first + num_invalid_edges;
+
+      auto itr_lower_invalid =
+        thrust::lower_bound(thrust::seq, invalid_dst_first, invalid_end_dst, dst);
+      auto idx_lower_invalid = thrust::distance(
+        invalid_dst_first,
+        itr_lower_invalid);  // Need a binary search to find the begining of the range
+
+      auto incoming_edges_to_r_first_valid = thrust::make_zip_iterator(
+        valid_src_first + idx_lower_valid, thrust::make_constant_iterator(dst));
+      thrust::copy(
+        thrust::seq,
+        incoming_edges_to_r_first_valid,
+        incoming_edges_to_r_first_valid + (prefix_sum_valid[idx + 1] - prefix_sum_valid[idx]),
+        incoming_edges_to_r + prefix_sum_valid[idx] + prefix_sum_invalid[idx]);
+
+      auto incoming_edges_to_r_first_invalid = thrust::make_zip_iterator(
+        invalid_src_first + idx_lower_invalid, thrust::make_constant_iterator(dst));
+      thrust::copy(
+        thrust::seq,
+        incoming_edges_to_r_first_invalid,
+        incoming_edges_to_r_first_invalid + (prefix_sum_invalid[idx + 1] - prefix_sum_invalid[idx]),
+
+        incoming_edges_to_r + prefix_sum_invalid[idx] + prefix_sum_valid[idx + 1]);
+
+      if constexpr (is_q_r_edge) {
+        auto potential_closing_edges_first_valid = thrust::make_zip_iterator(
+          valid_src_first + idx_lower_valid, thrust::make_constant_iterator(src));
+        thrust::copy(
+          thrust::seq,
+          potential_closing_edges_first_valid,
+          potential_closing_edges_first_valid + (prefix_sum_valid[idx + 1] - prefix_sum_valid[idx]),
+          potential_closing_edges + prefix_sum_valid[idx] + prefix_sum_invalid[idx]);
+
+        auto potential_closing_edges_first_invalid = thrust::make_zip_iterator(
+          invalid_src_first + idx_lower_invalid, thrust::make_constant_iterator(src));
+        thrust::copy(thrust::seq,
+                     potential_closing_edges_first_invalid,
+                     potential_closing_edges_first_invalid +
+                       (prefix_sum_invalid[idx + 1] - prefix_sum_invalid[idx]),
+                     potential_closing_edges + prefix_sum_invalid[idx] + prefix_sum_valid[idx + 1]);
+
+      } else {
+        auto potential_closing_edges_first_valid = thrust::make_zip_iterator(
+          thrust::make_constant_iterator(src), valid_src_first + idx_lower_valid);
+        thrust::copy(
+          thrust::seq,
+          potential_closing_edges_first_valid,
+          potential_closing_edges_first_valid + (prefix_sum_valid[idx + 1] - prefix_sum_valid[idx]),
+          potential_closing_edges + prefix_sum_valid[idx] + prefix_sum_invalid[idx]);
+
+        auto potential_closing_edges_first_invalid = thrust::make_zip_iterator(
+          thrust::make_constant_iterator(src), invalid_src_first + idx_lower_invalid);
+        thrust::copy(
+          thrust::seq,
+          potential_closing_edges_first_invalid,
+          potential_closing_edges_first_invalid +
+            (prefix_sum_invalid[idx + 1] - prefix_sum_invalid[idx]),
+          potential_closing_edges + prefix_sum_invalid[idx] + (prefix_sum_valid[idx + 1]));
+      }
+    });
+
+  auto edges_exist = graph_view.has_edge(
+    handle,
+    raft::device_span<vertex_t const>(std::get<0>(potential_closing_edges).data(),
+                                      std::get<0>(potential_closing_edges).size()),
+    raft::device_span<vertex_t const>(std::get<1>(potential_closing_edges).data(),
+                                      std::get<1>(potential_closing_edges).size()));
+
+  auto edge_to_existance = thrust::make_zip_iterator(
+    thrust::make_zip_iterator(get_dataframe_buffer_begin(potential_closing_edges),
+                              get_dataframe_buffer_begin(incoming_edges_to_r)),
+    edges_exist.begin());
+
+  auto has_edge_last = thrust::remove_if(handle.get_thrust_policy(),
+                                         edge_to_existance,
+                                         edge_to_existance + edges_exist.size(),
+                                         [] __device__(auto e) {
+                                           auto edge_exists = thrust::get<1>(e);
+                                           return edge_exists == 0;
+                                         });
+
+  auto num_edge_exists = thrust::distance(edge_to_existance, has_edge_last);
+
+  // After pushing the non-existant edges to the second partition,
+  // remove them by resizing  both vertex pair buffer
+  resize_dataframe_buffer(potential_closing_edges, num_edge_exists, handle.get_stream());
+  resize_dataframe_buffer(incoming_edges_to_r, num_edge_exists, handle.get_stream());
+
+  auto num_edges_not_overcomp =
+    remove_overcompensating_edges<vertex_t,
+                                  edge_t,
+                                  decltype(get_dataframe_buffer_begin(potential_closing_edges))>(
+      handle,
+      num_edge_exists,
+      get_dataframe_buffer_begin(potential_closing_edges),
+      get_dataframe_buffer_begin(incoming_edges_to_r),
+      raft::device_span<vertex_t const>(edgelist_srcs.data() + num_valid_edges, num_invalid_edges),
+      raft::device_span<vertex_t const>(edgelist_dsts.data() + num_valid_edges, num_invalid_edges));
+
+  // After pushing the non-existant edges to the second partition,
+  // remove them by resizing  both vertex pair buffer
+  resize_dataframe_buffer(potential_closing_edges, num_edges_not_overcomp, handle.get_stream());
+  resize_dataframe_buffer(incoming_edges_to_r, num_edges_not_overcomp, handle.get_stream());
+
+  // Extra check for 'incoming_edges_to_r'
+  if constexpr (!is_q_r_edge) {
+    // Exchange the arguments (incoming_edges_to_r, num_edges_not_overcomp) order
+    // To also check if the 'incoming_edges_to_r' belong the the invalid_edgelist
+    num_edges_not_overcomp =
+      remove_overcompensating_edges<vertex_t,
+                                    edge_t,
+                                    decltype(get_dataframe_buffer_begin(potential_closing_edges))>(
+        handle,
+        num_edges_not_overcomp,
+        get_dataframe_buffer_begin(incoming_edges_to_r),
+        get_dataframe_buffer_begin(potential_closing_edges),
+        raft::device_span<vertex_t const>(edgelist_srcs.data() + num_valid_edges,
+                                          num_invalid_edges),
+        raft::device_span<vertex_t const>(edgelist_dsts.data() + num_valid_edges,
+                                          num_invalid_edges));
+
+    resize_dataframe_buffer(potential_closing_edges, num_edges_not_overcomp, handle.get_stream());
+    resize_dataframe_buffer(incoming_edges_to_r, num_edges_not_overcomp, handle.get_stream());
+  }
+
+  thrust::for_each(
+    handle.get_thrust_policy(),
+    thrust::make_zip_iterator(get_dataframe_buffer_begin(potential_closing_edges),
+                              get_dataframe_buffer_begin(incoming_edges_to_r)),
+    thrust::make_zip_iterator(
+      get_dataframe_buffer_begin(potential_closing_edges) + num_edges_not_overcomp,
+      get_dataframe_buffer_begin(incoming_edges_to_r) + num_edges_not_overcomp),
+    [num_triangles = num_triangles.begin(),
+     num_valid_edges,
+     invalid_first = thrust::make_zip_iterator(edgelist_dsts.begin() + num_valid_edges,
+                                               edgelist_srcs.begin() + num_valid_edges),
+     invalid_last  = thrust::make_zip_iterator(
+       edgelist_dsts.end(), edgelist_srcs.end())] __device__(auto potential_or_incoming_e) {
+      auto potential_e     = thrust::get<0>(potential_or_incoming_e);
+      auto incoming_e_to_r = thrust::get<1>(potential_or_incoming_e);
+      // thrust::tuple<vertex_t, vertex_t>> transposed_invalid_edge_;
+      auto transposed_invalid_edge =
+        thrust::make_tuple(thrust::get<1>(incoming_e_to_r), thrust::get<1>(potential_e));
+
+      if constexpr (!is_q_r_edge) {
+        transposed_invalid_edge =
+          thrust::make_tuple(thrust::get<1>(incoming_e_to_r), thrust::get<0>(potential_e));
+      }
+      auto itr =
+        thrust::lower_bound(thrust::seq, invalid_first, invalid_last, transposed_invalid_edge);
+      if (itr != invalid_last) { assert(*itr == transposed_invalid_edge); }
+      auto dist = thrust::distance(invalid_first, itr) + num_valid_edges;
+
+      cuda::atomic_ref<edge_t, cuda::thread_scope_device> atomic_counter(num_triangles[dist]);
+      auto r = atomic_counter.fetch_sub(edge_t{1}, cuda::std::memory_order_relaxed);
+    });
+
+  thrust::for_each(
+    handle.get_thrust_policy(),
+    thrust::make_counting_iterator<edge_t>(0),
+    thrust::make_counting_iterator<edge_t>(num_edges_not_overcomp),
+    unroll_edge<vertex_t,
+                edge_t,
+                decltype(thrust::make_zip_iterator(edgelist_srcs.begin(), edgelist_dsts.begin()))>{
+      num_valid_edges,
+      raft::device_span<edge_t>(num_triangles.data(), num_triangles.size()),
+      get_dataframe_buffer_begin(potential_closing_edges),
+      thrust::make_zip_iterator(edgelist_dsts.begin(), edgelist_srcs.begin()),
+      thrust::make_zip_iterator(edgelist_dsts.begin() + num_valid_edges,
+                                edgelist_srcs.begin() + num_valid_edges),
+      thrust::make_zip_iterator(edgelist_dsts.end(), edgelist_srcs.end())});
+
+  thrust::for_each(
+    handle.get_thrust_policy(),
+    thrust::make_counting_iterator<edge_t>(0),
+    thrust::make_counting_iterator<edge_t>(num_edges_not_overcomp),
+    unroll_edge<vertex_t,
+                edge_t,
+                decltype(thrust::make_zip_iterator(edgelist_srcs.begin(), edgelist_dsts.begin()))>{
+      num_valid_edges,
+      raft::device_span<edge_t>(num_triangles.data(), num_triangles.size()),
+      get_dataframe_buffer_begin(incoming_edges_to_r),
+      thrust::make_zip_iterator(edgelist_dsts.begin(), edgelist_srcs.begin()),
+      thrust::make_zip_iterator(edgelist_dsts.begin() + num_valid_edges,
+                                edgelist_srcs.begin() + num_valid_edges),
+      thrust::make_zip_iterator(edgelist_dsts.end(), edgelist_srcs.end())});
+}
+
+namespace {
+
+template <typename vertex_t>
+struct exclude_self_loop_t {
+  __device__ thrust::optional<thrust::tuple<vertex_t, vertex_t>> operator()(
+    vertex_t src, vertex_t dst, thrust::nullopt_t, thrust::nullopt_t, thrust::nullopt_t) const
+  {
+    return src != dst
+             ? thrust::optional<thrust::tuple<vertex_t, vertex_t>>{thrust::make_tuple(src, dst)}
+             : thrust::nullopt;
+  }
+};
+
+template <typename vertex_t, typename weight_t, typename edge_t>
+struct extract_low_to_high_degree_weighted_edges_t {
+  __device__ thrust::optional<thrust::tuple<vertex_t, vertex_t, weight_t>> operator()(
+    vertex_t src, vertex_t dst, edge_t src_out_degree, edge_t dst_out_degree, weight_t wgt) const
+  {
+    return (src_out_degree < dst_out_degree)
+             ? thrust::optional<thrust::tuple<vertex_t, vertex_t, weight_t>>{thrust::make_tuple(
+                 src, dst, wgt)}
+             : (((src_out_degree == dst_out_degree) &&
+                 (src < dst) /* tie-breaking using vertex ID */)
+                  ? thrust::optional<
+                      thrust::tuple<vertex_t, vertex_t, weight_t>>{thrust::make_tuple(
+                      src, dst, wgt)}
+                  : thrust::nullopt);
+  }
+};
+
+template <typename vertex_t, typename edge_t>
+struct extract_low_to_high_degree_edges_t {
+  __device__ thrust::optional<thrust::tuple<vertex_t, vertex_t>> operator()(vertex_t src,
+                                                                            vertex_t dst,
+                                                                            edge_t src_out_degree,
+                                                                            edge_t dst_out_degree,
+                                                                            thrust::nullopt_t) const
+  {
+    return (src_out_degree < dst_out_degree)
+             ? thrust::optional<thrust::tuple<vertex_t, vertex_t>>{thrust::make_tuple(src, dst)}
+             : (((src_out_degree == dst_out_degree) &&
+                 (src < dst) /* tie-breaking using vertex ID */)
+                  ? thrust::optional<thrust::tuple<vertex_t, vertex_t>>{thrust::make_tuple(src,
+                                                                                           dst)}
+                  : thrust::nullopt);
+  }
+};
+
+template <typename vertex_t, typename edge_t, bool generate_p_r>
+struct generate_p_r_or_q_r_from_p_q {
+  raft::device_span<size_t const> intersection_offsets{};
+  raft::device_span<vertex_t const> intersection_indices{};
+  raft::device_span<vertex_t const> invalid_srcs{};
+  raft::device_span<vertex_t const> invalid_dsts{};
+
+  __device__ thrust::tuple<vertex_t, vertex_t> operator()(edge_t i) const
+  {
+    auto itr = thrust::upper_bound(
+      thrust::seq, intersection_offsets.begin() + 1, intersection_offsets.end(), i);
+    auto idx = thrust::distance(intersection_offsets.begin() + 1, itr);
+
+    if constexpr (generate_p_r) {
+      return thrust::make_tuple(invalid_srcs[idx], intersection_indices[i]);
+
+    } else {
+      return thrust::make_tuple(invalid_dsts[idx], intersection_indices[i]);
+    }
+  }
+};
+}  // namespace
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>>
+k_truss(raft::handle_t const& handle,
+        graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+        std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+        edge_t k,
+        bool do_expensive_check)
+{
+  // 1. Check input arguments.
+
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
+  CUGRAPH_EXPECTS(graph_view.is_symmetric(),
+                  "Invalid input arguments: K-truss currently supports undirected graphs only.");
+  CUGRAPH_EXPECTS(!graph_view.is_multigraph(),
+                  "Invalid input arguments: K-truss currently does not support multi-graphs.");
+
+  if (do_expensive_check) {
+    // nothing to do
+  }
+
+  std::optional<graph_t<vertex_t, edge_t, false, multi_gpu>> modified_graph{std::nullopt};
+  std::optional<graph_view_t<vertex_t, edge_t, false, multi_gpu>> modified_graph_view{std::nullopt};
+  std::optional<rmm::device_uvector<vertex_t>> renumber_map{std::nullopt};
+  std::optional<edge_property_t<graph_view_t<vertex_t, edge_t, false, multi_gpu>, weight_t>>
+    edge_weight{std::nullopt};
+
+  if (graph_view.count_self_loops(handle) > edge_t{0}) {
+    auto [srcs, dsts] = extract_transform_e(handle,
+                                            graph_view,
+                                            edge_src_dummy_property_t{}.view(),
+                                            edge_dst_dummy_property_t{}.view(),
+                                            edge_dummy_property_t{}.view(),
+                                            exclude_self_loop_t<vertex_t>{});
+
+    if constexpr (multi_gpu) {
+      std::tie(srcs, dsts, std::ignore, std::ignore, std::ignore) =
+        detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
+                                                                                       edge_t,
+                                                                                       weight_t,
+                                                                                       int32_t>(
+          handle, std::move(srcs), std::move(dsts), std::nullopt, std::nullopt, std::nullopt);
+    }
+
+    std::tie(*modified_graph, std::ignore, std::ignore, std::ignore, renumber_map) =
+      create_graph_from_edgelist<vertex_t, edge_t, weight_t, edge_t, int32_t, false, multi_gpu>(
+        handle,
+        std::nullopt,
+        std::move(srcs),
+        std::move(dsts),
+        std::nullopt,
+        std::nullopt,
+        std::nullopt,
+        cugraph::graph_properties_t{true, graph_view.is_multigraph()},
+        true);
+
+    modified_graph_view = (*modified_graph).view();
+  }
+
+  // FIXME: Investigate k-1 core failure to yield correct results.
+  // 3. Find (k-1)-core and exclude edges that do not belong to (k-1)-core
+  /*
+  {
+    auto cur_graph_view = modified_graph_view ? *modified_graph_view : graph_view;
+    auto vertex_partition_range_lasts =
+      renumber_map
+        ? std::make_optional<std::vector<vertex_t>>(cur_graph_view.vertex_partition_range_lasts())
+        : std::nullopt;
+
+    rmm::device_uvector<edge_t> d_core_numbers(cur_graph_view.local_vertex_partition_range_size(),
+                                               handle.get_stream());
+    raft::device_span<edge_t const> core_number_span{d_core_numbers.data(), d_core_numbers.size()};
+
+    rmm::device_uvector<vertex_t> srcs{0, handle.get_stream()};
+    rmm::device_uvector<vertex_t> dsts{0, handle.get_stream()};
+    std::tie(srcs, dsts, std::ignore) =
+      k_core(handle,
+             cur_graph_view,
+             std::optional<edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+             size_t{k - 1},
+             std::make_optional(k_core_degree_type_t::OUT),
+             // Seems like the below argument is required. passing a std::nullopt
+             // create a compiler error
+             std::make_optional(core_number_span));
+
+    if constexpr (multi_gpu) {
+      std::tie(srcs, dsts, std::ignore, std::ignore, std::ignore) =
+        detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
+                                                                                       edge_t,
+                                                                                       weight_t,
+                                                                                       int32_t>(
+          handle, std::move(srcs), std::move(dsts), std::nullopt, std::nullopt, std::nullopt);
+    }
+
+    std::optional<rmm::device_uvector<vertex_t>> tmp_renumber_map{std::nullopt};
+
+    std::tie(*modified_graph, std::ignore, std::ignore, std::ignore, tmp_renumber_map) =
+      create_graph_from_edgelist<vertex_t, edge_t, weight_t, edge_t, int32_t, false, multi_gpu>(
+        handle,
+        std::nullopt,
+        std::move(srcs),
+        std::move(dsts),
+        std::nullopt,
+        std::nullopt,
+        std::nullopt,
+        cugraph::graph_properties_t{true, graph_view.is_multigraph()},
+        true);
+
+    modified_graph_view = (*modified_graph).view();
+
+    if (renumber_map) {  // collapse renumber_map
+      unrenumber_int_vertices<vertex_t, multi_gpu>(handle,
+                                                   (*tmp_renumber_map).data(),
+                                                   (*tmp_renumber_map).size(),
+                                                   (*renumber_map).data(),
+                                                   *vertex_partition_range_lasts);
+    }
+    renumber_map = std::move(tmp_renumber_map);
+  }
+  */
+
+  // 4. Keep only the edges from a low-degree vertex to a high-degree vertex.
+
+  {
+    auto cur_graph_view = modified_graph_view ? *modified_graph_view : graph_view;
+
+    auto vertex_partition_range_lasts =
+      renumber_map
+        ? std::make_optional<std::vector<vertex_t>>(cur_graph_view.vertex_partition_range_lasts())
+        : std::nullopt;
+
+    auto out_degrees = cur_graph_view.compute_out_degrees(handle);
+    edge_src_property_t<decltype(cur_graph_view), edge_t> edge_src_out_degrees(handle,
+                                                                               cur_graph_view);
+    edge_dst_property_t<decltype(cur_graph_view), edge_t> edge_dst_out_degrees(handle,
+                                                                               cur_graph_view);
+    update_edge_src_property(handle, cur_graph_view, out_degrees.begin(), edge_src_out_degrees);
+    update_edge_dst_property(handle, cur_graph_view, out_degrees.begin(), edge_dst_out_degrees);
+
+    rmm::device_uvector<vertex_t> srcs(0, handle.get_stream());
+    rmm::device_uvector<vertex_t> dsts(0, handle.get_stream());
+    std::optional<rmm::device_uvector<weight_t>> wgts{std::nullopt};
+    if (edge_weight_view) {
+      std::tie(srcs, dsts, wgts) = extract_transform_e(
+        handle,
+        cur_graph_view,
+        edge_src_out_degrees.view(),
+        edge_dst_out_degrees.view(),
+        *edge_weight_view,
+        extract_low_to_high_degree_weighted_edges_t<vertex_t, weight_t, edge_t>{});
+    } else {
+      std::tie(srcs, dsts) =
+        extract_transform_e(handle,
+                            cur_graph_view,
+                            edge_src_out_degrees.view(),
+                            edge_dst_out_degrees.view(),
+                            edge_dummy_property_t{}.view(),
+                            extract_low_to_high_degree_edges_t<vertex_t, edge_t>{});
+    }
+
+    if constexpr (multi_gpu) {
+      std::tie(srcs, dsts, wgts, std::ignore, std::ignore) =
+        detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
+                                                                                       edge_t,
+                                                                                       weight_t,
+                                                                                       int32_t>(
+          handle, std::move(srcs), std::move(dsts), std::move(wgts), std::nullopt, std::nullopt);
+    }
+
+    std::optional<rmm::device_uvector<vertex_t>> tmp_renumber_map{std::nullopt};
+
+    std::tie(*modified_graph, edge_weight, std::ignore, std::ignore, tmp_renumber_map) =
+      create_graph_from_edgelist<vertex_t, edge_t, weight_t, edge_t, int32_t, false, multi_gpu>(
+        handle,
+        std::nullopt,
+        std::move(srcs),
+        std::move(dsts),
+        std::move(wgts),
+        std::nullopt,
+        std::nullopt,
+        cugraph::graph_properties_t{false /* now asymmetric */, cur_graph_view.is_multigraph()},
+        false);
+
+    modified_graph_view = (*modified_graph).view();
+    if (renumber_map) {  // collapse renumber_map
+      unrenumber_int_vertices<vertex_t, multi_gpu>(handle,
+                                                   (*tmp_renumber_map).data(),
+                                                   (*tmp_renumber_map).size(),
+                                                   (*renumber_map).data(),
+                                                   *vertex_partition_range_lasts);
+    }
+    renumber_map = std::move(tmp_renumber_map);
+  }
+
+  // 5. Decompress the resulting graph to an edges list and ind intersection of edges endpoints
+  // for each partition using detail::nbr_intersection
+
+  {
+    auto cur_graph_view = modified_graph_view ? *modified_graph_view : graph_view;
+    rmm::device_uvector<vertex_t> edgelist_srcs(0, handle.get_stream());
+    rmm::device_uvector<vertex_t> edgelist_dsts(0, handle.get_stream());
+    std::optional<rmm::device_uvector<weight_t>> edgelist_wgts{std::nullopt};
+
+    edge_weight_view =
+      edge_weight ? std::make_optional((*edge_weight).view())
+                  : std::optional<edge_property_view_t<edge_t, weight_t const*>>{std::nullopt};
+    std::tie(edgelist_srcs, edgelist_dsts, edgelist_wgts, std::ignore) = decompress_to_edgelist(
+      handle,
+      cur_graph_view,
+      edge_weight_view,
+      std::optional<edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+      std::optional<raft::device_span<vertex_t const>>(std::nullopt));
+
+    auto num_triangles = edge_triangle_count<vertex_t, edge_t, false, false>(
+      handle,
+      cur_graph_view,
+      raft::device_span<vertex_t>(edgelist_srcs.data(), edgelist_srcs.size()),
+      raft::device_span<vertex_t>(edgelist_dsts.data(), edgelist_dsts.size()));
+
+    auto transposed_edge_first =
+      thrust::make_zip_iterator(edgelist_dsts.begin(), edgelist_srcs.begin());
+
+    auto edge_first = thrust::make_zip_iterator(edgelist_srcs.begin(), edgelist_dsts.begin());
+
+    auto transposed_edge_triangle_count_pair_first =
+      thrust::make_zip_iterator(transposed_edge_first, num_triangles.begin());
+
+    thrust::sort_by_key(handle.get_thrust_policy(),
+                        transposed_edge_first,
+                        transposed_edge_first + edgelist_srcs.size(),
+                        num_triangles.begin());
+
+    cugraph::edge_property_t<decltype(cur_graph_view), bool> edge_mask(handle, cur_graph_view);
+    cugraph::fill_edge_property(handle, cur_graph_view, true, edge_mask);
+    cur_graph_view.attach_edge_mask(edge_mask.view());
+
+    while (true) {
+      // 'invalid_transposed_edge_triangle_count_first' marks the beginning of the edges to be
+      // removed 'invalid_transposed_edge_triangle_count_first' + edgelist_srcs.size() marks the end
+      // of the edges to be removed 'edge_triangle_count_pair_first' marks the begining of the valid
+      // edges.
+      auto invalid_transposed_edge_triangle_count_first =
+        thrust::stable_partition(handle.get_thrust_policy(),
+                                 transposed_edge_triangle_count_pair_first,
+                                 transposed_edge_triangle_count_pair_first + edgelist_srcs.size(),
+                                 [k] __device__(auto e) {
+                                   auto num_triangles = thrust::get<1>(e);
+                                   return num_triangles >= k - 2;
+                                 });
+      auto num_invalid_edges = static_cast<size_t>(
+        thrust::distance(invalid_transposed_edge_triangle_count_first,
+                         transposed_edge_triangle_count_pair_first + edgelist_srcs.size()));
+
+      if (num_invalid_edges == 0) { break; }
+
+      auto num_valid_edges = edgelist_srcs.size() - num_invalid_edges;
+
+      // case 1. For the (p, q), find intersection 'r'.
+
+      // nbr_intersection requires the edges to be sort by 'src'
+      // sort the invalid edges by src for nbr intersection
+      thrust::sort_by_key(handle.get_thrust_policy(),
+                          edge_first + num_valid_edges,
+                          edge_first + edgelist_srcs.size(),
+                          num_triangles.begin() + num_valid_edges);
+
+      auto [intersection_offsets, intersection_indices] =
+        detail::nbr_intersection(handle,
+                                 cur_graph_view,
+                                 cugraph::edge_dummy_property_t{}.view(),
+                                 edge_first + num_valid_edges,
+                                 edge_first + edgelist_srcs.size(),
+                                 std::array<bool, 2>{true, true},
+                                 do_expensive_check);
+
+      // Update the number of triangles of each (p, q) edges by looking at their intersection
+      // size.
+      thrust::for_each(
+        handle.get_thrust_policy(),
+        thrust::make_counting_iterator<edge_t>(0),
+        thrust::make_counting_iterator<edge_t>(num_invalid_edges),
+        [num_triangles =
+           raft::device_span<edge_t>(num_triangles.data() + num_valid_edges, num_invalid_edges),
+         intersection_offsets = raft::device_span<size_t const>(
+           intersection_offsets.data(), intersection_offsets.size())] __device__(auto i) {
+          num_triangles[i] -= intersection_offsets[i + 1] - intersection_offsets[i];
+        });
+
+      // FIXME: Find a way to not have to maintain a dataframe_buffer
+      auto vertex_pair_buffer_p_r_edge_p_q =
+        allocate_dataframe_buffer<thrust::tuple<vertex_t, vertex_t>>(intersection_indices.size(),
+                                                                     handle.get_stream());
+
+      thrust::tabulate(
+        handle.get_thrust_policy(),
+        get_dataframe_buffer_begin(vertex_pair_buffer_p_r_edge_p_q),
+        get_dataframe_buffer_end(vertex_pair_buffer_p_r_edge_p_q),
+        generate_p_r_or_q_r_from_p_q<vertex_t, edge_t, true>{
+          raft::device_span<size_t const>(intersection_offsets.data(), intersection_offsets.size()),
+          raft::device_span<vertex_t const>(intersection_indices.data(),
+                                            intersection_indices.size()),
+          raft::device_span<vertex_t>(edgelist_srcs.data() + num_valid_edges, num_invalid_edges),
+          raft::device_span<vertex_t>(edgelist_dsts.data() + num_valid_edges, num_invalid_edges)});
+
+      auto vertex_pair_buffer_q_r_edge_p_q =
+        allocate_dataframe_buffer<thrust::tuple<vertex_t, vertex_t>>(intersection_indices.size(),
+                                                                     handle.get_stream());
+      thrust::tabulate(
+        handle.get_thrust_policy(),
+        get_dataframe_buffer_begin(vertex_pair_buffer_q_r_edge_p_q),
+        get_dataframe_buffer_end(vertex_pair_buffer_q_r_edge_p_q),
+        generate_p_r_or_q_r_from_p_q<vertex_t, edge_t, false>{
+          raft::device_span<size_t const>(intersection_offsets.data(), intersection_offsets.size()),
+          raft::device_span<vertex_t const>(intersection_indices.data(),
+                                            intersection_indices.size()),
+          raft::device_span<vertex_t>(edgelist_srcs.data() + num_valid_edges, num_invalid_edges),
+          raft::device_span<vertex_t>(edgelist_dsts.data() + num_valid_edges, num_invalid_edges)});
+
+      // Unrolling the edges require the edges to be sorted by destination
+      // re-sort the invalid edges by 'dst'
+      thrust::sort_by_key(handle.get_thrust_policy(),
+                          transposed_edge_first + num_valid_edges,
+                          transposed_edge_first + edgelist_srcs.size(),
+                          num_triangles.begin() + num_valid_edges);
+
+      thrust::for_each(handle.get_thrust_policy(),
+                       thrust::make_counting_iterator<edge_t>(0),
+                       thrust::make_counting_iterator<edge_t>(intersection_indices.size()),
+                       unroll_edge<vertex_t, edge_t, decltype(transposed_edge_first)>{
+                         num_valid_edges,
+                         raft::device_span<edge_t>(num_triangles.data(), num_triangles.size()),
+                         get_dataframe_buffer_begin(vertex_pair_buffer_p_r_edge_p_q),
+                         transposed_edge_first,
+                         transposed_edge_first + num_valid_edges,
+                         transposed_edge_first + edgelist_srcs.size()});
+
+      thrust::for_each(handle.get_thrust_policy(),
+                       thrust::make_counting_iterator<edge_t>(0),
+                       thrust::make_counting_iterator<edge_t>(intersection_indices.size()),
+                       unroll_edge<vertex_t, edge_t, decltype(transposed_edge_first)>{
+                         num_valid_edges,
+                         raft::device_span<edge_t>(num_triangles.data(), num_triangles.size()),
+                         get_dataframe_buffer_begin(vertex_pair_buffer_q_r_edge_p_q),
+                         transposed_edge_first,
+                         transposed_edge_first + num_valid_edges,
+                         transposed_edge_first + edgelist_srcs.size()});
+
+      // case 2: unroll (q, r)
+      // For each (q, r) edges to unroll, find the incoming edges to 'r' let's say from 'p' and
+      // create the pair (p, q)
+      cugraph::unroll_p_r_or_q_r_edges<vertex_t, edge_t, false, true>(
+        handle,
+        cur_graph_view,
+        num_invalid_edges,
+        num_valid_edges,
+        raft::device_span<vertex_t const>(edgelist_srcs.data(), edgelist_srcs.size()),
+        raft::device_span<vertex_t const>(edgelist_dsts.data(), edgelist_dsts.size()),
+        raft::device_span<edge_t>(num_triangles.data(), num_triangles.size()));
+
+      // case 3: unroll (p, r)
+      cugraph::unroll_p_r_or_q_r_edges<vertex_t, edge_t, false, false>(
+        handle,
+        cur_graph_view,
+        num_invalid_edges,
+        num_valid_edges,
+        raft::device_span<vertex_t const>(edgelist_srcs.data(), edgelist_srcs.size()),
+        raft::device_span<vertex_t const>(edgelist_dsts.data(), edgelist_dsts.size()),
+        raft::device_span<edge_t>(num_triangles.data(), num_triangles.size()));
+
+      // Remove edges that have a triangle count of zero. Those should not be accounted
+      // for during the unroling phase.
+      auto edges_with_triangle_last =
+        thrust::stable_partition(handle.get_thrust_policy(),
+                                 transposed_edge_triangle_count_pair_first,
+                                 transposed_edge_triangle_count_pair_first + num_triangles.size(),
+                                 [] __device__(auto e) {
+                                   auto num_triangles = thrust::get<1>(e);
+                                   return num_triangles > 0;
+                                 });
+
+      auto num_edges_with_triangles = static_cast<size_t>(
+        thrust::distance(transposed_edge_triangle_count_pair_first, edges_with_triangle_last));
+
+      thrust::sort(handle.get_thrust_policy(),
+                   thrust::make_zip_iterator(edgelist_srcs.begin() + num_edges_with_triangles,
+                                             edgelist_dsts.begin() + num_edges_with_triangles),
+                   thrust::make_zip_iterator(edgelist_srcs.end(), edgelist_dsts.end()));
+
+      cugraph::edge_bucket_t<vertex_t, void, true, multi_gpu, true> edges_with_no_triangle(handle);
+      edges_with_no_triangle.insert(edgelist_srcs.begin() + num_edges_with_triangles,
+                                    edgelist_srcs.end(),
+                                    edgelist_dsts.begin() + num_edges_with_triangles);
+
+      cur_graph_view.clear_edge_mask();
+      if (edge_weight_view) {
+        cugraph::transform_e(
+          handle,
+          cur_graph_view,
+          edges_with_no_triangle,
+          cugraph::edge_src_dummy_property_t{}.view(),
+          cugraph::edge_dst_dummy_property_t{}.view(),
+          *edge_weight_view,
+          [] __device__(auto src, auto dst, thrust::nullopt_t, thrust::nullopt_t, auto wgt) {
+            return false;
+          },
+          edge_mask.mutable_view(),
+          false);
+      } else {
+        cugraph::transform_e(
+          handle,
+          cur_graph_view,
+          edges_with_no_triangle,
+          cugraph::edge_src_dummy_property_t{}.view(),
+          cugraph::edge_dst_dummy_property_t{}.view(),
+          cugraph::edge_dummy_property_t{}.view(),
+          [] __device__(
+            auto src, auto dst, thrust::nullopt_t, thrust::nullopt_t, thrust::nullopt_t) {
+            return false;
+          },
+          edge_mask.mutable_view(),
+          false);
+      }
+      cur_graph_view.attach_edge_mask(edge_mask.view());
+
+      edgelist_srcs.resize(num_edges_with_triangles, handle.get_stream());
+      edgelist_dsts.resize(num_edges_with_triangles, handle.get_stream());
+      num_triangles.resize(num_edges_with_triangles, handle.get_stream());
+    }
+
+    std::tie(edgelist_srcs, edgelist_dsts, edgelist_wgts, std::ignore) = decompress_to_edgelist(
+      handle,
+      cur_graph_view,
+      edge_weight_view ? std::make_optional(*edge_weight_view) : std::nullopt,
+      std::optional<edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+      std::optional<raft::device_span<vertex_t const>>(std::nullopt));
+
+    std::tie(edgelist_srcs, edgelist_dsts, edgelist_wgts) =
+      symmetrize_edgelist<vertex_t, weight_t, false, multi_gpu>(handle,
+                                                                std::move(edgelist_srcs),
+                                                                std::move(edgelist_dsts),
+                                                                std::move(edgelist_wgts),
+                                                                false);
+
+    return std::make_tuple(
+      std::move(edgelist_srcs), std::move(edgelist_dsts), std::move(edgelist_wgts));
+  }
+}
+}  // namespace cugraph
diff --git a/cpp/src/community/k_truss_sg.cu b/cpp/src/community/k_truss_sg.cu
new file mode 100644
index 00000000000..dfea62182f5
--- /dev/null
+++ b/cpp/src/community/k_truss_sg.cu
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "community/k_truss_impl.cuh"
+
+namespace cugraph {
+
+// SG instantiation
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>>
+k_truss(raft::handle_t const& handle,
+        graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+        std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+        int32_t k,
+        bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>>
+k_truss(raft::handle_t const& handle,
+        graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+        std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+        int32_t k,
+        bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>>
+k_truss(raft::handle_t const& handle,
+        graph_view_t<int32_t, int64_t, false, false> const& graph_view,
+        std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+        int64_t k,
+        bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>>
+k_truss(raft::handle_t const& handle,
+        graph_view_t<int32_t, int64_t, false, false> const& graph_view,
+        std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+        int64_t k,
+        bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>>
+k_truss(raft::handle_t const& handle,
+        graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+        std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+        int64_t k,
+        bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>>
+k_truss(raft::handle_t const& handle,
+        graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+        std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+        int64_t k,
+        bool do_expensive_check);
+
+}  // namespace cugraph
diff --git a/cpp/src/community/legacy/ktruss.cu b/cpp/src/community/legacy/ktruss.cu
deleted file mode 100644
index 38b68eb1947..00000000000
--- a/cpp/src/community/legacy/ktruss.cu
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * ---------------------------------------------------------------------------*
- * @brief KTruss implementation
- *
- * @file ktruss.cu
- * --------------------------------------------------------------------------*/
-
-#include <cugraph/algorithms.hpp>
-#include <cugraph/utilities/error.hpp>
-
-#include <Hornet.hpp>
-#include <StandardAPI.hpp>
-#include <Static/KTruss/KTruss.cuh>
-
-using namespace hornets_nest;
-
-namespace cugraph {
-
-namespace detail {
-
-template <typename vertex_t>
-std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> ktruss_subgraph_impl(
-  raft::handle_t const& handle,
-  raft::device_span<vertex_t> src,
-  raft::device_span<vertex_t> dst,
-  size_t number_of_vertices,
-  int k)
-{
-  using HornetGraph = hornet::gpu::Hornet<vertex_t>;
-  using UpdatePtr   = hornet::BatchUpdatePtr<vertex_t, hornet::EMPTY, hornet::DeviceType::DEVICE>;
-  using Update      = hornet::gpu::BatchUpdate<vertex_t>;
-
-  HornetGraph hnt(number_of_vertices + 1);
-
-  // NOTE: Should a constant pointer be passed for @src and @dst
-  UpdatePtr ptr(static_cast<int>(src.size()), src.data(), dst.data());
-  Update batch(ptr);
-
-  hnt.insert(batch);
-  CUGRAPH_EXPECTS(cudaPeekAtLastError() == cudaSuccess, "KTruss : Failed to initialize graph");
-
-  KTruss kt(hnt);
-
-  kt.init();
-  kt.reset();
-  kt.createOffSetArray();
-  // NOTE : These parameters will become obsolete once we move to the updated
-  // algorithm (https://ieeexplore.ieee.org/document/8547581)
-  kt.setInitParameters(4,      // Number of threads per block per list intersection
-                       8,      // Number of intersections per block
-                       2,      // log2(Number of threads)
-                       64000,  // Total number of blocks launched
-                       32);    // Thread block dimension
-  kt.reset();
-  kt.sortHornet();
-
-  kt.runForK(k);
-  CUGRAPH_EXPECTS(cudaPeekAtLastError() == cudaSuccess, "KTruss : Failed to run");
-
-  rmm::device_uvector<vertex_t> result_src(kt.getGraphEdgeCount(), handle.get_stream());
-  rmm::device_uvector<vertex_t> result_dst(kt.getGraphEdgeCount(), handle.get_stream());
-
-  kt.copyGraph(result_src.data(), result_dst.data());
-
-  kt.release();
-  CUGRAPH_EXPECTS(cudaPeekAtLastError() == cudaSuccess, "KTruss : Failed to release");
-
-  return std::make_tuple(std::move(result_src), std::move(result_dst));
-}
-
-template <typename vertex_t, typename weight_t>
-std::tuple<rmm::device_uvector<vertex_t>,
-           rmm::device_uvector<vertex_t>,
-           std::optional<rmm::device_uvector<weight_t>>>
-weighted_ktruss_subgraph_impl(raft::handle_t const& handle,
-                              raft::device_span<vertex_t> src,
-                              raft::device_span<vertex_t> dst,
-                              std::optional<raft::device_span<weight_t>> wgt,
-                              size_t number_of_vertices,
-                              int k)
-{
-  using HornetGraph = hornet::gpu::Hornet<vertex_t, hornet::EMPTY, hornet::TypeList<weight_t>>;
-  using UpdatePtr =
-    hornet::BatchUpdatePtr<vertex_t, hornet::TypeList<weight_t>, hornet::DeviceType::DEVICE>;
-  using Update = hornet::gpu::BatchUpdate<vertex_t, hornet::TypeList<weight_t>>;
-
-  HornetGraph hnt(number_of_vertices + 1);
-
-  UpdatePtr ptr(static_cast<int>(src.size()), src.data(), dst.data(), wgt->data());
-  Update batch(ptr);
-
-  hnt.insert(batch);
-  CUGRAPH_EXPECTS(cudaPeekAtLastError() == cudaSuccess, "KTruss : Failed to initialize graph");
-
-  KTrussWeighted<weight_t> kt(hnt);
-
-  kt.init();
-  kt.reset();
-  kt.createOffSetArray();
-  // NOTE : These parameters will become obsolete once we move to the updated
-  // algorithm (https://ieeexplore.ieee.org/document/8547581)
-  kt.setInitParameters(4,      // Number of threads per block per list intersection
-                       8,      // Number of intersections per block
-                       2,      // log2(Number of threads)
-                       64000,  // Total number of blocks launched
-                       32);    // Thread block dimension
-  kt.reset();
-  kt.sortHornet();
-
-  kt.runForK(k);
-  CUGRAPH_EXPECTS(cudaPeekAtLastError() == cudaSuccess, "KTruss : Failed to run");
-
-  rmm::device_uvector<vertex_t> result_src(kt.getGraphEdgeCount(), handle.get_stream());
-  rmm::device_uvector<vertex_t> result_dst(kt.getGraphEdgeCount(), handle.get_stream());
-  std::optional<rmm::device_uvector<weight_t>> result_wgt{std::nullopt};
-
-  result_wgt = rmm::device_uvector<weight_t>(kt.getGraphEdgeCount(), handle.get_stream());
-  kt.copyGraph(result_src.data(), result_dst.data(), result_wgt->data());
-
-  kt.release();
-  CUGRAPH_EXPECTS(cudaPeekAtLastError() == cudaSuccess, "KTruss : Failed to release");
-
-  return std::make_tuple(std::move(result_src), std::move(result_dst), std::move(result_wgt));
-}
-
-}  // namespace detail
-
-template <typename vertex_t, typename weight_t>
-std::tuple<rmm::device_uvector<vertex_t>,
-           rmm::device_uvector<vertex_t>,
-           std::optional<rmm::device_uvector<weight_t>>>
-k_truss_subgraph(raft::handle_t const& handle,
-                 raft::device_span<vertex_t> src,
-                 raft::device_span<vertex_t> dst,
-                 std::optional<raft::device_span<weight_t>> wgt,
-                 size_t number_of_vertices,
-                 int k)
-{
-  if (wgt.has_value()) {
-    return detail::weighted_ktruss_subgraph_impl(handle, src, dst, wgt, number_of_vertices, k);
-  } else {
-    auto [result_src, result_dst] =
-      detail::ktruss_subgraph_impl(handle, src, dst, number_of_vertices, k);
-    std::optional<rmm::device_uvector<weight_t>> result_wgt{std::nullopt};
-    return std::make_tuple(std::move(result_src), std::move(result_dst), std::move(result_wgt));
-  }
-}
-
-template std::tuple<rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<float>>>
-k_truss_subgraph(raft::handle_t const& handle,
-                 raft::device_span<int32_t> src,
-                 raft::device_span<int32_t> dst,
-                 std::optional<raft::device_span<float>> wgt,
-                 size_t number_of_vertices,
-                 int k);
-
-template std::tuple<rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<double>>>
-k_truss_subgraph(raft::handle_t const& handle,
-                 raft::device_span<int32_t> src,
-                 raft::device_span<int32_t> dst,
-                 std::optional<raft::device_span<double>> wgt,
-                 size_t number_of_vertices,
-                 int k);
-
-}  // namespace cugraph
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index f2eb4c38a2b..32e020ff6f9 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -414,6 +414,10 @@ ConfigureTest(CORE_NUMBER_TEST cores/core_number_test.cpp)
 # - Core Number tests -----------------------------------------------------------------------------
 ConfigureTest(K_CORE_TEST cores/k_core_test.cpp)
 
+###############################################################################################
+# - K-truss tests --------------------------------------------------------------------------
+ConfigureTest(K_TRUSS_TEST community/k_truss_test.cpp)
+
 ###################################################################################################
 # - Triangle Count tests --------------------------------------------------------------------------
 ConfigureTest(TRIANGLE_COUNT_TEST community/triangle_count_test.cpp)
@@ -769,7 +773,7 @@ ConfigureCTest(CAPI_INDUCED_SUBGRAPH_TEST c_api/induced_subgraph_test.c)
 ConfigureCTest(CAPI_DEGREES c_api/degrees_test.c)
 ConfigureCTest(CAPI_EGONET_TEST c_api/egonet_test.c)
 ConfigureCTest(CAPI_TWO_HOP_NEIGHBORS_TEST c_api/two_hop_neighbors_test.c)
-ConfigureCTest(CAPI_LEGACY_K_TRUSS_TEST c_api/legacy_k_truss_test.c)
+ConfigureCTest(CAPI_K_TRUSS_TEST c_api/k_truss_test.c)
 
 if (BUILD_CUGRAPH_MTMG_TESTS)
     ###################################################################################################
diff --git a/cpp/tests/c_api/legacy_k_truss_test.c b/cpp/tests/c_api/k_truss_test.c
similarity index 95%
rename from cpp/tests/c_api/legacy_k_truss_test.c
rename to cpp/tests/c_api/k_truss_test.c
index bc85f568688..89b2d6df544 100644
--- a/cpp/tests/c_api/legacy_k_truss_test.c
+++ b/cpp/tests/c_api/k_truss_test.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -120,7 +120,6 @@ int generic_k_truss_test(vertex_t* h_src,
       TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
     }
 
-
     ret_code = cugraph_type_erased_device_array_view_copy_to_host(
       resource_handle, (byte_t*)h_result_offsets, offsets, &ret_error);
     TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
@@ -172,9 +171,9 @@ int test_k_truss()
   weight_t h_wgt[] = {
     0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f, 0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
 
-  vertex_t h_result_src[]   = {0, 0, 1, 1, 1, 2, 2, 2, 3, 3};
-  vertex_t h_result_dst[]   = {1, 2, 0, 2, 3, 0, 1, 3, 1, 2};
-  weight_t h_result_wgt[]   = {0.1, 5.1, 0.1, 3.1, 2.1, 5.1, 3.1, 4.1, 2.1, 4.1};
+  vertex_t h_result_src[]   = {1, 2, 2, 3, 3, 0, 0, 1, 1, 2};
+  vertex_t h_result_dst[]   = {0, 0, 1, 1, 2, 1, 2, 2, 3, 3};
+  weight_t h_result_wgt[]   = {0.1, 5.1, 3.1, 2.1, 4.1, 0.1, 5.1, 3.1, 2.1, 4.1};
   size_t h_result_offsets[] = {0, 10};
   size_t num_expected_edges = 10;
   size_t num_expected_offsets = 2;
@@ -203,8 +202,8 @@ int test_k_truss_no_weights()
   vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
   vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
 
-  vertex_t h_result_src[]   = {0, 0, 1, 1, 1, 2, 2, 2, 3, 3};
-  vertex_t h_result_dst[]   = {1, 2, 0, 2, 3, 0, 1, 3, 1, 2};
+  vertex_t h_result_src[]   = {0, 0, 2, 2, 3, 1, 2, 1, 3, 1};
+  vertex_t h_result_dst[]   = {1, 2, 1, 3, 1, 0, 0, 2, 2, 3};
   size_t h_result_offsets[] = {0, 10};
   size_t num_expected_edges = 10;
   size_t num_expected_offsets = 2;
diff --git a/cpp/tests/community/k_truss_test.cpp b/cpp/tests/community/k_truss_test.cpp
new file mode 100644
index 00000000000..c8010422e42
--- /dev/null
+++ b/cpp/tests/community/k_truss_test.cpp
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governin_from_mtxg permissions and
+ * limitations under the License.
+ */
+
+#include "utilities/base_fixture.hpp"
+#include "utilities/check_utilities.hpp"
+#include "utilities/conversion_utilities.hpp"
+#include "utilities/test_graphs.hpp"
+#include "utilities/thrust_wrapper.hpp"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/graph.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <raft/core/handle.hpp>
+#include <raft/util/cudart_utils.hpp>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <iterator>
+#include <limits>
+#include <numeric>
+#include <random>
+#include <set>
+#include <vector>
+
+struct KTruss_Usecase {
+  int32_t k_{3};
+  bool test_weighted_{false};
+  bool check_correctness_{true};
+};
+
+template <typename input_usecase_t>
+class Tests_KTruss : public ::testing::TestWithParam<std::tuple<KTruss_Usecase, input_usecase_t>> {
+ public:
+  Tests_KTruss() {}
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  // FIXME: There is an utility equivalent functor not
+  // supporting host vectors.
+  template <typename type_t>
+  struct host_nearly_equal {
+    const type_t threshold_ratio;
+    const type_t threshold_magnitude;
+
+    bool operator()(type_t lhs, type_t rhs) const
+    {
+      return std::abs(lhs - rhs) <
+             std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude);
+    }
+  };
+
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  std::tuple<std::vector<vertex_t>, std::vector<vertex_t>, std::optional<std::vector<weight_t>>>
+  k_truss_reference(std::vector<vertex_t> h_offsets,
+                    std::vector<vertex_t> h_indices,
+                    std::optional<std::vector<weight_t>> h_values,
+                    edge_t k)
+  {
+    std::vector<vertex_t> vertices(h_offsets.size() - 1);
+    std::iota(vertices.begin(), vertices.end(), 0);
+
+    auto n_dropped = 1;
+
+    while (n_dropped > 0) {
+      n_dropped = 0;
+      std::set<vertex_t> seen;
+      // Go over all the vertices.
+      for (auto u = vertices.begin(); u != vertices.end(); ++u) {
+        std::set<vertex_t> nbrs_u;
+        // Find all neighbors of u from the offsets and indices array
+        auto idx_start = (h_offsets.begin() + (*u));
+        auto idx_end   = idx_start + 1;
+
+        for (edge_t i = *idx_start; i < *idx_end; ++i) {
+          nbrs_u.insert(*(h_indices.begin() + i));
+        }
+
+        seen.insert(*u);
+        std::set<vertex_t> new_nbrs;
+        std::set_difference(nbrs_u.begin(),
+                            nbrs_u.end(),
+                            seen.begin(),
+                            seen.end(),
+                            std::inserter(new_nbrs, new_nbrs.end()));
+
+        // Finding the neighbors of v
+        for (auto v = new_nbrs.begin(); v != new_nbrs.end(); ++v) {
+          std::set<vertex_t> nbrs_v;
+          // Find all neighbors of v from the offsets and indices array
+          idx_start = (h_offsets.begin() + (*v));
+          idx_end   = idx_start + 1;
+          for (edge_t i = *idx_start; i < *idx_end; ++i) {
+            nbrs_v.insert(*(h_indices.begin() + i));
+          }
+
+          std::set<vertex_t> nbr_intersection_u_v;
+          // Find the intersection of nbr_u and nbr_v
+          std::set_intersection(nbrs_u.begin(),
+                                nbrs_u.end(),
+                                nbrs_v.begin(),
+                                nbrs_v.end(),
+                                std::inserter(nbr_intersection_u_v, nbr_intersection_u_v.end()));
+
+          if (nbr_intersection_u_v.size() < (k - 2)) {
+            auto del_v = std::find(
+              h_indices.begin() + h_offsets[*u], h_indices.begin() + h_offsets[*u + 1], *v);
+
+            if (h_values) {
+              (*h_values).erase((*h_values).begin() + std::distance(h_indices.begin(), del_v));
+            }
+
+            std::transform(std::begin(h_offsets) + (*u) + 1,
+                           std::end(h_offsets),
+                           std::begin(h_offsets) + (*u) + 1,
+                           [](int x) { return x - 1; });
+            h_indices.erase(del_v);
+
+            // Delete edge in both directions
+            auto del_u = std::find(
+              h_indices.begin() + h_offsets[*v], h_indices.begin() + h_offsets[*v + 1], *u);
+
+            if (h_values) {
+              (*h_values).erase((*h_values).begin() + std::distance(h_indices.begin(), del_u));
+            }
+            std::transform(std::begin(h_offsets) + (*v) + 1,
+                           std::end(h_offsets),
+                           std::begin(h_offsets) + (*v) + 1,
+                           [](int x) { return x - 1; });
+            h_indices.erase(del_u);
+            n_dropped += 1;
+          }
+        }
+      }
+    }
+
+    std::vector<vertex_t> h_srcs(h_indices.size());
+
+    for (auto i = 0; i < h_offsets.size() - 1; ++i) {
+      std::fill(h_srcs.begin() + h_offsets[i], h_srcs.begin() + h_offsets[i + 1], i);
+    }
+
+    return std::make_tuple(std::move(h_srcs), std::move(h_indices), std::move(h_values));
+  }
+
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void run_current_test(std::tuple<KTruss_Usecase const&, input_usecase_t const&> const& param)
+  {
+    constexpr bool renumber               = false;
+    auto [k_truss_usecase, input_usecase] = param;
+    raft::handle_t handle{};
+
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("SG Construct graph");
+    }
+
+    // NX k_truss is not implemented for graph with self loop and multi edges therefore dropped
+    // them especially for rmat generated graphs.
+    auto [graph, edge_weight, d_renumber_map_labels] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, false>(
+        handle, input_usecase, k_truss_usecase.test_weighted_, renumber, true, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto graph_view = graph.view();
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("K-truss");
+    }
+
+    auto [d_cugraph_srcs, d_cugraph_dsts, d_cugraph_wgts] =
+      cugraph::k_truss<vertex_t, edge_t, weight_t, false>(
+        handle,
+        graph_view,
+        edge_weight ? std::make_optional((*edge_weight).view()) : std::nullopt,
+        k_truss_usecase.k_,
+        false);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (k_truss_usecase.check_correctness_) {
+      std::optional<cugraph::graph_t<vertex_t, edge_t, false, false>> modified_graph{std::nullopt};
+      auto [h_offsets, h_indices, h_values] = cugraph::test::graph_to_host_csr(
+        handle,
+        graph_view,
+        edge_weight ? std::make_optional((*edge_weight).view()) : std::nullopt,
+        std::optional<raft::device_span<vertex_t const>>(std::nullopt));
+
+      rmm::device_uvector<weight_t> d_sorted_cugraph_wgts{0, handle.get_stream()};
+      rmm::device_uvector<vertex_t> d_sorted_cugraph_srcs{0, handle.get_stream()};
+      rmm::device_uvector<vertex_t> d_sorted_cugraph_dsts{0, handle.get_stream()};
+
+      if (edge_weight) {
+        std::tie(d_sorted_cugraph_srcs, d_sorted_cugraph_dsts, d_sorted_cugraph_wgts) =
+          cugraph::test::sort_by_key(handle, d_cugraph_srcs, d_cugraph_dsts, *d_cugraph_wgts);
+      } else {
+        std::tie(d_sorted_cugraph_srcs, d_sorted_cugraph_dsts) =
+          cugraph::test::sort(handle, d_cugraph_srcs, d_cugraph_dsts);
+      }
+
+      auto h_cugraph_srcs = cugraph::test::to_host(handle, d_sorted_cugraph_srcs);
+
+      auto h_cugraph_dsts = cugraph::test::to_host(handle, d_sorted_cugraph_dsts);
+
+      auto [h_reference_srcs, h_reference_dsts, h_reference_wgts] =
+        k_truss_reference<vertex_t, edge_t, weight_t>(
+          h_offsets, h_indices, h_values, k_truss_usecase.k_);
+
+      EXPECT_EQ(h_cugraph_srcs.size(), h_reference_srcs.size());
+      ASSERT_TRUE(
+        std::equal(h_cugraph_srcs.begin(), h_cugraph_srcs.end(), h_reference_srcs.begin()));
+
+      ASSERT_TRUE(
+        std::equal(h_cugraph_dsts.begin(), h_cugraph_dsts.end(), h_reference_dsts.begin()));
+
+      if (edge_weight) {
+        auto h_cugraph_wgts  = cugraph::test::to_host(handle, d_sorted_cugraph_wgts);
+        auto compare_functor = host_nearly_equal<weight_t>{
+          weight_t{1e-3},
+          weight_t{(weight_t{1} / static_cast<weight_t>((h_cugraph_wgts).size())) *
+                   weight_t{1e-3}}};
+        EXPECT_TRUE(std::equal((h_cugraph_wgts).begin(),
+                               (h_cugraph_wgts).end(),
+                               (*h_reference_wgts).begin(),
+                               compare_functor));
+      }
+    }
+  }
+};
+
+using Tests_KTruss_File = Tests_KTruss<cugraph::test::File_Usecase>;
+using Tests_KTruss_Rmat = Tests_KTruss<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_KTruss_File, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_KTruss_File, CheckInt64Int64Float)
+{
+  run_current_test<int64_t, int64_t, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_KTruss_Rmat, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_KTruss_Rmat, CheckInt64Int64Float)
+{
+  run_current_test<int64_t, int64_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  simple_test,
+  Tests_KTruss_File,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(KTruss_Usecase{5, true, false},
+                      KTruss_Usecase{4, true, false},
+                      KTruss_Usecase{9, true, true},
+                      KTruss_Usecase{7, true, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/netscience.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/dolphins.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(rmat_small_test,
+                         Tests_KTruss_Rmat,
+                         // enable correctness checks
+                         ::testing::Combine(::testing::Values(KTruss_Usecase{5, false, true},
+                                                              KTruss_Usecase{4, false, true},
+                                                              KTruss_Usecase{9, true, true},
+                                                              KTruss_Usecase{7, true, true}),
+                                            ::testing::Values(cugraph::test::Rmat_Usecase(
+                                              10, 16, 0.57, 0.19, 0.19, 0, true, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_KTruss_Rmat,
+  // disable correctness checks for large graphs
+  // FIXME: High memory footprint. Perform nbr_intersection in chunks.
+  ::testing::Combine(
+    ::testing::Values(KTruss_Usecase{12, false, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(14, 16, 0.57, 0.19, 0.19, 0, true, false))));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/utilities/thrust_wrapper.cu b/cpp/tests/utilities/thrust_wrapper.cu
index 7d485dd5ab3..93bb8a04e87 100644
--- a/cpp/tests/utilities/thrust_wrapper.cu
+++ b/cpp/tests/utilities/thrust_wrapper.cu
@@ -57,12 +57,53 @@ value_buffer_type sort(raft::handle_t const& handle, value_buffer_type const& va
   return sorted_values;
 }
 
+template <typename value_buffer_type>
+std::tuple<value_buffer_type, value_buffer_type> sort(raft::handle_t const& handle,
+                                                      value_buffer_type const& first,
+                                                      value_buffer_type const& second)
+{
+  auto sorted_first =
+    cugraph::allocate_dataframe_buffer<cugraph::dataframe_element_t<value_buffer_type>>(
+      first.size(), handle.get_stream());
+  auto sorted_second =
+    cugraph::allocate_dataframe_buffer<cugraph::dataframe_element_t<value_buffer_type>>(
+      first.size(), handle.get_stream());
+
+  auto execution_policy = handle.get_thrust_policy();
+  thrust::copy(execution_policy,
+               cugraph::get_dataframe_buffer_begin(first),
+               cugraph::get_dataframe_buffer_end(first),
+               cugraph::get_dataframe_buffer_begin(sorted_first));
+  thrust::copy(execution_policy,
+               cugraph::get_dataframe_buffer_begin(second),
+               cugraph::get_dataframe_buffer_end(second),
+               cugraph::get_dataframe_buffer_begin(sorted_second));
+  thrust::sort(
+    execution_policy,
+    thrust::make_zip_iterator(cugraph::get_dataframe_buffer_begin(sorted_first),
+                              cugraph::get_dataframe_buffer_begin(sorted_second)),
+    thrust::make_zip_iterator(cugraph::get_dataframe_buffer_begin(sorted_first) + first.size(),
+                              cugraph::get_dataframe_buffer_begin(sorted_second) + first.size()));
+
+  return std::make_tuple(std::move(sorted_first), std::move(sorted_second));
+}
+
 template rmm::device_uvector<int32_t> sort(raft::handle_t const& handle,
                                            rmm::device_uvector<int32_t> const& values);
 
 template rmm::device_uvector<int64_t> sort(raft::handle_t const& handle,
                                            rmm::device_uvector<int64_t> const& values);
 
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>> sort(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t> const& first,
+  rmm::device_uvector<int32_t> const& second);
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>> sort(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int64_t> const& first,
+  rmm::device_uvector<int64_t> const& second);
+
 template <typename key_buffer_type, typename value_buffer_type>
 std::tuple<key_buffer_type, value_buffer_type> sort_by_key(raft::handle_t const& handle,
                                                            key_buffer_type const& keys,
@@ -143,6 +184,76 @@ template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>
   rmm::device_uvector<int64_t> const& keys,
   rmm::device_uvector<int64_t> const& values);
 
+template <typename key_buffer_type, typename value_buffer_type>
+std::tuple<key_buffer_type, key_buffer_type, value_buffer_type> sort_by_key(
+  raft::handle_t const& handle,
+  key_buffer_type const& first,
+  key_buffer_type const& second,
+  value_buffer_type const& values)
+{
+  auto sorted_first =
+    cugraph::allocate_dataframe_buffer<cugraph::dataframe_element_t<key_buffer_type>>(
+      first.size(), handle.get_stream());
+  auto sorted_second =
+    cugraph::allocate_dataframe_buffer<cugraph::dataframe_element_t<key_buffer_type>>(
+      first.size(), handle.get_stream());
+  auto sorted_values =
+    cugraph::allocate_dataframe_buffer<cugraph::dataframe_element_t<value_buffer_type>>(
+      first.size(), handle.get_stream());
+
+  auto execution_policy = handle.get_thrust_policy();
+  thrust::copy(execution_policy,
+               cugraph::get_dataframe_buffer_begin(first),
+               cugraph::get_dataframe_buffer_end(first),
+               cugraph::get_dataframe_buffer_begin(sorted_first));
+  thrust::copy(execution_policy,
+               cugraph::get_dataframe_buffer_begin(second),
+               cugraph::get_dataframe_buffer_end(second),
+               cugraph::get_dataframe_buffer_begin(sorted_second));
+  thrust::copy(execution_policy,
+               cugraph::get_dataframe_buffer_begin(values),
+               cugraph::get_dataframe_buffer_end(values),
+               cugraph::get_dataframe_buffer_begin(sorted_values));
+  thrust::sort_by_key(
+    execution_policy,
+    thrust::make_zip_iterator(cugraph::get_dataframe_buffer_begin(sorted_first),
+                              cugraph::get_dataframe_buffer_begin(sorted_second)),
+    thrust::make_zip_iterator(cugraph::get_dataframe_buffer_begin(sorted_first) + first.size(),
+                              cugraph::get_dataframe_buffer_begin(sorted_second) + first.size()),
+    cugraph::get_dataframe_buffer_begin(sorted_values));
+
+  return std::make_tuple(
+    std::move(sorted_first), std::move(sorted_second), std::move(sorted_values));
+}
+
+template std::
+  tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>, rmm::device_uvector<float>>
+  sort_by_key(raft::handle_t const& handle,
+              rmm::device_uvector<int32_t> const& first,
+              rmm::device_uvector<int32_t> const& second,
+              rmm::device_uvector<float> const& values);
+
+template std::
+  tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>, rmm::device_uvector<float>>
+  sort_by_key(raft::handle_t const& handle,
+              rmm::device_uvector<int64_t> const& first,
+              rmm::device_uvector<int64_t> const& second,
+              rmm::device_uvector<float> const& values);
+
+template std::
+  tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>, rmm::device_uvector<double>>
+  sort_by_key(raft::handle_t const& handle,
+              rmm::device_uvector<int32_t> const& first,
+              rmm::device_uvector<int32_t> const& second,
+              rmm::device_uvector<double> const& values);
+
+template std::
+  tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>, rmm::device_uvector<double>>
+  sort_by_key(raft::handle_t const& handle,
+              rmm::device_uvector<int64_t> const& first,
+              rmm::device_uvector<int64_t> const& second,
+              rmm::device_uvector<double> const& values);
+
 template std::tuple<rmm::device_uvector<int32_t>,
                     std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<float>>>
 sort_by_key(raft::handle_t const& handle,
diff --git a/cpp/tests/utilities/thrust_wrapper.hpp b/cpp/tests/utilities/thrust_wrapper.hpp
index ae2f5f2fdf7..c4b87126f50 100644
--- a/cpp/tests/utilities/thrust_wrapper.hpp
+++ b/cpp/tests/utilities/thrust_wrapper.hpp
@@ -29,11 +29,23 @@ namespace test {
 template <typename value_buffer_type>
 value_buffer_type sort(raft::handle_t const& handle, value_buffer_type const& values);
 
+template <typename value_buffer_type>
+std::tuple<value_buffer_type, value_buffer_type> sort(raft::handle_t const& handle,
+                                                      value_buffer_type const& first,
+                                                      value_buffer_type const& second);
+
 template <typename key_buffer_type, typename value_buffer_type>
 std::tuple<key_buffer_type, value_buffer_type> sort_by_key(raft::handle_t const& handle,
                                                            key_buffer_type const& keys,
                                                            value_buffer_type const& values);
 
+template <typename key_buffer_type, typename value_buffer_type>
+std::tuple<key_buffer_type, key_buffer_type, value_buffer_type> sort_by_key(
+  raft::handle_t const& handle,
+  key_buffer_type const& first,
+  key_buffer_type const& second,
+  value_buffer_type const& values);
+
 template <typename vertex_t>
 vertex_t max_element(raft::handle_t const& handle, raft::device_span<vertex_t const> vertices);