diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index bab521f485d..7c9cd0258a4 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -8,6 +8,11 @@
       "BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-conda"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index d225f15f755..a4dc168505b 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -8,10 +8,15 @@
       "BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-pip"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
     "ghcr.io/rapidsai/devcontainers/features/ucx:24.6": {
-      "version": "1.14.1"
+      "version": "1.15.0"
     },
     "ghcr.io/rapidsai/devcontainers/features/cuda:24.6": {
       "version": "11.8",
diff --git a/.devcontainer/cuda12.2-conda/devcontainer.json b/.devcontainer/cuda12.2-conda/devcontainer.json
index bcaabab572b..eae4967f3b2 100644
--- a/.devcontainer/cuda12.2-conda/devcontainer.json
+++ b/.devcontainer/cuda12.2-conda/devcontainer.json
@@ -8,6 +8,11 @@
       "BASE": "rapidsai/devcontainers:24.06-cpp-mambaforge-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-conda"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
diff --git a/.devcontainer/cuda12.2-pip/devcontainer.json b/.devcontainer/cuda12.2-pip/devcontainer.json
index e472f4621f9..393a5c63d23 100644
--- a/.devcontainer/cuda12.2-pip/devcontainer.json
+++ b/.devcontainer/cuda12.2-pip/devcontainer.json
@@ -8,10 +8,15 @@
       "BASE": "rapidsai/devcontainers:24.06-cpp-cuda12.2-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-pip"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
     "ghcr.io/rapidsai/devcontainers/features/ucx:24.6": {
-      "version": "1.14.1"
+      "version": "1.15.0"
     },
     "ghcr.io/rapidsai/devcontainers/features/cuda:24.6": {
       "version": "12.2",
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 9a7324fb330..f5c14e8d315 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -105,6 +105,7 @@ find .devcontainer/ -type f -name devcontainer.json -print0 | while IFS= read -r
     sed_runner "s@rapidsai/devcontainers/features/ucx:[0-9.]*@rapidsai/devcontainers/features/ucx:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
     sed_runner "s@rapidsai/devcontainers/features/cuda:[0-9.]*@rapidsai/devcontainers/features/cuda:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
     sed_runner "s@rapidsai/devcontainers/features/rapids-build-utils:[0-9.]*@rapidsai/devcontainers/features/rapids-build-utils:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
+    sed_runner "s@rapids-\${localWorkspaceFolderBasename}-[0-9.]*@rapids-\${localWorkspaceFolderBasename}-${NEXT_SHORT_TAG}@g" "${filename}"
 done
 
 sed_runner "s/:[0-9][0-9]\.[0-9][0-9]/:${NEXT_SHORT_TAG}/" ./notebooks/README.md
diff --git a/ci/test_wheel_cugraph-pyg.sh b/ci/test_wheel_cugraph-pyg.sh
index e98bf4ab56b..f45112dd80b 100755
--- a/ci/test_wheel_cugraph-pyg.sh
+++ b/ci/test_wheel_cugraph-pyg.sh
@@ -33,7 +33,7 @@ else
 fi
 rapids-logger "Installing PyTorch and PyG dependencies"
 rapids-retry python -m pip install torch==2.1.0 --index-url ${PYTORCH_URL}
-rapids-retry python -m pip install torch-geometric==2.4.0
+rapids-retry python -m pip install "torch-geometric>=2.5,<2.6"
 rapids-retry python -m pip install \
   ogb \
   pyg_lib \
diff --git a/conda/recipes/cugraph-pyg/meta.yaml b/conda/recipes/cugraph-pyg/meta.yaml
index 818616c2f5b..c02e8391eb2 100644
--- a/conda/recipes/cugraph-pyg/meta.yaml
+++ b/conda/recipes/cugraph-pyg/meta.yaml
@@ -34,7 +34,7 @@ requirements:
     - cupy >=12.0.0
     - cugraph ={{ version }}
     - pylibcugraphops ={{ minor_version }}
-    - pyg >=2.3,<2.5
+    - pyg >=2.5,<2.6
 
 tests:
   imports:
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index eb6f348b380..57e0aa2d078 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -77,6 +77,16 @@ rapids_find_package(CUDAToolkit REQUIRED
     INSTALL_EXPORT_SET  cugraph-exports
     )
 
+if (BUILD_CUGRAPH_MTMG_TESTS)
+    if(NOT TARGET ucx::ucp)
+        find_package(ucx REQUIRED)
+    endif()
+
+    if(NOT TARGET ucxx::ucxx)
+        find_package(ucxx REQUIRED)
+    endif()
+endif()
+
 set(CUGRAPH_C_FLAGS "")
 set(CUGRAPH_CXX_FLAGS "")
 set(CUGRAPH_CUDA_FLAGS "")
@@ -145,11 +155,6 @@ if(USE_CUGRAPH_OPS)
   include(cmake/thirdparty/get_libcugraphops.cmake)
 endif()
 
-
-if (BUILD_CUGRAPH_MTMG_TESTS)
-  include(cmake/thirdparty/get_ucp.cmake)
-endif()
-
 if(BUILD_TESTS)
   include(${rapids-cmake-dir}/cpm/gtest.cmake)
   rapids_cpm_gtest(BUILD_STATIC)
@@ -283,9 +288,12 @@ set(CUGRAPH_SOURCES
     src/structure/symmetrize_edgelist_mg.cu
     src/community/triangle_count_sg.cu
     src/community/triangle_count_mg.cu
+    src/community/approx_weighted_matching_sg.cu
+    src/community/approx_weighted_matching_mg.cu
     src/traversal/k_hop_nbrs_sg.cu
     src/traversal/k_hop_nbrs_mg.cu
     src/mtmg/vertex_result.cu
+    src/mtmg/vertex_pairs_result.cu
 )
 
 if(USE_CUGRAPH_OPS)
diff --git a/cpp/cmake/thirdparty/get_ucp.cmake b/cpp/cmake/thirdparty/get_ucp.cmake
deleted file mode 100644
index dcc4956a34e..00000000000
--- a/cpp/cmake/thirdparty/get_ucp.cmake
+++ /dev/null
@@ -1,35 +0,0 @@
-#=============================================================================
-# Copyright (c) 2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
-
-function(find_and_configure_ucp)
-
-    if(TARGET UCP::UCP)
-        return()
-    endif()
-
-    rapids_find_generate_module(UCP
-        HEADER_NAMES ucp.h
-        LIBRARY_NAMES ucp
-        INCLUDE_SUFFIXES ucp/api
-    )
-
-    # Currently UCP has no CMake build-system so we require
-    # it built and installed on the machine already
-    rapids_find_package(UCP REQUIRED)
-
-endfunction()
-
-find_and_configure_ucp()
diff --git a/cpp/include/cugraph/algorithms.hpp b/cpp/include/cugraph/algorithms.hpp
index 0caa151daac..7c4a978c4b4 100644
--- a/cpp/include/cugraph/algorithms.hpp
+++ b/cpp/include/cugraph/algorithms.hpp
@@ -2368,6 +2368,32 @@ rmm::device_uvector<vertex_t> vertex_coloring(
   graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
   raft::random::RngState& rng_state);
 
+/*
+ * @brief Approximate Weighted Matching
+ *
+ * A matching in an undirected graph G = (V, E) is a pairing of adjacent vertices
+ * such that each vertex is matched with at most one other vertex, the objective
+ * being to match as many vertices as possible or to maximise the sum of the
+ * weights of the matched edges. Here we provide an implementation of an
+ * approximation algorithm to the weighted Maximum matching. See
+ * https://web.archive.org/web/20081031230449id_/http://www.ii.uib.no/~fredrikm/fredrik/papers/CP75.pdf
+ * for further information.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * @param[in] handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator,
+ * and handles to various CUDA libraries) to run graph algorithms.
+ * @param[in] graph_view Graph view object.
+ * @param[in] edge_weight_view View object holding edge weights for @p graph_view.
+ * @return A tuple of device vector of matched vertex ids and sum of the weights of the matched
+ * edges.
+ */
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>, weight_t> approximate_weighted_matching(
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+  edge_property_view_t<edge_t, weight_t const*> edge_weight_view);
 }  // namespace cugraph
 
 /**
diff --git a/cpp/include/cugraph/detail/shuffle_wrappers.hpp b/cpp/include/cugraph/detail/shuffle_wrappers.hpp
index 1353302f825..69d48098a5d 100644
--- a/cpp/include/cugraph/detail/shuffle_wrappers.hpp
+++ b/cpp/include/cugraph/detail/shuffle_wrappers.hpp
@@ -213,12 +213,12 @@ shuffle_int_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
  *
  * @param[in] handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator,
  * and handles to various CUDA libraries) to run graph algorithms.
- * @param[in/out] d_edgelist_majors Vertex IDs for sources (if we are internally storing edges in
+ * @param[in,out] d_edgelist_majors Vertex IDs for sources (if we are internally storing edges in
  * the sparse 2D matrix using sources as major indices) or destinations (otherwise)
- * @param[in/out] d_edgelist_minors Vertex IDs for destinations (if we are internally storing edges
+ * @param[in,out] d_edgelist_minors Vertex IDs for destinations (if we are internally storing edges
  * in the sparse 2D matrix using sources as major indices) or sources (otherwise)
- * @param[in/out] d_edgelist_weights Optional edge weights
- * @param[in/out] d_edgelist_id_type_pairs Optional edge (ID, type) pairs
+ * @param[in,out] d_edgelist_weights Optional edge weights
+ * @param[in,out] d_edgelist_id_type_pairs Optional edge (ID, type) pairs
  * @param[in] groupby_and_count_local_partition_by_minor If set to true, groupby and count edges
  * based on (local partition ID, GPU ID) pairs (where GPU IDs are computed by applying the
  * compute_gpu_id_from_vertex_t function to the minor vertex ID). If set to false, groupby and count
diff --git a/cpp/include/cugraph/mtmg/detail/device_shared_device_span.hpp b/cpp/include/cugraph/mtmg/detail/device_shared_device_span.hpp
index caba9e91d8b..82a7a3fae25 100644
--- a/cpp/include/cugraph/mtmg/detail/device_shared_device_span.hpp
+++ b/cpp/include/cugraph/mtmg/detail/device_shared_device_span.hpp
@@ -25,12 +25,7 @@ namespace mtmg {
 namespace detail {
 
 /**
- * @brief  Wrap an object to be available for each GPU
- *
- * In the MTMG environment we need the ability to manage a collection of objects
- * that are associated with a particular GPU, and fetch the objects from an
- * arbitrary GPU thread.  This object will wrap any object and allow it to be
- * accessed from different threads.
+ * @brief  Manage device spans on each GPU
  */
 template <typename T>
 using device_shared_device_span_t = device_shared_wrapper_t<raft::device_span<T>>;
diff --git a/cpp/include/cugraph/mtmg/detail/device_shared_device_span_tuple.hpp b/cpp/include/cugraph/mtmg/detail/device_shared_device_span_tuple.hpp
new file mode 100644
index 00000000000..2d28cbadc24
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/detail/device_shared_device_span_tuple.hpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_wrapper.hpp>
+
+#include <raft/core/device_span.hpp>
+
+namespace cugraph {
+namespace mtmg {
+namespace detail {
+
+/**
+ * @brief  Manage a tuple of device spans on each GPU
+ */
+template <typename... Ts>
+using device_shared_device_span_tuple_t =
+  device_shared_wrapper_t<std::tuple<raft::device_span<Ts>...>>;
+
+}  // namespace detail
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/detail/device_shared_device_vector.hpp b/cpp/include/cugraph/mtmg/detail/device_shared_device_vector.hpp
index 969d32aa313..cfb746551ef 100644
--- a/cpp/include/cugraph/mtmg/detail/device_shared_device_vector.hpp
+++ b/cpp/include/cugraph/mtmg/detail/device_shared_device_vector.hpp
@@ -25,12 +25,10 @@ namespace mtmg {
 namespace detail {
 
 /**
- * @brief  Wrap an object to be available for each GPU
+ * @brief  Manage a device vector on each GPU
  *
- * In the MTMG environment we need the ability to manage a collection of objects
- * that are associated with a particular GPU, and fetch the objects from an
- * arbitrary GPU thread.  This object will wrap any object and allow it to be
- * accessed from different threads.
+ * Uses the device_shared_wrapper to manage an rmm::device_uvector<T> on
+ * each GPU.
  */
 template <typename T>
 class device_shared_device_vector_t : public device_shared_wrapper_t<rmm::device_uvector<T>> {
diff --git a/cpp/include/cugraph/mtmg/detail/device_shared_device_vector_tuple.hpp b/cpp/include/cugraph/mtmg/detail/device_shared_device_vector_tuple.hpp
new file mode 100644
index 00000000000..8d93d1ac39a
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/detail/device_shared_device_vector_tuple.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_device_span_tuple.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+namespace cugraph {
+namespace mtmg {
+namespace detail {
+
+/**
+ * @brief  Manage a tuple of device vector on each GPU
+ *
+ * Uses the device_shared_wrapper to manage a tuple of rmm::device_uvector
+ * instances on each GPU.
+ */
+template <typename... Ts>
+class device_shared_device_vector_tuple_t
+  : public device_shared_wrapper_t<std::tuple<rmm::device_uvector<Ts>...>> {
+  using parent_t = detail::device_shared_wrapper_t<std::tuple<rmm::device_uvector<Ts>...>>;
+
+ public:
+  /**
+   * @brief Create a device_shared_device_span (read only view)
+   */
+  auto view()
+  {
+    std::lock_guard<std::mutex> lock(parent_t::lock_);
+
+    device_shared_device_span_tuple_t<Ts...> result;
+
+    std::for_each(parent_t::objects_.begin(), parent_t::objects_.end(), [&result, this](auto& p) {
+      convert_to_span(std::index_sequence_for<Ts...>(), result, p);
+      // std::size_t Is... = std::index_sequence_for<Ts...>;
+      // result.set(p.first, std::make_tuple(raft::device_span<Ts
+      // const>{std::get<Is>(p.second).data(), std::get<Is>(p.second).size()}...));
+    });
+
+    return result;
+  }
+
+ private:
+  template <std::size_t... Is>
+  void convert_to_span(std::index_sequence<Is...>,
+                       device_shared_device_span_tuple_t<Ts...>& result,
+                       std::pair<int32_t const, std::tuple<rmm::device_uvector<Ts>...>>& p)
+  {
+    result.set(p.first,
+               std::make_tuple(raft::device_span<Ts>{std::get<Is>(p.second).data(),
+                                                     std::get<Is>(p.second).size()}...));
+  }
+};
+
+}  // namespace detail
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/vertex_pair_result.hpp b/cpp/include/cugraph/mtmg/vertex_pair_result.hpp
new file mode 100644
index 00000000000..2049b76c4b9
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/vertex_pair_result.hpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_device_vector_tuple.hpp>
+#include <cugraph/mtmg/vertex_pair_result_view.hpp>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief An MTMG device vector for storing vertex results
+ */
+template <typename vertex_t, typename result_t>
+class vertex_pair_result_t
+  : public detail::device_shared_device_vector_tuple_t<vertex_t, vertex_t, result_t> {
+  using parent_t = detail::device_shared_device_vector_tuple_t<vertex_t, vertex_t, result_t>;
+
+ public:
+  /**
+   * @brief Create a vertex result view (read only)
+   */
+  auto view() { return vertex_pair_result_view_t<vertex_t, result_t>(this->parent_t::view()); }
+};
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/vertex_pair_result_view.hpp b/cpp/include/cugraph/mtmg/vertex_pair_result_view.hpp
new file mode 100644
index 00000000000..8e51ef2de3c
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/vertex_pair_result_view.hpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_device_span_tuple.hpp>
+#include <cugraph/mtmg/graph_view.hpp>
+#include <cugraph/mtmg/handle.hpp>
+#include <cugraph/mtmg/renumber_map.hpp>
+
+#include <optional>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief An MTMG device span for referencing a vertex pair result
+ */
+template <typename vertex_t, typename result_t>
+class vertex_pair_result_view_t
+  : public detail::device_shared_device_span_tuple_t<vertex_t, vertex_t, result_t> {
+  using parent_t = detail::device_shared_device_span_tuple_t<vertex_t, vertex_t, result_t>;
+
+ public:
+  vertex_pair_result_view_t(parent_t&& other) : parent_t{std::move(other)} {}
+
+  /**
+   * @brief Gather results from specified vertices
+   */
+  template <bool multi_gpu>
+  std::tuple<rmm::device_uvector<vertex_t>,
+             rmm::device_uvector<vertex_t>,
+             rmm::device_uvector<result_t>>
+  gather(handle_t const& handle,
+         raft::device_span<vertex_t const> vertices,
+         std::vector<vertex_t> const& vertex_partition_range_lasts,
+         cugraph::vertex_partition_view_t<vertex_t, multi_gpu> vertex_partition_view,
+         std::optional<cugraph::mtmg::renumber_map_view_t<vertex_t>>& renumber_map_view);
+};
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph_c/community_algorithms.h b/cpp/include/cugraph_c/community_algorithms.h
index cb3d6b6375a..b6f59333805 100644
--- a/cpp/include/cugraph_c/community_algorithms.h
+++ b/cpp/include/cugraph_c/community_algorithms.h
@@ -121,7 +121,7 @@ cugraph_error_code_t cugraph_louvain(const cugraph_resource_handle_t* handle,
  * @param [in]  handle       Handle for accessing resources
  * @param [in]  graph        Pointer to graph.  NOTE: Graph might be modified if the storage
  *                           needs to be transposed
- * @param [in/out] rng_state State of the random number generator, updated with each call
+ * @param [in,out] rng_state State of the random number generator, updated with each call
  * @param [in]  max_level    Maximum level in hierarchy
  * @param [in]  resolution   Resolution parameter (gamma) in modularity formula.
  *                           This changes the size of the communities.  Higher resolutions
@@ -181,7 +181,7 @@ void cugraph_hierarchical_clustering_result_free(cugraph_hierarchical_clustering
  * @brief     Compute ECG clustering
  *
  * @param [in]  handle        Handle for accessing resources
- * @param [in/out] rng_state  State of the random number generator, updated with each call
+ * @param [in,out] rng_state  State of the random number generator, updated with each call
  * @param [in]  graph         Pointer to graph.  NOTE: Graph might be modified if the storage
  *                            needs to be transposed
  * @param [in]  min_weight    Minimum edge weight in final graph
diff --git a/cpp/include/cugraph_c/graph_functions.h b/cpp/include/cugraph_c/graph_functions.h
index 94b06189796..ff7e439232a 100644
--- a/cpp/include/cugraph_c/graph_functions.h
+++ b/cpp/include/cugraph_c/graph_functions.h
@@ -229,6 +229,26 @@ cugraph_error_code_t cugraph_allgather(const cugraph_resource_handle_t* handle,
                                        cugraph_induced_subgraph_result_t** result,
                                        cugraph_error_t** error);
 
+/**
+ * @brief      Count multi_edges
+ *
+ * Count the number of multi-edges in the graph
+ *
+ * @param [in]  handle              Handle for accessing resources.
+ * @param [in]  graph               Pointer to graph
+ * @param [in]  do_expensive_check  A flag to run expensive checks for input arguments (if set to
+ * true)
+ * @param [out] result              Where to store the count of multi-edges
+ * @param [out] error               Pointer to an error object storing details of any error.  Will
+ *                                  be populated if error code is not CUGRAPH_SUCCESS
+ * @return error code
+ */
+cugraph_error_code_t cugraph_count_multi_edges(const cugraph_resource_handle_t* handle,
+                                               cugraph_graph_t* graph,
+                                               bool_t do_expensive_check,
+                                               size_t* result,
+                                               cugraph_error_t** error);
+
 /**
  * @brief       Opaque degree result type
  */
diff --git a/cpp/include/cugraph_c/graph_generators.h b/cpp/include/cugraph_c/graph_generators.h
index 36cded27ba6..272131d2aab 100644
--- a/cpp/include/cugraph_c/graph_generators.h
+++ b/cpp/include/cugraph_c/graph_generators.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -120,7 +120,7 @@ void cugraph_coo_list_free(cugraph_coo_list_t* coo_list);
  * Vertex types will be int32 if scale < 32 and int64 if scale >= 32
  *
  * @param [in]     handle             Handle for accessing resources
- * @param [in/out] rng_state          State of the random number generator, updated with each call
+ * @param [in,out] rng_state          State of the random number generator, updated with each call
  * @param [in]     scale Scale factor to set the number of vertices in the graph. Vertex IDs have
  * values in [0, V), where V = 1 << @p scale.
  * @param [in]     num_edges          Number of edges to generate.
@@ -164,7 +164,7 @@ cugraph_error_code_t cugraph_generate_rmat_edgelist(const cugraph_resource_handl
  * Vertex types will be int32 if scale < 32 and int64 if scale >= 32
  *
  * @param [in]     handle             Handle for accessing resources
- * @param [in/out] rng_state          State of the random number generator, updated with each call
+ * @param [in,out] rng_state          State of the random number generator, updated with each call
  * @param [in]     n_edgelists Number of edge lists (graphs) to generate
  * @param [in]     min_scale Scale factor to set the minimum number of verties in the graph.
  * @param [in]     max_scale Scale factor to set the maximum number of verties in the graph.
@@ -204,8 +204,8 @@ cugraph_error_code_t cugraph_generate_rmat_edgelists(
  * Updates a COO to contain random edge weights
  *
  * @param [in]     handle             Handle for accessing resources
- * @param [in/out] rng_state          State of the random number generator, updated with each call
- * @param [in/out] coo                Opaque pointer to the coo, weights will be added (overwriting
+ * @param [in,out] rng_state          State of the random number generator, updated with each call
+ * @param [in,out] coo                Opaque pointer to the coo, weights will be added (overwriting
  * any existing weights)
  * @param [in]     dtype              The type of weight to generate (FLOAT32 or FLOAT64), ignored
  * unless include_weights is true
@@ -229,7 +229,7 @@ cugraph_error_code_t cugraph_generate_edge_weights(const cugraph_resource_handle
  * edges
  *
  * @param [in]     handle             Handle for accessing resources
- * @param [in/out] coo                Opaque pointer to the coo, weights will be added (overwriting
+ * @param [in,out] coo                Opaque pointer to the coo, weights will be added (overwriting
  * any existing weights)
  * @param [in]     multi_gpu          Flag if the COO is being created on multiple GPUs
  * @param [out]    error              Pointer to an error object storing details of any error.  Will
@@ -246,8 +246,8 @@ cugraph_error_code_t cugraph_generate_edge_ids(const cugraph_resource_handle_t*
  * Updates a COO to contain edge types.  Edges types will be randomly generated.
  *
  * @param [in]     handle             Handle for accessing resources
- * @param [in/out] rng_state          State of the random number generator, updated with each call
- * @param [in/out] coo                Opaque pointer to the coo, weights will be added (overwriting
+ * @param [in,out] rng_state          State of the random number generator, updated with each call
+ * @param [in,out] coo                Opaque pointer to the coo, weights will be added (overwriting
  * any existing weights)
  * @param [in]     max_edge_type      Edge types will be randomly generated between min_edge_type
  * and max_edge_type
diff --git a/cpp/include/cugraph_c/sampling_algorithms.h b/cpp/include/cugraph_c/sampling_algorithms.h
index 859eaca7f3b..35f60e195be 100644
--- a/cpp/include/cugraph_c/sampling_algorithms.h
+++ b/cpp/include/cugraph_c/sampling_algorithms.h
@@ -348,7 +348,7 @@ void cugraph_sampling_options_free(cugraph_sampling_options_t* options);
  *                            parameter is only used with the retain_seeds option.
  * @param [in]  fanout       Host array defining the fan out at each step in the sampling algorithm.
  *                           We only support fanout values of type INT32
- * @param [in/out] rng_state State of the random number generator, updated with each call
+ * @param [in,out] rng_state State of the random number generator, updated with each call
  * @param [in]  sampling_options
  *                           Opaque pointer defining the sampling options.
  * @param [in]  do_expensive_check
@@ -599,7 +599,7 @@ cugraph_error_code_t cugraph_test_uniform_neighborhood_sample_result_create(
  *
  * @param [in]      handle        Handle for accessing resources
  * @param [in]      graph         Pointer to graph
- * @param [in/out]  rng_state     State of the random number generator, updated with each call
+ * @param [in,out]  rng_state     State of the random number generator, updated with each call
  * @param [in]      num_vertices  Number of vertices to sample
  * @param [out]     vertices      Device array view to populate label
  * @param [out]     error         Pointer to an error object storing details of
diff --git a/cpp/include/cugraph_c/traversal_algorithms.h b/cpp/include/cugraph_c/traversal_algorithms.h
index e25fa167e43..166205ce281 100644
--- a/cpp/include/cugraph_c/traversal_algorithms.h
+++ b/cpp/include/cugraph_c/traversal_algorithms.h
@@ -88,7 +88,7 @@ void cugraph_paths_result_free(cugraph_paths_result_t* result);
  * @param [in]  handle       Handle for accessing resources
  * @param [in]  graph        Pointer to graph
  * FIXME:  Make this just [in], copy it if I need to temporarily modify internally
- * @param [in/out]  sources  Array of source vertices.  NOTE: Array might be modified if
+ * @param [in,out]  sources  Array of source vertices.  NOTE: Array might be modified if
  *                           renumbering is enabled for the graph
  * @param [in]  direction_optimizing If set to true, this algorithm switches between the push based
  * breadth-first search and pull based breadth-first search depending on the size of the
diff --git a/cpp/src/c_api/graph_functions.cpp b/cpp/src/c_api/graph_functions.cpp
index 35f7086d726..91371b988b3 100644
--- a/cpp/src/c_api/graph_functions.cpp
+++ b/cpp/src/c_api/graph_functions.cpp
@@ -214,6 +214,44 @@ struct two_hop_neighbors_functor : public cugraph::c_api::abstract_functor {
   }
 };
 
+struct count_multi_edges_functor : public cugraph::c_api::abstract_functor {
+  raft::handle_t const& handle_{};
+  cugraph::c_api::cugraph_graph_t* graph_{nullptr};
+  size_t result_{};
+  bool do_expensive_check_{false};
+
+  count_multi_edges_functor(::cugraph_resource_handle_t const* handle,
+                            ::cugraph_graph_t* graph,
+                            bool do_expensive_check)
+    : abstract_functor(),
+      handle_(*reinterpret_cast<cugraph::c_api::cugraph_resource_handle_t const*>(handle)->handle_),
+      graph_(reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)),
+      do_expensive_check_(do_expensive_check)
+  {
+  }
+
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            typename edge_type_type_t,
+            bool store_transposed,
+            bool multi_gpu>
+  void operator()()
+  {
+    if constexpr (!cugraph::is_candidate<vertex_t, edge_t, weight_t>::value) {
+      unsupported();
+    } else {
+      auto graph =
+        reinterpret_cast<cugraph::graph_t<vertex_t, edge_t, store_transposed, multi_gpu>*>(
+          graph_->graph_);
+
+      auto graph_view = graph->view();
+
+      result_ = static_cast<size_t>(graph_view.count_multi_edges(handle_));
+    }
+  }
+};
+
 }  // namespace
 
 extern "C" cugraph_error_code_t cugraph_create_vertex_pairs(
@@ -281,3 +319,14 @@ extern "C" cugraph_error_code_t cugraph_two_hop_neighbors(
 
   return cugraph::c_api::run_algorithm(graph, functor, result, error);
 }
+
+extern "C" cugraph_error_code_t cugraph_count_multi_edges(const cugraph_resource_handle_t* handle,
+                                                          cugraph_graph_t* graph,
+                                                          bool_t do_expensive_check,
+                                                          size_t* result,
+                                                          cugraph_error_t** error)
+{
+  count_multi_edges_functor functor(handle, graph, do_expensive_check);
+
+  return cugraph::c_api::run_algorithm(graph, functor, result, error);
+}
diff --git a/cpp/src/community/approx_weighted_matching_impl.cuh b/cpp/src/community/approx_weighted_matching_impl.cuh
new file mode 100644
index 00000000000..e693beee489
--- /dev/null
+++ b/cpp/src/community/approx_weighted_matching_impl.cuh
@@ -0,0 +1,392 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "prims/fill_edge_property.cuh"
+#include "prims/reduce_op.cuh"
+#include "prims/transform_e.cuh"
+#include "prims/transform_reduce_e_by_src_dst_key.cuh"
+#include "prims/update_edge_src_dst_property.cuh"
+#include "utilities/collect_comm.cuh"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/detail/shuffle_wrappers.hpp>
+#include <cugraph/detail/utility_wrappers.hpp>
+
+#include <raft/core/handle.hpp>
+
+#include <thrust/fill.h>
+
+namespace cugraph {
+
+namespace detail {
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>, weight_t> approximate_weighted_matching(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+  edge_property_view_t<edge_t, weight_t const*> edge_weight_view)
+{
+  CUGRAPH_EXPECTS(graph_view.is_symmetric(),
+                  "Invalid input arguments: input graph for approximate_weighted_matching must "
+                  "need to be symmetric");
+
+  using graph_view_t = cugraph::graph_view_t<vertex_t, edge_t, false, multi_gpu>;
+
+  graph_view_t current_graph_view(graph_view);
+  if (current_graph_view.has_edge_mask()) { current_graph_view.clear_edge_mask(); }
+
+  cugraph::edge_property_t<graph_view_t, bool> edge_masks_even(handle, current_graph_view);
+  cugraph::fill_edge_property(handle, current_graph_view, bool{false}, edge_masks_even);
+  cugraph::edge_property_t<graph_view_t, bool> edge_masks_odd(handle, current_graph_view);
+  cugraph::fill_edge_property(handle, current_graph_view, bool{false}, edge_masks_odd);
+
+  if (graph_view.has_edge_mask()) {
+    current_graph_view.attach_edge_mask(*(graph_view.edge_mask_view()));
+  }
+  // Mask out self-loop
+  cugraph::transform_e(
+    handle,
+    current_graph_view,
+    cugraph::edge_src_dummy_property_t{}.view(),
+    cugraph::edge_dst_dummy_property_t{}.view(),
+    cugraph::edge_dummy_property_t{}.view(),
+    [] __device__(auto src, auto dst, thrust::nullopt_t, thrust::nullopt_t, thrust::nullopt_t) {
+      return !(src == dst);
+    },
+    edge_masks_even.mutable_view());
+
+  if (current_graph_view.has_edge_mask()) current_graph_view.clear_edge_mask();
+  current_graph_view.attach_edge_mask(edge_masks_even.view());
+
+  auto constexpr invalid_partner = invalid_vertex_id<vertex_t>::value;
+  rmm::device_uvector<weight_t> offers_from_partners(
+    current_graph_view.local_vertex_partition_range_size(), handle.get_stream());
+
+  rmm::device_uvector<vertex_t> partners(current_graph_view.local_vertex_partition_range_size(),
+                                         handle.get_stream());
+
+  thrust::fill(handle.get_thrust_policy(), partners.begin(), partners.end(), invalid_partner);
+  thrust::fill(handle.get_thrust_policy(),
+               offers_from_partners.begin(),
+               offers_from_partners.end(),
+               weight_t{0.0});
+
+  rmm::device_uvector<vertex_t> local_vertices(
+    current_graph_view.local_vertex_partition_range_size(), handle.get_stream());
+  detail::sequence_fill(handle.get_stream(),
+                        local_vertices.begin(),
+                        local_vertices.size(),
+                        current_graph_view.local_vertex_partition_range_first());
+
+  edge_src_property_t<graph_view_t, vertex_t> src_key_cache(handle);
+  cugraph::edge_src_property_t<graph_view_t, bool> src_match_flags(handle);
+  cugraph::edge_dst_property_t<graph_view_t, bool> dst_match_flags(handle);
+
+  if constexpr (graph_view_t::is_multi_gpu) {
+    src_key_cache = edge_src_property_t<graph_view_t, vertex_t>(handle, current_graph_view);
+
+    update_edge_src_property(handle, current_graph_view, local_vertices.begin(), src_key_cache);
+
+    src_match_flags = cugraph::edge_src_property_t<graph_view_t, bool>(handle, current_graph_view);
+    dst_match_flags = cugraph::edge_dst_property_t<graph_view_t, bool>(handle, current_graph_view);
+  }
+
+  vertex_t loop_counter = 0;
+  while (true) {
+    //
+    // For each candidate vertex, find the best possible target
+    //
+
+    rmm::device_uvector<vertex_t> candidates(0, handle.get_stream());
+    rmm::device_uvector<weight_t> offers_from_candidates(0, handle.get_stream());
+    rmm::device_uvector<vertex_t> targets(0, handle.get_stream());
+
+    // FIXME: This can be implemented more efficiently if per_v_transform_reduce_incoming|outgoing_e
+    // is updated to support reduction on thrust::tuple.
+    std::forward_as_tuple(candidates, std::tie(offers_from_candidates, targets)) =
+      cugraph::transform_reduce_e_by_src_key(
+        handle,
+        current_graph_view,
+        cugraph::edge_src_dummy_property_t{}.view(),
+        cugraph::edge_dst_dummy_property_t{}.view(),
+        edge_weight_view,
+        graph_view_t::is_multi_gpu
+          ? src_key_cache.view()
+          : detail::edge_major_property_view_t<vertex_t, vertex_t const*>(local_vertices.begin()),
+        [] __device__(auto, auto dst, thrust::nullopt_t, thrust::nullopt_t, auto wt) {
+          return thrust::make_tuple(wt, dst);
+        },
+        thrust::make_tuple(weight_t{0.0}, invalid_partner),
+        reduce_op::maximum<thrust::tuple<weight_t, vertex_t>>{},
+        true);
+
+    //
+    // For each target, find the best offer
+    //
+
+    if constexpr (graph_view_t::is_multi_gpu) {
+      auto vertex_partition_range_lasts = current_graph_view.vertex_partition_range_lasts();
+
+      rmm::device_uvector<vertex_t> d_vertex_partition_range_lasts(
+        vertex_partition_range_lasts.size(), handle.get_stream());
+
+      raft::update_device(d_vertex_partition_range_lasts.data(),
+                          vertex_partition_range_lasts.data(),
+                          vertex_partition_range_lasts.size(),
+                          handle.get_stream());
+
+      auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
+      auto const major_comm_size = major_comm.get_size();
+      auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+      auto const minor_comm_size = minor_comm.get_size();
+
+      auto key_func = cugraph::detail::compute_gpu_id_from_int_vertex_t<vertex_t>{
+        raft::device_span<vertex_t const>(d_vertex_partition_range_lasts.data(),
+                                          d_vertex_partition_range_lasts.size()),
+        major_comm_size,
+        minor_comm_size};
+
+      std::forward_as_tuple(std::tie(candidates, offers_from_candidates, targets), std::ignore) =
+        cugraph::groupby_gpu_id_and_shuffle_values(
+          handle.get_comms(),
+          thrust::make_zip_iterator(thrust::make_tuple(
+            candidates.begin(), offers_from_candidates.begin(), targets.begin())),
+          thrust::make_zip_iterator(
+            thrust::make_tuple(candidates.end(), offers_from_candidates.end(), targets.end())),
+          [key_func] __device__(auto val) { return key_func(thrust::get<2>(val)); },
+          handle.get_stream());
+    }
+
+    auto itr_to_tuples = thrust::make_zip_iterator(
+      thrust::make_tuple(offers_from_candidates.begin(), candidates.begin()));
+
+    thrust::sort_by_key(handle.get_thrust_policy(), targets.begin(), targets.end(), itr_to_tuples);
+
+    auto nr_unique_targets = thrust::count_if(handle.get_thrust_policy(),
+                                              thrust::make_counting_iterator(size_t{0}),
+                                              thrust::make_counting_iterator(targets.size()),
+                                              is_first_in_run_t<vertex_t const*>{targets.data()});
+
+    rmm::device_uvector<vertex_t> unique_targets(nr_unique_targets, handle.get_stream());
+    rmm::device_uvector<weight_t> best_offers_to_targets(nr_unique_targets, handle.get_stream());
+    rmm::device_uvector<vertex_t> best_candidates(nr_unique_targets, handle.get_stream());
+
+    auto itr_to_reduced_tuples = thrust::make_zip_iterator(
+      thrust::make_tuple(best_offers_to_targets.begin(), best_candidates.begin()));
+
+    auto new_end = thrust::reduce_by_key(
+      handle.get_thrust_policy(),
+      targets.begin(),
+      targets.end(),
+      itr_to_tuples,
+      unique_targets.begin(),
+      itr_to_reduced_tuples,
+      thrust::equal_to<vertex_t>{},
+      [] __device__(auto pair1, auto pair2) { return (pair1 > pair2) ? pair1 : pair2; });
+
+    vertex_t nr_reduces_tuples =
+      static_cast<vertex_t>(thrust::distance(unique_targets.begin(), new_end.first));
+
+    targets                = std::move(unique_targets);
+    offers_from_candidates = std::move(best_offers_to_targets);
+    candidates             = std::move(best_candidates);
+
+    //
+    //  two vertex offer each other, that's a match
+    //
+
+    kv_store_t<vertex_t, vertex_t, false> target_candidate_map(targets.begin(),
+                                                               targets.end(),
+                                                               candidates.begin(),
+                                                               invalid_vertex_id<vertex_t>::value,
+                                                               invalid_vertex_id<vertex_t>::value,
+                                                               handle.get_stream());
+
+    rmm::device_uvector<vertex_t> candidates_of_candidates(0, handle.get_stream());
+
+    if (graph_view_t::is_multi_gpu) {
+      auto& comm       = handle.get_comms();
+      auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
+      auto const major_comm_size = major_comm.get_size();
+      auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+      auto const minor_comm_size = minor_comm.get_size();
+
+      auto partitions_range_lasts = graph_view.vertex_partition_range_lasts();
+      rmm::device_uvector<vertex_t> d_partitions_range_lasts(partitions_range_lasts.size(),
+                                                             handle.get_stream());
+
+      raft::update_device(d_partitions_range_lasts.data(),
+                          partitions_range_lasts.data(),
+                          partitions_range_lasts.size(),
+                          handle.get_stream());
+
+      cugraph::detail::compute_gpu_id_from_int_vertex_t<vertex_t> vertex_to_gpu_id_op{
+        raft::device_span<vertex_t const>(d_partitions_range_lasts.data(),
+                                          d_partitions_range_lasts.size()),
+        major_comm_size,
+        minor_comm_size};
+
+      candidates_of_candidates = cugraph::collect_values_for_keys(handle,
+                                                                  target_candidate_map.view(),
+                                                                  candidates.begin(),
+                                                                  candidates.end(),
+                                                                  vertex_to_gpu_id_op);
+    } else {
+      candidates_of_candidates.resize(candidates.size(), handle.get_stream());
+
+      target_candidate_map.view().find(candidates.begin(),
+                                       candidates.end(),
+                                       candidates_of_candidates.begin(),
+                                       handle.get_stream());
+    }
+
+    //
+    // Mask out neighborhood of matched vertices
+    //
+
+    rmm::device_uvector<bool> is_vertex_matched = rmm::device_uvector<bool>(
+      current_graph_view.local_vertex_partition_range_size(), handle.get_stream());
+    thrust::fill(
+      handle.get_thrust_policy(), is_vertex_matched.begin(), is_vertex_matched.end(), bool{false});
+
+    thrust::for_each(
+      handle.get_thrust_policy(),
+      thrust::make_zip_iterator(thrust::make_tuple(candidates_of_candidates.begin(),
+                                                   targets.begin(),
+                                                   candidates.begin(),
+                                                   offers_from_candidates.begin())),
+      thrust::make_zip_iterator(thrust::make_tuple(candidates_of_candidates.end(),
+                                                   targets.end(),
+                                                   candidates.end(),
+                                                   offers_from_candidates.end())),
+      [partners             = partners.begin(),
+       offers_from_partners = offers_from_partners.begin(),
+       is_vertex_matched =
+         raft::device_span<bool>(is_vertex_matched.data(), is_vertex_matched.size()),
+       v_first =
+         current_graph_view.local_vertex_partition_range_first()] __device__(auto msrc_tgt) {
+        auto candidate_of_candidate = thrust::get<0>(msrc_tgt);
+        auto tgt                    = thrust::get<1>(msrc_tgt);
+        auto candiate               = thrust::get<2>(msrc_tgt);
+        auto offer_value            = thrust::get<3>(msrc_tgt);
+
+        if (candidate_of_candidate != invalid_partner && candidate_of_candidate == tgt) {
+          auto tgt_offset                  = tgt - v_first;
+          is_vertex_matched[tgt_offset]    = true;
+          partners[tgt_offset]             = candiate;
+          offers_from_partners[tgt_offset] = offer_value;
+        }
+      });
+
+    if (current_graph_view.compute_number_of_edges(handle) == 0) { break; }
+
+    if constexpr (graph_view_t::is_multi_gpu) {
+      cugraph::update_edge_src_property(
+        handle, current_graph_view, is_vertex_matched.begin(), src_match_flags);
+      cugraph::update_edge_dst_property(
+        handle, current_graph_view, is_vertex_matched.begin(), dst_match_flags);
+    }
+
+    if (loop_counter % 2 == 0) {
+      if constexpr (graph_view_t::is_multi_gpu) {
+        cugraph::transform_e(
+          handle,
+          current_graph_view,
+          src_match_flags.view(),
+          dst_match_flags.view(),
+          cugraph::edge_dummy_property_t{}.view(),
+          [] __device__(
+            auto src, auto dst, auto is_src_matched, auto is_dst_matched, thrust::nullopt_t) {
+            return !((is_src_matched == true) || (is_dst_matched == true));
+          },
+          edge_masks_odd.mutable_view());
+      } else {
+        cugraph::transform_e(
+          handle,
+          current_graph_view,
+          detail::edge_major_property_view_t<vertex_t, bool const*>(is_vertex_matched.begin()),
+          detail::edge_minor_property_view_t<vertex_t, bool const*>(is_vertex_matched.begin(),
+                                                                    vertex_t{0}),
+          cugraph::edge_dummy_property_t{}.view(),
+          [] __device__(
+            auto src, auto dst, auto is_src_matched, auto is_dst_matched, thrust::nullopt_t) {
+            return !((is_src_matched == true) || (is_dst_matched == true));
+          },
+          edge_masks_odd.mutable_view());
+      }
+
+      if (current_graph_view.has_edge_mask()) current_graph_view.clear_edge_mask();
+      cugraph::fill_edge_property(handle, current_graph_view, bool{false}, edge_masks_even);
+      current_graph_view.attach_edge_mask(edge_masks_odd.view());
+    } else {
+      if constexpr (graph_view_t::is_multi_gpu) {
+        cugraph::transform_e(
+          handle,
+          current_graph_view,
+          src_match_flags.view(),
+          dst_match_flags.view(),
+          cugraph::edge_dummy_property_t{}.view(),
+          [] __device__(
+            auto src, auto dst, auto is_src_matched, auto is_dst_matched, thrust::nullopt_t) {
+            return !((is_src_matched == true) || (is_dst_matched == true));
+          },
+          edge_masks_even.mutable_view());
+      } else {
+        cugraph::transform_e(
+          handle,
+          current_graph_view,
+          detail::edge_major_property_view_t<vertex_t, bool const*>(is_vertex_matched.begin()),
+          detail::edge_minor_property_view_t<vertex_t, bool const*>(is_vertex_matched.begin(),
+                                                                    vertex_t{0}),
+          cugraph::edge_dummy_property_t{}.view(),
+          [] __device__(
+            auto src, auto dst, auto is_src_matched, auto is_dst_matched, thrust::nullopt_t) {
+            return !((is_src_matched == true) || (is_dst_matched == true));
+          },
+          edge_masks_even.mutable_view());
+      }
+
+      if (current_graph_view.has_edge_mask()) current_graph_view.clear_edge_mask();
+      cugraph::fill_edge_property(handle, current_graph_view, bool{false}, edge_masks_odd);
+      current_graph_view.attach_edge_mask(edge_masks_even.view());
+    }
+
+    loop_counter++;
+  }
+
+  weight_t sum_matched_edge_weights = thrust::reduce(
+    handle.get_thrust_policy(), offers_from_partners.begin(), offers_from_partners.end());
+
+  if constexpr (graph_view_t::is_multi_gpu) {
+    sum_matched_edge_weights = host_scalar_allreduce(
+      handle.get_comms(), sum_matched_edge_weights, raft::comms::op_t::SUM, handle.get_stream());
+  }
+
+  return std::make_tuple(std::move(partners), sum_matched_edge_weights / 2.0);
+}
+}  // namespace detail
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>, weight_t> approximate_weighted_matching(
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+  edge_property_view_t<edge_t, weight_t const*> edge_weight_view)
+{
+  return detail::approximate_weighted_matching(handle, graph_view, edge_weight_view);
+}
+
+}  // namespace cugraph
diff --git a/cpp/src/community/approx_weighted_matching_mg.cu b/cpp/src/community/approx_weighted_matching_mg.cu
new file mode 100644
index 00000000000..41d6c3d97e0
--- /dev/null
+++ b/cpp/src/community/approx_weighted_matching_mg.cu
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "approx_weighted_matching_impl.cuh"
+
+namespace cugraph {
+
+template std::tuple<rmm::device_uvector<int32_t>, float> approximate_weighted_matching(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  edge_property_view_t<int32_t, float const*> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int32_t>, double> approximate_weighted_matching(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  edge_property_view_t<int32_t, double const*> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int32_t>, float> approximate_weighted_matching(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int64_t, false, true> const& graph_view,
+  edge_property_view_t<int64_t, float const*> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int64_t>, float> approximate_weighted_matching(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  edge_property_view_t<int64_t, float const*> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int32_t>, double> approximate_weighted_matching(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int64_t, false, true> const& graph_view,
+  edge_property_view_t<int64_t, double const*> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int64_t>, double> approximate_weighted_matching(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  edge_property_view_t<int64_t, double const*> edge_weight_view);
+
+}  // namespace cugraph
diff --git a/cpp/src/community/approx_weighted_matching_sg.cu b/cpp/src/community/approx_weighted_matching_sg.cu
new file mode 100644
index 00000000000..418a43d51ae
--- /dev/null
+++ b/cpp/src/community/approx_weighted_matching_sg.cu
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "approx_weighted_matching_impl.cuh"
+
+namespace cugraph {
+
+template std::tuple<rmm::device_uvector<int32_t>, float> approximate_weighted_matching(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  edge_property_view_t<int32_t, float const*> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int32_t>, double> approximate_weighted_matching(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  edge_property_view_t<int32_t, double const*> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int32_t>, float> approximate_weighted_matching(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int64_t, false, false> const& graph_view,
+  edge_property_view_t<int64_t, float const*> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int64_t>, float> approximate_weighted_matching(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  edge_property_view_t<int64_t, float const*> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int32_t>, double> approximate_weighted_matching(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int64_t, false, false> const& graph_view,
+  edge_property_view_t<int64_t, double const*> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int64_t>, double> approximate_weighted_matching(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  edge_property_view_t<int64_t, double const*> edge_weight_view);
+
+}  // namespace cugraph
diff --git a/cpp/src/link_prediction/similarity_impl.cuh b/cpp/src/link_prediction/similarity_impl.cuh
index c13259f0da7..00f7bc6cbe7 100644
--- a/cpp/src/link_prediction/similarity_impl.cuh
+++ b/cpp/src/link_prediction/similarity_impl.cuh
@@ -21,6 +21,7 @@
 #include "prims/update_edge_src_dst_property.cuh"
 #include "utilities/error_check_utils.cuh"
 
+#include <cugraph/detail/shuffle_wrappers.hpp>
 #include <cugraph/graph_functions.hpp>
 #include <cugraph/graph_view.hpp>
 
@@ -365,6 +366,24 @@ all_pairs_similarity(raft::handle_t const& handle,
         v1.resize(new_size, handle.get_stream());
         v2.resize(new_size, handle.get_stream());
 
+        if constexpr (multi_gpu) {
+          // shuffle vertex pairs
+          auto vertex_partition_range_lasts = graph_view.vertex_partition_range_lasts();
+
+          std::tie(v1, v2, std::ignore, std::ignore, std::ignore) =
+            detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
+                                                                                           edge_t,
+                                                                                           weight_t,
+                                                                                           int>(
+              handle,
+              std::move(v1),
+              std::move(v2),
+              std::nullopt,
+              std::nullopt,
+              std::nullopt,
+              vertex_partition_range_lasts);
+        }
+
         auto score =
           similarity(handle,
                      graph_view,
@@ -537,6 +556,24 @@ all_pairs_similarity(raft::handle_t const& handle,
     v1.resize(new_size, handle.get_stream());
     v2.resize(new_size, handle.get_stream());
 
+    if constexpr (multi_gpu) {
+      // shuffle vertex pairs
+      auto vertex_partition_range_lasts = graph_view.vertex_partition_range_lasts();
+
+      std::tie(v1, v2, std::ignore, std::ignore, std::ignore) =
+        detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
+                                                                                       edge_t,
+                                                                                       weight_t,
+                                                                                       int>(
+          handle,
+          std::move(v1),
+          std::move(v2),
+          std::nullopt,
+          std::nullopt,
+          std::nullopt,
+          vertex_partition_range_lasts);
+    }
+
     auto score =
       similarity(handle,
                  graph_view,
diff --git a/cpp/src/mtmg/vertex_pairs_result.cu b/cpp/src/mtmg/vertex_pairs_result.cu
new file mode 100644
index 00000000000..55b855d676d
--- /dev/null
+++ b/cpp/src/mtmg/vertex_pairs_result.cu
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "detail/graph_partition_utils.cuh"
+
+#include <cugraph/detail/utility_wrappers.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/mtmg/vertex_pair_result_view.hpp>
+#include <cugraph/vertex_partition_device_view.cuh>
+
+#include <thrust/functional.h>
+#include <thrust/gather.h>
+
+namespace cugraph {
+namespace mtmg {
+
+template <typename vertex_t, typename result_t>
+template <bool multi_gpu>
+std::
+  tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>, rmm::device_uvector<result_t>>
+  vertex_pair_result_view_t<vertex_t, result_t>::gather(
+    handle_t const& handle,
+    raft::device_span<vertex_t const> vertices,
+    std::vector<vertex_t> const& vertex_partition_range_lasts,
+    vertex_partition_view_t<vertex_t, multi_gpu> vertex_partition_view,
+    std::optional<cugraph::mtmg::renumber_map_view_t<vertex_t>>& renumber_map_view)
+{
+  // FIXME: Should this handle the case of multiple local host threads?
+  //        It currently does not.  If vertices were a raft::host_span
+  //        We could have the host threads copy the data to a device_uvector
+  //        and then have rank 0 execute this logic, and we could copy to
+  //        host at the end.
+  auto stream = handle.raft_handle().get_stream();
+
+  rmm::device_uvector<vertex_t> local_vertices(vertices.size(), stream);
+  rmm::device_uvector<int> vertex_gpu_ids(vertices.size(), stream);
+
+  raft::copy(local_vertices.data(), vertices.data(), vertices.size(), stream);
+  cugraph::detail::scalar_fill(
+    stream, vertex_gpu_ids.data(), vertex_gpu_ids.size(), handle.get_rank());
+
+  rmm::device_uvector<vertex_t> d_vertex_partition_range_lasts(vertex_partition_range_lasts.size(),
+                                                               stream);
+  raft::update_device(d_vertex_partition_range_lasts.data(),
+                      vertex_partition_range_lasts.data(),
+                      vertex_partition_range_lasts.size(),
+                      stream);
+
+  if (renumber_map_view) {
+    cugraph::renumber_ext_vertices<vertex_t, multi_gpu>(
+      handle.raft_handle(),
+      local_vertices.data(),
+      local_vertices.size(),
+      renumber_map_view->get(handle).data(),
+      vertex_partition_view.local_vertex_partition_range_first(),
+      vertex_partition_view.local_vertex_partition_range_last());
+  }
+
+  auto const major_comm_size =
+    handle.raft_handle().get_subcomm(cugraph::partition_manager::major_comm_name()).get_size();
+  auto const minor_comm_size =
+    handle.raft_handle().get_subcomm(cugraph::partition_manager::minor_comm_name()).get_size();
+
+  std::tie(local_vertices, vertex_gpu_ids, std::ignore) = groupby_gpu_id_and_shuffle_kv_pairs(
+    handle.raft_handle().get_comms(),
+    local_vertices.begin(),
+    local_vertices.end(),
+    vertex_gpu_ids.begin(),
+    cugraph::detail::compute_gpu_id_from_int_vertex_t<vertex_t>{
+      raft::device_span<vertex_t const>(d_vertex_partition_range_lasts.data(),
+                                        d_vertex_partition_range_lasts.size()),
+      major_comm_size,
+      minor_comm_size},
+    stream);
+
+  //
+  // LOOK AT THIS...
+  //    I think the above shuffle is correct...
+  //    This will give us vertex/gpu_id tuples on the GPU that vertex is assigned
+  //       to.  I need to take this and filter the device vector tuples based on the desired
+  //       vertex (v1).
+  //
+
+  //
+  //  Now gather
+  //
+  auto& wrapped = this->get(handle);
+
+  rmm::device_uvector<vertex_t> v1(std::get<0>(wrapped).size(), stream);
+  rmm::device_uvector<vertex_t> v2(std::get<0>(wrapped).size(), stream);
+  rmm::device_uvector<result_t> result(std::get<0>(wrapped).size(), stream);
+
+  thrust::copy(
+    rmm::exec_policy(stream),
+    thrust::make_zip_iterator(
+      std::get<0>(wrapped).begin(), std::get<1>(wrapped).begin(), std::get<2>(wrapped).begin()),
+    thrust::make_zip_iterator(
+      std::get<0>(wrapped).end(), std::get<1>(wrapped).end(), std::get<2>(wrapped).end()),
+    thrust::make_zip_iterator(v1.begin(), v2.begin(), result.begin()));
+
+  thrust::sort_by_key(
+    rmm::exec_policy(stream), local_vertices.begin(), local_vertices.end(), vertex_gpu_ids.begin());
+
+  auto new_end =
+    thrust::remove_if(rmm::exec_policy(stream),
+                      thrust::make_zip_iterator(v1.begin(), v2.begin(), result.begin()),
+                      thrust::make_zip_iterator(v1.end(), v2.end(), result.end()),
+                      [v1_check = raft::device_span<vertex_t const>{
+                         local_vertices.data(), local_vertices.size()}] __device__(auto tuple) {
+                        return thrust::binary_search(
+                          thrust::seq, v1_check.begin(), v1_check.end(), thrust::get<0>(tuple));
+                      });
+
+  v1.resize(
+    thrust::distance(thrust::make_zip_iterator(v1.begin(), v2.begin(), result.begin()), new_end),
+    stream);
+  v2.resize(v1.size(), stream);
+  result.resize(v1.size(), stream);
+
+  //
+  // Shuffle back
+  //
+  std::forward_as_tuple(std::ignore, std::tie(v1, v2, result), std::ignore) =
+    groupby_gpu_id_and_shuffle_kv_pairs(
+      handle.raft_handle().get_comms(),
+      v1.begin(),
+      v1.end(),
+      thrust::make_zip_iterator(v1.begin(), v2.begin(), result.begin()),
+      cuda::proclaim_return_type<int>(
+        [local_v = raft::device_span<vertex_t const>{local_vertices.data(), local_vertices.size()},
+         gpu     = raft::device_span<int const>{vertex_gpu_ids.data(),
+                                                vertex_gpu_ids.size()}] __device__(auto v1) {
+          return gpu[thrust::distance(
+            local_v.begin(), thrust::lower_bound(thrust::seq, local_v.begin(), local_v.end(), v1))];
+        }),
+      stream);
+
+  if (renumber_map_view) {
+    cugraph::unrenumber_int_vertices<vertex_t, multi_gpu>(handle.raft_handle(),
+                                                          v1.data(),
+                                                          v1.size(),
+                                                          renumber_map_view->get(handle).data(),
+                                                          vertex_partition_range_lasts);
+
+    cugraph::unrenumber_int_vertices<vertex_t, multi_gpu>(handle.raft_handle(),
+                                                          v2.data(),
+                                                          v2.size(),
+                                                          renumber_map_view->get(handle).data(),
+                                                          vertex_partition_range_lasts);
+  }
+
+  return std::make_tuple(std::move(v1), std::move(v2), std::move(result));
+}
+
+template std::
+  tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>, rmm::device_uvector<float>>
+  vertex_pair_result_view_t<int32_t, float>::gather(
+    handle_t const& handle,
+    raft::device_span<int32_t const> vertices,
+    std::vector<int32_t> const& vertex_partition_range_lasts,
+    vertex_partition_view_t<int32_t, false> vertex_partition_view,
+    std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
+
+template std::
+  tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>, rmm::device_uvector<float>>
+  vertex_pair_result_view_t<int64_t, float>::gather(
+    handle_t const& handle,
+    raft::device_span<int64_t const> vertices,
+    std::vector<int64_t> const& vertex_partition_range_lasts,
+    vertex_partition_view_t<int64_t, false> vertex_partition_view,
+    std::optional<cugraph::mtmg::renumber_map_view_t<int64_t>>& renumber_map_view);
+
+template std::
+  tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>, rmm::device_uvector<float>>
+  vertex_pair_result_view_t<int32_t, float>::gather(
+    handle_t const& handle,
+    raft::device_span<int32_t const> vertices,
+    std::vector<int32_t> const& vertex_partition_range_lasts,
+    vertex_partition_view_t<int32_t, true> vertex_partition_view,
+    std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
+
+template std::
+  tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>, rmm::device_uvector<float>>
+  vertex_pair_result_view_t<int64_t, float>::gather(
+    handle_t const& handle,
+    raft::device_span<int64_t const> vertices,
+    std::vector<int64_t> const& vertex_partition_range_lasts,
+    vertex_partition_view_t<int64_t, true> vertex_partition_view,
+    std::optional<cugraph::mtmg::renumber_map_view_t<int64_t>>& renumber_map_view);
+
+template std::
+  tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>, rmm::device_uvector<double>>
+  vertex_pair_result_view_t<int32_t, double>::gather(
+    handle_t const& handle,
+    raft::device_span<int32_t const> vertices,
+    std::vector<int32_t> const& vertex_partition_range_lasts,
+    vertex_partition_view_t<int32_t, false> vertex_partition_view,
+    std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
+
+template std::
+  tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>, rmm::device_uvector<double>>
+  vertex_pair_result_view_t<int64_t, double>::gather(
+    handle_t const& handle,
+    raft::device_span<int64_t const> vertices,
+    std::vector<int64_t> const& vertex_partition_range_lasts,
+    vertex_partition_view_t<int64_t, false> vertex_partition_view,
+    std::optional<cugraph::mtmg::renumber_map_view_t<int64_t>>& renumber_map_view);
+
+template std::
+  tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>, rmm::device_uvector<double>>
+  vertex_pair_result_view_t<int32_t, double>::gather(
+    handle_t const& handle,
+    raft::device_span<int32_t const> vertices,
+    std::vector<int32_t> const& vertex_partition_range_lasts,
+    vertex_partition_view_t<int32_t, true> vertex_partition_view,
+    std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
+
+template std::
+  tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>, rmm::device_uvector<double>>
+  vertex_pair_result_view_t<int64_t, double>::gather(
+    handle_t const& handle,
+    raft::device_span<int64_t const> vertices,
+    std::vector<int64_t> const& vertex_partition_range_lasts,
+    vertex_partition_view_t<int64_t, true> vertex_partition_view,
+    std::optional<cugraph::mtmg::renumber_map_view_t<int64_t>>& renumber_map_view);
+
+template std::
+  tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>
+  vertex_pair_result_view_t<int32_t, int32_t>::gather(
+    handle_t const& handle,
+    raft::device_span<int32_t const> vertices,
+    std::vector<int32_t> const& vertex_partition_range_lasts,
+    vertex_partition_view_t<int32_t, false> vertex_partition_view,
+    std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
+
+template std::
+  tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>
+  vertex_pair_result_view_t<int32_t, int32_t>::gather(
+    handle_t const& handle,
+    raft::device_span<int32_t const> vertices,
+    std::vector<int32_t> const& vertex_partition_range_lasts,
+    vertex_partition_view_t<int32_t, true> vertex_partition_view,
+    std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
+
+template std::
+  tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>
+  vertex_pair_result_view_t<int64_t, int64_t>::gather(
+    handle_t const& handle,
+    raft::device_span<int64_t const> vertices,
+    std::vector<int64_t> const& vertex_partition_range_lasts,
+    vertex_partition_view_t<int64_t, false> vertex_partition_view,
+    std::optional<cugraph::mtmg::renumber_map_view_t<int64_t>>& renumber_map_view);
+
+template std::
+  tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>
+  vertex_pair_result_view_t<int64_t, int64_t>::gather(
+    handle_t const& handle,
+    raft::device_span<int64_t const> vertices,
+    std::vector<int64_t> const& vertex_partition_range_lasts,
+    vertex_partition_view_t<int64_t, true> vertex_partition_view,
+    std::optional<cugraph::mtmg::renumber_map_view_t<int64_t>>& renumber_map_view);
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/src/prims/kv_store.cuh b/cpp/src/prims/kv_store.cuh
index 2cc7856d87a..76b64b5692b 100644
--- a/cpp/src/prims/kv_store.cuh
+++ b/cpp/src/prims/kv_store.cuh
@@ -526,6 +526,7 @@ class kv_cuco_store_t {
                     std::conditional_t<!std::is_arithmetic_v<value_t>, value_t, void>>(0, stream))
   {
     allocate(capacity, invalid_key, invalid_value, stream);
+    if constexpr (!std::is_arithmetic_v<value_t>) { invalid_value_ = invalid_value; }
     capacity_ = capacity;
     size_     = 0;
   }
diff --git a/cpp/src/traversal/bfs_impl.cuh b/cpp/src/traversal/bfs_impl.cuh
index 1f6f29d8683..fb837484a14 100644
--- a/cpp/src/traversal/bfs_impl.cuh
+++ b/cpp/src/traversal/bfs_impl.cuh
@@ -31,8 +31,6 @@
 
 #include <raft/core/handle.hpp>
 
-#include <rmm/exec_policy.hpp>
-
 #include <thrust/copy.h>
 #include <thrust/count.h>
 #include <thrust/fill.h>
@@ -149,11 +147,11 @@ void bfs(raft::handle_t const& handle,
   auto constexpr invalid_distance = std::numeric_limits<vertex_t>::max();
   auto constexpr invalid_vertex   = invalid_vertex_id<vertex_t>::value;
 
-  thrust::fill(rmm::exec_policy(handle.get_thrust_policy()),
+  thrust::fill(handle.get_thrust_policy(),
                distances,
                distances + push_graph_view.local_vertex_partition_range_size(),
                invalid_distance);
-  thrust::fill(rmm::exec_policy(handle.get_thrust_policy()),
+  thrust::fill(handle.get_thrust_policy(),
                predecessor_first,
                predecessor_first + push_graph_view.local_vertex_partition_range_size(),
                invalid_vertex);
@@ -161,7 +159,7 @@ void bfs(raft::handle_t const& handle,
     push_graph_view.local_vertex_partition_view());
   if (n_sources) {
     thrust::for_each(
-      rmm::exec_policy(handle.get_thrust_policy()),
+      handle.get_thrust_policy(),
       sources,
       sources + n_sources,
       [vertex_partition, distances, predecessor_first] __device__(auto v) {
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 2dcda796f9c..ced3b7bedb1 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -309,6 +309,10 @@ ConfigureTest(LOUVAIN_TEST community/louvain_test.cpp)
 # - LEIDEN tests ----------------------------------------------------------------------------------
 ConfigureTest(LEIDEN_TEST community/leiden_test.cpp)
 
+###################################################################################################
+# - WEIGHTED MATCHING tests ----------------------------------------------------------------------------------
+ConfigureTest(WEIGHTED_MATCHING_TEST community/weighted_matching_test.cpp)
+
 ###################################################################################################
 # - Legacy ECG tests -------------------------------------------------------------------------------------
 ConfigureTest(LEGACY_ECG_TEST community/legacy_ecg_test.cpp)
@@ -570,6 +574,10 @@ if(BUILD_CUGRAPH_MG_TESTS)
     # - MG LEIDEN tests --------------------------------------------------------------------------
     ConfigureTestMG(MG_LEIDEN_TEST community/mg_leiden_test.cpp)
 
+    ###############################################################################################
+    # - MG WEIGHTED MATCHING tests --------------------------------------------------------------------------
+    ConfigureTestMG(MG_WEIGHTED_MATCHING_TEST community/mg_weighted_matching_test.cpp)
+
     ###############################################################################################
     # - MG ECG tests --------------------------------------------------------------------------
     ConfigureTestMG(MG_ECG_TEST community/mg_ecg_test.cpp)
@@ -733,6 +741,7 @@ if(BUILD_CUGRAPH_MG_TESTS)
     ConfigureCTestMG(MG_CAPI_K_CORE_TEST c_api/mg_k_core_test.c)
     ConfigureCTestMG(MG_CAPI_INDUCED_SUBGRAPH_TEST c_api/mg_induced_subgraph_test.c)
     ConfigureCTestMG(MG_CAPI_DEGREES c_api/mg_degrees_test.c)
+    ConfigureCTestMG(MG_CAPI_COUNT_MULTI_EDGES c_api/mg_count_multi_edges_test.c)
     ConfigureCTestMG(MG_CAPI_EGONET_TEST c_api/mg_egonet_test.c)
     ConfigureCTestMG(MG_CAPI_TWO_HOP_NEIGHBORS_TEST c_api/mg_two_hop_neighbors_test.c)
 
@@ -777,6 +786,7 @@ ConfigureCTest(CAPI_SIMILARITY_TEST c_api/similarity_test.c)
 ConfigureCTest(CAPI_K_CORE_TEST c_api/k_core_test.c)
 ConfigureCTest(CAPI_INDUCED_SUBGRAPH_TEST c_api/induced_subgraph_test.c)
 ConfigureCTest(CAPI_DEGREES c_api/degrees_test.c)
+ConfigureCTest(CAPI_COUNT_MULTI_EDGES c_api/count_multi_edges_test.c)
 ConfigureCTest(CAPI_EGONET_TEST c_api/egonet_test.c)
 ConfigureCTest(CAPI_TWO_HOP_NEIGHBORS_TEST c_api/two_hop_neighbors_test.c)
 ConfigureCTest(CAPI_K_TRUSS_TEST c_api/k_truss_test.c)
@@ -787,14 +797,31 @@ if (BUILD_CUGRAPH_MTMG_TESTS)
     ConfigureTest(MTMG_TEST mtmg/threaded_test.cu)
     target_link_libraries(MTMG_TEST
                           PRIVATE
-                          UCP::UCP
+                          cugraphmgtestutil
+                          ${COMPILED_RAFT_LIB}
+                          ucx::ucp
+                          ucx::ucs
+                          ucxx::ucxx
                          )
 
     ConfigureTest(MTMG_LOUVAIN_TEST mtmg/threaded_test_louvain.cu)
     target_link_libraries(MTMG_LOUVAIN_TEST
                           PRIVATE
                           cugraphmgtestutil
-                          UCP::UCP
+                          ${COMPILED_RAFT_LIB}
+                          ucx::ucp
+                          ucx::ucs
+                          ucxx::ucxx
+                         )
+
+    ConfigureTest(MTMG_JACCARD_TEST mtmg/threaded_test_jaccard.cu)
+    target_link_libraries(MTMG_JACCARD_TEST
+                          PRIVATE
+                          cugraphmgtestutil
+                          ${COMPILED_RAFT_LIB}
+                          ucx::ucp
+                          ucx::ucs
+                          ucxx::ucxx
                          )
 
     if(BUILD_CUGRAPH_MG_TESTS)
@@ -804,7 +831,10 @@ if (BUILD_CUGRAPH_MTMG_TESTS)
         target_link_libraries(MTMG_MULTINODE_TEST
                               PRIVATE
                               cugraphmgtestutil
-                              UCP::UCP
+                              ${COMPILED_RAFT_LIB}
+                              ucx::ucp
+                              ucx::ucs
+                              ucxx::ucxx
                              )
     endif(BUILD_CUGRAPH_MG_TESTS)
 endif(BUILD_CUGRAPH_MTMG_TESTS)
diff --git a/cpp/tests/c_api/count_multi_edges_test.c b/cpp/tests/c_api/count_multi_edges_test.c
new file mode 100644
index 00000000000..222cf12ea36
--- /dev/null
+++ b/cpp/tests/c_api/count_multi_edges_test.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c_test_utils.h" /* RUN_TEST */
+
+#include <cugraph_c/graph.h>
+#include <cugraph_c/graph_functions.h>
+
+#include <stdio.h>
+
+typedef int32_t vertex_t;
+typedef int32_t edge_t;
+typedef float weight_t;
+
+data_type_id_t vertex_tid    = INT32;
+data_type_id_t edge_tid      = INT32;
+data_type_id_t weight_tid    = FLOAT32;
+data_type_id_t edge_id_tid   = INT32;
+data_type_id_t edge_type_tid = INT32;
+
+/*
+ * Create graph and count multi-edges
+ */
+int generic_count_multi_edges_test(vertex_t* h_src,
+                                   vertex_t* h_dst,
+                                   weight_t* h_wgt,
+                                   size_t num_vertices,
+                                   size_t num_edges,
+                                   bool_t store_transposed,
+                                   bool_t is_symmetric,
+                                   bool_t is_multigraph,
+                                   size_t multi_edges_count)
+{
+  int test_ret_value = 0;
+
+  cugraph_error_code_t ret_code = CUGRAPH_SUCCESS;
+  cugraph_error_t* ret_error;
+
+  cugraph_resource_handle_t* handle = NULL;
+  cugraph_graph_t* graph            = NULL;
+  size_t result                     = 0;
+
+  handle = cugraph_create_resource_handle(NULL);
+  TEST_ASSERT(test_ret_value, handle != NULL, "resource handle creation failed.");
+
+  ret_code = create_sg_test_graph(handle,
+                                  vertex_tid,
+                                  edge_tid,
+                                  h_src,
+                                  h_dst,
+                                  weight_tid,
+                                  h_wgt,
+                                  edge_type_tid,
+                                  NULL,
+                                  edge_id_tid,
+                                  NULL,
+                                  num_edges,
+                                  store_transposed,
+                                  FALSE,
+                                  is_symmetric,
+                                  is_multigraph,
+                                  &graph,
+                                  &ret_error);
+
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create_test_graph failed.");
+  TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
+
+  ret_code = cugraph_count_multi_edges(handle, graph, FALSE, &result, &ret_error);
+
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "cugraph_count_multi_edges failed.");
+
+  TEST_ASSERT(test_ret_value, result == multi_edges_count, "multi-edge count did not match");
+
+  cugraph_graph_free(graph);
+  cugraph_error_free(ret_error);
+
+  return test_ret_value;
+}
+
+int test_multi_edges_count()
+{
+  size_t num_edges    = 14;
+  size_t num_vertices = 6;
+
+  vertex_t h_src[]        = {0, 1, 1, 2, 2, 2, 3, 4, 0, 1, 1, 3, 0, 1};
+  vertex_t h_dst[]        = {1, 3, 4, 0, 1, 3, 5, 5, 1, 3, 0, 1, 1, 0};
+  weight_t h_wgt[]        = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+  size_t multi_edge_count = 4;
+
+  return generic_count_multi_edges_test(
+    h_src, h_dst, h_wgt, num_vertices, num_edges, TRUE, TRUE, TRUE, multi_edge_count);
+}
+
+/******************************************************************************/
+
+int main(int argc, char** argv)
+{
+  int result = 0;
+  result |= RUN_TEST(test_multi_edges_count);
+  return result;
+}
diff --git a/cpp/tests/c_api/mg_count_multi_edges_test.c b/cpp/tests/c_api/mg_count_multi_edges_test.c
new file mode 100644
index 00000000000..69eaaff40dc
--- /dev/null
+++ b/cpp/tests/c_api/mg_count_multi_edges_test.c
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mg_test_utils.h" /* RUN_TEST */
+
+#include <cugraph_c/graph.h>
+#include <cugraph_c/graph_functions.h>
+
+#include <stdio.h>
+
+typedef int32_t vertex_t;
+typedef int32_t edge_t;
+typedef float weight_t;
+
+data_type_id_t vertex_tid    = INT32;
+data_type_id_t edge_tid      = INT32;
+data_type_id_t weight_tid    = FLOAT32;
+data_type_id_t edge_id_tid   = INT32;
+data_type_id_t edge_type_tid = INT32;
+
+/*
+ * Create graph and count multi-edges
+ */
+int generic_count_multi_edges_test(const cugraph_resource_handle_t* handle,
+                                   vertex_t* h_src,
+                                   vertex_t* h_dst,
+                                   weight_t* h_wgt,
+                                   size_t num_vertices,
+                                   size_t num_edges,
+                                   bool_t store_transposed,
+                                   bool_t is_symmetric,
+                                   bool_t is_multigraph,
+                                   size_t multi_edges_count)
+{
+  int test_ret_value = 0;
+
+  cugraph_error_code_t ret_code = CUGRAPH_SUCCESS;
+  cugraph_error_t* ret_error;
+
+  cugraph_graph_t* graph = NULL;
+  size_t result          = 0;
+
+  ret_code = create_mg_test_graph_new(handle,
+                                      vertex_tid,
+                                      edge_tid,
+                                      h_src,
+                                      h_dst,
+                                      weight_tid,
+                                      h_wgt,
+                                      edge_type_tid,
+                                      NULL,
+                                      edge_id_tid,
+                                      NULL,
+                                      num_edges,
+                                      store_transposed,
+                                      FALSE,
+                                      is_symmetric,
+                                      is_multigraph,
+                                      &graph,
+                                      &ret_error);
+
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create_test_graph failed.");
+  TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
+
+  ret_code = cugraph_count_multi_edges(handle, graph, FALSE, &result, &ret_error);
+
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "cugraph_count_multi_edges failed.");
+
+  TEST_ASSERT(test_ret_value, result == multi_edges_count, "multi-edge count did not match");
+
+  cugraph_graph_free(graph);
+  cugraph_error_free(ret_error);
+
+  return test_ret_value;
+}
+
+int test_multi_edges_count(const cugraph_resource_handle_t* handle)
+{
+  size_t num_edges    = 14;
+  size_t num_vertices = 6;
+
+  vertex_t h_src[]        = {0, 1, 1, 2, 2, 2, 3, 4, 0, 1, 1, 3, 0, 1};
+  vertex_t h_dst[]        = {1, 3, 4, 0, 1, 3, 5, 5, 1, 3, 0, 1, 1, 0};
+  weight_t h_wgt[]        = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+  size_t multi_edge_count = 4;
+
+  return generic_count_multi_edges_test(
+    handle, h_src, h_dst, h_wgt, num_vertices, num_edges, TRUE, TRUE, TRUE, multi_edge_count);
+}
+
+/******************************************************************************/
+
+int main(int argc, char** argv)
+{
+  void* raft_handle                 = create_mg_raft_handle(argc, argv);
+  cugraph_resource_handle_t* handle = cugraph_create_resource_handle(raft_handle);
+
+  int result = 0;
+  result |= RUN_MG_TEST(test_multi_edges_count, handle);
+
+  cugraph_free_resource_handle(handle);
+  free_mg_raft_handle(raft_handle);
+
+  return result;
+}
diff --git a/cpp/tests/community/mg_weighted_matching_test.cpp b/cpp/tests/community/mg_weighted_matching_test.cpp
new file mode 100644
index 00000000000..21963922ab1
--- /dev/null
+++ b/cpp/tests/community/mg_weighted_matching_test.cpp
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governin_from_mtxg permissions and
+ * limitations under the License.
+ */
+
+#include "utilities/base_fixture.hpp"
+#include "utilities/conversion_utilities.hpp"
+#include "utilities/property_generator_utilities.hpp"
+#include "utilities/test_graphs.hpp"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/edge_partition_view.hpp>
+#include <cugraph/edge_property.hpp>
+#include <cugraph/edge_src_dst_property.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/dataframe_buffer.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+#include <cugraph/utilities/host_scalar_comm.hpp>
+
+#include <raft/random/rng_state.hpp>
+
+#include <gtest/gtest.h>
+
+#include <chrono>
+#include <iostream>
+#include <random>
+
+struct WeightedMatching_UseCase {
+  bool edge_masking{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MGWeightedMatching
+  : public ::testing::TestWithParam<std::tuple<WeightedMatching_UseCase, input_usecase_t>> {
+ public:
+  Tests_MGWeightedMatching() {}
+
+  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
+  static void TearDownTestCase() { handle_.reset(); }
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
+  void run_current_test(std::tuple<WeightedMatching_UseCase, input_usecase_t> const& param)
+  {
+    auto [weighted_matching_usecase, input_usecase] = param;
+
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      handle_->get_comms().barrier();
+      hr_timer.start("MG Construct graph");
+    }
+
+    constexpr bool multi_gpu = true;
+
+    bool test_weighted    = true;
+    bool renumber         = true;
+    bool drop_self_loops  = false;
+    bool drop_multi_edges = false;
+
+    auto [mg_graph, mg_edge_weights, mg_renumber_map] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, multi_gpu>(
+        *handle_, input_usecase, test_weighted, renumber, drop_self_loops, drop_multi_edges);
+
+    std::tie(mg_graph, mg_edge_weights, mg_renumber_map) = cugraph::symmetrize_graph(
+      *handle_,
+      std::move(mg_graph),
+      std::move(mg_edge_weights),
+      mg_renumber_map ? std::optional<rmm::device_uvector<vertex_t>>(std::move(*mg_renumber_map))
+                      : std::nullopt,
+      false);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto mg_graph_view = mg_graph.view();
+    auto mg_edge_weight_view =
+      mg_edge_weights ? std::make_optional((*mg_edge_weights).view()) : std::nullopt;
+
+    std::optional<cugraph::edge_property_t<decltype(mg_graph_view), bool>> edge_mask{std::nullopt};
+    if (weighted_matching_usecase.edge_masking) {
+      edge_mask = cugraph::test::generate<decltype(mg_graph_view), bool>::edge_property(
+        *handle_, mg_graph_view, 2);
+      mg_graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
+    rmm::device_uvector<vertex_t> mg_partners(0, handle_->get_stream());
+    weight_t mg_matching_weights;
+
+    std::forward_as_tuple(mg_partners, mg_matching_weights) =
+      cugraph::approximate_weighted_matching<vertex_t, edge_t, weight_t, multi_gpu>(
+        *handle_, mg_graph_view, (*mg_edge_weights).view());
+
+    if (weighted_matching_usecase.check_correctness) {
+      auto h_mg_partners = cugraph::test::to_host(*handle_, mg_partners);
+
+      auto constexpr invalid_partner = cugraph::invalid_vertex_id<vertex_t>::value;
+
+      rmm::device_uvector<vertex_t> mg_aggregate_partners(0, handle_->get_stream());
+      std::tie(std::ignore, mg_aggregate_partners) =
+        cugraph::test::mg_vertex_property_values_to_sg_vertex_property_values(
+          *handle_,
+          std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+          mg_graph_view.local_vertex_partition_range(),
+          std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+          std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+          raft::device_span<vertex_t const>(mg_partners.data(), mg_partners.size()));
+
+      cugraph::graph_t<vertex_t, edge_t, false, false> sg_graph(*handle_);
+      std::optional<
+        cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, false, false>, weight_t>>
+        sg_edge_weights{std::nullopt};
+      std::tie(sg_graph, sg_edge_weights, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
+        *handle_,
+        mg_graph_view,
+        mg_edge_weight_view,
+        std::optional<raft::device_span<vertex_t const>>(std::nullopt),
+        false);
+
+      if (handle_->get_comms().get_rank() == 0) {
+        auto sg_graph_view = sg_graph.view();
+
+        rmm::device_uvector<vertex_t> sg_partners(0, handle_->get_stream());
+        weight_t sg_matching_weights;
+
+        std::forward_as_tuple(sg_partners, sg_matching_weights) =
+          cugraph::approximate_weighted_matching<vertex_t, edge_t, weight_t, false>(
+            *handle_, sg_graph_view, (*sg_edge_weights).view());
+        auto h_sg_partners           = cugraph::test::to_host(*handle_, sg_partners);
+        auto h_mg_aggregate_partners = cugraph::test::to_host(*handle_, mg_aggregate_partners);
+
+        ASSERT_FLOAT_EQ(mg_matching_weights, sg_matching_weights)
+          << "SG and MG matching weights are different";
+        ASSERT_TRUE(
+          std::equal(h_sg_partners.begin(), h_sg_partners.end(), h_mg_aggregate_partners.begin()));
+      }
+    }
+  }
+
+ private:
+  static std::unique_ptr<raft::handle_t> handle_;
+};
+
+template <typename input_usecase_t>
+std::unique_ptr<raft::handle_t> Tests_MGWeightedMatching<input_usecase_t>::handle_ = nullptr;
+
+using Tests_MGWeightedMatching_File = Tests_MGWeightedMatching<cugraph::test::File_Usecase>;
+using Tests_MGWeightedMatching_Rmat = Tests_MGWeightedMatching<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MGWeightedMatching_File, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, int>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGWeightedMatching_File, CheckInt32Int64FloatFloat)
+{
+  run_current_test<int32_t, int64_t, float, int>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGWeightedMatching_File, CheckInt64Int64FloatFloat)
+{
+  run_current_test<int64_t, int64_t, float, int>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGWeightedMatching_Rmat, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, int>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGWeightedMatching_Rmat, CheckInt32Int64FloatFloat)
+{
+  run_current_test<int32_t, int64_t, float, int>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGWeightedMatching_Rmat, CheckInt64Int64FloatFloat)
+{
+  run_current_test<int64_t, int64_t, float, int>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_MGWeightedMatching_File,
+  ::testing::Combine(::testing::Values(WeightedMatching_UseCase{false},
+                                       WeightedMatching_UseCase{true}),
+                     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(rmat_small_test,
+                         Tests_MGWeightedMatching_Rmat,
+                         ::testing::Combine(::testing::Values(WeightedMatching_UseCase{false},
+                                                              WeightedMatching_UseCase{true}),
+                                            ::testing::Values(cugraph::test::Rmat_Usecase(
+                                              3, 2, 0.57, 0.19, 0.19, 0, true, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_MGWeightedMatching_Rmat,
+  ::testing::Combine(
+    ::testing::Values(WeightedMatching_UseCase{false, false},
+                      WeightedMatching_UseCase{true, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, true, false))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/community/weighted_matching_test.cpp b/cpp/tests/community/weighted_matching_test.cpp
new file mode 100644
index 00000000000..436273c3be3
--- /dev/null
+++ b/cpp/tests/community/weighted_matching_test.cpp
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governin_from_mtxg permissions and
+ * limitations under the License.
+ */
+
+#include "utilities/base_fixture.hpp"
+#include "utilities/conversion_utilities.hpp"
+#include "utilities/property_generator_utilities.hpp"
+#include "utilities/test_graphs.hpp"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/edge_partition_view.hpp>
+#include <cugraph/edge_property.hpp>
+#include <cugraph/edge_src_dst_property.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <raft/random/rng_state.hpp>
+
+#include <gtest/gtest.h>
+
+#include <chrono>
+#include <iostream>
+#include <random>
+
+struct WeightedMatching_UseCase {
+  bool edge_masking{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_SGWeightedMatching
+  : public ::testing::TestWithParam<std::tuple<WeightedMatching_UseCase, input_usecase_t>> {
+ public:
+  Tests_SGWeightedMatching() {}
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
+  void run_current_test(std::tuple<WeightedMatching_UseCase, input_usecase_t> const& param)
+  {
+    auto [weighted_matching_usecase, input_usecase] = param;
+
+    raft::handle_t handle{};
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      hr_timer.start("Construct graph");
+    }
+
+    constexpr bool multi_gpu = false;
+
+    bool test_weighted    = true;
+    bool renumber         = true;
+    bool drop_self_loops  = false;
+    bool drop_multi_edges = false;
+
+    auto [sg_graph, sg_edge_weights, sg_renumber_map] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, multi_gpu>(
+        handle, input_usecase, test_weighted, renumber, drop_self_loops, drop_multi_edges);
+
+    std::tie(sg_graph, sg_edge_weights, sg_renumber_map) = cugraph::symmetrize_graph(
+      handle, std::move(sg_graph), std::move(sg_edge_weights), std::move(sg_renumber_map), false);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto sg_graph_view = sg_graph.view();
+    auto sg_edge_weight_view =
+      sg_edge_weights ? std::make_optional((*sg_edge_weights).view()) : std::nullopt;
+
+    std::optional<cugraph::edge_property_t<decltype(sg_graph_view), bool>> edge_mask{std::nullopt};
+    if (weighted_matching_usecase.edge_masking) {
+      edge_mask = cugraph::test::generate<decltype(sg_graph_view), bool>::edge_property(
+        handle, sg_graph_view, 2);
+      sg_graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
+    rmm::device_uvector<vertex_t> d_partners(0, handle.get_stream());
+    weight_t total_matching_weights;
+
+    std::forward_as_tuple(d_partners, total_matching_weights) =
+      cugraph::approximate_weighted_matching<vertex_t, edge_t, weight_t, multi_gpu>(
+        handle, sg_graph_view, (*sg_edge_weights).view());
+
+    if (weighted_matching_usecase.check_correctness) {
+      auto h_partners                = cugraph::test::to_host(handle, d_partners);
+      auto constexpr invalid_partner = cugraph::invalid_vertex_id<vertex_t>::value;
+
+      std::for_each(h_partners.begin(), h_partners.end(), [&invalid_partner, h_partners](auto& v) {
+        if (v != invalid_partner) ASSERT_TRUE(h_partners[h_partners[v]] == v);
+      });
+    }
+  }
+};
+
+using Tests_SGWeightedMatching_File = Tests_SGWeightedMatching<cugraph::test::File_Usecase>;
+using Tests_SGWeightedMatching_Rmat = Tests_SGWeightedMatching<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_SGWeightedMatching_File, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, int>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SGWeightedMatching_File, CheckInt32Int64FloatFloat)
+{
+  run_current_test<int32_t, int64_t, float, int>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SGWeightedMatching_File, CheckInt64Int64FloatFloat)
+{
+  run_current_test<int64_t, int64_t, float, int>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SGWeightedMatching_Rmat, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, int>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SGWeightedMatching_Rmat, CheckInt32Int64FloatFloat)
+{
+  run_current_test<int32_t, int64_t, float, int>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SGWeightedMatching_Rmat, CheckInt64Int64FloatFloat)
+{
+  run_current_test<int64_t, int64_t, float, int>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_SGWeightedMatching_File,
+  ::testing::Combine(::testing::Values(WeightedMatching_UseCase{false},
+                                       WeightedMatching_UseCase{true}),
+                     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(rmat_small_test,
+                         Tests_SGWeightedMatching_Rmat,
+                         ::testing::Combine(::testing::Values(WeightedMatching_UseCase{false},
+                                                              WeightedMatching_UseCase{true}),
+                                            ::testing::Values(cugraph::test::Rmat_Usecase(
+                                              3, 3, 0.57, 0.19, 0.19, 0, true, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_SGWeightedMatching_Rmat,
+  ::testing::Combine(
+    ::testing::Values(WeightedMatching_UseCase{false, false},
+                      WeightedMatching_UseCase{true, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/mtmg/multi_node_threaded_test.cu b/cpp/tests/mtmg/multi_node_threaded_test.cu
index 0ac8c56bc46..1ad83761d51 100644
--- a/cpp/tests/mtmg/multi_node_threaded_test.cu
+++ b/cpp/tests/mtmg/multi_node_threaded_test.cu
@@ -14,8 +14,9 @@
  * limitations under the License.
  */
 #include "utilities/base_fixture.hpp"
+#include "utilities/check_utilities.hpp"
+#include "utilities/conversion_utilities.hpp"
 #include "utilities/test_graphs.hpp"
-#include "utilities/test_utilities.hpp"
 #include "utilities/thrust_wrapper.hpp"
 
 #include <cugraph/algorithms.hpp>
diff --git a/cpp/tests/mtmg/threaded_test.cu b/cpp/tests/mtmg/threaded_test.cu
index 5d902e35dfa..f55a102ea67 100644
--- a/cpp/tests/mtmg/threaded_test.cu
+++ b/cpp/tests/mtmg/threaded_test.cu
@@ -14,8 +14,9 @@
  * limitations under the License.
  */
 #include "utilities/base_fixture.hpp"
+#include "utilities/check_utilities.hpp"
+#include "utilities/conversion_utilities.hpp"
 #include "utilities/test_graphs.hpp"
-#include "utilities/test_utilities.hpp"
 #include "utilities/thrust_wrapper.hpp"
 
 #include <cugraph/algorithms.hpp>
@@ -388,11 +389,11 @@ class Tests_Multithreaded
       std::for_each(
         computed_pageranks_v.begin(),
         computed_pageranks_v.end(),
-        [h_sg_pageranks, compare_functor, h_sg_renumber_map](auto t1) {
+        [&h_sg_pageranks, compare_functor, &h_sg_renumber_map](auto t1) {
           std::for_each(
             thrust::make_zip_iterator(std::get<0>(t1).begin(), std::get<1>(t1).begin()),
             thrust::make_zip_iterator(std::get<0>(t1).end(), std::get<1>(t1).end()),
-            [h_sg_pageranks, compare_functor, h_sg_renumber_map](auto t2) {
+            [&h_sg_pageranks, compare_functor, &h_sg_renumber_map](auto t2) {
               vertex_t v  = thrust::get<0>(t2);
               weight_t pr = thrust::get<1>(t2);
 
diff --git a/cpp/tests/mtmg/threaded_test_jaccard.cu b/cpp/tests/mtmg/threaded_test_jaccard.cu
new file mode 100644
index 00000000000..a64cc8ee1fa
--- /dev/null
+++ b/cpp/tests/mtmg/threaded_test_jaccard.cu
@@ -0,0 +1,498 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "utilities/base_fixture.hpp"
+#include "utilities/check_utilities.hpp"
+#include "utilities/conversion_utilities.hpp"
+#include "utilities/test_graphs.hpp"
+#include "utilities/thrust_wrapper.hpp"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/graph.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/mtmg/edgelist.hpp>
+#include <cugraph/mtmg/graph.hpp>
+#include <cugraph/mtmg/per_thread_edgelist.hpp>
+#include <cugraph/mtmg/renumber_map.hpp>
+#include <cugraph/mtmg/resource_manager.hpp>
+#include <cugraph/mtmg/vertex_pair_result.hpp>
+
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/count.h>
+#include <thrust/unique.h>
+
+#include <gtest/gtest.h>
+#include <nccl.h>
+
+#include <vector>
+
+struct Multithreaded_Usecase {
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_Multithreaded
+  : public ::testing::TestWithParam<std::tuple<Multithreaded_Usecase, input_usecase_t>> {
+ public:
+  Tests_Multithreaded() {}
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  std::vector<int> get_gpu_list()
+  {
+    int num_gpus_per_node{1};
+    RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node));
+
+    std::vector<int> gpu_list(num_gpus_per_node);
+    std::iota(gpu_list.begin(), gpu_list.end(), 0);
+
+    return gpu_list;
+  }
+
+  template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+  void run_current_test(
+    std::tuple<Multithreaded_Usecase const&, input_usecase_t const&> const& param,
+    std::vector<int> gpu_list)
+  {
+    using edge_type_t = int32_t;
+
+    constexpr bool renumber           = true;
+    constexpr bool do_expensive_check = false;
+    constexpr bool store_transposed   = false;
+    constexpr bool test_weighted      = false;
+
+    auto [multithreaded_usecase, input_usecase] = param;
+
+    raft::handle_t handle{};
+
+    size_t device_buffer_size{64 * 1024 * 1024};
+    size_t thread_buffer_size{4 * 1024 * 1024};
+
+    const int num_threads_per_gpu{4};
+    int num_gpus    = gpu_list.size();
+    int num_threads = num_gpus * num_threads_per_gpu;
+
+    cugraph::mtmg::resource_manager_t resource_manager;
+
+    std::for_each(gpu_list.begin(), gpu_list.end(), [&resource_manager](int gpu_id) {
+      resource_manager.register_local_gpu(gpu_id, rmm::cuda_device_id{gpu_id});
+    });
+
+    ncclUniqueId instance_manager_id;
+    ncclGetUniqueId(&instance_manager_id);
+
+    // Currently the only uses for multiple streams for each CPU threads
+    // associated with a particular GPU, which is a constant set above
+    auto instance_manager = resource_manager.create_instance_manager(
+      resource_manager.registered_ranks(), instance_manager_id, num_threads_per_gpu);
+
+    cugraph::mtmg::edgelist_t<vertex_t, weight_t, edge_t, edge_type_t> edgelist;
+    cugraph::mtmg::graph_t<vertex_t, edge_t, store_transposed, multi_gpu> graph;
+    cugraph::mtmg::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> graph_view;
+    cugraph::mtmg::vertex_pair_result_t<vertex_t, weight_t> jaccard_results;
+    std::optional<cugraph::mtmg::renumber_map_t<vertex_t>> renumber_map =
+      std::make_optional<cugraph::mtmg::renumber_map_t<vertex_t>>();
+
+    std::optional<cugraph::mtmg::edge_property_t<
+      cugraph::mtmg::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>,
+      weight_t>>
+      edge_weights{std::nullopt};
+
+    //
+    // Simulate graph creation by spawning threads to walk through the
+    // local COO and add edges
+    //
+    std::vector<std::thread> running_threads;
+
+    //  Initialize shared edgelist object, one per GPU
+    for (int i = 0; i < num_gpus; ++i) {
+      running_threads.emplace_back([&instance_manager,
+                                    &edgelist,
+                                    device_buffer_size,
+                                    use_weight    = true,
+                                    use_edge_id   = false,
+                                    use_edge_type = false]() {
+        auto thread_handle = instance_manager->get_handle();
+
+        edgelist.set(thread_handle, device_buffer_size, use_weight, use_edge_id, use_edge_type);
+      });
+    }
+
+    // Wait for CPU threads to complete
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+    running_threads.resize(0);
+    instance_manager->reset_threads();
+
+    // Load SG edge list
+    auto [d_src_v, d_dst_v, d_weights_v, d_vertices_v, is_symmetric] =
+      input_usecase.template construct_edgelist<vertex_t, weight_t>(
+        handle, test_weighted, store_transposed, false);
+
+    rmm::device_uvector<vertex_t> d_unique_vertices(2 * d_src_v.size(), handle.get_stream());
+    thrust::copy(
+      handle.get_thrust_policy(), d_src_v.begin(), d_src_v.end(), d_unique_vertices.begin());
+    thrust::copy(handle.get_thrust_policy(),
+                 d_dst_v.begin(),
+                 d_dst_v.end(),
+                 d_unique_vertices.begin() + d_src_v.size());
+    thrust::sort(handle.get_thrust_policy(), d_unique_vertices.begin(), d_unique_vertices.end());
+
+    d_unique_vertices.resize(thrust::distance(d_unique_vertices.begin(),
+                                              thrust::unique(handle.get_thrust_policy(),
+                                                             d_unique_vertices.begin(),
+                                                             d_unique_vertices.end())),
+                             handle.get_stream());
+
+    auto h_src_v         = cugraph::test::to_host(handle, d_src_v);
+    auto h_dst_v         = cugraph::test::to_host(handle, d_dst_v);
+    auto h_weights_v     = cugraph::test::to_host(handle, d_weights_v);
+    auto unique_vertices = cugraph::test::to_host(handle, d_unique_vertices);
+
+    // Load edgelist from different threads.  We'll use more threads than GPUs here
+    for (int i = 0; i < num_threads; ++i) {
+      running_threads.emplace_back([&instance_manager,
+                                    thread_buffer_size,
+                                    &edgelist,
+                                    &h_src_v,
+                                    &h_dst_v,
+                                    &h_weights_v,
+                                    i,
+                                    num_threads]() {
+        auto thread_handle = instance_manager->get_handle();
+        cugraph::mtmg::per_thread_edgelist_t<vertex_t, weight_t, edge_t, edge_type_t>
+          per_thread_edgelist(edgelist.get(thread_handle), thread_buffer_size);
+
+        for (size_t j = i; j < h_src_v.size(); j += num_threads) {
+          per_thread_edgelist.append(
+            thread_handle,
+            h_src_v[j],
+            h_dst_v[j],
+            h_weights_v ? std::make_optional((*h_weights_v)[j]) : std::nullopt,
+            std::nullopt,
+            std::nullopt);
+        }
+
+        per_thread_edgelist.flush(thread_handle);
+      });
+    }
+
+    // Wait for CPU threads to complete
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+    running_threads.resize(0);
+    instance_manager->reset_threads();
+
+    for (int i = 0; i < num_gpus; ++i) {
+      running_threads.emplace_back([&instance_manager,
+                                    &graph,
+                                    &edge_weights,
+                                    &edgelist,
+                                    &renumber_map,
+                                    &jaccard_results,
+                                    is_symmetric = is_symmetric,
+                                    renumber,
+                                    do_expensive_check]() {
+        auto thread_handle = instance_manager->get_handle();
+
+        if (thread_handle.get_thread_rank() > 0) return;
+
+        std::optional<cugraph::mtmg::edge_property_t<
+          cugraph::mtmg::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>,
+          edge_t>>
+          edge_ids{std::nullopt};
+        std::optional<cugraph::mtmg::edge_property_t<
+          cugraph::mtmg::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>,
+          int32_t>>
+          edge_types{std::nullopt};
+
+        edgelist.finalize_buffer(thread_handle);
+        edgelist.consolidate_and_shuffle(thread_handle, store_transposed);
+
+        cugraph::mtmg::create_graph_from_edgelist<vertex_t,
+                                                  edge_t,
+                                                  weight_t,
+                                                  edge_t,
+                                                  int32_t,
+                                                  store_transposed,
+                                                  multi_gpu>(
+          thread_handle,
+          edgelist,
+          cugraph::graph_properties_t{is_symmetric, true},
+          renumber,
+          graph,
+          edge_weights,
+          edge_ids,
+          edge_types,
+          renumber_map,
+          do_expensive_check);
+      });
+    }
+
+    // Wait for CPU threads to complete
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+    running_threads.resize(0);
+    instance_manager->reset_threads();
+
+    graph_view = graph.view();
+
+    for (int i = 0; i < num_threads; ++i) {
+      running_threads.emplace_back(
+        [&instance_manager, &graph_view, &edge_weights, &jaccard_results]() {
+          auto thread_handle = instance_manager->get_handle();
+
+          if (thread_handle.get_thread_rank() > 0) return;
+
+          auto local_results =
+            cugraph::jaccard_all_pairs_coefficients<vertex_t, edge_t, weight_t, true>(
+              thread_handle.raft_handle(),
+              graph_view.get(thread_handle),
+              edge_weights ? std::make_optional(edge_weights->get(thread_handle).view())
+                           : std::nullopt,
+              std::nullopt,
+              std::nullopt,
+              true);
+
+          jaccard_results.set(thread_handle, std::move(local_results));
+        });
+    }
+
+    // Wait for CPU threads to complete
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+    running_threads.resize(0);
+    instance_manager->reset_threads();
+
+    std::vector<std::tuple<std::vector<vertex_t>, std::vector<vertex_t>, std::vector<weight_t>>>
+      computed_similarities_v;
+    std::mutex computed_similarities_lock{};
+
+    auto jaccard_results_view = jaccard_results.view();
+    auto renumber_map_view = renumber_map ? std::make_optional(renumber_map->view()) : std::nullopt;
+
+    // Load computed_similarities from different threads.
+    for (int i = 0; i < num_gpus; ++i) {
+      running_threads.emplace_back([&instance_manager,
+                                    &graph_view,
+                                    &renumber_map_view,
+                                    &jaccard_results_view,
+                                    &computed_similarities_lock,
+                                    &computed_similarities_v,
+                                    &h_src_v,
+                                    &h_dst_v,
+                                    &h_weights_v,
+                                    &unique_vertices,
+                                    i,
+                                    num_threads]() {
+        auto thread_handle = instance_manager->get_handle();
+
+        auto number_of_vertices = unique_vertices.size();
+
+        std::vector<vertex_t> my_vertex_list;
+        my_vertex_list.reserve((number_of_vertices + num_threads - 1) / num_threads);
+
+        for (size_t j = i; j < number_of_vertices; j += num_threads) {
+          my_vertex_list.push_back(unique_vertices[j]);
+        }
+
+        rmm::device_uvector<vertex_t> d_my_vertex_list(my_vertex_list.size(),
+                                                       thread_handle.raft_handle().get_stream());
+        raft::update_device(d_my_vertex_list.data(),
+                            my_vertex_list.data(),
+                            my_vertex_list.size(),
+                            thread_handle.raft_handle().get_stream());
+
+        auto xxx = graph_view.get_vertex_partition_range_lasts(thread_handle);
+
+        // TODO: What's this going to look like?
+        auto [d_my_jaccard_v1, d_my_jaccard_v2, d_my_jaccard_similarity] =
+          jaccard_results_view.gather(
+            thread_handle,
+            raft::device_span<vertex_t const>{d_my_vertex_list.data(), d_my_vertex_list.size()},
+            // graph_view.get_vertex_partition_range_lasts(thread_handle),
+            xxx,
+            graph_view.get_vertex_partition_view(thread_handle),
+            renumber_map_view);
+
+        std::vector<vertex_t> my_jaccard_v1(d_my_jaccard_v1.size());
+        std::vector<vertex_t> my_jaccard_v2(d_my_jaccard_v2.size());
+        std::vector<weight_t> my_jaccard_similarity(d_my_jaccard_similarity.size());
+        raft::update_host(my_jaccard_v1.data(),
+                          d_my_jaccard_v1.data(),
+                          d_my_jaccard_v1.size(),
+                          thread_handle.raft_handle().get_stream());
+        raft::update_host(my_jaccard_v2.data(),
+                          d_my_jaccard_v2.data(),
+                          d_my_jaccard_v2.size(),
+                          thread_handle.raft_handle().get_stream());
+        raft::update_host(my_jaccard_similarity.data(),
+                          d_my_jaccard_similarity.data(),
+                          d_my_jaccard_similarity.size(),
+                          thread_handle.raft_handle().get_stream());
+
+        {
+          std::lock_guard<std::mutex> lock(computed_similarities_lock);
+          computed_similarities_v.push_back(std::make_tuple(
+            std::move(my_jaccard_v1), std::move(my_jaccard_v2), std::move(my_jaccard_similarity)));
+        }
+      });
+    }
+
+    // Wait for CPU threads to complete
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+    running_threads.resize(0);
+    instance_manager->reset_threads();
+
+    if (multithreaded_usecase.check_correctness) {
+      // Want to compare the results in computed_similarities_v with SG results
+      cugraph::graph_t<vertex_t, edge_t, store_transposed, false> sg_graph(handle);
+      std::optional<
+        cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, store_transposed, false>,
+                                 weight_t>>
+        sg_edge_weights{std::nullopt};
+
+      std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore, std::ignore) =
+        cugraph::create_graph_from_edgelist<vertex_t,
+                                            edge_t,
+                                            weight_t,
+                                            edge_t,
+                                            int32_t,
+                                            store_transposed,
+                                            false>(handle,
+                                                   std::nullopt,
+                                                   std::move(d_src_v),
+                                                   std::move(d_dst_v),
+                                                   std::move(d_weights_v),
+                                                   std::nullopt,
+                                                   std::nullopt,
+                                                   cugraph::graph_properties_t{is_symmetric, true},
+                                                   false);
+
+      auto [sg_v1, sg_v2, sg_similarities] =
+        cugraph::jaccard_all_pairs_coefficients<vertex_t, edge_t, weight_t, false>(
+          handle,
+          sg_graph.view(),
+          sg_edge_weights ? std::make_optional(sg_edge_weights->view()) : std::nullopt,
+          std::nullopt,
+          std::nullopt);
+
+      auto h_sg_v1           = cugraph::test::to_host(handle, sg_v1);
+      auto h_sg_v2           = cugraph::test::to_host(handle, sg_v2);
+      auto h_sg_similarities = cugraph::test::to_host(handle, sg_similarities);
+      auto compare_functor   = cugraph::test::nearly_equal<weight_t>{
+        weight_t{1e-3},
+        weight_t{(weight_t{1} / static_cast<weight_t>(h_sg_v1.size())) * weight_t{1e-3}}};
+
+      std::map<std::tuple<vertex_t, vertex_t>, weight_t> sg_results;
+
+      std::for_each(
+        thrust::make_zip_iterator(h_sg_v1.begin(), h_sg_v2.begin(), h_sg_similarities.begin()),
+        thrust::make_zip_iterator(h_sg_v1.end(), h_sg_v2.end(), h_sg_similarities.end()),
+        [&sg_results](auto tuple) {
+          sg_results.insert(std::make_pair(
+            std::make_tuple(thrust::get<0>(tuple), thrust::get<1>(tuple)), thrust::get<2>(tuple)));
+        });
+
+      std::for_each(
+        computed_similarities_v.begin(),
+        computed_similarities_v.end(),
+        [&sg_results, compare_functor](auto t1) {
+          std::for_each(
+            thrust::make_zip_iterator(
+              std::get<0>(t1).begin(), std::get<1>(t1).begin(), std::get<2>(t1).begin()),
+            thrust::make_zip_iterator(
+              std::get<0>(t1).end(), std::get<1>(t1).end(), std::get<2>(t1).end()),
+            [&sg_results, compare_functor](auto t2) {
+              vertex_t v1      = thrust::get<0>(t2);
+              vertex_t v2      = thrust::get<1>(t2);
+              weight_t jaccard = thrust::get<2>(t2);
+
+              auto pos = sg_results.find(std::make_tuple(v1, v2));
+
+              ASSERT_NE(pos, sg_results.end())
+                << "vertex pair (" << v1 << "," << v2 << ") from mtmg result not found in SG result"
+                << std::endl;
+
+              ASSERT_TRUE(compare_functor(jaccard, pos->second))
+                << "vertex pair (" << v1 << "," << v2 << ") SG result = " << pos->second
+                << ", mtmg result = " << jaccard;
+            });
+        });
+    }
+  }
+};
+
+using Tests_Multithreaded_File = Tests_Multithreaded<cugraph::test::File_Usecase>;
+using Tests_Multithreaded_Rmat = Tests_Multithreaded<cugraph::test::Rmat_Usecase>;
+
+// FIXME: add tests for type combinations
+TEST_P(Tests_Multithreaded_File, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, true>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()), std::vector<int>{{0, 1}});
+}
+
+TEST_P(Tests_Multithreaded_Rmat, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, true>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()), std::vector<int>{{0, 1}});
+}
+
+INSTANTIATE_TEST_SUITE_P(file_test,
+                         Tests_Multithreaded_File,
+                         ::testing::Combine(
+                           // enable correctness checks
+                           ::testing::Values(Multithreaded_Usecase{true}),
+                           ::testing::Values(cugraph::test::File_Usecase("karate.csv"),
+                                             cugraph::test::File_Usecase("dolphins.csv"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_Multithreaded_Rmat,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(Multithreaded_Usecase{true}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, true, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_benchmark_test, /* note that the test filename can be overridden in benchmarking (with
+                          --gtest_filter to select only the file_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one File_Usecase that differ only in filename
+                          (to avoid running same benchmarks more than once) */
+  Tests_Multithreaded_File,
+  ::testing::Combine(
+    // disable correctness checks
+    ::testing::Values(Multithreaded_Usecase{false}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_Multithreaded_Rmat,
+  ::testing::Combine(
+    // disable correctness checks for large graphs
+    ::testing::Values(Multithreaded_Usecase{false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, true, false))));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/mtmg/threaded_test_louvain.cu b/cpp/tests/mtmg/threaded_test_louvain.cu
index ff9641d59f8..c8faf33dae2 100644
--- a/cpp/tests/mtmg/threaded_test_louvain.cu
+++ b/cpp/tests/mtmg/threaded_test_louvain.cu
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 #include "utilities/base_fixture.hpp"
+#include "utilities/conversion_utilities.hpp"
 #include "utilities/device_comm_wrapper.hpp"
 #include "utilities/test_graphs.hpp"
-#include "utilities/test_utilities.hpp"
 #include "utilities/thrust_wrapper.hpp"
 
 #include <cugraph/algorithms.hpp>
@@ -83,6 +83,7 @@ class Tests_Multithreaded
 
     constexpr bool renumber           = true;
     constexpr bool do_expensive_check = false;
+    constexpr bool store_transposed   = false;
 
     auto [multithreaded_usecase, input_usecase] = param;
 
@@ -111,17 +112,18 @@ class Tests_Multithreaded
       resource_manager.registered_ranks(), instance_manager_id);
 
     cugraph::mtmg::edgelist_t<vertex_t, weight_t, edge_t, edge_type_t> edgelist;
-    cugraph::mtmg::graph_t<vertex_t, edge_t, false, multi_gpu> graph;
-    cugraph::mtmg::graph_view_t<vertex_t, edge_t, false, multi_gpu> graph_view;
+    cugraph::mtmg::graph_t<vertex_t, edge_t, store_transposed, multi_gpu> graph;
+    cugraph::mtmg::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> graph_view;
     cugraph::mtmg::vertex_result_t<vertex_t> louvain_clusters;
     std::optional<cugraph::mtmg::renumber_map_t<vertex_t>> renumber_map =
       std::make_optional<cugraph::mtmg::renumber_map_t<vertex_t>>();
 
-    auto edge_weights = multithreaded_usecase.test_weighted
-                          ? std::make_optional<cugraph::mtmg::edge_property_t<
-                              cugraph::mtmg::graph_view_t<vertex_t, edge_t, false, multi_gpu>,
-                              weight_t>>()
-                          : std::nullopt;
+    auto edge_weights =
+      multithreaded_usecase.test_weighted
+        ? std::make_optional<cugraph::mtmg::edge_property_t<
+            cugraph::mtmg::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>,
+            weight_t>>()
+        : std::nullopt;
 
     //
     // Simulate graph creation by spawning threads to walk through the
@@ -220,29 +222,34 @@ class Tests_Multithreaded
         if (thread_handle.get_thread_rank() > 0) return;
 
         std::optional<cugraph::mtmg::edge_property_t<
-          cugraph::mtmg::graph_view_t<vertex_t, edge_t, false, multi_gpu>,
+          cugraph::mtmg::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>,
           edge_t>>
           edge_ids{std::nullopt};
         std::optional<cugraph::mtmg::edge_property_t<
-          cugraph::mtmg::graph_view_t<vertex_t, edge_t, false, multi_gpu>,
+          cugraph::mtmg::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>,
           int32_t>>
           edge_types{std::nullopt};
 
         edgelist.finalize_buffer(thread_handle);
-        edgelist.consolidate_and_shuffle(thread_handle, false);
-
-        cugraph::mtmg::
-          create_graph_from_edgelist<vertex_t, edge_t, weight_t, edge_t, int32_t, false, multi_gpu>(
-            thread_handle,
-            edgelist,
-            cugraph::graph_properties_t{is_symmetric, true},
-            renumber,
-            graph,
-            edge_weights,
-            edge_ids,
-            edge_types,
-            renumber_map,
-            do_expensive_check);
+        edgelist.consolidate_and_shuffle(thread_handle, store_transposed);
+
+        cugraph::mtmg::create_graph_from_edgelist<vertex_t,
+                                                  edge_t,
+                                                  weight_t,
+                                                  edge_t,
+                                                  int32_t,
+                                                  store_transposed,
+                                                  multi_gpu>(
+          thread_handle,
+          edgelist,
+          cugraph::graph_properties_t{is_symmetric, true},
+          renumber,
+          graph,
+          edge_weights,
+          edge_ids,
+          edge_types,
+          renumber_map,
+          do_expensive_check);
       });
     }
 
@@ -365,9 +372,10 @@ class Tests_Multithreaded
 
     if (multithreaded_usecase.check_correctness) {
       // Want to compare the results in computed_clusters_v with SG results
-      cugraph::graph_t<vertex_t, edge_t, false, false> sg_graph(handle);
+      cugraph::graph_t<vertex_t, edge_t, store_transposed, false> sg_graph(handle);
       std::optional<
-        cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, false, false>, weight_t>>
+        cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, store_transposed, false>,
+                                 weight_t>>
         sg_edge_weights{std::nullopt};
 
       for (int i = 0; i < num_gpus; ++i) {
diff --git a/dependencies.yaml b/dependencies.yaml
index 9dca069ea33..c0699fdb1c5 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -565,7 +565,7 @@ dependencies:
           - cugraph==24.6.*
           - pytorch>=2.0
           - pytorch-cuda==11.8
-          - pyg>=2.4.0
+          - pyg>=2.5,<2.6
 
   depends_on_rmm:
     common:
diff --git a/docs/cugraph/source/api_docs/cugraph-ops/python/operators.rst b/docs/cugraph/source/api_docs/cugraph-ops/python/operators.rst
index 3e6664b2db5..8b5efd7aa36 100644
--- a/docs/cugraph/source/api_docs/cugraph-ops/python/operators.rst
+++ b/docs/cugraph/source/api_docs/cugraph-ops/python/operators.rst
@@ -47,10 +47,26 @@ Graph Attention (GATConv/GATv2Conv)
 .. autosummary::
    :toctree: ../../api/ops
 
-   operators.mha_gat_n2n_fwd
-   operators.mha_gat_n2n_bwd
-   operators.mha_gat_n2n_efeat_fwd
-   operators.mha_gat_n2n_efeat_bwd
+   operators.mha_gat_n2n_fwd_bf16_fp32
+   operators.mha_gat_n2n_fwd_fp16_fp32
+   operators.mha_gat_n2n_fwd_fp32_fp32
+   operators.mha_gat_n2n_bwd_bf16_bf16_bf16_fp32
+   operators.mha_gat_n2n_bwd_bf16_bf16_fp32_fp32
+   operators.mha_gat_n2n_bwd_bf16_fp32_fp32_fp32
+   operators.mha_gat_n2n_bwd_fp16_fp16_fp16_fp32
+   operators.mha_gat_n2n_bwd_fp16_fp16_fp32_fp32
+   operators.mha_gat_n2n_bwd_fp16_fp32_fp32_fp32
+   operators.mha_gat_n2n_bwd_fp32_fp32_fp32_fp32
+   operators.mha_gat_n2n_efeat_fwd_bf16_fp32
+   operators.mha_gat_n2n_efeat_fwd_fp16_fp32
+   operators.mha_gat_n2n_efeat_fwd_fp32_fp32
+   operators.mha_gat_n2n_efeat_bwd_bf16_bf16_bf16_fp32
+   operators.mha_gat_n2n_efeat_bwd_bf16_bf16_fp32_fp32
+   operators.mha_gat_n2n_efeat_bwd_bf16_fp32_fp32_fp32
+   operators.mha_gat_n2n_efeat_bwd_fp16_fp16_fp16_fp32
+   operators.mha_gat_n2n_efeat_bwd_fp16_fp16_fp32_fp32
+   operators.mha_gat_n2n_efeat_bwd_fp16_fp32_fp32_fp32
+   operators.mha_gat_n2n_efeat_bwd_fp32_fp32_fp32_fp32
 
    operators.mha_gat_v2_n2n_fwd
    operators.mha_gat_v2_n2n_bwd
diff --git a/docs/cugraph/source/conf.py b/docs/cugraph/source/conf.py
index 952b962aca2..66bc3137fba 100644
--- a/docs/cugraph/source/conf.py
+++ b/docs/cugraph/source/conf.py
@@ -190,9 +190,17 @@
      'Miscellaneous'),
 ]
 
-# Example configuration for intersphinx: refer to the Python standard library.
-intersphinx_mapping = {'https://docs.python.org/': None}
-
+# Connect docs in other projects
+intersphinx_mapping = {
+    "networkx": (
+        "https://networkx.org/documentation/stable/",
+        "https://networkx.org/documentation/stable/objects.inv",
+    ),
+    "python": (
+        "https://docs.python.org/3",
+        "https://docs.python.org/3/objects.inv",
+    ),
+}
 
 # Config numpydoc
 numpydoc_show_inherited_class_members = False
diff --git a/docs/cugraph/source/graph_support/DGL_support.md b/docs/cugraph/source/graph_support/DGL_support.md
index dc4f66180ac..9df462155fd 100644
--- a/docs/cugraph/source/graph_support/DGL_support.md
+++ b/docs/cugraph/source/graph_support/DGL_support.md
@@ -17,7 +17,7 @@ mamba install cugraph-dgl -c rapidsai-nightly -c rapidsai -c pytorch -c conda-fo
 
 ### Create the conda development environment
 ```
-mamba env create -n cugraph_dgl_dev --file conda/cugraph_dgl_dev_11.6.yml
+conda env create -n cugraph_dgl_dev --file conda/environments/all_cuda-122_arch-x86_64.yaml
 ```
 
 ### Install  in editable mode
diff --git a/docs/cugraph/source/graph_support/cugraphops_support.rst b/docs/cugraph/source/graph_support/cugraphops_support.rst
index fd79564f849..96b13f62a9c 100644
--- a/docs/cugraph/source/graph_support/cugraphops_support.rst
+++ b/docs/cugraph/source/graph_support/cugraphops_support.rst
@@ -7,4 +7,4 @@ cugraph-ops aims to be a low-level, framework agnostic library providing commonl
 .. toctree::
    :maxdepth: 3
 
-   https://github.com/rapidsai/cugraph-ops/blob/branch-23.04/README.md
+   https://github.com/rapidsai/cugraph/blob/branch-24.06/readme_pages/cugraph_ops.md
diff --git a/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml b/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
index 94e9f1decbd..ebef0094cfa 100644
--- a/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
+++ b/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
@@ -12,7 +12,7 @@ dependencies:
 - cugraph==24.6.*
 - pandas
 - pre-commit
-- pyg>=2.4.0
+- pyg>=2.5,<2.6
 - pylibcugraphops==24.6.*
 - pytest
 - pytest-benchmark
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py
index 10431a0398d..713448a8203 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,11 +15,15 @@
 from typing import Optional, Tuple, Union
 
 from cugraph.utilities.utils import import_optional
-from pylibcugraphops.pytorch import CSC, HeteroCSC
+import pylibcugraphops.pytorch
+
 
 torch = import_optional("torch")
 torch_geometric = import_optional("torch_geometric")
 
+# A tuple of (row, colptr, num_src_nodes)
+CSC = Tuple[torch.Tensor, torch.Tensor, int]
+
 
 class BaseConv(torch.nn.Module):  # pragma: no cover
     r"""An abstract base class for implementing cugraph-ops message passing layers."""
@@ -33,10 +37,7 @@ def to_csc(
         edge_index: torch.Tensor,
         size: Optional[Tuple[int, int]] = None,
         edge_attr: Optional[torch.Tensor] = None,
-    ) -> Union[
-        Tuple[torch.Tensor, torch.Tensor, int],
-        Tuple[Tuple[torch.Tensor, torch.Tensor, int], torch.Tensor],
-    ]:
+    ) -> Union[CSC, Tuple[CSC, torch.Tensor],]:
         r"""Returns a CSC representation of an :obj:`edge_index` tensor to be
         used as input to cugraph-ops conv layers.
 
@@ -71,19 +72,17 @@ def to_csc(
 
     def get_cugraph(
         self,
-        csc: Tuple[torch.Tensor, torch.Tensor, int],
+        edge_index: Union[torch_geometric.EdgeIndex, CSC],
         bipartite: bool = False,
         max_num_neighbors: Optional[int] = None,
-    ) -> CSC:
+    ) -> Tuple[pylibcugraphops.pytorch.CSC, Optional[torch.Tensor]]:
         r"""Constructs a :obj:`cugraph-ops` graph object from CSC representation.
         Supports both bipartite and non-bipartite graphs.
 
         Args:
-            csc ((torch.Tensor, torch.Tensor, int)): A tuple containing the CSC
-                representation of a graph, given as a tuple of
-                :obj:`(row, colptr, num_src_nodes)`. Use the
-                :meth:`to_csc` method to convert an :obj:`edge_index`
-                representation to the desired format.
+            edge_index (EdgeIndex, (torch.Tensor, torch.Tensor, int)): The edge
+                indices, or a tuple of :obj:`(row, colptr, num_src_nodes)` for
+                CSC representation.
             bipartite (bool): If set to :obj:`True`, will create the bipartite
                 structure in cugraph-ops. (default: :obj:`False`)
             max_num_neighbors (int, optional): The maximum number of neighbors
@@ -91,7 +90,13 @@ def get_cugraph(
                 the message-flow-graph primitives in cugraph-ops.
                 (default: :obj:`None`)
         """
-        row, colptr, num_src_nodes = csc
+        perm = None
+        if isinstance(edge_index, torch_geometric.EdgeIndex):
+            edge_index, perm = edge_index.sort_by("col")
+            num_src_nodes = edge_index.get_sparse_size(0)
+            (colptr, row), _ = edge_index.get_csc()
+        else:
+            row, colptr, num_src_nodes = edge_index
 
         if not row.is_cuda:
             raise RuntimeError(
@@ -102,32 +107,33 @@ def get_cugraph(
         if max_num_neighbors is None:
             max_num_neighbors = -1
 
-        return CSC(
-            offsets=colptr,
-            indices=row,
-            num_src_nodes=num_src_nodes,
-            dst_max_in_degree=max_num_neighbors,
-            is_bipartite=bipartite,
+        return (
+            pylibcugraphops.pytorch.CSC(
+                offsets=colptr,
+                indices=row,
+                num_src_nodes=num_src_nodes,
+                dst_max_in_degree=max_num_neighbors,
+                is_bipartite=bipartite,
+            ),
+            perm,
         )
 
     def get_typed_cugraph(
         self,
-        csc: Tuple[torch.Tensor, torch.Tensor, int],
+        edge_index: Union[torch_geometric.EdgeIndex, CSC],
         edge_type: torch.Tensor,
         num_edge_types: Optional[int] = None,
         bipartite: bool = False,
         max_num_neighbors: Optional[int] = None,
-    ) -> HeteroCSC:
+    ) -> Tuple[pylibcugraphops.pytorch.HeteroCSC, Optional[torch.Tensor]]:
         r"""Constructs a typed :obj:`cugraph` graph object from a CSC
         representation where each edge corresponds to a given edge type.
         Supports both bipartite and non-bipartite graphs.
 
         Args:
-            csc ((torch.Tensor, torch.Tensor, int)): A tuple containing the CSC
-                representation of a graph, given as a tuple of
-                :obj:`(row, colptr, num_src_nodes)`. Use the
-                :meth:`to_csc` method to convert an :obj:`edge_index`
-                representation to the desired format.
+            edge_index (EdgeIndex, (torch.Tensor, torch.Tensor, int)): The edge
+                indices, or a tuple of :obj:`(row, colptr, num_src_nodes)` for
+                CSC representation.
             edge_type (torch.Tensor): The edge type.
             num_edge_types (int, optional): The maximum number of edge types.
                 When not given, will be computed on-the-fly, leading to
@@ -145,32 +151,40 @@ def get_typed_cugraph(
         if max_num_neighbors is None:
             max_num_neighbors = -1
 
-        row, colptr, num_src_nodes = csc
+        perm = None
+        if isinstance(edge_index, torch_geometric.EdgeIndex):
+            edge_index, perm = edge_index.sort_by("col")
+            edge_type = edge_type[perm]
+            num_src_nodes = edge_index.get_sparse_size(0)
+            (colptr, row), _ = edge_index.get_csc()
+        else:
+            row, colptr, num_src_nodes = edge_index
         edge_type = edge_type.int()
 
-        return HeteroCSC(
-            offsets=colptr,
-            indices=row,
-            edge_types=edge_type,
-            num_src_nodes=num_src_nodes,
-            num_edge_types=num_edge_types,
-            dst_max_in_degree=max_num_neighbors,
-            is_bipartite=bipartite,
+        return (
+            pylibcugraphops.pytorch.HeteroCSC(
+                offsets=colptr,
+                indices=row,
+                edge_types=edge_type,
+                num_src_nodes=num_src_nodes,
+                num_edge_types=num_edge_types,
+                dst_max_in_degree=max_num_neighbors,
+                is_bipartite=bipartite,
+            ),
+            perm,
         )
 
     def forward(
         self,
         x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-        csc: Tuple[torch.Tensor, torch.Tensor, int],
+        edge_index: Union[torch_geometric.EdgeIndex, CSC],
     ) -> torch.Tensor:
         r"""Runs the forward pass of the module.
 
         Args:
             x (torch.Tensor): The node features.
-            csc ((torch.Tensor, torch.Tensor, int)): A tuple containing the CSC
-                representation of a graph, given as a tuple of
-                :obj:`(row, colptr, num_src_nodes)`. Use the
-                :meth:`to_csc` method to convert an :obj:`edge_index`
-                representation to the desired format.
+            edge_index (EdgeIndex, (torch.Tensor, torch.Tensor, int)): The edge
+                indices, or a tuple of :obj:`(row, colptr, num_src_nodes)` for
+                CSC representation.
         """
         raise NotImplementedError
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py
index d1785f2bef8..981b1c5b50d 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py
@@ -16,7 +16,7 @@
 from cugraph.utilities.utils import import_optional
 from pylibcugraphops.pytorch.operators import mha_gat_n2n
 
-from .base import BaseConv
+from .base import BaseConv, CSC
 
 torch = import_optional("torch")
 nn = import_optional("torch.nn")
@@ -159,7 +159,7 @@ def reset_parameters(self):
     def forward(
         self,
         x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-        csc: Tuple[torch.Tensor, torch.Tensor, int],
+        edge_index: Union[torch_geometric.EdgeIndex, CSC],
         edge_attr: Optional[torch.Tensor] = None,
         max_num_neighbors: Optional[int] = None,
         deterministic_dgrad: bool = False,
@@ -172,11 +172,7 @@ def forward(
         Args:
             x (torch.Tensor or tuple): The node features. Can be a tuple of
                 tensors denoting source and destination node features.
-            csc ((torch.Tensor, torch.Tensor, int)): A tuple containing the CSC
-                representation of a graph, given as a tuple of
-                :obj:`(row, colptr, num_src_nodes)`. Use the
-                :meth:`to_csc` method to convert an :obj:`edge_index`
-                representation to the desired format.
+            edge_index (EdgeIndex or CSC): The edge indices.
             edge_attr: (torch.Tensor, optional) The edge features.
             max_num_neighbors (int, optional): The maximum number of neighbors
                 of a destination node. When enabled, it allows models to use
@@ -198,9 +194,12 @@ def forward(
                 the corresponding input type at the very end.
         """
         bipartite = not isinstance(x, torch.Tensor)
-        graph = self.get_cugraph(
-            csc, bipartite=bipartite, max_num_neighbors=max_num_neighbors
+        graph, perm = self.get_cugraph(
+            edge_index=edge_index,
+            bipartite=bipartite,
+            max_num_neighbors=max_num_neighbors,
         )
+
         if deterministic_dgrad:
             graph.add_reverse_graph()
 
@@ -212,6 +211,8 @@ def forward(
                 )
             if edge_attr.dim() == 1:
                 edge_attr = edge_attr.view(-1, 1)
+            if perm is not None:
+                edge_attr = edge_attr[perm]
             edge_attr = self.lin_edge(edge_attr)
 
         if bipartite:
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py
index 33865898816..ebb30de9754 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py
@@ -16,7 +16,7 @@
 from cugraph.utilities.utils import import_optional
 from pylibcugraphops.pytorch.operators import mha_gat_v2_n2n
 
-from .base import BaseConv
+from .base import BaseConv, CSC
 
 torch = import_optional("torch")
 nn = import_optional("torch.nn")
@@ -172,7 +172,7 @@ def reset_parameters(self):
     def forward(
         self,
         x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-        csc: Tuple[torch.Tensor, torch.Tensor, int],
+        edge_index: Union[torch_geometric.EdgeIndex, CSC],
         edge_attr: Optional[torch.Tensor] = None,
         deterministic_dgrad: bool = False,
         deterministic_wgrad: bool = False,
@@ -182,11 +182,7 @@ def forward(
         Args:
             x (torch.Tensor or tuple): The node features. Can be a tuple of
                 tensors denoting source and destination node features.
-            csc ((torch.Tensor, torch.Tensor, int)): A tuple containing the CSC
-                representation of a graph, given as a tuple of
-                :obj:`(row, colptr, num_src_nodes)`. Use the
-                :meth:`to_csc` method to convert an :obj:`edge_index`
-                representation to the desired format.
+            edge_index (EdgeIndex or CSC): The edge indices.
             edge_attr: (torch.Tensor, optional) The edge features.
             deterministic_dgrad : bool, default=False
                 Optional flag indicating whether the feature gradients
@@ -196,7 +192,7 @@ def forward(
                 are computed deterministically using a dedicated workspace buffer.
         """
         bipartite = not isinstance(x, torch.Tensor) or not self.share_weights
-        graph = self.get_cugraph(csc, bipartite=bipartite)
+        graph, perm = self.get_cugraph(edge_index, bipartite=bipartite)
         if deterministic_dgrad:
             graph.add_reverse_graph()
 
@@ -208,6 +204,8 @@ def forward(
                 )
             if edge_attr.dim() == 1:
                 edge_attr = edge_attr.view(-1, 1)
+            if perm is not None:
+                edge_attr = edge_attr[perm]
             edge_attr = self.lin_edge(edge_attr)
 
         if bipartite:
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/hetero_gat_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/hetero_gat_conv.py
index 3b717552a96..a73dd8e57ff 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/hetero_gat_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/hetero_gat_conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -18,6 +18,7 @@
 from pylibcugraphops.pytorch.operators import mha_gat_n2n
 
 from .base import BaseConv
+from cugraph_pyg.utils.imports import package_available
 
 torch = import_optional("torch")
 torch_geometric = import_optional("torch_geometric")
@@ -74,10 +75,10 @@ def __init__(
         bias: bool = True,
         aggr: str = "sum",
     ):
-        major, minor, patch = torch_geometric.__version__.split(".")[:3]
-        pyg_version = tuple(map(int, [major, minor, patch]))
-        if pyg_version < (2, 4, 0):
-            raise RuntimeError(f"{self.__class__.__name__} requires pyg >= 2.4.0.")
+        if not package_available("torch_geometric>=2.4.0"):
+            raise RuntimeError(
+                f"{self.__class__.__name__} requires torch_geometric>=2.4.0."
+            )
 
         super().__init__()
 
@@ -225,7 +226,7 @@ def forward(
             )
 
             if src_type == dst_type:
-                graph = self.get_cugraph(
+                graph, _ = self.get_cugraph(
                     csc,
                     bipartite=False,
                 )
@@ -240,7 +241,7 @@ def forward(
                 )
 
             else:
-                graph = self.get_cugraph(
+                graph, _ = self.get_cugraph(
                     csc,
                     bipartite=True,
                 )
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/rgcn_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/rgcn_conv.py
index 683780b66eb..13fa08db5c5 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/rgcn_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/rgcn_conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,12 +11,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Tuple
+from typing import Optional, Union
 
 from cugraph.utilities.utils import import_optional
 from pylibcugraphops.pytorch.operators import agg_hg_basis_n2n_post
 
-from .base import BaseConv
+from .base import BaseConv, CSC
 
 torch = import_optional("torch")
 torch_geometric = import_optional("torch_geometric")
@@ -110,13 +110,16 @@ def reset_parameters(self):
     def forward(
         self,
         x: torch.Tensor,
-        csc: Tuple[torch.Tensor, torch.Tensor, int],
+        edge_index: Union[torch_geometric.EdgeIndex, CSC],
         edge_type: torch.Tensor,
         max_num_neighbors: Optional[int] = None,
     ) -> torch.Tensor:
 
-        graph = self.get_typed_cugraph(
-            csc, edge_type, self.num_relations, max_num_neighbors=max_num_neighbors
+        graph, _ = self.get_typed_cugraph(
+            edge_index,
+            edge_type,
+            self.num_relations,
+            max_num_neighbors=max_num_neighbors,
         )
 
         out = agg_hg_basis_n2n_post(
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/sage_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/sage_conv.py
index 8e0c1027416..65dc99d8988 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/sage_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/sage_conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -16,7 +16,7 @@
 from cugraph.utilities.utils import import_optional
 from pylibcugraphops.pytorch.operators import agg_concat_n2n
 
-from .base import BaseConv
+from .base import BaseConv, CSC
 
 torch = import_optional("torch")
 torch_geometric = import_optional("torch_geometric")
@@ -116,12 +116,14 @@ def reset_parameters(self):
     def forward(
         self,
         x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-        csc: Tuple[torch.Tensor, torch.Tensor, int],
+        edge_index: Union[torch_geometric.EdgeIndex, CSC],
         max_num_neighbors: Optional[int] = None,
     ) -> torch.Tensor:
         bipartite = isinstance(x, Tuple)
-        graph = self.get_cugraph(
-            csc, bipartite=bipartite, max_num_neighbors=max_num_neighbors
+        graph, _ = self.get_cugraph(
+            edge_index=edge_index,
+            bipartite=bipartite,
+            max_num_neighbors=max_num_neighbors,
         )
 
         if self.project:
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/transformer_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/transformer_conv.py
index 41c0b4b4090..e184ee0e893 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/transformer_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/transformer_conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -16,7 +16,7 @@
 from cugraph.utilities.utils import import_optional
 from pylibcugraphops.pytorch.operators import mha_simple_n2n
 
-from .base import BaseConv
+from .base import BaseConv, CSC
 
 torch = import_optional("torch")
 nn = import_optional("torch.nn")
@@ -153,7 +153,7 @@ def reset_parameters(self):
     def forward(
         self,
         x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-        csc: Tuple[torch.Tensor, torch.Tensor, int],
+        edge_index: Union[torch_geometric.EdgeIndex, CSC],
         edge_attr: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         r"""Runs the forward pass of the module.
@@ -161,15 +161,11 @@ def forward(
         Args:
             x (torch.Tensor or tuple): The node features. Can be a tuple of
                 tensors denoting source and destination node features.
-            csc ((torch.Tensor, torch.Tensor, int)): A tuple containing the CSC
-                representation of a graph, given as a tuple of
-                :obj:`(row, colptr, num_src_nodes)`. Use the
-                :meth:`to_csc` method to convert an :obj:`edge_index`
-                representation to the desired format.
+            edge_index (EdgeIndex or CSC): The edge indices.
             edge_attr: (torch.Tensor, optional) The edge features.
         """
         bipartite = True
-        graph = self.get_cugraph(csc, bipartite=bipartite)
+        graph, perm = self.get_cugraph(edge_index=edge_index, bipartite=bipartite)
 
         if isinstance(x, torch.Tensor):
             x = (x, x)
@@ -184,6 +180,8 @@ def forward(
                     f"{self.__class__.__name__}.edge_dim must be set to accept "
                     f"edge features."
                 )
+            if perm is not None:
+                edge_attr = edge_attr[perm]
             edge_attr = self.lin_edge(edge_attr)
 
         out = mha_simple_n2n(
diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py b/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py
index ffab54efe08..8bcfb783ae1 100644
--- a/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py
+++ b/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py
@@ -171,8 +171,8 @@ def _sampler_output_from_sampling_results_homogeneous_coo(
         row=row_dict,
         col=col_dict,
         edge=None,
-        num_sampled_nodes=num_nodes_per_hop_dict,
-        num_sampled_edges=num_edges_per_hop_dict,
+        num_sampled_nodes={k: t.tolist() for k, t in num_nodes_per_hop_dict.items()},
+        num_sampled_edges={k: t.tolist() for k, t in num_edges_per_hop_dict.items()},
         metadata=metadata,
     )
 
@@ -222,7 +222,9 @@ def _sampler_output_from_sampling_results_homogeneous_csr(
     major_offsets = major_offsets.clone() - major_offsets[0]
     label_hop_offsets = label_hop_offsets.clone() - label_hop_offsets[0]
 
-    num_edges_per_hop_dict = {edge_type: major_offsets[label_hop_offsets].diff().cpu()}
+    num_edges_per_hop_dict = {
+        edge_type: major_offsets[label_hop_offsets].diff().tolist()
+    }
 
     label_hop_offsets = label_hop_offsets.cpu()
     num_nodes_per_hop_dict = {
@@ -231,7 +233,7 @@ def _sampler_output_from_sampling_results_homogeneous_csr(
                 label_hop_offsets.diff(),
                 (renumber_map.shape[0] - label_hop_offsets[-1]).reshape((1,)),
             ]
-        ).cpu()
+        ).tolist()
     }
 
     noi_index = {node_type: torch.as_tensor(renumber_map, device="cuda")}
@@ -397,8 +399,8 @@ def _sampler_output_from_sampling_results_heterogeneous(
         row=row_dict,
         col=col_dict,
         edge=None,
-        num_sampled_nodes=num_nodes_per_hop_dict,
-        num_sampled_edges=num_edges_per_hop_dict,
+        num_sampled_nodes={k: t.tolist() for k, t in num_nodes_per_hop_dict.items()},
+        num_sampled_edges={k: t.tolist() for k, t in num_edges_per_hop_dict.items()},
         metadata=metadata,
     )
 
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py
index 7047c62250b..85acbebc3ec 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py
@@ -373,10 +373,15 @@ def test_get_input_nodes(karate_gnn, dask_client):
     F, G, N = karate_gnn
     cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
 
-    node_type, input_nodes = torch_geometric.loader.utils.get_input_nodes(
+    nodes = torch_geometric.loader.utils.get_input_nodes(
         (cugraph_store, cugraph_store), "type0"
     )
 
+    if len(nodes) == 2:
+        node_type, input_nodes = nodes
+    else:
+        node_type, input_nodes, _ = nodes
+
     assert node_type == "type0"
     assert input_nodes.tolist() == torch.arange(17, dtype=torch.int32).tolist()
 
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
index 62bebb9211d..a26063f62fa 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -14,10 +14,15 @@
 import pytest
 
 from cugraph_pyg.nn import GATConv as CuGraphGATConv
+from cugraph_pyg.utils.imports import package_available
 
 ATOL = 1e-6
 
 
+@pytest.mark.skipif(
+    package_available("torch_geometric<2.5"), reason="Test requires pyg>=2.5"
+)
+@pytest.mark.parametrize("use_edge_index", [True, False])
 @pytest.mark.parametrize("bias", [True, False])
 @pytest.mark.parametrize("bipartite", [True, False])
 @pytest.mark.parametrize("concat", [True, False])
@@ -26,10 +31,18 @@
 @pytest.mark.parametrize("use_edge_attr", [True, False])
 @pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
 def test_gat_conv_equality(
-    bias, bipartite, concat, heads, max_num_neighbors, use_edge_attr, graph, request
+    use_edge_index,
+    bias,
+    bipartite,
+    concat,
+    heads,
+    max_num_neighbors,
+    use_edge_attr,
+    graph,
+    request,
 ):
-    pytest.importorskip("torch_geometric", reason="PyG not available")
     import torch
+    from torch_geometric import EdgeIndex
     from torch_geometric.nn import GATConv
 
     torch.manual_seed(12345)
@@ -50,13 +63,19 @@ def test_gat_conv_equality(
     if use_edge_attr:
         edge_dim = 3
         edge_attr = torch.rand(edge_index.size(1), edge_dim).cuda()
-        csc, edge_attr_perm = CuGraphGATConv.to_csc(
-            edge_index, size, edge_attr=edge_attr
-        )
     else:
-        edge_dim = None
-        edge_attr = edge_attr_perm = None
-        csc = CuGraphGATConv.to_csc(edge_index, size)
+        edge_dim = edge_attr = None
+
+    if use_edge_index:
+        csc = EdgeIndex(edge_index, sparse_size=size)
+    else:
+        if use_edge_attr:
+            csc, edge_attr_perm = CuGraphGATConv.to_csc(
+                edge_index, size, edge_attr=edge_attr
+            )
+        else:
+            csc = CuGraphGATConv.to_csc(edge_index, size)
+            edge_attr_perm = None
 
     kwargs = dict(bias=bias, concat=concat, edge_dim=edge_dim)
 
@@ -68,19 +87,24 @@ def test_gat_conv_equality(
     out_dim = heads * out_channels
     with torch.no_grad():
         if bipartite:
-            conv2.lin_src.weight.data = conv1.lin_src.weight.data.detach().clone()
-            conv2.lin_dst.weight.data = conv1.lin_dst.weight.data.detach().clone()
+            conv2.lin_src.weight.copy_(conv1.lin_src.weight)
+            conv2.lin_dst.weight.copy_(conv1.lin_dst.weight)
         else:
-            conv2.lin.weight.data = conv1.lin_src.weight.data.detach().clone()
+            conv2.lin.weight.copy_(conv1.lin.weight)
 
-        conv2.att.data[:out_dim] = conv1.att_src.data.flatten()
-        conv2.att.data[out_dim : 2 * out_dim] = conv1.att_dst.data.flatten()
+        conv2.att[:out_dim].copy_(conv1.att_src.flatten())
+        conv2.att[out_dim : 2 * out_dim].copy_(conv1.att_dst.flatten())
         if use_edge_attr:
-            conv2.att.data[2 * out_dim :] = conv1.att_edge.data.flatten()
-            conv2.lin_edge.weight.data = conv1.lin_edge.weight.data.detach().clone()
+            conv2.att[2 * out_dim :].copy_(conv1.att_edge.flatten())
+            conv2.lin_edge.weight.copy_(conv1.lin_edge.weight)
 
     out1 = conv1(x, edge_index, edge_attr=edge_attr)
-    out2 = conv2(x, csc, edge_attr=edge_attr_perm, max_num_neighbors=max_num_neighbors)
+    if use_edge_index:
+        out2 = conv2(x, csc, edge_attr=edge_attr, max_num_neighbors=max_num_neighbors)
+    else:
+        out2 = conv2(
+            x, csc, edge_attr=edge_attr_perm, max_num_neighbors=max_num_neighbors
+        )
     assert torch.allclose(out1, out2, atol=ATOL)
 
     grad_output = torch.rand_like(out1)
@@ -95,9 +119,7 @@ def test_gat_conv_equality(
             conv1.lin_dst.weight.grad, conv2.lin_dst.weight.grad, atol=ATOL
         )
     else:
-        assert torch.allclose(
-            conv1.lin_src.weight.grad, conv2.lin.weight.grad, atol=ATOL
-        )
+        assert torch.allclose(conv1.lin.weight.grad, conv2.lin.weight.grad, atol=ATOL)
 
     assert torch.allclose(
         conv1.att_src.grad.flatten(), conv2.att.grad[:out_dim], atol=ATOL
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py
index a4794628410..a62f2fed2f7 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -18,14 +18,18 @@
 ATOL = 1e-6
 
 
+@pytest.mark.parametrize("use_edge_index", [True, False])
 @pytest.mark.parametrize("bipartite", [True, False])
 @pytest.mark.parametrize("concat", [True, False])
 @pytest.mark.parametrize("heads", [1, 2, 3, 5, 10, 16])
 @pytest.mark.parametrize("use_edge_attr", [True, False])
 @pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
-def test_gatv2_conv_equality(bipartite, concat, heads, use_edge_attr, graph, request):
+def test_gatv2_conv_equality(
+    use_edge_index, bipartite, concat, heads, use_edge_attr, graph, request
+):
     pytest.importorskip("torch_geometric", reason="PyG not available")
     import torch
+    from torch_geometric import EdgeIndex
     from torch_geometric.nn import GATv2Conv
 
     torch.manual_seed(12345)
@@ -46,13 +50,19 @@ def test_gatv2_conv_equality(bipartite, concat, heads, use_edge_attr, graph, req
     if use_edge_attr:
         edge_dim = 3
         edge_attr = torch.rand(edge_index.size(1), edge_dim).cuda()
-        csc, edge_attr_perm = CuGraphGATv2Conv.to_csc(
-            edge_index, size, edge_attr=edge_attr
-        )
     else:
-        edge_dim = None
-        edge_attr = edge_attr_perm = None
-        csc = CuGraphGATv2Conv.to_csc(edge_index, size)
+        edge_dim = edge_attr = None
+
+    if use_edge_index:
+        csc = EdgeIndex(edge_index, sparse_size=size)
+    else:
+        if use_edge_attr:
+            csc, edge_attr_perm = CuGraphGATv2Conv.to_csc(
+                edge_index, size, edge_attr=edge_attr
+            )
+        else:
+            csc = CuGraphGATv2Conv.to_csc(edge_index, size)
+            edge_attr_perm = None
 
     kwargs = dict(bias=False, concat=concat, edge_dim=edge_dim)
 
@@ -62,14 +72,17 @@ def test_gatv2_conv_equality(bipartite, concat, heads, use_edge_attr, graph, req
     conv2 = CuGraphGATv2Conv(in_channels, out_channels, heads, **kwargs).cuda()
 
     with torch.no_grad():
-        conv2.lin_src.weight.data = conv1.lin_l.weight.data.detach().clone()
-        conv2.lin_dst.weight.data = conv1.lin_r.weight.data.detach().clone()
-        conv2.att.data = conv1.att.data.flatten().detach().clone()
+        conv2.lin_src.weight.copy_(conv1.lin_l.weight)
+        conv2.lin_dst.weight.copy_(conv1.lin_r.weight)
+        conv2.att.copy_(conv1.att.flatten())
         if use_edge_attr:
-            conv2.lin_edge.weight.data = conv1.lin_edge.weight.data.detach().clone()
+            conv2.lin_edge.weight.copy_(conv1.lin_edge.weight)
 
     out1 = conv1(x, edge_index, edge_attr=edge_attr)
-    out2 = conv2(x, csc, edge_attr=edge_attr_perm)
+    if use_edge_index:
+        out2 = conv2(x, csc, edge_attr=edge_attr)
+    else:
+        out2 = conv2(x, csc, edge_attr=edge_attr_perm)
     assert torch.allclose(out1, out2, atol=ATOL)
 
     grad_output = torch.rand_like(out1)
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py
index 1c841a17df7..d8190ea345f 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -14,31 +14,23 @@
 import pytest
 
 from cugraph_pyg.nn import HeteroGATConv as CuGraphHeteroGATConv
-from cugraph.utilities.utils import import_optional, MissingModule
-
-torch = import_optional("torch")
-torch_geometric = import_optional("torch_geometric")
+from cugraph_pyg.utils.imports import package_available
 
 ATOL = 1e-6
 
 
 @pytest.mark.cugraph_ops
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 @pytest.mark.skipif(
-    isinstance(torch_geometric, MissingModule), reason="torch_geometric not available"
+    package_available("torch_geometric<2.4"), reason="Test requires pyg>=2.4"
 )
 @pytest.mark.parametrize("heads", [1, 3, 10])
 @pytest.mark.parametrize("aggr", ["sum", "mean"])
 def test_hetero_gat_conv_equality(sample_pyg_hetero_data, aggr, heads):
-    major, minor, patch = torch_geometric.__version__.split(".")[:3]
-    pyg_version = tuple(map(int, [major, minor, patch]))
-    if pyg_version < (2, 4, 0):
-        pytest.skip("Skipping HeteroGATConv test")
-
+    import torch
     from torch_geometric.data import HeteroData
     from torch_geometric.nn import HeteroConv, GATConv
 
-    device = torch.device("cuda:0")
+    device = torch.device("cuda")
     data = HeteroData(sample_pyg_hetero_data).to(device)
 
     in_channels_dict = {k: v.size(1) for k, v in data.x_dict.items()}
@@ -73,16 +65,19 @@ def test_hetero_gat_conv_equality(sample_pyg_hetero_data, aggr, heads):
     with torch.no_grad():
         for edge_type in conv2.edge_types:
             src_t, _, dst_t = edge_type
-            w_src[edge_type][:, :] = conv1.convs[edge_type].lin_src.weight[:, :]
-            if w_dst[edge_type] is not None:
-                w_dst[edge_type][:, :] = conv1.convs[edge_type].lin_dst.weight[:, :]
-
-            conv2.attn_weights[edge_type][: heads * out_channels] = conv1.convs[
-                edge_type
-            ].att_src.data.flatten()
-            conv2.attn_weights[edge_type][heads * out_channels :] = conv1.convs[
-                edge_type
-            ].att_dst.data.flatten()
+            if src_t == dst_t:
+                w_src[edge_type].copy_(conv1.convs[edge_type].lin.weight)
+            else:
+                w_src[edge_type].copy_(conv1.convs[edge_type].lin_src.weight)
+                if w_dst[edge_type] is not None:
+                    w_dst[edge_type].copy_(conv1.convs[edge_type].lin_dst.weight)
+
+            conv2.attn_weights[edge_type][: heads * out_channels].copy_(
+                conv1.convs[edge_type].att_src.flatten()
+            )
+            conv2.attn_weights[edge_type][heads * out_channels :].copy_(
+                conv1.convs[edge_type].att_dst.flatten()
+            )
 
     out1 = conv1(data.x_dict, data.edge_index_dict)
     out2 = conv2(data.x_dict, data.edge_index_dict)
@@ -118,7 +113,11 @@ def test_hetero_gat_conv_equality(sample_pyg_hetero_data, aggr, heads):
     for node_t, (rels_as_src, rels_as_dst) in conv2.relations_per_ntype.items():
         grad_list = []
         for rel_t in rels_as_src:
-            grad_list.append(conv1.convs[rel_t].lin_src.weight.grad.clone())
+            src_type, _, dst_type = rel_t
+            if src_type == dst_type:
+                grad_list.append(conv1.convs[rel_t].lin.weight.grad.clone())
+            else:
+                grad_list.append(conv1.convs[rel_t].lin_src.weight.grad.clone())
         for rel_t in rels_as_dst:
             grad_list.append(conv1.convs[rel_t].lin_dst.weight.grad.clone())
         assert len(grad_list) > 0
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py
index ded4f300c0c..fc0aaf25b7b 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -18,6 +18,7 @@
 ATOL = 1e-6
 
 
+@pytest.mark.parametrize("use_edge_index", [True, False])
 @pytest.mark.parametrize("aggr", ["add", "sum", "mean"])
 @pytest.mark.parametrize("bias", [True, False])
 @pytest.mark.parametrize("max_num_neighbors", [8, None])
@@ -25,10 +26,18 @@
 @pytest.mark.parametrize("root_weight", [True, False])
 @pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
 def test_rgcn_conv_equality(
-    aggr, bias, max_num_neighbors, num_bases, root_weight, graph, request
+    use_edge_index,
+    aggr,
+    bias,
+    max_num_neighbors,
+    num_bases,
+    root_weight,
+    graph,
+    request,
 ):
     pytest.importorskip("torch_geometric", reason="PyG not available")
     import torch
+    from torch_geometric import EdgeIndex
     from torch_geometric.nn import FastRGCNConv as RGCNConv
 
     torch.manual_seed(12345)
@@ -39,23 +48,30 @@ def test_rgcn_conv_equality(
     edge_index = edge_index.cuda()
     edge_type = torch.randint(num_relations, (edge_index.size(1),)).cuda()
 
+    if use_edge_index:
+        csc = EdgeIndex(edge_index, sparse_size=size)
+    else:
+        csc, edge_type_perm = CuGraphRGCNConv.to_csc(edge_index, size, edge_type)
+
     x = torch.rand(size[0], in_channels, device="cuda")
-    csc, edge_type_perm = CuGraphRGCNConv.to_csc(edge_index, size, edge_type)
 
     conv1 = RGCNConv(in_channels, out_channels, num_relations, **kwargs).cuda()
     conv2 = CuGraphRGCNConv(in_channels, out_channels, num_relations, **kwargs).cuda()
 
     with torch.no_grad():
         if root_weight:
-            conv2.weight.data[:-1] = conv1.weight.data
-            conv2.weight.data[-1] = conv1.root.data
+            conv2.weight[:-1].copy_(conv1.weight)
+            conv2.weight[-1].copy_(conv1.root)
         else:
-            conv2.weight.data = conv1.weight.data.detach().clone()
+            conv2.weight.copy_(conv1.weight)
         if num_bases is not None:
-            conv2.comp.data = conv1.comp.data.detach().clone()
+            conv2.comp.copy_(conv1.comp)
 
     out1 = conv1(x, edge_index, edge_type)
-    out2 = conv2(x, csc, edge_type_perm, max_num_neighbors=max_num_neighbors)
+    if use_edge_index:
+        out2 = conv2(x, csc, edge_type)
+    else:
+        out2 = conv2(x, csc, edge_type_perm, max_num_neighbors=max_num_neighbors)
     assert torch.allclose(out1, out2, atol=ATOL)
 
     grad_out = torch.rand_like(out1)
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py
index b2977d1d175..9d8d413c590 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -18,6 +18,7 @@
 ATOL = 1e-6
 
 
+@pytest.mark.parametrize("use_edge_index", [True, False])
 @pytest.mark.parametrize("aggr", ["sum", "mean", "min", "max"])
 @pytest.mark.parametrize("bias", [True, False])
 @pytest.mark.parametrize("bipartite", [True, False])
@@ -26,16 +27,29 @@
 @pytest.mark.parametrize("root_weight", [True, False])
 @pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
 def test_sage_conv_equality(
-    aggr, bias, bipartite, max_num_neighbors, normalize, root_weight, graph, request
+    use_edge_index,
+    aggr,
+    bias,
+    bipartite,
+    max_num_neighbors,
+    normalize,
+    root_weight,
+    graph,
+    request,
 ):
     pytest.importorskip("torch_geometric", reason="PyG not available")
     import torch
+    from torch_geometric import EdgeIndex
     from torch_geometric.nn import SAGEConv
 
     torch.manual_seed(12345)
     edge_index, size = request.getfixturevalue(graph)
     edge_index = edge_index.cuda()
-    csc = CuGraphSAGEConv.to_csc(edge_index, size)
+
+    if use_edge_index:
+        csc = EdgeIndex(edge_index, sparse_size=size)
+    else:
+        csc = CuGraphSAGEConv.to_csc(edge_index, size)
 
     if bipartite:
         in_channels = (7, 3)
@@ -55,11 +69,11 @@ def test_sage_conv_equality(
 
     in_channels_src = conv2.in_channels_src
     with torch.no_grad():
-        conv2.lin.weight.data[:, :in_channels_src] = conv1.lin_l.weight.data
+        conv2.lin.weight[:, :in_channels_src].copy_(conv1.lin_l.weight)
         if root_weight:
-            conv2.lin.weight.data[:, in_channels_src:] = conv1.lin_r.weight.data
+            conv2.lin.weight[:, in_channels_src:].copy_(conv1.lin_r.weight)
         if bias:
-            conv2.lin.bias.data[:] = conv1.lin_l.bias.data
+            conv2.lin.bias.copy_(conv1.lin_l.bias)
 
     out1 = conv1(x, edge_index)
     out2 = conv2(x, csc, max_num_neighbors=max_num_neighbors)
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py
index fbdb244898b..1776b691c87 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -18,22 +18,23 @@
 ATOL = 1e-6
 
 
+@pytest.mark.parametrize("use_edge_index", [True, False])
+@pytest.mark.parametrize("use_edge_attr", [True, False])
 @pytest.mark.parametrize("bipartite", [True, False])
 @pytest.mark.parametrize("concat", [True, False])
 @pytest.mark.parametrize("heads", [1, 2, 3, 5, 10, 16])
 @pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
-def test_transformer_conv_equality(bipartite, concat, heads, graph, request):
+def test_transformer_conv_equality(
+    use_edge_index, use_edge_attr, bipartite, concat, heads, graph, request
+):
     pytest.importorskip("torch_geometric", reason="PyG not available")
     import torch
+    from torch_geometric import EdgeIndex
     from torch_geometric.nn import TransformerConv
 
     torch.manual_seed(12345)
     edge_index, size = request.getfixturevalue(graph)
     edge_index = edge_index.cuda()
-    csc = CuGraphTransformerConv.to_csc(edge_index, size)
-
-    out_channels = 2
-    kwargs = dict(concat=concat, bias=False, root_weight=False)
 
     if bipartite:
         in_channels = (5, 3)
@@ -44,20 +45,45 @@ def test_transformer_conv_equality(bipartite, concat, heads, graph, request):
     else:
         in_channels = 5
         x = torch.rand(size[0], in_channels, device="cuda")
+    out_channels = 2
+
+    if use_edge_attr:
+        edge_dim = 3
+        edge_attr = torch.rand(edge_index.size(1), edge_dim).cuda()
+    else:
+        edge_dim = edge_attr = None
+
+    if use_edge_index:
+        csc = EdgeIndex(edge_index, sparse_size=size)
+    else:
+        if use_edge_attr:
+            csc, edge_attr_perm = CuGraphTransformerConv.to_csc(
+                edge_index, size, edge_attr=edge_attr
+            )
+        else:
+            csc = CuGraphTransformerConv.to_csc(edge_index, size)
+            edge_attr_perm = None
+
+    kwargs = dict(concat=concat, bias=False, edge_dim=edge_dim, root_weight=False)
 
     conv1 = TransformerConv(in_channels, out_channels, heads, **kwargs).cuda()
     conv2 = CuGraphTransformerConv(in_channels, out_channels, heads, **kwargs).cuda()
 
     with torch.no_grad():
-        conv2.lin_query.weight.data = conv1.lin_query.weight.data.detach().clone()
-        conv2.lin_key.weight.data = conv1.lin_key.weight.data.detach().clone()
-        conv2.lin_value.weight.data = conv1.lin_value.weight.data.detach().clone()
-        conv2.lin_query.bias.data = conv1.lin_query.bias.data.detach().clone()
-        conv2.lin_key.bias.data = conv1.lin_key.bias.data.detach().clone()
-        conv2.lin_value.bias.data = conv1.lin_value.bias.data.detach().clone()
+        conv2.lin_query.weight.copy_(conv1.lin_query.weight)
+        conv2.lin_key.weight.copy_(conv1.lin_key.weight)
+        conv2.lin_value.weight.copy_(conv1.lin_value.weight)
+        conv2.lin_query.bias.copy_(conv1.lin_query.bias)
+        conv2.lin_key.bias.copy_(conv1.lin_key.bias)
+        conv2.lin_value.bias.copy_(conv1.lin_value.bias)
+        if use_edge_attr:
+            conv2.lin_edge.weight.copy_(conv1.lin_edge.weight)
 
-    out1 = conv1(x, edge_index)
-    out2 = conv2(x, csc)
+    out1 = conv1(x, edge_index, edge_attr=edge_attr)
+    if use_edge_index:
+        out2 = conv2(x, csc, edge_attr=edge_attr)
+    else:
+        out2 = conv2(x, csc, edge_attr=edge_attr_perm)
 
     assert torch.allclose(out1, out2, atol=ATOL)
 
@@ -81,3 +107,8 @@ def test_transformer_conv_equality(bipartite, concat, heads, graph, request):
     assert torch.allclose(
         conv1.lin_value.bias.grad, conv2.lin_value.bias.grad, atol=ATOL
     )
+
+    if use_edge_attr:
+        assert torch.allclose(
+            conv1.lin_edge.weight.grad, conv2.lin_edge.weight.grad, atol=ATOL
+        )
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
index 9813fa933ee..ab20ef01fd3 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -32,7 +32,11 @@
 
 torch = import_optional("torch")
 torch_geometric = import_optional("torch_geometric")
+
 trim_to_layer = import_optional("torch_geometric.utils.trim_to_layer")
+if isinstance(trim_to_layer, MissingModule):
+    trim_to_layer = import_optional("torch_geometric.utils._trim_to_layer")
+
 
 try:
     import torch_sparse  # noqa: F401
@@ -278,8 +282,8 @@ def test_cugraph_loader_from_disk_subset_csr():
         )
         assert row.tolist() == bogus_samples.minors.dropna().values_host.tolist()
 
-        assert sample["t0"]["num_sampled_nodes"].tolist() == [1, 3, 2]
-        assert sample["t0", "knows", "t0"]["num_sampled_edges"].tolist() == [3, 5]
+        assert sample["t0"]["num_sampled_nodes"] == [1, 3, 2]
+        assert sample["t0", "knows", "t0"]["num_sampled_edges"] == [3, 5]
 
     assert num_samples == 100
 
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py
index e703d477b70..ed011a658a9 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -80,10 +80,10 @@ def test_neighbor_sample(basic_graph_1):
 
     # check the hop dictionaries
     assert len(out.num_sampled_nodes) == 1
-    assert out.num_sampled_nodes["vt1"].tolist() == [4, 1]
+    assert out.num_sampled_nodes["vt1"] == [4, 1]
 
     assert len(out.num_sampled_edges) == 1
-    assert out.num_sampled_edges[("vt1", "pig", "vt1")].tolist() == [6]
+    assert out.num_sampled_edges[("vt1", "pig", "vt1")] == [6]
 
 
 @pytest.mark.cugraph_ops
@@ -136,15 +136,15 @@ def test_neighbor_sample_multi_vertex(multi_edge_multi_vertex_graph_1):
 
     # check the hop dictionaries
     assert len(out.num_sampled_nodes) == 2
-    assert out.num_sampled_nodes["black"].tolist() == [2, 0]
-    assert out.num_sampled_nodes["brown"].tolist() == [3, 0]
+    assert out.num_sampled_nodes["black"] == [2, 0]
+    assert out.num_sampled_nodes["brown"] == [3, 0]
 
     assert len(out.num_sampled_edges) == 5
-    assert out.num_sampled_edges[("brown", "horse", "brown")].tolist() == [2]
-    assert out.num_sampled_edges[("brown", "tortoise", "black")].tolist() == [3]
-    assert out.num_sampled_edges[("brown", "mongoose", "black")].tolist() == [2]
-    assert out.num_sampled_edges[("black", "cow", "brown")].tolist() == [2]
-    assert out.num_sampled_edges[("black", "snake", "black")].tolist() == [1]
+    assert out.num_sampled_edges[("brown", "horse", "brown")] == [2]
+    assert out.num_sampled_edges[("brown", "tortoise", "black")] == [3]
+    assert out.num_sampled_edges[("brown", "mongoose", "black")] == [2]
+    assert out.num_sampled_edges[("black", "cow", "brown")] == [2]
+    assert out.num_sampled_edges[("black", "snake", "black")] == [1]
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
@@ -183,14 +183,14 @@ def test_neighbor_sample_mock_sampling_results(abc_graph):
     assert out.col[("B", "ba", "A")].tolist() == [1, 1]
 
     assert len(out.num_sampled_nodes) == 3
-    assert out.num_sampled_nodes["A"].tolist() == [2, 0, 0, 0, 0]
-    assert out.num_sampled_nodes["B"].tolist() == [0, 2, 0, 0, 0]
-    assert out.num_sampled_nodes["C"].tolist() == [0, 0, 2, 0, 1]
+    assert out.num_sampled_nodes["A"] == [2, 0, 0, 0, 0]
+    assert out.num_sampled_nodes["B"] == [0, 2, 0, 0, 0]
+    assert out.num_sampled_nodes["C"] == [0, 0, 2, 0, 1]
 
     assert len(out.num_sampled_edges) == 3
-    assert out.num_sampled_edges[("A", "ab", "B")].tolist() == [3, 0, 1, 0]
-    assert out.num_sampled_edges[("B", "ba", "A")].tolist() == [0, 1, 0, 1]
-    assert out.num_sampled_edges[("B", "bc", "C")].tolist() == [0, 2, 0, 2]
+    assert out.num_sampled_edges[("A", "ab", "B")] == [3, 0, 1, 0]
+    assert out.num_sampled_edges[("B", "ba", "A")] == [0, 1, 0, 1]
+    assert out.num_sampled_edges[("B", "bc", "C")] == [0, 2, 0, 2]
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
diff --git a/python/cugraph-pyg/cugraph_pyg/utils/__init__.py b/python/cugraph-pyg/cugraph_pyg/utils/__init__.py
new file mode 100644
index 00000000000..aeae6078111
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/utils/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/cugraph-pyg/cugraph_pyg/utils/imports.py b/python/cugraph-pyg/cugraph_pyg/utils/imports.py
new file mode 100644
index 00000000000..1cc865a1f35
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/utils/imports.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from packaging.requirements import Requirement
+from importlib import import_module
+
+
+def package_available(requirement: str) -> bool:
+    """Check if a package is installed and meets the version requirement."""
+    req = Requirement(requirement)
+    try:
+        pkg = import_module(req.name)
+    except ImportError:
+        return False
+
+    if len(req.specifier) > 0:
+        if hasattr(pkg, "__version__"):
+            return pkg.__version__ in req.specifier
+        else:
+            return False
+
+    return True
diff --git a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py
index eafadfa4ff0..2e3d032a20f 100644
--- a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py
+++ b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py
@@ -353,7 +353,7 @@ def uniform_neighbor_sample(
         else None,
         h_fan_out=fanout_vals,
         with_replacement=with_replacement,
-        do_expensive_check=True,
+        do_expensive_check=False,
         with_edge_properties=with_edge_properties,
         random_state=random_state,
         prior_sources_behavior=prior_sources_behavior,
diff --git a/python/cugraph/cugraph/structure/convert_matrix.py b/python/cugraph/cugraph/structure/convert_matrix.py
index ca8e93c482b..b9b9554b870 100644
--- a/python/cugraph/cugraph/structure/convert_matrix.py
+++ b/python/cugraph/cugraph/structure/convert_matrix.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -277,6 +277,8 @@ def from_pandas_edgelist(
     renumber=True,
 ):
     """
+    See :func:`networkx.convert_matrix.from_pandas_edgelist`.
+
     Initialize a graph from the edge list. It is an error to call this
     method on an initialized Graph object. Source argument is source
     column name and destination argument is destination column name.
diff --git a/python/cugraph/cugraph/tests/centrality/test_batch_betweenness_centrality_mg.py b/python/cugraph/cugraph/tests/centrality/test_batch_betweenness_centrality_mg.py
index 1c73ebb0216..9f0980d4199 100644
--- a/python/cugraph/cugraph/tests/centrality/test_batch_betweenness_centrality_mg.py
+++ b/python/cugraph/cugraph/tests/centrality/test_batch_betweenness_centrality_mg.py
@@ -93,6 +93,3 @@ def test_mg_betweenness_centrality(
         second_key="ref_bc",
         epsilon=DEFAULT_EPSILON,
     )
-
-    # Clean-up stored dataset edge-lists
-    dataset.unload()
diff --git a/python/cugraph/cugraph/tests/centrality/test_batch_edge_betweenness_centrality_mg.py b/python/cugraph/cugraph/tests/centrality/test_batch_edge_betweenness_centrality_mg.py
index 4530dd3da86..4764c01f0fc 100644
--- a/python/cugraph/cugraph/tests/centrality/test_batch_edge_betweenness_centrality_mg.py
+++ b/python/cugraph/cugraph/tests/centrality/test_batch_edge_betweenness_centrality_mg.py
@@ -84,5 +84,3 @@ def test_mg_edge_betweenness_centrality(
         second_key="ref_bc",
         epsilon=DEFAULT_EPSILON,
     )
-    # Clean-up stored dataset edge-lists
-    dataset.unload()
diff --git a/python/cugraph/cugraph/tests/centrality/test_betweenness_centrality.py b/python/cugraph/cugraph/tests/centrality/test_betweenness_centrality.py
index db34c68a054..6d1f53f7fc3 100644
--- a/python/cugraph/cugraph/tests/centrality/test_betweenness_centrality.py
+++ b/python/cugraph/cugraph/tests/centrality/test_betweenness_centrality.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.:
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.:
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -116,6 +116,8 @@ def calc_betweenness_centrality(
         create_using=cugraph.Graph(directed=directed),
         ignore_weights=not edgevals,
     )
+    if multi_gpu_batch:
+        G.enable_batch()
 
     M = G.to_pandas_edgelist().rename(
         columns={"src": "0", "dst": "1", "wgt": edge_attr}
@@ -130,8 +132,6 @@ def calc_betweenness_centrality(
     )
 
     assert G is not None and Gnx is not None
-    if multi_gpu_batch:
-        G.enable_batch()
 
     calc_func = None
     if k is not None and seed is not None:
diff --git a/python/cugraph/cugraph/tests/centrality/test_betweenness_centrality_mg.py b/python/cugraph/cugraph/tests/centrality/test_betweenness_centrality_mg.py
index c94c2dcaff6..35e199093ce 100644
--- a/python/cugraph/cugraph/tests/centrality/test_betweenness_centrality_mg.py
+++ b/python/cugraph/cugraph/tests/centrality/test_betweenness_centrality_mg.py
@@ -49,14 +49,12 @@ def setup_function():
 
 
 def get_sg_graph(dataset, directed):
-    dataset.unload()
     G = dataset.get_graph(create_using=cugraph.Graph(directed=directed))
 
     return G
 
 
 def get_mg_graph(dataset, directed):
-    dataset.unload()
     ddf = dataset.get_dask_edgelist()
     dg = cugraph.Graph(directed=directed)
     dg.from_dask_cudf_edgelist(
@@ -96,7 +94,6 @@ def test_dask_mg_betweenness_centrality(
     benchmark,
 ):
     g = get_sg_graph(dataset, directed)
-    dataset.unload()
     dg = get_mg_graph(dataset, directed)
     random_state = subset_seed
 
@@ -143,6 +140,3 @@ def test_dask_mg_betweenness_centrality(
     diff = cupy.isclose(mg_bc_results, sg_bc_results)
 
     assert diff.all()
-
-    # Clean-up stored dataset edge-lists
-    dataset.unload()
diff --git a/python/cugraph/cugraph/tests/centrality/test_degree_centrality_mg.py b/python/cugraph/cugraph/tests/centrality/test_degree_centrality_mg.py
index 68daff9238c..8606649c745 100644
--- a/python/cugraph/cugraph/tests/centrality/test_degree_centrality_mg.py
+++ b/python/cugraph/cugraph/tests/centrality/test_degree_centrality_mg.py
@@ -45,14 +45,12 @@ def setup_function():
 
 
 def get_sg_graph(dataset, directed):
-    dataset.unload()
     G = dataset.get_graph(create_using=cugraph.Graph(directed=directed))
 
     return G
 
 
 def get_mg_graph(dataset, directed):
-    dataset.unload()
     ddf = dataset.get_dask_edgelist()
     dg = cugraph.Graph(directed=directed)
     dg.from_dask_cudf_edgelist(
@@ -118,6 +116,3 @@ def test_dask_mg_degree(dask_client, dataset, directed):
         check_names=False,
         check_dtype=False,
     )
-
-    # Clean-up stored dataset edge-lists
-    dataset.unload()
diff --git a/python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality_mg.py b/python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality_mg.py
index 80acfe1c4ad..5b83a05e2a2 100644
--- a/python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality_mg.py
+++ b/python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality_mg.py
@@ -47,7 +47,6 @@ def setup_function():
 
 
 def get_sg_graph(dataset, directed, edge_ids):
-    dataset.unload()
     df = dataset.get_edgelist()
     if edge_ids:
         if not directed:
@@ -71,7 +70,6 @@ def get_sg_graph(dataset, directed, edge_ids):
 
 
 def get_mg_graph(dataset, directed, edge_ids, weight):
-    dataset.unload()
     ddf = dataset.get_dask_edgelist()
 
     if weight:
@@ -178,6 +176,3 @@ def test_dask_mg_edge_betweenness_centrality(
 
         assert len(edge_bc_diffs1) == 0
         assert len(edge_bc_diffs2) == 0
-
-    # Clean-up stored dataset edge-lists
-    dataset.unload()
diff --git a/python/cugraph/cugraph/tests/centrality/test_eigenvector_centrality_mg.py b/python/cugraph/cugraph/tests/centrality/test_eigenvector_centrality_mg.py
index 8cd77fb5e24..3a840c82e95 100644
--- a/python/cugraph/cugraph/tests/centrality/test_eigenvector_centrality_mg.py
+++ b/python/cugraph/cugraph/tests/centrality/test_eigenvector_centrality_mg.py
@@ -52,7 +52,6 @@ def setup_function():
 def test_dask_mg_eigenvector_centrality(dask_client, dataset, directed):
     input_data_path = dataset.get_path()
     print(f"dataset={input_data_path}")
-    dataset.unload()
     ddf = dataset.get_dask_edgelist()
     dg = cugraph.Graph(directed=True)
     dg.from_dask_cudf_edgelist(ddf, "src", "dst", store_transposed=True)
@@ -89,15 +88,11 @@ def test_dask_mg_eigenvector_centrality(dask_client, dataset, directed):
             err = err + 1
     assert err == 0
 
-    # Clean-up stored dataset edge-lists
-    dataset.unload()
-
 
 @pytest.mark.mg
 def test_dask_mg_eigenvector_centrality_transposed_false(dask_client):
     dataset = DATASETS[0]
 
-    dataset.unload()
     ddf = dataset.get_dask_edgelist()
     dg = cugraph.Graph(directed=True)
     dg.from_dask_cudf_edgelist(ddf, "src", "dst", store_transposed=False)
@@ -110,6 +105,3 @@ def test_dask_mg_eigenvector_centrality_transposed_false(dask_client):
 
     with pytest.warns(UserWarning, match=warning_msg):
         dcg.eigenvector_centrality(dg)
-
-    # Clean-up stored dataset edge-lists
-    dataset.unload()
diff --git a/python/cugraph/cugraph/tests/centrality/test_katz_centrality_mg.py b/python/cugraph/cugraph/tests/centrality/test_katz_centrality_mg.py
index ebbe5974814..5dcbd8173df 100644
--- a/python/cugraph/cugraph/tests/centrality/test_katz_centrality_mg.py
+++ b/python/cugraph/cugraph/tests/centrality/test_katz_centrality_mg.py
@@ -53,7 +53,6 @@ def test_dask_mg_katz_centrality(dask_client, dataset, directed):
     input_data_path = dataset.get_path()
     print(f"dataset={input_data_path}")
 
-    dataset.unload()
     ddf = dataset.get_dask_edgelist()
     dg = cugraph.Graph(directed=True)
     dg.from_dask_cudf_edgelist(ddf, "src", "dst", store_transposed=True)
@@ -95,16 +94,12 @@ def test_dask_mg_katz_centrality(dask_client, dataset, directed):
             err = err + 1
     assert err == 0
 
-    # Clean-up stored dataset edge-lists
-    dataset.unload()
-
 
 @pytest.mark.mg
 @pytest.mark.skipif(is_single_gpu(), reason="skipping MG testing on Single GPU system")
 @pytest.mark.parametrize("dataset", DATASETS)
 @pytest.mark.parametrize("directed", IS_DIRECTED)
 def test_dask_mg_katz_centrality_nstart(dask_client, dataset, directed):
-    dataset.unload()
     ddf = dataset.get_dask_edgelist()
     dg = cugraph.Graph(directed=True)
     dg.from_dask_cudf_edgelist(ddf, "src", "dst", store_transposed=True)
@@ -136,14 +131,10 @@ def test_dask_mg_katz_centrality_nstart(dask_client, dataset, directed):
             err = err + 1
     assert err == 0
 
-    # Clean-up stored dataset edge-lists
-    dataset.unload()
-
 
 @pytest.mark.mg
 @pytest.mark.parametrize("dataset", DATASETS)
 def test_dask_mg_katz_centrality_transposed_false(dask_client, dataset):
-    dataset.unload()
     ddf = dataset.get_dask_edgelist()
     dg = cugraph.Graph(directed=True)
     dg.from_dask_cudf_edgelist(ddf, "src", "dst", store_transposed=False)
@@ -156,6 +147,3 @@ def test_dask_mg_katz_centrality_transposed_false(dask_client, dataset):
 
     with pytest.warns(UserWarning, match=warning_msg):
         dcg.katz_centrality(dg)
-
-    # Clean-up stored dataset edge-lists
-    dataset.unload()
diff --git a/python/cugraph/cugraph/tests/comms/test_comms_mg.py b/python/cugraph/cugraph/tests/comms/test_comms_mg.py
index 75462924c9d..d096eb7e5c2 100644
--- a/python/cugraph/cugraph/tests/comms/test_comms_mg.py
+++ b/python/cugraph/cugraph/tests/comms/test_comms_mg.py
@@ -16,10 +16,9 @@
 import pytest
 import cugraph.dask as dcg
 
-import cudf
-import dask_cudf
 import cugraph
-from cugraph.testing.utils import RAPIDS_DATASET_ROOT_DIR_PATH
+from cugraph.datasets import karate, dolphins
+
 
 # =============================================================================
 # Pytest Setup / Teardown - called for each test function
@@ -30,12 +29,36 @@ def setup_function():
     gc.collect()
 
 
+# =============================================================================
+# Parameters
+# =============================================================================
+
+
+DATASETS = [karate, dolphins]
 IS_DIRECTED = [True, False]
 
 
-# @pytest.mark.skipif(
-#    is_single_gpu(), reason="skipping MG testing on Single GPU system"
-# )
+# =============================================================================
+# Helper Functions
+# =============================================================================
+
+
+def get_pagerank_result(dataset, is_mg):
+    """Return the cugraph.pagerank result for an MG or SG graph"""
+
+    if is_mg:
+        dg = dataset.get_dask_graph(store_transposed=True)
+        return dcg.pagerank(dg).compute()
+    else:
+        g = dataset.get_graph(store_transposed=True)
+        return cugraph.pagerank(g)
+
+
+# =============================================================================
+# Tests
+# =============================================================================
+
+
 @pytest.mark.mg
 @pytest.mark.parametrize("directed", IS_DIRECTED)
 def test_dask_mg_pagerank(dask_client, directed):
@@ -43,62 +66,17 @@ def test_dask_mg_pagerank(dask_client, directed):
     # Initialize and run pagerank on two distributed graphs
     # with same communicator
 
-    input_data_path1 = (RAPIDS_DATASET_ROOT_DIR_PATH / "karate.csv").as_posix()
+    input_data_path1 = karate.get_path()
     print(f"dataset1={input_data_path1}")
-    chunksize1 = dcg.get_chunksize(input_data_path1)
+    result_pr1 = get_pagerank_result(karate, is_mg=True)
 
-    input_data_path2 = (RAPIDS_DATASET_ROOT_DIR_PATH / "dolphins.csv").as_posix()
+    input_data_path2 = dolphins.get_path()
     print(f"dataset2={input_data_path2}")
-    chunksize2 = dcg.get_chunksize(input_data_path2)
-
-    ddf1 = dask_cudf.read_csv(
-        input_data_path1,
-        blocksize=chunksize1,
-        delimiter=" ",
-        names=["src", "dst", "value"],
-        dtype=["int32", "int32", "float32"],
-    )
-
-    dg1 = cugraph.Graph(directed=directed)
-    dg1.from_dask_cudf_edgelist(ddf1, "src", "dst")
-
-    result_pr1 = dcg.pagerank(dg1).compute()
-
-    ddf2 = dask_cudf.read_csv(
-        input_data_path2,
-        blocksize=chunksize2,
-        delimiter=" ",
-        names=["src", "dst", "value"],
-        dtype=["int32", "int32", "float32"],
-    )
-
-    dg2 = cugraph.Graph(directed=directed)
-    dg2.from_dask_cudf_edgelist(ddf2, "src", "dst")
-
-    result_pr2 = dcg.pagerank(dg2).compute()
+    result_pr2 = get_pagerank_result(dolphins, is_mg=True)
 
     # Calculate single GPU pagerank for verification of results
-    df1 = cudf.read_csv(
-        input_data_path1,
-        delimiter=" ",
-        names=["src", "dst", "value"],
-        dtype=["int32", "int32", "float32"],
-    )
-
-    g1 = cugraph.Graph(directed=directed)
-    g1.from_cudf_edgelist(df1, "src", "dst")
-    expected_pr1 = cugraph.pagerank(g1)
-
-    df2 = cudf.read_csv(
-        input_data_path2,
-        delimiter=" ",
-        names=["src", "dst", "value"],
-        dtype=["int32", "int32", "float32"],
-    )
-
-    g2 = cugraph.Graph(directed=directed)
-    g2.from_cudf_edgelist(df2, "src", "dst")
-    expected_pr2 = cugraph.pagerank(g2)
+    expected_pr1 = get_pagerank_result(karate, is_mg=False)
+    expected_pr2 = get_pagerank_result(dolphins, is_mg=False)
 
     # Compare and verify pagerank results
 
diff --git a/python/cugraph/cugraph/tests/community/test_induced_subgraph_mg.py b/python/cugraph/cugraph/tests/community/test_induced_subgraph_mg.py
index 45ec8eca0e8..311fd7a24bc 100644
--- a/python/cugraph/cugraph/tests/community/test_induced_subgraph_mg.py
+++ b/python/cugraph/cugraph/tests/community/test_induced_subgraph_mg.py
@@ -17,7 +17,6 @@
 
 import cugraph
 import cugraph.dask as dcg
-import dask_cudf
 from cudf.testing.testing import assert_frame_equal
 from cugraph.dask.common.mg_utils import is_single_gpu
 from cugraph.datasets import karate, dolphins, email_Eu_core
@@ -36,11 +35,13 @@ def setup_function():
 # Parameters
 # =============================================================================
 
+
 DATASETS = [karate, dolphins, email_Eu_core]
 IS_DIRECTED = [True, False]
 NUM_VERTICES = [2, 5, 10, 20]
 OFFSETS = [None]
 
+
 # =============================================================================
 # Helper functions
 # =============================================================================
@@ -53,15 +54,7 @@ def get_sg_graph(dataset, directed):
 
 
 def get_mg_graph(dataset, directed):
-    input_data_path = dataset.get_path()
-    blocksize = dcg.get_chunksize(input_data_path)
-    ddf = dask_cudf.read_csv(
-        input_data_path,
-        blocksize=blocksize,
-        delimiter=dataset.metadata["delim"],
-        names=dataset.metadata["col_names"],
-        dtype=dataset.metadata["col_types"],
-    )
+    ddf = dataset.get_dask_edgelist()
     dg = cugraph.Graph(directed=directed)
     dg.from_dask_cudf_edgelist(
         ddf,
@@ -108,7 +101,7 @@ def test_mg_induced_subgraph(
 
     # FIXME: This parameter is not yet tested
     # mg_offsets = mg_offsets.compute().reset_index(drop=True)
-    mg_df, mg_offsets = result_induced_subgraph
+    mg_df, _ = result_induced_subgraph
 
     if mg_df is not None and sg_induced_subgraph is not None:
         # FIXME: 'edges()' or 'view_edgelist()' takes half the edges out if
diff --git a/python/cugraph/cugraph/tests/community/test_leiden_mg.py b/python/cugraph/cugraph/tests/community/test_leiden_mg.py
index b1908ae10a2..2904ecd12a2 100644
--- a/python/cugraph/cugraph/tests/community/test_leiden_mg.py
+++ b/python/cugraph/cugraph/tests/community/test_leiden_mg.py
@@ -13,123 +13,56 @@
 
 import pytest
 
-
-import dask_cudf
 import cugraph
 import cugraph.dask as dcg
-from cugraph.testing import utils
-
+from cugraph.datasets import karate_asymmetric, karate, dolphins
 
-try:
-    from rapids_pytest_benchmark import setFixtureParamNames
-except ImportError:
-    print(
-        "\n\nWARNING: rapids_pytest_benchmark is not installed, "
-        "falling back to pytest_benchmark fixtures.\n"
-    )
 
-    # if rapids_pytest_benchmark is not available, just perfrom time-only
-    # benchmarking and replace the util functions with nops
-    import pytest_benchmark
+# =============================================================================
+# Parameters
+# =============================================================================
 
-    gpubenchmark = pytest_benchmark.plugin.benchmark
 
-    def setFixtureParamNames(*args, **kwargs):
-        pass
+DATASETS = [karate, dolphins]
+DATASETS_ASYMMETRIC = [karate_asymmetric]
 
 
 # =============================================================================
-# Parameters
+# Helper Functions
 # =============================================================================
-DATASETS_ASYMMETRIC = [utils.RAPIDS_DATASET_ROOT_DIR_PATH / "karate-asymmetric.csv"]
-
-
-###############################################################################
-# Fixtures
-# @pytest.mark.skipif(
-#    is_single_gpu(), reason="skipping MG testing on Single GPU system"
-# )
-@pytest.fixture(
-    scope="module",
-    params=DATASETS_ASYMMETRIC,
-    ids=[f"dataset={d.as_posix()}" for d in DATASETS_ASYMMETRIC],
-)
-def daskGraphFromDataset(request, dask_client):
-    """
-    Returns a new dask dataframe created from the dataset file param.
-    This creates a directed Graph.
-    """
-    # Since parameterized fixtures do not assign param names to param values,
-    # manually call the helper to do so.
-    setFixtureParamNames(request, ["dataset"])
-    dataset = request.param
-
-    chunksize = dcg.get_chunksize(dataset)
-    ddf = dask_cudf.read_csv(
-        dataset,
-        blocksize=chunksize,
-        delimiter=" ",
-        names=["src", "dst", "value"],
-        dtype=["int32", "int32", "float32"],
-    )
-
-    dg = cugraph.Graph(directed=True)
-    dg.from_dask_cudf_edgelist(ddf, "src", "dst", "value")
-    return dg
 
 
-@pytest.fixture(
-    scope="module",
-    params=utils.DATASETS_UNDIRECTED,
-    ids=[f"dataset={d.as_posix()}" for d in utils.DATASETS_UNDIRECTED],
-)
-def uddaskGraphFromDataset(request, dask_client):
-    """
-    Returns a new dask dataframe created from the dataset file param.
-    This creates an undirected Graph.
-    """
-    # Since parameterized fixtures do not assign param names to param
-    # values, manually call the helper to do so.
-    setFixtureParamNames(request, ["dataset"])
-    dataset = request.param
-
-    chunksize = dcg.get_chunksize(dataset)
-    ddf = dask_cudf.read_csv(
-        dataset,
-        blocksize=chunksize,
-        delimiter=" ",
-        names=["src", "dst", "value"],
-        dtype=["int32", "int32", "float32"],
-    )
-
-    dg = cugraph.Graph(directed=False)
-    dg.from_dask_cudf_edgelist(ddf, "src", "dst", "value")
+def get_mg_graph(dataset, directed):
+    """Returns an MG graph"""
+    ddf = dataset.get_dask_edgelist()
+
+    dg = cugraph.Graph(directed=directed)
+    dg.from_dask_cudf_edgelist(ddf, "src", "dst", "wgt")
+
     return dg
 
 
-###############################################################################
+# =============================================================================
 # Tests
-# @pytest.mark.skipif(
-#    is_single_gpu(), reason="skipping MG testing on Single GPU system"
-# )
+# =============================================================================
 # FIXME: Implement more robust tests
+
+
 @pytest.mark.mg
-def test_mg_leiden_with_edgevals_directed_graph(daskGraphFromDataset):
+@pytest.mark.parametrize("dataset", DATASETS_ASYMMETRIC)
+def test_mg_leiden_with_edgevals_directed_graph(dask_client, dataset):
+    dg = get_mg_graph(dataset, directed=True)
     # Directed graphs are not supported by Leiden and a ValueError should be
     # raised
     with pytest.raises(ValueError):
-        parts, mod = dcg.leiden(daskGraphFromDataset)
+        parts, mod = dcg.leiden(dg)
 
 
-###############################################################################
-# Tests
-# @pytest.mark.skipif(
-#    is_single_gpu(), reason="skipping MG testing on Single GPU system"
-# )
-# FIXME: Implement more robust tests
 @pytest.mark.mg
-def test_mg_leiden_with_edgevals_undirected_graph(uddaskGraphFromDataset):
-    parts, mod = dcg.leiden(uddaskGraphFromDataset)
+@pytest.mark.parametrize("dataset", DATASETS)
+def test_mg_leiden_with_edgevals_undirected_graph(dask_client, dataset):
+    dg = get_mg_graph(dataset, directed=False)
+    parts, mod = dcg.leiden(dg)
 
     # FIXME: either call Nx with the same dataset and compare results, or
     # hardcode golden results to compare to.
diff --git a/python/cugraph/cugraph/tests/community/test_louvain_mg.py b/python/cugraph/cugraph/tests/community/test_louvain_mg.py
index 19fffe96b5c..0dff7f1c8b0 100644
--- a/python/cugraph/cugraph/tests/community/test_louvain_mg.py
+++ b/python/cugraph/cugraph/tests/community/test_louvain_mg.py
@@ -14,122 +14,41 @@
 import pytest
 
 import cugraph.dask as dcg
+from cugraph.datasets import karate_asymmetric, karate, dolphins
 
-import cugraph
-import dask_cudf
-from cugraph.testing import utils
-
-
-try:
-    from rapids_pytest_benchmark import setFixtureParamNames
-except ImportError:
-    print(
-        "\n\nWARNING: rapids_pytest_benchmark is not installed, "
-        "falling back to pytest_benchmark fixtures.\n"
-    )
-
-    # if rapids_pytest_benchmark is not available, just perfrom time-only
-    # benchmarking and replace the util functions with nops
-    import pytest_benchmark
-
-    gpubenchmark = pytest_benchmark.plugin.benchmark
-
-    def setFixtureParamNames(*args, **kwargs):
-        pass
+from test_leiden_mg import get_mg_graph
 
 
 # =============================================================================
 # Parameters
 # =============================================================================
-DATASETS_ASYMMETRIC = [utils.RAPIDS_DATASET_ROOT_DIR_PATH / "karate-asymmetric.csv"]
-
-
-###############################################################################
-# Fixtures
-# @pytest.mark.skipif(
-#    is_single_gpu(), reason="skipping MG testing on Single GPU system"
-# )
-@pytest.fixture(
-    scope="module",
-    params=DATASETS_ASYMMETRIC,
-    ids=[f"dataset={d.as_posix()}" for d in DATASETS_ASYMMETRIC],
-)
-def daskGraphFromDataset(request, dask_client):
-    """
-    Returns a new dask dataframe created from the dataset file param.
-    This creates a directed Graph.
-    """
-    # Since parameterized fixtures do not assign param names to param values,
-    # manually call the helper to do so.
-    setFixtureParamNames(request, ["dataset"])
-    dataset = request.param
-
-    chunksize = dcg.get_chunksize(dataset)
-    ddf = dask_cudf.read_csv(
-        dataset,
-        blocksize=chunksize,
-        delimiter=" ",
-        names=["src", "dst", "value"],
-        dtype=["int32", "int32", "float32"],
-    )
-
-    dg = cugraph.Graph(directed=True)
-    dg.from_dask_cudf_edgelist(ddf, "src", "dst", "value")
-    return dg
 
 
-@pytest.fixture(
-    scope="module",
-    params=utils.DATASETS_UNDIRECTED,
-    ids=[f"dataset={d.as_posix()}" for d in utils.DATASETS_UNDIRECTED],
-)
-def uddaskGraphFromDataset(request, dask_client):
-    """
-    Returns a new dask dataframe created from the dataset file param.
-    This creates an undirected Graph.
-    """
-    # Since parameterized fixtures do not assign param names to param
-    # values, manually call the helper to do so.
-    setFixtureParamNames(request, ["dataset"])
-    dataset = request.param
+DATASETS_ASYMMETRIC = DATASETS_ASYMMETRIC = [karate_asymmetric]
+DATASETS = [karate, dolphins]
 
-    chunksize = dcg.get_chunksize(dataset)
-    ddf = dask_cudf.read_csv(
-        dataset,
-        blocksize=chunksize,
-        delimiter=" ",
-        names=["src", "dst", "value"],
-        dtype=["int32", "int32", "float32"],
-    )
 
-    dg = cugraph.Graph(directed=False)
-    dg.from_dask_cudf_edgelist(ddf, "src", "dst", "value")
-    return dg
-
-
-###############################################################################
+# =============================================================================
 # Tests
-# @pytest.mark.skipif(
-#    is_single_gpu(), reason="skipping MG testing on Single GPU system"
-# )
+# =============================================================================
 # FIXME: Implement more robust tests
+
+
 @pytest.mark.mg
-def test_mg_louvain_with_edgevals_directed_graph(daskGraphFromDataset):
+@pytest.mark.parametrize("dataset", DATASETS_ASYMMETRIC)
+def test_mg_louvain_with_edgevals_directed_graph(dask_client, dataset):
+    dg = get_mg_graph(dataset, directed=True)
     # Directed graphs are not supported by Louvain and a ValueError should be
     # raised
     with pytest.raises(ValueError):
-        parts, mod = dcg.louvain(daskGraphFromDataset)
+        parts, mod = dcg.louvain(dg)
 
 
-###############################################################################
-# Tests
-# @pytest.mark.skipif(
-#    is_single_gpu(), reason="skipping MG testing on Single GPU system"
-# )
-# FIXME: Implement more robust tests
 @pytest.mark.mg
-def test_mg_louvain_with_edgevals_undirected_graph(uddaskGraphFromDataset):
-    parts, mod = dcg.louvain(uddaskGraphFromDataset)
+@pytest.mark.parametrize("dataset", DATASETS)
+def test_mg_louvain_with_edgevals_undirected_graph(dask_client, dataset):
+    dg = get_mg_graph(dataset, directed=False)
+    parts, mod = dcg.louvain(dg)
 
     # FIXME: either call Nx with the same dataset and compare results, or
     # hardcode golden results to compare to.
diff --git a/python/cugraph/cugraph/tests/community/test_triangle_count_mg.py b/python/cugraph/cugraph/tests/community/test_triangle_count_mg.py
index 0a052845cf8..e2c47af8a1b 100644
--- a/python/cugraph/cugraph/tests/community/test_triangle_count_mg.py
+++ b/python/cugraph/cugraph/tests/community/test_triangle_count_mg.py
@@ -16,115 +16,81 @@
 import random
 import pytest
 
-import cudf
-import dask_cudf
 import cugraph
 import cugraph.dask as dcg
-from cugraph.testing import utils
-from pylibcugraph.testing.utils import gen_fixture_params_product
+from cugraph.datasets import karate, dolphins
 
 
 # =============================================================================
 # Pytest Setup / Teardown - called for each test function
 # =============================================================================
+
+
 def setup_function():
     gc.collect()
 
 
 # =============================================================================
-# Pytest fixtures
+# Parameters
 # =============================================================================
-datasets = utils.DATASETS_UNDIRECTED
-fixture_params = gen_fixture_params_product(
-    (datasets, "graph_file"),
-    ([True, False], "start_list"),
-)
-
-
-@pytest.fixture(scope="module", params=fixture_params)
-def input_combo(request):
-    """
-    Simply return the current combination of params as a dictionary for use in
-    tests or other parameterized fixtures.
-    """
-    parameters = dict(zip(("graph_file", "start_list", "edgevals"), request.param))
-
-    return parameters
-
-
-@pytest.fixture(scope="module")
-def input_expected_output(dask_client, input_combo):
-    """
-    This fixture returns the inputs and expected results from the triangle
-    count algo.
-    """
-    start_list = input_combo["start_list"]
-    input_data_path = input_combo["graph_file"]
-    G = utils.generate_cugraph_graph_from_file(
-        input_data_path, directed=False, edgevals=True
-    )
 
-    input_combo["SGGraph"] = G
 
-    if start_list:
+DATASETS = [karate, dolphins]
+START_LIST = [True, False]
+
+
+# =============================================================================
+# Helper Functions
+# =============================================================================
+
+
+def get_sg_graph(dataset, directed, start):
+    G = dataset.get_graph(create_using=cugraph.Graph(directed=directed))
+    if start:
         # sample k nodes from the cuGraph graph
-        k = random.randint(1, 10)
-        srcs = G.view_edge_list()[G.source_columns]
-        dsts = G.view_edge_list()[G.destination_columns]
-        nodes = cudf.concat([srcs, dsts]).drop_duplicates()
-        start_list = nodes.sample(k)
+        start = G.select_random_vertices(num_vertices=random.randint(1, 10))
     else:
-        start_list = None
+        start = None
 
-    sg_triangle_results = cugraph.triangle_count(G, start_list)
-    sg_triangle_results = sg_triangle_results.sort_values("vertex").reset_index(
-        drop=True
-    )
+    return G, start
 
-    input_combo["sg_triangle_results"] = sg_triangle_results
-    input_combo["start_list"] = start_list
-
-    # Creating an edgelist from a dask cudf dataframe
-    chunksize = dcg.get_chunksize(input_data_path)
-    ddf = dask_cudf.read_csv(
-        input_data_path,
-        blocksize=chunksize,
-        delimiter=" ",
-        names=["src", "dst", "value"],
-        dtype=["int32", "int32", "float32"],
-    )
 
-    dg = cugraph.Graph(directed=False)
+def get_mg_graph(dataset, directed):
+    ddf = dataset.get_dask_edgelist()
+    dg = cugraph.Graph(directed=directed)
     dg.from_dask_cudf_edgelist(
-        ddf, source="src", destination="dst", edge_attr="value", renumber=True
+        ddf, source="src", destination="dst", edge_attr="wgt", renumber=True
     )
 
-    input_combo["MGGraph"] = dg
-
-    return input_combo
+    return dg
 
 
 # =============================================================================
 # Tests
 # =============================================================================
+
+
 @pytest.mark.mg
-def test_sg_triangles(dask_client, benchmark, input_expected_output):
+@pytest.mark.parametrize("dataset", DATASETS)
+@pytest.mark.parametrize("start", START_LIST)
+def test_sg_triangles(dask_client, dataset, start, benchmark):
     # This test is only for benchmark purposes.
     sg_triangle_results = None
-    G = input_expected_output["SGGraph"]
-    start_list = input_expected_output["start_list"]
-    sg_triangle_results = benchmark(cugraph.triangle_count, G, start_list)
+    G, start = get_sg_graph(dataset, False, start)
+
+    sg_triangle_results = benchmark(cugraph.triangle_count, G, start)
+    sg_triangle_results.sort_values("vertex").reset_index(drop=True)
     assert sg_triangle_results is not None
 
 
 @pytest.mark.mg
-def test_triangles(dask_client, benchmark, input_expected_output):
-
-    dg = input_expected_output["MGGraph"]
-    start_list = input_expected_output["start_list"]
-
-    result_counts = benchmark(dcg.triangle_count, dg, start_list)
+@pytest.mark.parametrize("dataset", DATASETS)
+@pytest.mark.parametrize("start", START_LIST)
+def test_triangles(dask_client, dataset, start, benchmark):
+    G, start = get_sg_graph(dataset, False, start)
+    dg = get_mg_graph(dataset, False)
 
+    result_counts = benchmark(dcg.triangle_count, dg, start)
     result_counts = (
         result_counts.drop_duplicates()
         .compute()
@@ -132,8 +98,9 @@ def test_triangles(dask_client, benchmark, input_expected_output):
         .reset_index(drop=True)
         .rename(columns={"counts": "mg_counts"})
     )
-
-    expected_output = input_expected_output["sg_triangle_results"]
+    expected_output = (
+        cugraph.triangle_count(G, start).sort_values("vertex").reset_index(drop=True)
+    )
 
     # Update the mg triangle count with sg triangle count results
     # for easy comparison using cuDF DataFrame methods.
diff --git a/python/cugraph/cugraph/tests/components/test_connectivity_mg.py b/python/cugraph/cugraph/tests/components/test_connectivity_mg.py
index 26e8ed17bcb..4ab251c0e29 100644
--- a/python/cugraph/cugraph/tests/components/test_connectivity_mg.py
+++ b/python/cugraph/cugraph/tests/components/test_connectivity_mg.py
@@ -15,11 +15,9 @@
 
 import pytest
 
-import cudf
-import dask_cudf
 import cugraph
 import cugraph.dask as dcg
-from cugraph.testing.utils import RAPIDS_DATASET_ROOT_DIR_PATH
+from cugraph.datasets import netscience
 
 
 # =============================================================================
@@ -31,42 +29,47 @@ def setup_function():
     gc.collect()
 
 
+# =============================================================================
+# Parameters
+# =============================================================================
+
+
+DATASETS = [netscience]
 # Directed graph is not currently supported
 IS_DIRECTED = [False, True]
 
 
-# @pytest.mark.skipif(
-#    is_single_gpu(), reason="skipping MG testing on Single GPU system"
-# )
+# =============================================================================
+# Helper
+# =============================================================================
+
+
+def get_mg_graph(dataset, directed):
+    """Returns an MG graph"""
+    ddf = dataset.get_dask_edgelist()
+
+    dg = cugraph.Graph(directed=directed)
+    dg.from_dask_cudf_edgelist(ddf, "src", "dst", "wgt")
+
+    return dg
+
+
+# =============================================================================
+# Tests
+# =============================================================================
+
+
 @pytest.mark.mg
+@pytest.mark.parametrize("dataset", DATASETS)
 @pytest.mark.parametrize("directed", IS_DIRECTED)
-def test_dask_mg_wcc(dask_client, directed):
-
-    input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "netscience.csv").as_posix()
+def test_dask_mg_wcc(dask_client, dataset, directed):
+    input_data_path = dataset.get_path()
     print(f"dataset={input_data_path}")
-    chunksize = dcg.get_chunksize(input_data_path)
-
-    ddf = dask_cudf.read_csv(
-        input_data_path,
-        blocksize=chunksize,
-        delimiter=" ",
-        names=["src", "dst", "value"],
-        dtype=["int32", "int32", "float32"],
-    )
-
-    df = cudf.read_csv(
-        input_data_path,
-        delimiter=" ",
-        names=["src", "dst", "value"],
-        dtype=["int32", "int32", "float32"],
-    )
-
-    g = cugraph.Graph(directed=directed)
-    g.from_cudf_edgelist(df, "src", "dst", renumber=True)
 
-    dg = cugraph.Graph(directed=directed)
-    dg.from_dask_cudf_edgelist(ddf, "src", "dst")
+    g = dataset.get_graph(create_using=cugraph.Graph(directed=directed))
+    dg = get_mg_graph(dataset, directed)
 
+    # breakpoint()
     if not directed:
         expected_dist = cugraph.weakly_connected_components(g)
         result_dist = dcg.weakly_connected_components(dg)
diff --git a/python/cugraph/cugraph/tests/core/test_core_number_mg.py b/python/cugraph/cugraph/tests/core/test_core_number_mg.py
index f771ce513eb..3d9a7bef5be 100644
--- a/python/cugraph/cugraph/tests/core/test_core_number_mg.py
+++ b/python/cugraph/cugraph/tests/core/test_core_number_mg.py
@@ -15,107 +15,64 @@
 
 import pytest
 
-import dask_cudf
 import cugraph
 import cugraph.dask as dcg
-from cugraph.testing import utils
-from pylibcugraph.testing.utils import gen_fixture_params_product
+from cugraph.datasets import karate, dolphins, karate_asymmetric
 
 
 # =============================================================================
 # Pytest Setup / Teardown - called for each test function
 # =============================================================================
+
+
 def setup_function():
     gc.collect()
 
 
 # =============================================================================
-# Pytest fixtures
+# Parameters
 # =============================================================================
-datasets = utils.DATASETS_UNDIRECTED
-degree_type = ["incoming", "outgoing", "bidirectional"]
-
-fixture_params = gen_fixture_params_product(
-    (datasets, "graph_file"),
-    (degree_type, "degree_type"),
-)
-
-
-@pytest.fixture(scope="module", params=fixture_params)
-def input_combo(request):
-    """
-    Simply return the current combination of params as a dictionary for use in
-    tests or other parameterized fixtures.
-    """
-    parameters = dict(zip(("graph_file", "degree_type"), request.param))
-
-    return parameters
-
-
-@pytest.fixture(scope="module")
-def input_expected_output(dask_client, input_combo):
-    """
-    This fixture returns the inputs and expected results from the Core number
-    algo.
-    """
-    degree_type = input_combo["degree_type"]
-    input_data_path = input_combo["graph_file"]
-    G = utils.generate_cugraph_graph_from_file(
-        input_data_path, directed=False, edgevals=True
-    )
 
-    input_combo["SGGraph"] = G
 
-    sg_core_number_results = cugraph.core_number(G, degree_type)
-    sg_core_number_results = sg_core_number_results.sort_values("vertex").reset_index(
-        drop=True
-    )
+DATASETS = [karate, dolphins]
+DEGREE_TYPE = ["incoming", "outgoing", "bidirectional"]
 
-    input_combo["sg_core_number_results"] = sg_core_number_results
-    input_combo["degree_type"] = degree_type
-
-    # Creating an edgelist from a dask cudf dataframe
-    chunksize = dcg.get_chunksize(input_data_path)
-    ddf = dask_cudf.read_csv(
-        input_data_path,
-        blocksize=chunksize,
-        delimiter=" ",
-        names=["src", "dst", "value"],
-        dtype=["int32", "int32", "float32"],
-    )
 
-    dg = cugraph.Graph(directed=False)
-    dg.from_dask_cudf_edgelist(
-        ddf, source="src", destination="dst", edge_attr="value", renumber=True
-    )
+# =============================================================================
+# Helper Functions
+# =============================================================================
 
-    input_combo["MGGraph"] = dg
 
-    return input_combo
+def get_sg_results(dataset, degree_type):
+    G = dataset.get_graph(create_using=cugraph.Graph(directed=False))
+    res = cugraph.core_number(G, degree_type)
+    res = res.sort_values("vertex").reset_index(drop=True)
+    return res
 
 
 # =============================================================================
 # Tests
 # =============================================================================
+
+
 @pytest.mark.mg
-def test_sg_core_number(dask_client, benchmark, input_expected_output):
+@pytest.mark.parametrize("dataset", DATASETS)
+@pytest.mark.parametrize("degree_type", DEGREE_TYPE)
+def test_sg_core_number(dask_client, dataset, degree_type, benchmark):
     # This test is only for benchmark purposes.
     sg_core_number_results = None
-    G = input_expected_output["SGGraph"]
-    degree_type = input_expected_output["degree_type"]
-
+    G = dataset.get_graph(create_using=cugraph.Graph(directed=False))
     sg_core_number_results = benchmark(cugraph.core_number, G, degree_type)
     assert sg_core_number_results is not None
 
 
 @pytest.mark.mg
-def test_core_number(dask_client, benchmark, input_expected_output):
-
-    dg = input_expected_output["MGGraph"]
-    degree_type = input_expected_output["degree_type"]
+@pytest.mark.parametrize("dataset", DATASETS)
+@pytest.mark.parametrize("degree_type", DEGREE_TYPE)
+def test_core_number(dask_client, dataset, degree_type, benchmark):
+    dg = dataset.get_dask_graph(create_using=cugraph.Graph(directed=False))
 
     result_core_number = benchmark(dcg.core_number, dg, degree_type)
-
     result_core_number = (
         result_core_number.drop_duplicates()
         .compute()
@@ -124,7 +81,7 @@ def test_core_number(dask_client, benchmark, input_expected_output):
         .rename(columns={"core_number": "mg_core_number"})
     )
 
-    expected_output = input_expected_output["sg_core_number_results"]
+    expected_output = get_sg_results(dataset, degree_type)
 
     # Update the mg core number with sg core number results
     # for easy comparison using cuDF DataFrame methods.
@@ -135,30 +92,10 @@ def test_core_number(dask_client, benchmark, input_expected_output):
 
 
 @pytest.mark.mg
-def test_core_number_invalid_input(input_expected_output):
-    input_data_path = (
-        utils.RAPIDS_DATASET_ROOT_DIR_PATH / "karate-asymmetric.csv"
-    ).as_posix()
-
-    chunksize = dcg.get_chunksize(input_data_path)
-    ddf = dask_cudf.read_csv(
-        input_data_path,
-        blocksize=chunksize,
-        delimiter=" ",
-        names=["src", "dst", "value"],
-        dtype=["int32", "int32", "float32"],
-    )
-
-    dg = cugraph.Graph(directed=True)
-    dg.from_dask_cudf_edgelist(
-        ddf,
-        source="src",
-        destination="dst",
-        edge_attr="value",
-        renumber=True,
-    )
+def test_core_number_invalid_input():
+    dg = karate_asymmetric.get_graph(create_using=cugraph.Graph(directed=True))
 
     invalid_degree_type = 3
-    dg = input_expected_output["MGGraph"]
+
     with pytest.raises(ValueError):
         dcg.core_number(dg, invalid_degree_type)
diff --git a/python/cugraph/cugraph/tests/core/test_k_core_mg.py b/python/cugraph/cugraph/tests/core/test_k_core_mg.py
index b2ac18cf3a9..c7ad6d2d41d 100644
--- a/python/cugraph/cugraph/tests/core/test_k_core_mg.py
+++ b/python/cugraph/cugraph/tests/core/test_k_core_mg.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,58 +16,39 @@
 
 import pytest
 
-import dask_cudf
 import cugraph
 import cugraph.dask as dcg
-from cugraph.testing import utils
+from cugraph.datasets import karate, dolphins
 from cudf.testing.testing import assert_frame_equal
 from cugraph.structure.symmetrize import symmetrize_df
-from pylibcugraph.testing import gen_fixture_params_product
 
 
 # =============================================================================
 # Pytest Setup / Teardown - called for each test function
 # =============================================================================
+
+
 def setup_function():
     gc.collect()
 
 
 # =============================================================================
-# Pytest fixtures
+# Parameters
 # =============================================================================
-datasets = utils.DATASETS_UNDIRECTED
-
-core_number = [True, False]
-degree_type = ["bidirectional", "outgoing", "incoming"]
-
-fixture_params = gen_fixture_params_product(
-    (datasets, "graph_file"), (core_number, "core_number"), (degree_type, "degree_type")
-)
-
-
-@pytest.fixture(scope="module", params=fixture_params)
-def input_combo(request):
-    """
-    Simply return the current combination of params as a dictionary for use in
-    tests or other parameterized fixtures.
-    """
-    parameters = dict(zip(("graph_file", "core_number", "degree_type"), request.param))
-
-    return parameters
-
-
-@pytest.fixture(scope="module")
-def input_expected_output(dask_client, input_combo):
-    """
-    This fixture returns the inputs and expected results from the Core number
-    algo.
-    """
-    core_number = input_combo["core_number"]
-    degree_type = input_combo["degree_type"]
-    input_data_path = input_combo["graph_file"]
-    G = utils.generate_cugraph_graph_from_file(
-        input_data_path, directed=False, edgevals=True
-    )
+
+
+DATASETS = [karate, dolphins]
+CORE_NUMBER = [True, False]
+DEGREE_TYPE = ["bidirectional", "outgoing", "incoming"]
+
+
+# =============================================================================
+# Helper Functions
+# =============================================================================
+
+
+def get_sg_results(dataset, core_number, degree_type):
+    G = dataset.get_graph(create_using=cugraph.Graph(directed=False))
 
     if core_number:
         # compute the core_number
@@ -74,62 +56,41 @@ def input_expected_output(dask_client, input_combo):
     else:
         core_number = None
 
-    input_combo["core_number"] = core_number
-
-    input_combo["SGGraph"] = G
-
     sg_k_core_graph = cugraph.k_core(
         G, core_number=core_number, degree_type=degree_type
     )
-    sg_k_core_results = sg_k_core_graph.view_edge_list()
+    res = sg_k_core_graph.view_edge_list()
     # FIXME: The result will come asymetric. Symmetrize the results
     srcCol = sg_k_core_graph.source_columns
     dstCol = sg_k_core_graph.destination_columns
     wgtCol = sg_k_core_graph.weight_column
-    sg_k_core_results = (
-        symmetrize_df(sg_k_core_results, srcCol, dstCol, wgtCol)
+    res = (
+        symmetrize_df(res, srcCol, dstCol, wgtCol)
         .sort_values([srcCol, dstCol])
         .reset_index(drop=True)
     )
 
-    input_combo["sg_k_core_results"] = sg_k_core_results
-
-    # Creating an edgelist from a dask cudf dataframe
-    chunksize = dcg.get_chunksize(input_data_path)
-    ddf = dask_cudf.read_csv(
-        input_data_path,
-        blocksize=chunksize,
-        delimiter=" ",
-        names=["src", "dst", "value"],
-        dtype=["int32", "int32", "float32"],
-    )
-
-    dg = cugraph.Graph(directed=False)
-    # FIXME: False when renumbering (C++ and python renumbering)
-    dg.from_dask_cudf_edgelist(
-        ddf,
-        source="src",
-        destination="dst",
-        edge_attr="value",
-        renumber=True,
-    )
-
-    input_combo["MGGraph"] = dg
-
-    return input_combo
+    return res, core_number
 
 
 # =============================================================================
 # Tests
 # =============================================================================
+
+
 @pytest.mark.mg
-def test_sg_k_core(dask_client, benchmark, input_expected_output):
+@pytest.mark.parametrize("dataset", DATASETS)
+@pytest.mark.parametrize("core_number", CORE_NUMBER)
+@pytest.mark.parametrize("degree_type", DEGREE_TYPE)
+def test_sg_k_core(dask_client, dataset, core_number, degree_type, benchmark):
     # This test is only for benchmark purposes.
     sg_k_core = None
-    G = input_expected_output["SGGraph"]
-    core_number = input_expected_output["core_number"]
-    degree_type = input_expected_output["degree_type"]
-
+    G = dataset.get_graph(create_using=cugraph.Graph(directed=False))
+    if core_number:
+        # compute the core_number
+        core_number = cugraph.core_number(G, degree_type=degree_type)
+    else:
+        core_number = None
     sg_k_core = benchmark(
         cugraph.k_core, G, core_number=core_number, degree_type=degree_type
     )
@@ -137,15 +98,16 @@ def test_sg_k_core(dask_client, benchmark, input_expected_output):
 
 
 @pytest.mark.mg
-def test_dask_mg_k_core(dask_client, benchmark, input_expected_output):
-
-    dg = input_expected_output["MGGraph"]
-    core_number = input_expected_output["core_number"]
+@pytest.mark.parametrize("dataset", DATASETS)
+@pytest.mark.parametrize("core_number", CORE_NUMBER)
+@pytest.mark.parametrize("degree_type", DEGREE_TYPE)
+def test_dask_mg_k_core(dask_client, dataset, core_number, degree_type, benchmark):
+    expected_k_core_results, core_number = get_sg_results(
+        dataset, core_number, degree_type
+    )
 
+    dg = dataset.get_dask_graph(create_using=cugraph.Graph(directed=False))
     k_core_results = benchmark(dcg.k_core, dg, core_number=core_number)
-
-    expected_k_core_results = input_expected_output["sg_k_core_results"]
-
     k_core_results = (
         k_core_results.compute()
         .sort_values(["src", "dst"])
@@ -160,36 +122,13 @@ def test_dask_mg_k_core(dask_client, benchmark, input_expected_output):
 
 @pytest.mark.mg
 def test_dask_mg_k_core_invalid_input(dask_client):
-    input_data_path = datasets[0]
-    chunksize = dcg.get_chunksize(input_data_path)
-    ddf = dask_cudf.read_csv(
-        input_data_path,
-        blocksize=chunksize,
-        delimiter=" ",
-        names=["src", "dst", "value"],
-        dtype=["int32", "int32", "float32"],
-    )
+    dataset = DATASETS[0]
+    dg = dataset.get_dask_graph(create_using=cugraph.Graph(directed=True))
 
-    dg = cugraph.Graph(directed=True)
-    dg.from_dask_cudf_edgelist(
-        ddf,
-        source="src",
-        destination="dst",
-        edge_attr="value",
-        renumber=True,
-        store_transposed=True,
-    )
     with pytest.raises(ValueError):
         dcg.k_core(dg)
 
-    dg = cugraph.Graph(directed=False)
-    dg.from_dask_cudf_edgelist(
-        ddf,
-        source="src",
-        destination="dst",
-        edge_attr="value",
-        store_transposed=True,
-    )
+    dg = dataset.get_dask_graph(create_using=cugraph.Graph(directed=False))
 
     degree_type = "invalid"
     with pytest.raises(ValueError):
diff --git a/python/cugraph/cugraph/tests/internals/test_renumber_mg.py b/python/cugraph/cugraph/tests/internals/test_renumber_mg.py
index 45a3c46309d..64917d0c747 100644
--- a/python/cugraph/cugraph/tests/internals/test_renumber_mg.py
+++ b/python/cugraph/cugraph/tests/internals/test_renumber_mg.py
@@ -24,33 +24,61 @@
 import dask_cudf
 import cugraph.dask as dcg
 import cugraph
+from cugraph.datasets import karate, karate_disjoint
 from cugraph.testing import utils
 from cugraph.structure.number_map import NumberMap
 from cugraph.dask.common.mg_utils import is_single_gpu
-from cugraph.testing.utils import RAPIDS_DATASET_ROOT_DIR_PATH
 from cudf.testing import assert_frame_equal, assert_series_equal
 
 
 # =============================================================================
 # Pytest Setup / Teardown - called for each test function
 # =============================================================================
+
+
 def setup_function():
     gc.collect()
 
 
+# =============================================================================
+# Parameters
+# =============================================================================
+
+
+DATASETS = [karate]
+DATASETS_UNRENUMBERED = [karate_disjoint]
 IS_DIRECTED = [True, False]
 
 
+# =============================================================================
+# Helper Functions
+# =============================================================================
+
+
+def get_sg_graph(dataset, directed):
+    dataset.unload()
+    g = dataset.get_graph(create_using=cugraph.Graph(directed=directed))
+
+    return g
+
+
+def get_mg_graph(dataset, directed):
+    dataset.unload()
+    dg = dataset.get_dask_graph(create_using=cugraph.Graph(directed=directed))
+
+    return dg
+
+
+# =============================================================================
+# Tests
+# =============================================================================
+
+
 @pytest.mark.mg
 @pytest.mark.skipif(is_single_gpu(), reason="skipping MG testing on Single GPU system")
-@pytest.mark.parametrize(
-    "graph_file",
-    utils.DATASETS_UNRENUMBERED,
-    ids=[f"dataset={d.as_posix()}" for d in utils.DATASETS_UNRENUMBERED],
-)
-def test_mg_renumber(graph_file, dask_client):
-
-    M = utils.read_csv_for_nx(graph_file)
+@pytest.mark.parametrize("dataset", DATASETS_UNRENUMBERED)
+def test_mg_renumber(dataset, dask_client):
+    M = utils.read_csv_for_nx(dataset.get_path())
     sources = cudf.Series(M["0"])
     destinations = cudf.Series(M["1"])
 
@@ -96,13 +124,9 @@ def test_mg_renumber(graph_file, dask_client):
 
 @pytest.mark.mg
 @pytest.mark.skipif(is_single_gpu(), reason="skipping MG testing on Single GPU system")
-@pytest.mark.parametrize(
-    "graph_file",
-    utils.DATASETS_UNRENUMBERED,
-    ids=[f"dataset={d.as_posix()}" for d in utils.DATASETS_UNRENUMBERED],
-)
-def test_mg_renumber_add_internal_vertex_id(graph_file, dask_client):
-    M = utils.read_csv_for_nx(graph_file)
+@pytest.mark.parametrize("dataset", DATASETS_UNRENUMBERED)
+def test_mg_renumber_add_internal_vertex_id(dataset, dask_client):
+    M = utils.read_csv_for_nx(dataset.get_path())
     sources = cudf.Series(M["0"])
     destinations = cudf.Series(M["1"])
 
@@ -131,33 +155,13 @@ def test_mg_renumber_add_internal_vertex_id(graph_file, dask_client):
 
 @pytest.mark.mg
 @pytest.mark.skipif(is_single_gpu(), reason="skipping MG testing on Single GPU system")
+@pytest.mark.parametrize("dataset", DATASETS)
 @pytest.mark.parametrize("directed", IS_DIRECTED)
-def test_dask_mg_pagerank(dask_client, directed):
+def test_dask_mg_pagerank(dask_client, dataset, directed):
     pandas.set_option("display.max_rows", 10000)
 
-    input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "karate.csv").as_posix()
-    chunksize = dcg.get_chunksize(input_data_path)
-
-    ddf = dask_cudf.read_csv(
-        input_data_path,
-        blocksize=chunksize,
-        delimiter=" ",
-        names=["src", "dst", "value"],
-        dtype=["int32", "int32", "float32"],
-    )
-
-    df = cudf.read_csv(
-        input_data_path,
-        delimiter=" ",
-        names=["src", "dst", "value"],
-        dtype=["int32", "int32", "float32"],
-    )
-
-    g = cugraph.Graph(directed=directed)
-    g.from_cudf_edgelist(df, "src", "dst")
-
-    dg = cugraph.Graph(directed=directed)
-    dg.from_dask_cudf_edgelist(ddf, "src", "dst")
+    g = get_sg_graph(dataset, directed)
+    dg = get_mg_graph(dataset, directed)
 
     expected_pr = cugraph.pagerank(g)
     result_pr = dcg.pagerank(dg).compute()
@@ -178,20 +182,18 @@ def test_dask_mg_pagerank(dask_client, directed):
     print("Mismatches:", err)
     assert err == 0
 
+    dataset.unload()
+
 
 @pytest.mark.mg
 @pytest.mark.skipif(is_single_gpu(), reason="skipping MG testing on Single GPU system")
-@pytest.mark.parametrize(
-    "graph_file",
-    utils.DATASETS_UNRENUMBERED,
-    ids=[f"dataset={d.as_posix()}" for d in utils.DATASETS_UNRENUMBERED],
-)
-def test_mg_renumber_common_col_names(graph_file, dask_client):
+@pytest.mark.parametrize("dataset", DATASETS_UNRENUMBERED)
+def test_mg_renumber_common_col_names(dataset, dask_client):
     """
     Ensure that commonly-used column names in the input do not conflict with
     names used internally by NumberMap.
     """
-    M = utils.read_csv_for_nx(graph_file)
+    M = utils.read_csv_for_nx(dataset.get_path())
     sources = cudf.Series(M["0"])
     destinations = cudf.Series(M["1"])
 
diff --git a/python/cugraph/cugraph/tests/internals/test_replicate_edgelist_mg.py b/python/cugraph/cugraph/tests/internals/test_replicate_edgelist_mg.py
index 3bdb5c079ef..09936e954e8 100644
--- a/python/cugraph/cugraph/tests/internals/test_replicate_edgelist_mg.py
+++ b/python/cugraph/cugraph/tests/internals/test_replicate_edgelist_mg.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -17,73 +17,54 @@
 
 import dask_cudf
 import numpy as np
-from cugraph.testing import UNDIRECTED_DATASETS, karate_disjoint
-
+from cugraph.datasets import karate, dolphins, karate_disjoint
 from cugraph.structure.replicate_edgelist import replicate_edgelist
 from cudf.testing.testing import assert_frame_equal
-from pylibcugraph.testing.utils import gen_fixture_params_product
 
 
 # =============================================================================
 # Pytest Setup / Teardown - called for each test function
 # =============================================================================
+
+
 def setup_function():
     gc.collect()
 
 
+# =============================================================================
+# Parameters
+# =============================================================================
+
+
 edgeWeightCol = "weights"
 edgeIdCol = "edge_id"
 edgeTypeCol = "edge_type"
 srcCol = "src"
 dstCol = "dst"
 
-
-input_data = UNDIRECTED_DATASETS + [karate_disjoint]
-datasets = [pytest.param(d) for d in input_data]
-
-fixture_params = gen_fixture_params_product(
-    (datasets, "graph_file"),
-    ([True, False], "distributed"),
-    ([True, False], "use_weights"),
-    ([True, False], "use_edge_ids"),
-    ([True, False], "use_edge_type_ids"),
-)
-
-
-@pytest.fixture(scope="module", params=fixture_params)
-def input_combo(request):
-    """
-    Simply return the current combination of params as a dictionary for use in
-    tests or other parameterized fixtures.
-    """
-    return dict(
-        zip(
-            (
-                "graph_file",
-                "use_weights",
-                "use_edge_ids",
-                "use_edge_type_ids",
-                "distributed",
-            ),
-            request.param,
-        )
-    )
+DATASETS = [karate, dolphins, karate_disjoint]
+IS_DISTRIBUTED = [True, False]
+USE_WEIGHTS = [True, False]
+USE_EDGE_IDS = [True, False]
+USE_EDGE_TYPE_IDS = [True, False]
 
 
 # =============================================================================
 # Tests
 # =============================================================================
-# @pytest.mark.skipif(
-#    is_single_gpu(), reason="skipping MG testing on Single GPU system"
-# )
-@pytest.mark.mg
-def test_mg_replicate_edgelist(dask_client, input_combo):
-    df = input_combo["graph_file"].get_edgelist()
-    distributed = input_combo["distributed"]
 
-    use_weights = input_combo["use_weights"]
-    use_edge_ids = input_combo["use_edge_ids"]
-    use_edge_type_ids = input_combo["use_edge_type_ids"]
+
+@pytest.mark.mg
+@pytest.mark.parametrize("dataset", DATASETS)
+@pytest.mark.parametrize("distributed", IS_DISTRIBUTED)
+@pytest.mark.parametrize("use_weights", USE_WEIGHTS)
+@pytest.mark.parametrize("use_edge_ids", USE_EDGE_IDS)
+@pytest.mark.parametrize("use_edge_type_ids", USE_EDGE_TYPE_IDS)
+def test_mg_replicate_edgelist(
+    dask_client, dataset, distributed, use_weights, use_edge_ids, use_edge_type_ids
+):
+    dataset.unload()
+    df = dataset.get_edgelist()
 
     columns = [srcCol, dstCol]
     weight = None
diff --git a/python/cugraph/cugraph/tests/internals/test_symmetrize_mg.py b/python/cugraph/cugraph/tests/internals/test_symmetrize_mg.py
index 05cc06e6282..913443fe400 100644
--- a/python/cugraph/cugraph/tests/internals/test_symmetrize_mg.py
+++ b/python/cugraph/cugraph/tests/internals/test_symmetrize_mg.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -25,6 +25,8 @@
 # =============================================================================
 # Pytest Setup / Teardown - called for each test function
 # =============================================================================
+
+
 def setup_function():
     gc.collect()
 
diff --git a/python/cugraph/pytest.ini b/python/cugraph/pytest.ini
index ebbd6a7eaad..675a6cf8fde 100644
--- a/python/cugraph/pytest.ini
+++ b/python/cugraph/pytest.ini
@@ -59,5 +59,14 @@ python_functions =
 
 filterwarnings =
           error:::cudf
+          error::FutureWarning
+          error::DeprecationWarning
+          # TODO
+          ignore:Multi is deprecated and the removal of multi edges will no longer be supported:FutureWarning
+          ignore:The legacy column names:FutureWarning
+          ignore:The include_hop_column flag is deprecated and will be removed:FutureWarning
+          ignore:Calling uniform_neighbor_sample with the:FutureWarning
+          ignore:The with_edge_properties flag is deprecated and will be removed:FutureWarning
+          ignore:This function is deprecated. Batched support for multiple vertices:DeprecationWarning
           # Called via dask. Not obviously addressable in cugraph.
-          ignore:The behavior of array concatenation with empty entries is deprecated:FutureWarning:cudf
+          ignore:The behavior of array concatenation with empty entries is deprecated:FutureWarning
diff --git a/python/nx-cugraph/README.md b/python/nx-cugraph/README.md
index 75b5c1c5aa9..27825585c28 100644
--- a/python/nx-cugraph/README.md
+++ b/python/nx-cugraph/README.md
@@ -216,6 +216,8 @@ Below is the list of algorithms that are currently supported in nx-cugraph.
  └─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.classic.wheel_graph.html#networkx.generators.classic.wheel_graph">wheel_graph</a>
 <a href="https://networkx.org/documentation/stable/reference/generators.html#module-networkx.generators.community">community</a>
  └─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.community.caveman_graph.html#networkx.generators.community.caveman_graph">caveman_graph</a>
+<a href="https://networkx.org/documentation/stable/reference/generators.html#module-networkx.generators.ego">ego</a>
+ └─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.ego.ego_graph.html#networkx.generators.ego.ego_graph">ego_graph</a>
 <a href="https://networkx.org/documentation/stable/reference/generators.html#module-networkx.generators.small">small</a>
  ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.small.bull_graph.html#networkx.generators.small.bull_graph">bull_graph</a>
  ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.small.chvatal_graph.html#networkx.generators.small.chvatal_graph">chvatal_graph</a>
diff --git a/python/nx-cugraph/_nx_cugraph/__init__.py b/python/nx-cugraph/_nx_cugraph/__init__.py
index edc96983b8f..f57b90eb402 100644
--- a/python/nx-cugraph/_nx_cugraph/__init__.py
+++ b/python/nx-cugraph/_nx_cugraph/__init__.py
@@ -77,6 +77,7 @@
         "diamond_graph",
         "dodecahedral_graph",
         "edge_betweenness_centrality",
+        "ego_graph",
         "eigenvector_centrality",
         "empty_graph",
         "florentine_families_graph",
@@ -163,6 +164,7 @@
         "clustering": "Directed graphs and `weight` parameter are not yet supported.",
         "core_number": "Directed graphs are not yet supported.",
         "edge_betweenness_centrality": "`weight` parameter is not yet supported, and RNG with seed may be different.",
+        "ego_graph": "Weighted ego_graph with negative cycles is not yet supported. `NotImplementedError` will be raised if there are negative `distance` edge weights.",
         "eigenvector_centrality": "`nstart` parameter is not used, but it is checked for validity.",
         "from_pandas_edgelist": "cudf.DataFrame inputs also supported; value columns with str is unsuppported.",
         "generic_bfs_edges": "`neighbors` and `sort_neighbors` parameters are not yet supported.",
@@ -191,6 +193,9 @@
         "bellman_ford_path_length": {
             "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
         },
+        "ego_graph": {
+            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
+        },
         "eigenvector_centrality": {
             "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
         },
diff --git a/python/nx-cugraph/lint.yaml b/python/nx-cugraph/lint.yaml
index d220cb18df3..c4422ffb97d 100644
--- a/python/nx-cugraph/lint.yaml
+++ b/python/nx-cugraph/lint.yaml
@@ -26,7 +26,7 @@ repos:
       - id: mixed-line-ending
       - id: trailing-whitespace
   - repo: https://github.com/abravalheri/validate-pyproject
-    rev: v0.16
+    rev: v0.17
     hooks:
       - id: validate-pyproject
         name: Validate pyproject.toml
@@ -50,7 +50,7 @@ repos:
       - id: black
       # - id: black-jupyter
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.4.2
+    rev: v0.4.4
     hooks:
       - id: ruff
         args: [--fix-only, --show-fixes]  # --unsafe-fixes]
@@ -62,7 +62,7 @@ repos:
         additional_dependencies: &flake8_dependencies
           # These versions need updated manually
           - flake8==7.0.0
-          - flake8-bugbear==24.4.21
+          - flake8-bugbear==24.4.26
           - flake8-simplify==0.21.0
   - repo: https://github.com/asottile/yesqa
     rev: v1.5.0
@@ -77,7 +77,7 @@ repos:
         additional_dependencies: [tomli]
         files: ^(nx_cugraph|docs)/
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.4.2
+    rev: v0.4.4
     hooks:
       - id: ruff
   - repo: https://github.com/pre-commit/pre-commit-hooks
diff --git a/python/nx-cugraph/nx_cugraph/convert.py b/python/nx-cugraph/nx_cugraph/convert.py
index f265540a161..b34245d5031 100644
--- a/python/nx-cugraph/nx_cugraph/convert.py
+++ b/python/nx-cugraph/nx_cugraph/convert.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -562,7 +562,12 @@ def to_networkx(G: nxcg.Graph, *, sort_edges: bool = False) -> nx.Graph:
         dst_iter = map(id_to_key.__getitem__, dst_indices)
     if G.is_multigraph() and (G.edge_keys is not None or G.edge_indices is not None):
         if G.edge_keys is not None:
-            edge_keys = G.edge_keys
+            if not G.is_directed():
+                edge_keys = [k for k, m in zip(G.edge_keys, mask.tolist()) if m]
+            else:
+                edge_keys = G.edge_keys
+        elif not G.is_directed():
+            edge_keys = G.edge_indices[mask].tolist()
         else:
             edge_keys = G.edge_indices.tolist()
         if edge_values:
diff --git a/python/nx-cugraph/nx_cugraph/generators/__init__.py b/python/nx-cugraph/nx_cugraph/generators/__init__.py
index c1834a4dec7..60a9d92373a 100644
--- a/python/nx-cugraph/nx_cugraph/generators/__init__.py
+++ b/python/nx-cugraph/nx_cugraph/generators/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,5 +12,6 @@
 # limitations under the License.
 from .classic import *
 from .community import *
+from .ego import *
 from .small import *
 from .social import *
diff --git a/python/nx-cugraph/nx_cugraph/generators/ego.py b/python/nx-cugraph/nx_cugraph/generators/ego.py
new file mode 100644
index 00000000000..66c9c8b95ee
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/generators/ego.py
@@ -0,0 +1,161 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+import cupy as cp
+import networkx as nx
+import numpy as np
+import pylibcugraph as plc
+
+import nx_cugraph as nxcg
+
+from ..utils import _dtype_param, _get_float_dtype, index_dtype, networkx_algorithm
+
+__all__ = ["ego_graph"]
+
+
+@networkx_algorithm(
+    extra_params=_dtype_param, version_added="24.06", _plc={"bfs", "ego_graph", "sssp"}
+)
+def ego_graph(
+    G, n, radius=1, center=True, undirected=False, distance=None, *, dtype=None
+):
+    """Weighted ego_graph with negative cycles is not yet supported. `NotImplementedError` will be raised if there are negative `distance` edge weights."""  # noqa: E501
+    if isinstance(G, nx.Graph):
+        G = nxcg.from_networkx(G, preserve_all_attrs=True)
+    if n not in G:
+        if distance is None:
+            raise nx.NodeNotFound(f"Source {n} is not in G")
+        raise nx.NodeNotFound(f"Node {n} not found in graph")
+    src_index = n if G.key_to_id is None else G.key_to_id[n]
+    symmetrize = "union" if undirected and G.is_directed() else None
+    if distance is None or distance not in G.edge_values:
+        # Simple BFS to determine nodes
+        if radius is not None and radius <= 0:
+            if center:
+                node_ids = cp.array([src_index], dtype=index_dtype)
+            else:
+                node_ids = cp.empty(0, dtype=index_dtype)
+            node_mask = None
+        else:
+            if radius is None or np.isinf(radius):
+                radius = -1
+            else:
+                radius = math.ceil(radius)
+            distances, unused_predecessors, node_ids = plc.bfs(
+                handle=plc.ResourceHandle(),
+                graph=G._get_plc_graph(symmetrize=symmetrize),
+                sources=cp.array([src_index], index_dtype),
+                direction_optimizing=False,  # True for undirected only; what's best?
+                depth_limit=radius,
+                compute_predecessors=False,
+                do_expensive_check=False,
+            )
+            node_mask = distances != np.iinfo(distances.dtype).max
+    else:
+        # SSSP to determine nodes
+        if callable(distance):
+            raise NotImplementedError("callable `distance` argument is not supported")
+        if symmetrize and G.is_multigraph():
+            # G._get_plc_graph does not implement `symmetrize=True` w/ edge array
+            raise NotImplementedError(
+                "Weighted ego_graph with undirected=True not implemented"
+            )
+        # Check for negative values since we don't support negative cycles
+        edge_vals = G.edge_values[distance]
+        if distance in G.edge_masks:
+            edge_vals = edge_vals[G.edge_masks[distance]]
+        if (edge_vals < 0).any():
+            raise NotImplementedError(
+                "Negative edge weights not yet supported by ego_graph"
+            )
+        # PERF: we could use BFS if all edges are equal
+        if radius is None:
+            radius = np.inf
+        dtype = _get_float_dtype(dtype, graph=G, weight=distance)
+        node_ids, distances, unused_predecessors = plc.sssp(
+            resource_handle=plc.ResourceHandle(),
+            graph=(G.to_undirected() if symmetrize else G)._get_plc_graph(
+                distance, 1, dtype
+            ),
+            source=src_index,
+            cutoff=np.nextafter(radius, np.inf, dtype=np.float64),
+            compute_predecessors=True,  # TODO: False is not yet supported
+            do_expensive_check=False,
+        )
+        node_mask = distances != np.finfo(distances.dtype).max
+
+    if node_mask is not None:
+        if not center:
+            node_mask &= node_ids != src_index
+        node_ids = node_ids[node_mask]
+    if node_ids.size == G._N:
+        return G.copy()
+    # TODO: create renumbering helper function(s)
+    node_ids.sort()  # TODO: is this ever necessary? Keep for safety
+    node_values = {key: val[node_ids] for key, val in G.node_values.items()}
+    node_masks = {key: val[node_ids] for key, val in G.node_masks.items()}
+
+    G._sort_edge_indices()  # TODO: is this ever necessary? Keep for safety
+    edge_mask = cp.isin(G.src_indices, node_ids) & cp.isin(G.dst_indices, node_ids)
+    src_indices = cp.searchsorted(node_ids, G.src_indices[edge_mask]).astype(
+        index_dtype
+    )
+    dst_indices = cp.searchsorted(node_ids, G.dst_indices[edge_mask]).astype(
+        index_dtype
+    )
+    edge_values = {key: val[edge_mask] for key, val in G.edge_values.items()}
+    edge_masks = {key: val[edge_mask] for key, val in G.edge_masks.items()}
+
+    # Renumber nodes
+    if (id_to_key := G.id_to_key) is not None:
+        key_to_id = {
+            id_to_key[old_index]: new_index
+            for new_index, old_index in enumerate(node_ids.tolist())
+        }
+    else:
+        key_to_id = {
+            old_index: new_index
+            for new_index, old_index in enumerate(node_ids.tolist())
+        }
+    kwargs = {
+        "N": node_ids.size,
+        "src_indices": src_indices,
+        "dst_indices": dst_indices,
+        "edge_values": edge_values,
+        "edge_masks": edge_masks,
+        "node_values": node_values,
+        "node_masks": node_masks,
+        "key_to_id": key_to_id,
+    }
+    if G.is_multigraph():
+        if G.edge_keys is not None:
+            kwargs["edge_keys"] = [
+                x for x, m in zip(G.edge_keys, edge_mask.tolist()) if m
+            ]
+        if G.edge_indices is not None:
+            kwargs["edge_indices"] = G.edge_indices[edge_mask]
+    rv = G.__class__.from_coo(**kwargs)
+    rv.graph.update(G.graph)
+    return rv
+
+
+@ego_graph._can_run
+def _(G, n, radius=1, center=True, undirected=False, distance=None, *, dtype=None):
+    if distance is not None and undirected and G.is_directed() and G.is_multigraph():
+        return "Weighted ego_graph with undirected=True not implemented"
+    if distance is not None and nx.is_negatively_weighted(G, weight=distance):
+        return "Weighted ego_graph with negative cycles not yet supported"
+    if callable(distance):
+        return "callable `distance` argument is not supported"
+    return True
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_ego_graph.py b/python/nx-cugraph/nx_cugraph/tests/test_ego_graph.py
new file mode 100644
index 00000000000..5474f9d79e3
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/tests/test_ego_graph.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import networkx as nx
+import pytest
+from packaging.version import parse
+
+import nx_cugraph as nxcg
+
+from .testing_utils import assert_graphs_equal
+
+nxver = parse(nx.__version__)
+
+
+if nxver.major == 3 and nxver.minor < 2:
+    pytest.skip("Need NetworkX >=3.2 to test ego_graph", allow_module_level=True)
+
+
+@pytest.mark.parametrize(
+    "create_using", [nx.Graph, nx.DiGraph, nx.MultiGraph, nx.MultiDiGraph]
+)
+@pytest.mark.parametrize("radius", [-1, 0, 1, 1.5, 2, float("inf"), None])
+@pytest.mark.parametrize("center", [True, False])
+@pytest.mark.parametrize("undirected", [False, True])
+@pytest.mark.parametrize("multiple_edges", [False, True])
+@pytest.mark.parametrize("n", [0, 3])
+def test_ego_graph_cycle_graph(
+    create_using, radius, center, undirected, multiple_edges, n
+):
+    Gnx = nx.cycle_graph(7, create_using=create_using)
+    if multiple_edges:
+        # Test multigraph with multiple edges
+        if not Gnx.is_multigraph():
+            return
+        Gnx.add_edges_from(nx.cycle_graph(7, create_using=nx.DiGraph).edges)
+        Gnx.add_edge(0, 1, 10)
+    Gcg = nxcg.from_networkx(Gnx, preserve_all_attrs=True)
+    assert_graphs_equal(Gnx, Gcg)  # Sanity check
+
+    kwargs = {"radius": radius, "center": center, "undirected": undirected}
+    Hnx = nx.ego_graph(Gnx, n, **kwargs)
+    Hcg = nx.ego_graph(Gnx, n, **kwargs, backend="cugraph")
+    assert_graphs_equal(Hnx, Hcg)
+    with pytest.raises(nx.NodeNotFound, match="not in G"):
+        nx.ego_graph(Gnx, -1, **kwargs)
+    with pytest.raises(nx.NodeNotFound, match="not in G"):
+        nx.ego_graph(Gnx, -1, **kwargs, backend="cugraph")
+    # Using sssp with default weight of 1 should give same answer as bfs
+    nx.set_edge_attributes(Gnx, 1, name="weight")
+    Gcg = nxcg.from_networkx(Gnx, preserve_all_attrs=True)
+    assert_graphs_equal(Gnx, Gcg)  # Sanity check
+
+    kwargs["distance"] = "weight"
+    H2nx = nx.ego_graph(Gnx, n, **kwargs)
+    is_nx32 = nxver.major == 3 and nxver.minor == 2
+    if undirected and Gnx.is_directed() and Gnx.is_multigraph():
+        if is_nx32:
+            # `should_run` was added in nx 3.3
+            match = "Weighted ego_graph with undirected=True not implemented"
+        else:
+            match = "not implemented by cugraph"
+        with pytest.raises(RuntimeError, match=match):
+            nx.ego_graph(Gnx, n, **kwargs, backend="cugraph")
+        with pytest.raises(NotImplementedError, match="ego_graph"):
+            nx.ego_graph(Gcg, n, **kwargs)
+    else:
+        H2cg = nx.ego_graph(Gnx, n, **kwargs, backend="cugraph")
+        assert_graphs_equal(H2nx, H2cg)
+        with pytest.raises(nx.NodeNotFound, match="not found in graph"):
+            nx.ego_graph(Gnx, -1, **kwargs)
+        with pytest.raises(nx.NodeNotFound, match="not found in graph"):
+            nx.ego_graph(Gnx, -1, **kwargs, backend="cugraph")
diff --git a/python/nx-cugraph/pyproject.toml b/python/nx-cugraph/pyproject.toml
index a7daf01775b..477fe8bb493 100644
--- a/python/nx-cugraph/pyproject.toml
+++ b/python/nx-cugraph/pyproject.toml
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 [build-system]
 
@@ -19,7 +19,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 classifiers = [
-    "Development Status :: 3 - Alpha",
+    "Development Status :: 4 - Beta",
     "License :: OSI Approved :: Apache Software License",
     "Programming Language :: Python",
     "Programming Language :: Python :: 3",
@@ -233,6 +233,7 @@ ignore = [
 "nx_cugraph/**/tests/*py" = ["S101", "S311", "T201", "D103", "D100"]
 "_nx_cugraph/__init__.py" = ["E501"]
 "nx_cugraph/algorithms/**/*py" = ["D205", "D401"]  # Allow flexible docstrings for algorithms
+"nx_cugraph/generators/**/*py" = ["D205", "D401"]  # Allow flexible docstrings for generators
 "nx_cugraph/interface.py" = ["D401"]  # Flexible docstrings
 "scripts/update_readme.py" = ["INP001"]  # Not part of a package
 
diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_functions.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_functions.pxd
index 6f1ac1f640b..315c9bd7503 100644
--- a/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_functions.pxd
+++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_functions.pxd
@@ -183,6 +183,17 @@ cdef extern from "cugraph_c/graph_functions.h":
             cugraph_error_t** error
         )
 
+    ###########################################################################
+    # count multi-edges
+    cdef cugraph_error_code_t \
+        cugraph_count_multi_edges(
+            const cugraph_resource_handle_t *handle,
+            cugraph_graph_t* graph,
+            bool_t do_expenive_check,
+            size_t *result,
+            cugraph_error_t** error
+        )
+
     ###########################################################################
     # degrees
     ctypedef struct cugraph_degrees_result_t:
diff --git a/python/pylibcugraph/pylibcugraph/count_multi_edges.pyx b/python/pylibcugraph/pylibcugraph/count_multi_edges.pyx
new file mode 100644
index 00000000000..d3780e53283
--- /dev/null
+++ b/python/pylibcugraph/pylibcugraph/count_multi_edges.pyx
@@ -0,0 +1,96 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Have cython use python 3 syntax
+# cython: language_level = 3
+
+from pylibcugraph._cugraph_c.resource_handle cimport (
+    bool_t,
+    data_type_id_t,
+    cugraph_resource_handle_t,
+)
+from pylibcugraph._cugraph_c.error cimport (
+    cugraph_error_code_t,
+    cugraph_error_t,
+)
+from pylibcugraph._cugraph_c.graph cimport (
+    cugraph_graph_t,
+)
+from pylibcugraph._cugraph_c.graph_functions cimport (
+    cugraph_count_multi_edges,
+)
+from pylibcugraph.resource_handle cimport (
+    ResourceHandle,
+)
+from pylibcugraph.graphs cimport (
+    _GPUGraph,
+)
+
+
+def count_multi_edges(ResourceHandle resource_handle,
+                      _GPUGraph graph,
+                      bool_t do_expensive_check):
+    """
+    Count the number of multi-edges in the graph.  This returns
+    the number of duplicates.  If the edge (u, v) appears k times
+    in the graph, then that edge will contribute (k-1) toward the
+    total number of duplicates.
+
+    Parameters
+    ----------
+    resource_handle : ResourceHandle
+        Handle to the underlying device resources needed for referencing data
+        and running algorithms.
+
+    graph : SGGraph or MGGraph
+        The input graph, for either Single or Multi-GPU operations.
+
+    do_expensive_check : bool_t
+        A flag to run expensive checks for input arguments if True.
+
+    Returns
+    -------
+    Total count of duplicate edges in the graph
+
+    Examples
+    --------
+    >>> import pylibcugraph, cupy, numpy
+    >>> srcs = cupy.asarray([0, 0, 0], dtype=numpy.int32)
+    >>> dsts = cupy.asarray([1, 1, 1], dtype=numpy.int32)
+    >>> weights = cupy.asarray([1.0, 1.0, 1.0], dtype=numpy.float32)
+    >>> resource_handle = pylibcugraph.ResourceHandle()
+    >>> graph_props = pylibcugraph.GraphProperties(
+    ...     is_symmetric=False, is_multigraph=False)
+    >>> G = pylibcugraph.SGGraph(
+    ...     resource_handle, graph_props, srcs, dsts, weight_array=weights,
+    ...     store_transposed=True, renumber=False, do_expensive_check=False)
+    >>> count = pylibcugraph.count_multi_edges(resource_handle, G, False)
+
+    """
+
+    cdef cugraph_resource_handle_t* c_resource_handle_ptr = \
+        resource_handle.c_resource_handle_ptr
+    cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr
+
+    cdef size_t result
+    cdef cugraph_error_code_t error_code
+    cdef cugraph_error_t* error_ptr
+
+    error_code = cugraph_count_multi_edges(c_resource_handle_ptr,
+                                           c_graph_ptr,
+                                           do_expensive_check,
+                                           &result,
+                                           &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_count_multi_edges")
+
+    return result;